aury-boot 0.0.39__py3-none-any.whl → 0.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/adapter/http.py +17 -6
- aury/boot/application/app/base.py +1 -0
- aury/boot/application/app/components.py +81 -2
- aury/boot/application/config/settings.py +73 -0
- aury/boot/commands/init.py +20 -0
- aury/boot/commands/pkg.py +31 -1
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +1 -0
- aury/boot/commands/templates/project/aury_docs/18-monitoring-profiling.md.tpl +239 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +15 -0
- aury/boot/common/logging/setup.py +8 -3
- aury/boot/infrastructure/database/manager.py +6 -4
- aury/boot/infrastructure/monitoring/__init__.py +10 -2
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +32 -16
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +14 -13
- aury/boot/infrastructure/monitoring/profiling/__init__.py +573 -0
- aury/boot/infrastructure/scheduler/manager.py +15 -3
- aury/boot/toolkit/http/__init__.py +180 -85
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.40.dist-info}/METADATA +10 -4
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.40.dist-info}/RECORD +22 -20
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.40.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.39.dist-info → aury_boot-0.0.40.dist-info}/entry_points.txt +0 -0
aury/boot/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0,
|
|
31
|
+
__version__ = version = '0.0.40'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 40)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -25,10 +25,14 @@ from __future__ import annotations
|
|
|
25
25
|
|
|
26
26
|
from typing import Any
|
|
27
27
|
|
|
28
|
-
import httpx
|
|
29
|
-
|
|
30
28
|
from aury.boot.common.logging import get_trace_id, logger
|
|
31
|
-
from aury.boot.toolkit.http import
|
|
29
|
+
from aury.boot.toolkit.http import (
|
|
30
|
+
HttpClient,
|
|
31
|
+
HttpNetworkError,
|
|
32
|
+
HttpStatusError,
|
|
33
|
+
HttpTimeoutError,
|
|
34
|
+
RetryConfig,
|
|
35
|
+
)
|
|
32
36
|
|
|
33
37
|
from .base import BaseAdapter
|
|
34
38
|
from .config import AdapterSettings
|
|
@@ -185,7 +189,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
185
189
|
json: JSON 请求体
|
|
186
190
|
data: 表单数据
|
|
187
191
|
files: 上传文件
|
|
188
|
-
**kwargs: 其他
|
|
192
|
+
**kwargs: 其他 aiohttp 参数
|
|
189
193
|
|
|
190
194
|
Returns:
|
|
191
195
|
dict: 响应 JSON
|
|
@@ -228,7 +232,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
228
232
|
"content": response.text,
|
|
229
233
|
}
|
|
230
234
|
|
|
231
|
-
except
|
|
235
|
+
except HttpTimeoutError as exc:
|
|
232
236
|
raise AdapterTimeoutError(
|
|
233
237
|
f"请求超时: {method} {path}",
|
|
234
238
|
adapter_name=self.name,
|
|
@@ -236,7 +240,7 @@ class HttpAdapter(BaseAdapter):
|
|
|
236
240
|
cause=exc,
|
|
237
241
|
) from exc
|
|
238
242
|
|
|
239
|
-
except
|
|
243
|
+
except HttpStatusError as exc:
|
|
240
244
|
# HTTP 错误状态码
|
|
241
245
|
response = exc.response
|
|
242
246
|
try:
|
|
@@ -255,6 +259,13 @@ class HttpAdapter(BaseAdapter):
|
|
|
255
259
|
cause=exc,
|
|
256
260
|
) from exc
|
|
257
261
|
|
|
262
|
+
except HttpNetworkError as exc:
|
|
263
|
+
raise AdapterError(
|
|
264
|
+
f"网络错误: {method} {path} - {exc}",
|
|
265
|
+
adapter_name=self.name,
|
|
266
|
+
cause=exc,
|
|
267
|
+
) from exc
|
|
268
|
+
|
|
258
269
|
except Exception as exc:
|
|
259
270
|
raise AdapterError(
|
|
260
271
|
f"请求失败: {method} {path} - {type(exc).__name__}: {exc}",
|
|
@@ -320,11 +320,24 @@ class SchedulerComponent(Component):
|
|
|
320
320
|
url = scheduler_config.jobstore_url
|
|
321
321
|
if url.startswith("redis://"):
|
|
322
322
|
try:
|
|
323
|
+
from urllib.parse import urlparse
|
|
323
324
|
from apscheduler.jobstores.redis import RedisJobStore
|
|
325
|
+
|
|
326
|
+
# 解析 Redis URL
|
|
327
|
+
parsed = urlparse(url)
|
|
328
|
+
redis_kwargs: dict = {
|
|
329
|
+
"host": parsed.hostname or "localhost",
|
|
330
|
+
"port": parsed.port or 6379,
|
|
331
|
+
}
|
|
332
|
+
if parsed.password:
|
|
333
|
+
redis_kwargs["password"] = parsed.password
|
|
334
|
+
if parsed.path and parsed.path != "/":
|
|
335
|
+
redis_kwargs["db"] = int(parsed.path.lstrip("/") or 0)
|
|
336
|
+
|
|
324
337
|
scheduler_kwargs["jobstores"] = {
|
|
325
|
-
"default": RedisJobStore
|
|
338
|
+
"default": RedisJobStore(**redis_kwargs)
|
|
326
339
|
}
|
|
327
|
-
logger.info(f"调度器使用 Redis 存储: {
|
|
340
|
+
logger.info(f"调度器使用 Redis 存储: {parsed.hostname}:{parsed.port}")
|
|
328
341
|
except ImportError:
|
|
329
342
|
logger.warning("Redis jobstore 需要安装 redis: pip install redis")
|
|
330
343
|
else:
|
|
@@ -766,6 +779,70 @@ class AlertComponent(Component):
|
|
|
766
779
|
pass
|
|
767
780
|
|
|
768
781
|
|
|
782
|
+
class ProfilingComponent(Component):
|
|
783
|
+
"""Profiling 组件。
|
|
784
|
+
|
|
785
|
+
提供持续性能分析和事件循环阻塞检测:
|
|
786
|
+
- Pyroscope:持续采样生成火焰图
|
|
787
|
+
- 阻塞检测:检测同步代码阻塞事件循环
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
name = "profiling"
|
|
791
|
+
enabled = True
|
|
792
|
+
depends_on: ClassVar[list[str]] = ["alert"] # 告警依赖
|
|
793
|
+
|
|
794
|
+
def can_enable(self, config: BaseConfig) -> bool:
|
|
795
|
+
"""当启用 Pyroscope 或阻塞检测时启用。"""
|
|
796
|
+
return self.enabled and (
|
|
797
|
+
config.profiling.enabled or config.profiling.blocking_detector_enabled
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
async def setup(self, app: FoundationApp, config: BaseConfig) -> None:
|
|
801
|
+
"""初始化 Profiling 组件。"""
|
|
802
|
+
try:
|
|
803
|
+
from aury.boot.infrastructure.monitoring.profiling import (
|
|
804
|
+
ProfilingConfig,
|
|
805
|
+
ProfilingManager,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
profiling_config = ProfilingConfig(
|
|
809
|
+
enabled=config.profiling.enabled,
|
|
810
|
+
pyroscope_endpoint=config.profiling.pyroscope_endpoint,
|
|
811
|
+
pyroscope_auth_token=config.profiling.pyroscope_auth_token,
|
|
812
|
+
service_name=config.service.name,
|
|
813
|
+
environment=config.service.environment,
|
|
814
|
+
blocking_detector_enabled=config.profiling.blocking_detector_enabled,
|
|
815
|
+
blocking_check_interval_ms=config.profiling.blocking_check_interval_ms,
|
|
816
|
+
blocking_threshold_ms=config.profiling.blocking_threshold_ms,
|
|
817
|
+
blocking_severe_threshold_ms=config.profiling.blocking_severe_threshold_ms,
|
|
818
|
+
blocking_alert_enabled=config.profiling.blocking_alert_enabled,
|
|
819
|
+
blocking_alert_cooldown_seconds=config.profiling.blocking_alert_cooldown_seconds,
|
|
820
|
+
blocking_max_history=config.profiling.blocking_max_history,
|
|
821
|
+
tags=config.profiling.pyroscope_tags,
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
manager = ProfilingManager.get_instance()
|
|
825
|
+
manager.configure(profiling_config)
|
|
826
|
+
await manager.start()
|
|
827
|
+
|
|
828
|
+
# 保存到 app.state
|
|
829
|
+
app.state.profiling_manager = manager
|
|
830
|
+
|
|
831
|
+
except ImportError as e:
|
|
832
|
+
logger.warning(f"Profiling 依赖未安装,跳过初始化: {e}")
|
|
833
|
+
except Exception as e:
|
|
834
|
+
logger.warning(f"Profiling 初始化失败(非关键): {e}")
|
|
835
|
+
|
|
836
|
+
async def teardown(self, app: FoundationApp) -> None:
|
|
837
|
+
"""停止 Profiling 组件。"""
|
|
838
|
+
try:
|
|
839
|
+
manager = getattr(app.state, "profiling_manager", None)
|
|
840
|
+
if manager:
|
|
841
|
+
await manager.stop()
|
|
842
|
+
except Exception as e:
|
|
843
|
+
logger.warning(f"Profiling 关闭失败: {e}")
|
|
844
|
+
|
|
845
|
+
|
|
769
846
|
class EventBusComponent(Component):
|
|
770
847
|
"""事件总线组件。
|
|
771
848
|
|
|
@@ -815,6 +892,7 @@ FoundationApp.plugins = [
|
|
|
815
892
|
# 设置默认组件
|
|
816
893
|
FoundationApp.components = [
|
|
817
894
|
AlertComponent, # 最先初始化告警管理器
|
|
895
|
+
ProfilingComponent, # Profiling 依赖告警
|
|
818
896
|
DatabaseComponent,
|
|
819
897
|
MigrationComponent,
|
|
820
898
|
AdminConsoleComponent,
|
|
@@ -837,6 +915,7 @@ __all__ = [
|
|
|
837
915
|
"EventBusComponent",
|
|
838
916
|
"MessageQueueComponent",
|
|
839
917
|
"MigrationComponent",
|
|
918
|
+
"ProfilingComponent",
|
|
840
919
|
"SchedulerComponent",
|
|
841
920
|
"StorageComponent",
|
|
842
921
|
"TaskComponent",
|
|
@@ -401,6 +401,14 @@ class LogSettings(BaseModel):
|
|
|
401
401
|
default=False,
|
|
402
402
|
description="是否记录 WebSocket 消息内容(注意性能和敏感数据)"
|
|
403
403
|
)
|
|
404
|
+
enqueue: bool = Field(
|
|
405
|
+
default=False,
|
|
406
|
+
description=(
|
|
407
|
+
"是否启用多进程安全队列。"
|
|
408
|
+
"启用后日志通过 multiprocessing.Queue 传输,"
|
|
409
|
+
"可能导致事件循环阻塞。建议在 asyncio 应用中保持 False"
|
|
410
|
+
)
|
|
411
|
+
)
|
|
404
412
|
|
|
405
413
|
|
|
406
414
|
class ServiceSettings(BaseModel):
|
|
@@ -741,6 +749,70 @@ class AlertSettings(BaseModel):
|
|
|
741
749
|
return self._notifiers
|
|
742
750
|
|
|
743
751
|
|
|
752
|
+
class ProfilingSettings(BaseModel):
|
|
753
|
+
"""Profiling 配置。
|
|
754
|
+
|
|
755
|
+
环境变量格式: PROFILING__{FIELD}
|
|
756
|
+
示例: PROFILING__ENABLED, PROFILING__PYROSCOPE_ENDPOINT
|
|
757
|
+
|
|
758
|
+
功能说明:
|
|
759
|
+
- Pyroscope:持续采样生成火焰图(需安装 pyroscope-io)
|
|
760
|
+
- 阻塞检测:检测同步代码阻塞事件循环(需安装 psutil)
|
|
761
|
+
"""
|
|
762
|
+
|
|
763
|
+
# Pyroscope 持续 Profiling
|
|
764
|
+
enabled: bool = Field(
|
|
765
|
+
default=False,
|
|
766
|
+
description="是否启用 Pyroscope 持续 profiling"
|
|
767
|
+
)
|
|
768
|
+
pyroscope_endpoint: str | None = Field(
|
|
769
|
+
default=None,
|
|
770
|
+
description="Pyroscope 服务端点(如 http://pyroscope:4040)"
|
|
771
|
+
)
|
|
772
|
+
pyroscope_auth_token: str | None = Field(
|
|
773
|
+
default=None,
|
|
774
|
+
description="Pyroscope 认证 token(可选)"
|
|
775
|
+
)
|
|
776
|
+
pyroscope_sample_rate: int = Field(
|
|
777
|
+
default=100,
|
|
778
|
+
description="Pyroscope 采样率 (Hz),降低可减少开销"
|
|
779
|
+
)
|
|
780
|
+
pyroscope_tags: dict[str, str] = Field(
|
|
781
|
+
default_factory=dict,
|
|
782
|
+
description="Pyroscope 自定义标签"
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# 事件循环阻塞检测
|
|
786
|
+
blocking_detector_enabled: bool = Field(
|
|
787
|
+
default=False,
|
|
788
|
+
description="是否启用事件循环阻塞检测"
|
|
789
|
+
)
|
|
790
|
+
blocking_check_interval_ms: float = Field(
|
|
791
|
+
default=100,
|
|
792
|
+
description="阻塞检测间隔 (ms)"
|
|
793
|
+
)
|
|
794
|
+
blocking_threshold_ms: float = Field(
|
|
795
|
+
default=100,
|
|
796
|
+
description="阻塞阈值 (ms),超过此时间记录阻塞事件"
|
|
797
|
+
)
|
|
798
|
+
blocking_severe_threshold_ms: float = Field(
|
|
799
|
+
default=500,
|
|
800
|
+
description="严重阻塞阈值 (ms),超过此时间触发严重告警"
|
|
801
|
+
)
|
|
802
|
+
blocking_alert_enabled: bool = Field(
|
|
803
|
+
default=True,
|
|
804
|
+
description="检测到阻塞时是否发送告警"
|
|
805
|
+
)
|
|
806
|
+
blocking_alert_cooldown_seconds: float = Field(
|
|
807
|
+
default=60,
|
|
808
|
+
description="阻塞告警冷却时间 (秒),避免告警风暴"
|
|
809
|
+
)
|
|
810
|
+
blocking_max_history: int = Field(
|
|
811
|
+
default=50,
|
|
812
|
+
description="保留的阻塞事件历史数量"
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
|
|
744
816
|
class MigrationSettings(BaseModel):
|
|
745
817
|
"""数据库迁移配置。
|
|
746
818
|
|
|
@@ -1014,6 +1086,7 @@ class BaseConfig(BaseSettings):
|
|
|
1014
1086
|
# ========== 监控告警 ==========
|
|
1015
1087
|
telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
|
|
1016
1088
|
alert: AlertSettings = Field(default_factory=AlertSettings)
|
|
1089
|
+
profiling: ProfilingSettings = Field(default_factory=ProfilingSettings)
|
|
1017
1090
|
|
|
1018
1091
|
model_config = SettingsConfigDict(
|
|
1019
1092
|
case_sensitive=False,
|
aury/boot/commands/init.py
CHANGED
|
@@ -542,6 +542,18 @@ def _collect_interactive_config() -> dict:
|
|
|
542
542
|
|
|
543
543
|
config["features"] = features
|
|
544
544
|
|
|
545
|
+
# 5.5 监控配置
|
|
546
|
+
console.print()
|
|
547
|
+
console.print("[bold]📊 监控配置[/bold]")
|
|
548
|
+
config["with_otel"] = Confirm.ask(
|
|
549
|
+
" 启用 OpenTelemetry 链路追踪",
|
|
550
|
+
default=True,
|
|
551
|
+
)
|
|
552
|
+
config["with_profiling"] = Confirm.ask(
|
|
553
|
+
" 启用 Profiling (火焰图/阻塞检测)",
|
|
554
|
+
default=False,
|
|
555
|
+
)
|
|
556
|
+
|
|
545
557
|
# 6. 开发工具
|
|
546
558
|
console.print()
|
|
547
559
|
config["with_dev"] = Confirm.ask(
|
|
@@ -589,6 +601,12 @@ def _build_dependency_list(config: dict) -> list[str]:
|
|
|
589
601
|
if config.get("with_admin_console", True):
|
|
590
602
|
extras.add("admin")
|
|
591
603
|
|
|
604
|
+
# 监控
|
|
605
|
+
if config.get("with_otel", True):
|
|
606
|
+
extras.add("otel")
|
|
607
|
+
if config.get("with_profiling", False):
|
|
608
|
+
extras.add("profiling")
|
|
609
|
+
|
|
592
610
|
# 开发工具
|
|
593
611
|
if config.get("with_dev"):
|
|
594
612
|
extras.add("dev")
|
|
@@ -616,6 +634,8 @@ def _show_config_summary(config: dict) -> None:
|
|
|
616
634
|
("服务模式", config.get("service_mode", "api")),
|
|
617
635
|
("管理后台", "是" if config.get("with_admin_console", True) else "否"),
|
|
618
636
|
("可选功能", ", ".join(config.get("features", [])) or "无"),
|
|
637
|
+
("OpenTelemetry", "是" if config.get("with_otel", True) else "否"),
|
|
638
|
+
("Profiling", "是" if config.get("with_profiling", False) else "否"),
|
|
619
639
|
("开发工具", "是" if config.get("with_dev") else "否"),
|
|
620
640
|
("Docker", "是" if config.get("with_docker") else "否"),
|
|
621
641
|
]
|
aury/boot/commands/pkg.py
CHANGED
|
@@ -45,6 +45,7 @@ class Category(str, Enum):
|
|
|
45
45
|
SCHEDULER = "scheduler"
|
|
46
46
|
ADMIN = "admin"
|
|
47
47
|
STORAGE = "storage"
|
|
48
|
+
MONITORING = "monitoring"
|
|
48
49
|
ECOSYSTEM = "ecosystem"
|
|
49
50
|
|
|
50
51
|
|
|
@@ -132,6 +133,29 @@ MODULES: dict[str, ModuleInfo] = {
|
|
|
132
133
|
category=Category.STORAGE,
|
|
133
134
|
deps=["aury-sdk-storage[aws]"],
|
|
134
135
|
),
|
|
136
|
+
# 监控
|
|
137
|
+
"otel": ModuleInfo(
|
|
138
|
+
name="otel",
|
|
139
|
+
desc="OpenTelemetry 链路追踪",
|
|
140
|
+
usage="启用 TELEMETRY__ENABLED 自动 instrument FastAPI/SQLAlchemy/httpx",
|
|
141
|
+
category=Category.MONITORING,
|
|
142
|
+
deps=["opentelemetry-api", "opentelemetry-sdk", "opentelemetry-instrumentation-fastapi",
|
|
143
|
+
"opentelemetry-instrumentation-sqlalchemy", "opentelemetry-instrumentation-httpx"],
|
|
144
|
+
),
|
|
145
|
+
"otel-exporter": ModuleInfo(
|
|
146
|
+
name="otel-exporter",
|
|
147
|
+
desc="OpenTelemetry OTLP 导出器",
|
|
148
|
+
usage="导出 Traces/Metrics/Logs 到 Jaeger/Prometheus/Loki",
|
|
149
|
+
category=Category.MONITORING,
|
|
150
|
+
deps=["opentelemetry-exporter-otlp"],
|
|
151
|
+
),
|
|
152
|
+
"profiling": ModuleInfo(
|
|
153
|
+
name="profiling",
|
|
154
|
+
desc="Profiling 性能分析",
|
|
155
|
+
usage="Pyroscope 火焰图 + 事件循环阻塞检测",
|
|
156
|
+
category=Category.MONITORING,
|
|
157
|
+
deps=["pyroscope-io", "psutil"],
|
|
158
|
+
),
|
|
135
159
|
# 生态包
|
|
136
160
|
"storage-aws": ModuleInfo(
|
|
137
161
|
name="storage-aws",
|
|
@@ -182,7 +206,12 @@ PRESETS: dict[str, PresetInfo] = {
|
|
|
182
206
|
"full": PresetInfo(
|
|
183
207
|
name="full",
|
|
184
208
|
desc="完整功能(所有模块)",
|
|
185
|
-
modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos"],
|
|
209
|
+
modules=["postgres", "redis", "tasks", "rabbitmq", "scheduler", "admin", "storage-cos", "otel", "profiling"],
|
|
210
|
+
),
|
|
211
|
+
"monitoring": PresetInfo(
|
|
212
|
+
name="monitoring",
|
|
213
|
+
desc="完整监控(OTel + Profiling)",
|
|
214
|
+
modules=["otel", "otel-exporter", "profiling"],
|
|
186
215
|
),
|
|
187
216
|
}
|
|
188
217
|
|
|
@@ -195,6 +224,7 @@ CATEGORY_NAMES: dict[Category, str] = {
|
|
|
195
224
|
Category.SCHEDULER: "📦 定时调度",
|
|
196
225
|
Category.ADMIN: "📦 管理后台",
|
|
197
226
|
Category.STORAGE: "📦 对象存储",
|
|
227
|
+
Category.MONITORING: "📊 监控分析",
|
|
198
228
|
Category.ECOSYSTEM: "🌐 生态包",
|
|
199
229
|
}
|
|
200
230
|
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# 监控与 Profiling
|
|
2
|
+
|
|
3
|
+
本文档介绍 {project_name} 项目中的监控和性能分析配置。
|
|
4
|
+
|
|
5
|
+
## 监控能力概览
|
|
6
|
+
|
|
7
|
+
| 功能 | 用途 | 建议环境 |
|
|
8
|
+
|------|------|----------|
|
|
9
|
+
| OpenTelemetry | 链路追踪、慢请求检测 | 所有环境 |
|
|
10
|
+
| 告警系统 | 异常/慢请求通知 | 所有环境 |
|
|
11
|
+
| Pyroscope | 持续 Profiling、火焰图 | 测试/灰度 |
|
|
12
|
+
| 阻塞检测 | 检测同步代码阻塞协程 | 测试/按需 |
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 不同环境的最佳实践
|
|
17
|
+
|
|
18
|
+
### 开发环境
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# .env.development
|
|
22
|
+
TELEMETRY__ENABLED=false
|
|
23
|
+
ALERT__ENABLED=false
|
|
24
|
+
PROFILING__ENABLED=false
|
|
25
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true # 开发时检测阻塞问题
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### 测试/灰度环境
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# .env.staging
|
|
32
|
+
TELEMETRY__ENABLED=true
|
|
33
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
|
|
34
|
+
|
|
35
|
+
ALERT__ENABLED=true
|
|
36
|
+
ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
37
|
+
ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
|
|
38
|
+
|
|
39
|
+
# 开启 Profiling 排查性能问题
|
|
40
|
+
PROFILING__ENABLED=true
|
|
41
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
42
|
+
|
|
43
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 生产环境
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# .env.production
|
|
50
|
+
# 链路追踪 - 必开
|
|
51
|
+
TELEMETRY__ENABLED=true
|
|
52
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
|
|
53
|
+
TELEMETRY__SAMPLING_RATE=0.1 # 采样 10% 减少开销
|
|
54
|
+
|
|
55
|
+
# 告警 - 必开
|
|
56
|
+
ALERT__ENABLED=true
|
|
57
|
+
ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
58
|
+
ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://...
|
|
59
|
+
|
|
60
|
+
# Profiling - 按需(有约 2-5% CPU 开销)
|
|
61
|
+
PROFILING__ENABLED=false # 出问题时临时开启
|
|
62
|
+
# PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
63
|
+
# PROFILING__PYROSCOPE_SAMPLE_RATE=10 # 降低采样率减少开销
|
|
64
|
+
|
|
65
|
+
# 阻塞检测 - 按需
|
|
66
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=false # 出问题时临时开启
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## OpenTelemetry
|
|
72
|
+
|
|
73
|
+
自动 instrument:
|
|
74
|
+
- FastAPI 请求
|
|
75
|
+
- SQLAlchemy SQL 查询
|
|
76
|
+
- httpx 外部调用
|
|
77
|
+
|
|
78
|
+
### 配置
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
TELEMETRY__ENABLED=true
|
|
82
|
+
TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317 # 可选
|
|
83
|
+
TELEMETRY__SAMPLING_RATE=1.0 # 采样率,1.0=100%
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 手动 Span
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from aury.boot.infrastructure.monitoring.tracing import span, trace_span
|
|
90
|
+
|
|
91
|
+
# 装饰器方式
|
|
92
|
+
@trace_span(name="call_external_api")
|
|
93
|
+
async def call_api():
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
# 上下文管理器
|
|
97
|
+
async def process():
|
|
98
|
+
with span("step_1"):
|
|
99
|
+
await do_step_1()
|
|
100
|
+
with span("step_2"):
|
|
101
|
+
await do_step_2()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Pyroscope 持续 Profiling
|
|
107
|
+
|
|
108
|
+
生成 CPU 火焰图,定位性能瓶颈。
|
|
109
|
+
|
|
110
|
+
### 安装
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install pyroscope-io
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 配置
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
PROFILING__ENABLED=true
|
|
120
|
+
PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
121
|
+
PROFILING__PYROSCOPE_SAMPLE_RATE=100 # 采样率 Hz
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### 部署 Pyroscope
|
|
125
|
+
|
|
126
|
+
```yaml
|
|
127
|
+
# docker-compose.yml
|
|
128
|
+
services:
|
|
129
|
+
pyroscope:
|
|
130
|
+
image: grafana/pyroscope:latest
|
|
131
|
+
ports:
|
|
132
|
+
- "4040:4040"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
访问 http://localhost:4040 查看火焰图。
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## 事件循环阻塞检测
|
|
140
|
+
|
|
141
|
+
检测同步代码阻塞 asyncio 事件循环的问题。
|
|
142
|
+
|
|
143
|
+
### 安装
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
pip install psutil
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### 配置
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=true
|
|
153
|
+
PROFILING__BLOCKING_THRESHOLD_MS=100 # 阻塞阈值
|
|
154
|
+
PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500 # 严重阈值
|
|
155
|
+
PROFILING__BLOCKING_ALERT_ENABLED=true # 阻塞时发送告警
|
|
156
|
+
PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60 # 告警冷却
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 工作原理
|
|
160
|
+
|
|
161
|
+
1. 后台线程每 100ms 向事件循环投递空任务
|
|
162
|
+
2. 如果响应延迟 > 阈值,说明事件循环被阻塞
|
|
163
|
+
3. 自动捕获主线程调用栈 + 进程状态
|
|
164
|
+
4. 发送告警(含阻塞代码位置)
|
|
165
|
+
|
|
166
|
+
### 告警示例
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
事件循环阻塞(严重): 520ms
|
|
170
|
+
|
|
171
|
+
调用栈:
|
|
172
|
+
app/services/sync_io.py:42 in read_file
|
|
173
|
+
> data = open(path).read() # 同步 IO!
|
|
174
|
+
|
|
175
|
+
进程状态:
|
|
176
|
+
cpu: 95%, memory: 256MB, threads: 12
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 常见阻塞原因
|
|
180
|
+
|
|
181
|
+
- `time.sleep()` 应使用 `asyncio.sleep()`
|
|
182
|
+
- `open().read()` 应使用 `aiofiles`
|
|
183
|
+
- `requests.get()` 应使用 `httpx` 或 `aiohttp`
|
|
184
|
+
- CPU 密集计算 应使用 `run_in_executor()`
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## 环境变量参考
|
|
189
|
+
|
|
190
|
+
### Telemetry
|
|
191
|
+
|
|
192
|
+
| 变量 | 说明 | 默认值 |
|
|
193
|
+
|------|------|--------|
|
|
194
|
+
| `TELEMETRY__ENABLED` | 是否启用 | `false` |
|
|
195
|
+
| `TELEMETRY__TRACES_ENDPOINT` | Traces 导出端点 | - |
|
|
196
|
+
| `TELEMETRY__LOGS_ENDPOINT` | Logs 导出端点 | - |
|
|
197
|
+
| `TELEMETRY__METRICS_ENDPOINT` | Metrics 导出端点 | - |
|
|
198
|
+
| `TELEMETRY__SAMPLING_RATE` | 采样率 | `1.0` |
|
|
199
|
+
|
|
200
|
+
### Profiling
|
|
201
|
+
|
|
202
|
+
| 变量 | 说明 | 默认值 |
|
|
203
|
+
|------|------|--------|
|
|
204
|
+
| `PROFILING__ENABLED` | 是否启用 Pyroscope | `false` |
|
|
205
|
+
| `PROFILING__PYROSCOPE_ENDPOINT` | Pyroscope 端点 | - |
|
|
206
|
+
| `PROFILING__PYROSCOPE_SAMPLE_RATE` | 采样率 (Hz) | `100` |
|
|
207
|
+
| `PROFILING__BLOCKING_DETECTOR_ENABLED` | 阻塞检测 | `false` |
|
|
208
|
+
| `PROFILING__BLOCKING_THRESHOLD_MS` | 阻塞阈值 | `100` |
|
|
209
|
+
| `PROFILING__BLOCKING_SEVERE_THRESHOLD_MS` | 严重阈值 | `500` |
|
|
210
|
+
| `PROFILING__BLOCKING_ALERT_ENABLED` | 阻塞告警 | `true` |
|
|
211
|
+
| `PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS` | 告警冷却 | `60` |
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## 推荐的监控栈
|
|
216
|
+
|
|
217
|
+
### 开源方案
|
|
218
|
+
|
|
219
|
+
```
|
|
220
|
+
OpenTelemetry → Jaeger (Traces)
|
|
221
|
+
→ Grafana Loki (Logs)
|
|
222
|
+
→ Prometheus (Metrics)
|
|
223
|
+
|
|
224
|
+
Pyroscope → 火焰图
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### 云服务方案
|
|
228
|
+
|
|
229
|
+
- **阿里云 ARMS** - APM + 告警
|
|
230
|
+
- **腾讯云 APM** - 类似
|
|
231
|
+
- **Datadog** - 全功能 APM
|
|
232
|
+
- **Sentry** - 错误监控
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## 相关文档
|
|
237
|
+
|
|
238
|
+
- [17-alerting.md](./17-alerting.md) - 告警系统详细配置
|
|
239
|
+
- [11-logging.md](./11-logging.md) - 日志配置
|
|
@@ -12,6 +12,21 @@ TELEMETRY__ENABLED=false
|
|
|
12
12
|
# TELEMETRY__LOGS_ENDPOINT=http://loki:3100
|
|
13
13
|
# TELEMETRY__METRICS_ENDPOINT=http://prometheus:9090
|
|
14
14
|
|
|
15
|
+
# ---------- Profiling 性能分析 ----------
|
|
16
|
+
# Pyroscope 持续 Profiling(生成火焰图)
|
|
17
|
+
# 需安装:pip install pyroscope-io
|
|
18
|
+
PROFILING__ENABLED=false
|
|
19
|
+
# PROFILING__PYROSCOPE_ENDPOINT=http://pyroscope:4040
|
|
20
|
+
# PROFILING__PYROSCOPE_SAMPLE_RATE=100
|
|
21
|
+
|
|
22
|
+
# 事件循环阻塞检测(检测同步代码阻塞协程)
|
|
23
|
+
# 需安装:pip install psutil
|
|
24
|
+
PROFILING__BLOCKING_DETECTOR_ENABLED=false
|
|
25
|
+
# PROFILING__BLOCKING_THRESHOLD_MS=100
|
|
26
|
+
# PROFILING__BLOCKING_SEVERE_THRESHOLD_MS=500
|
|
27
|
+
# PROFILING__BLOCKING_ALERT_ENABLED=true
|
|
28
|
+
# PROFILING__BLOCKING_ALERT_COOLDOWN_SECONDS=60
|
|
29
|
+
|
|
15
30
|
# ---------- 告警系统 ----------
|
|
16
31
|
ALERT__ENABLED=false
|
|
17
32
|
# 慢操作阈值
|