aury-boot 0.0.28__py3-none-any.whl → 0.0.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. aury/boot/_version.py +2 -2
  2. aury/boot/application/app/base.py +126 -2
  3. aury/boot/application/app/components.py +224 -1
  4. aury/boot/application/config/settings.py +195 -3
  5. aury/boot/application/constants/components.py +3 -0
  6. aury/boot/application/middleware/logging.py +45 -6
  7. aury/boot/commands/docs.py +40 -0
  8. aury/boot/commands/init.py +2 -0
  9. aury/boot/commands/templates/project/AGENTS.md.tpl +16 -1
  10. aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
  11. aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
  12. aury/boot/commands/templates/project/aury_docs/03-service.md.tpl +60 -0
  13. aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
  14. aury/boot/commands/templates/project/env_templates/monitoring.tpl +61 -0
  15. aury/boot/common/logging/context.py +17 -1
  16. aury/boot/common/logging/format.py +4 -0
  17. aury/boot/domain/transaction/__init__.py +57 -0
  18. aury/boot/infrastructure/channel/base.py +6 -2
  19. aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
  20. aury/boot/infrastructure/monitoring/__init__.py +210 -6
  21. aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
  22. aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
  23. aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
  24. aury/boot/infrastructure/monitoring/alerting/manager.py +428 -0
  25. aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
  26. aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
  27. aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
  28. aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
  29. aury/boot/infrastructure/monitoring/alerting/rules.py +163 -0
  30. aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
  31. aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
  32. aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
  33. aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
  34. aury/boot/infrastructure/monitoring/tracing/processor.py +327 -0
  35. aury/boot/infrastructure/monitoring/tracing/provider.py +320 -0
  36. aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
  37. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/METADATA +14 -1
  38. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/RECORD +40 -21
  39. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/WHEEL +0 -0
  40. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,327 @@
1
+ """自定义 SpanProcessor,用于检测慢 span 和异常 span 并触发告警。"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Awaitable, Callable
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from aury.boot.common.logging import logger
10
+
11
+ # OTel 可选依赖
12
+ try:
13
+ from opentelemetry.trace import SpanKind as OTelSpanKind
14
+ from opentelemetry.trace import StatusCode
15
+ except ImportError:
16
+ OTelSpanKind = None # type: ignore[assignment, misc]
17
+ StatusCode = None # type: ignore[assignment, misc]
18
+
19
+ if TYPE_CHECKING:
20
+ from asyncio import Task
21
+
22
+ from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
23
+
24
+ # 告警回调类型: async def callback(event_type, message, **metadata)
25
+ AlertCallback = Callable[[str, str], Awaitable[None]] | Callable[..., Awaitable[None]]
26
+
27
+
28
+ class AlertingSpanProcessor:
29
+ """告警 SpanProcessor。
30
+
31
+ 在 span 结束时检测:
32
+ - 慢 span(duration > threshold)
33
+ - 异常 span(status = ERROR)
34
+
35
+ 通过回调函数触发告警,不直接依赖 alerting 模块。
36
+
37
+ 注意:这是一个同步 SpanProcessor,告警发送会在后台异步执行。
38
+ """
39
+
40
+ # 保存后台任务引用,避免被 GC 回收
41
+ _background_tasks: set["Task"] = set()
42
+
43
+ def __init__(
44
+ self,
45
+ *,
46
+ slow_request_threshold: float = 1.0,
47
+ slow_sql_threshold: float = 0.5,
48
+ alert_on_slow_request: bool = True,
49
+ alert_on_slow_sql: bool = True,
50
+ alert_on_error: bool = True,
51
+ alert_callback: AlertCallback | None = None,
52
+ ) -> None:
53
+ """初始化 AlertingSpanProcessor。
54
+
55
+ Args:
56
+ slow_request_threshold: HTTP 请求慢阈值(秒)
57
+ slow_sql_threshold: SQL 查询慢阈值(秒)
58
+ alert_on_slow_request: 是否对慢 HTTP 请求发送告警
59
+ alert_on_slow_sql: 是否对慢 SQL 发送告警
60
+ alert_on_error: 是否对异常 span 发送告警
61
+ alert_callback: 告警回调函数,签名: async (event_type, message, **metadata) -> None
62
+ """
63
+ self._slow_request_threshold = slow_request_threshold
64
+ self._slow_sql_threshold = slow_sql_threshold
65
+ self._alert_on_slow_request = alert_on_slow_request
66
+ self._alert_on_slow_sql = alert_on_slow_sql
67
+ self._alert_on_error = alert_on_error
68
+ self._alert_callback = alert_callback
69
+
70
+ def on_start(self, span: "Span", parent_context: object = None) -> None:
71
+ """span 开始时调用(不做处理)。"""
72
+ pass
73
+
74
+ def on_end(self, span: "ReadableSpan") -> None:
75
+ """span 结束时调用,检测并触发告警。"""
76
+ if StatusCode is None:
77
+ return
78
+
79
+ # 获取 span 信息
80
+ name = span.name
81
+ duration_ns = (span.end_time or 0) - (span.start_time or 0)
82
+ duration_s = duration_ns / 1e9
83
+ status = span.status
84
+ trace_id = format(span.context.trace_id, "032x") if span.context else ""
85
+
86
+ # 获取 span 属性
87
+ attributes = dict(span.attributes) if span.attributes else {}
88
+
89
+ # 判断 span 类型
90
+ span_kind = _get_span_kind(span)
91
+
92
+ # 根据 span 类型获取对应的慢阈值和开关
93
+ threshold = self._get_slow_threshold(span_kind)
94
+ should_alert = self._should_alert_slow(span_kind)
95
+
96
+ # 检测慢 span
97
+ if should_alert and threshold > 0 and duration_s >= threshold:
98
+ self._emit_slow_alert(
99
+ name=name,
100
+ duration=duration_s,
101
+ trace_id=trace_id,
102
+ span_kind=span_kind,
103
+ attributes=attributes,
104
+ threshold=threshold,
105
+ )
106
+
107
+ # 检测异常 span(只对 SERVER span 发告警,避免重复)
108
+ if (
109
+ self._alert_on_error
110
+ and status
111
+ and status.status_code == StatusCode.ERROR
112
+ and span_kind == "http" # 只对 HTTP SERVER span 发异常告警
113
+ ):
114
+ # 过滤 4xx 业务异常(如 401/403/404 等),只对 5xx 系统异常告警
115
+ http_status = attributes.get("http.status_code", 0)
116
+ if 400 <= http_status < 500:
117
+ return # 业务异常不告警
118
+
119
+ # 从 span events 中提取异常详情
120
+ exception_info = _extract_exception_info(span)
121
+ self._emit_error_alert(
122
+ name=name,
123
+ duration=duration_s,
124
+ trace_id=trace_id,
125
+ span_kind=span_kind,
126
+ error_message=exception_info.get("message") or status.description or "Unknown error",
127
+ attributes=attributes,
128
+ exception_info=exception_info,
129
+ )
130
+
131
+ def shutdown(self) -> None:
132
+ """关闭处理器。"""
133
+ pass
134
+
135
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
136
+ """强制刷新(无缓冲,直接返回)。"""
137
+ return True
138
+
139
+ def _get_slow_threshold(self, span_kind: str) -> float:
140
+ """根据 span 类型获取对应的慢阈值。"""
141
+ if span_kind == "database":
142
+ return self._slow_sql_threshold
143
+ elif span_kind in ("http", "http_client"):
144
+ return self._slow_request_threshold
145
+ # 其他类型使用 HTTP 阈值作为默认
146
+ return self._slow_request_threshold
147
+
148
+ def _should_alert_slow(self, span_kind: str) -> bool:
149
+ """根据 span 类型判断是否应该发送慢告警。"""
150
+ if span_kind == "database":
151
+ return self._alert_on_slow_sql
152
+ elif span_kind in ("http", "http_client"):
153
+ return self._alert_on_slow_request
154
+ # 其他类型默认使用 HTTP 的开关
155
+ return self._alert_on_slow_request
156
+
157
+ def _emit_slow_alert(
158
+ self,
159
+ name: str,
160
+ duration: float,
161
+ trace_id: str,
162
+ span_kind: str,
163
+ attributes: dict,
164
+ threshold: float,
165
+ ) -> None:
166
+ """发送慢 span 告警。"""
167
+ if not self._alert_callback:
168
+ return
169
+
170
+ try:
171
+ task = asyncio.create_task(
172
+ self._alert_callback(
173
+ _get_event_type_for_slow(span_kind),
174
+ f"慢 {span_kind}: {name}",
175
+ severity="warning",
176
+ trace_id=trace_id,
177
+ source=span_kind,
178
+ duration=duration,
179
+ threshold=threshold,
180
+ **_extract_alert_context(attributes),
181
+ )
182
+ )
183
+ self._background_tasks.add(task)
184
+ task.add_done_callback(self._background_tasks.discard)
185
+ except RuntimeError:
186
+ logger.debug(f"无法发送慢 span 告警(无事件循环): {name}")
187
+
188
+ def _emit_error_alert(
189
+ self,
190
+ name: str,
191
+ duration: float,
192
+ trace_id: str,
193
+ span_kind: str,
194
+ error_message: str,
195
+ attributes: dict,
196
+ exception_info: dict | None = None,
197
+ ) -> None:
198
+ """发送异常 span 告警。"""
199
+ if not self._alert_callback:
200
+ return
201
+
202
+ # 合并异常信息
203
+ extra_context = _extract_alert_context(attributes)
204
+ if exception_info:
205
+ if exception_info.get("type"):
206
+ extra_context["error_type"] = exception_info["type"]
207
+ if exception_info.get("stacktrace"):
208
+ extra_context["stacktrace"] = exception_info["stacktrace"]
209
+
210
+ try:
211
+ task = asyncio.create_task(
212
+ self._alert_callback(
213
+ "exception",
214
+ f"异常: {error_message}",
215
+ severity="error",
216
+ trace_id=trace_id,
217
+ source=span_kind,
218
+ duration=duration,
219
+ error_message=error_message,
220
+ **extra_context,
221
+ )
222
+ )
223
+ self._background_tasks.add(task)
224
+ task.add_done_callback(self._background_tasks.discard)
225
+ except RuntimeError:
226
+ logger.debug(f"无法发送异常 span 告警(无事件循环): {name}")
227
+
228
+
229
+ def _get_span_kind(span: "ReadableSpan") -> str:
230
+ """根据 span 属性推断类型。"""
231
+ if OTelSpanKind is None:
232
+ return "unknown"
233
+
234
+ kind = span.kind
235
+
236
+ if kind == OTelSpanKind.SERVER:
237
+ # HTTP 请求
238
+ return "http"
239
+ elif kind == OTelSpanKind.CLIENT:
240
+ # 外部调用(DB、HTTP client 等)
241
+ attributes = dict(span.attributes) if span.attributes else {}
242
+ if "db.system" in attributes:
243
+ return "database"
244
+ elif "http.url" in attributes or "http.target" in attributes:
245
+ return "http_client"
246
+ return "client"
247
+ elif kind == OTelSpanKind.INTERNAL:
248
+ return "internal"
249
+ elif kind == OTelSpanKind.PRODUCER:
250
+ return "producer"
251
+ elif kind == OTelSpanKind.CONSUMER:
252
+ return "consumer"
253
+
254
+ return "unknown"
255
+
256
+
257
+ def _get_event_type_for_slow(span_kind: str) -> str:
258
+ """根据 span 类型获取慢操作的告警事件类型。"""
259
+ mapping = {
260
+ "http": "slow_request",
261
+ "database": "slow_sql",
262
+ "http_client": "slow_request",
263
+ }
264
+ return mapping.get(span_kind, "custom")
265
+
266
+
267
+ def _extract_exception_info(span: "ReadableSpan") -> dict:
268
+ """从 span events 中提取异常信息。
269
+
270
+ OTEL 会将异常作为 span event 记录,包含:
271
+ - exception.type: 异常类型
272
+ - exception.message: 异常消息
273
+ - exception.stacktrace: 堆栈信息
274
+ """
275
+ info = {}
276
+
277
+ if not span.events:
278
+ return info
279
+
280
+ for event in span.events:
281
+ if event.name == "exception":
282
+ attrs = dict(event.attributes) if event.attributes else {}
283
+ if "exception.type" in attrs:
284
+ info["type"] = str(attrs["exception.type"])
285
+ if "exception.message" in attrs:
286
+ info["message"] = str(attrs["exception.message"])
287
+ if "exception.stacktrace" in attrs:
288
+ info["stacktrace"] = str(attrs["exception.stacktrace"])
289
+ break # 只取第一个异常事件
290
+
291
+ return info
292
+
293
+
294
+ def _extract_alert_context(attributes: dict) -> dict:
295
+ """从 span 属性中提取告警上下文。"""
296
+ context = {}
297
+
298
+ # HTTP 相关
299
+ if "http.method" in attributes:
300
+ context["method"] = attributes["http.method"]
301
+ if "http.route" in attributes:
302
+ context["route"] = attributes["http.route"]
303
+ if "http.target" in attributes:
304
+ context["endpoint"] = attributes["http.target"]
305
+ if "http.url" in attributes:
306
+ context["url"] = attributes["http.url"]
307
+ if "http.status_code" in attributes:
308
+ context["status_code"] = attributes["http.status_code"]
309
+
310
+ # 数据库相关
311
+ if "db.system" in attributes:
312
+ context["db_system"] = attributes["db.system"]
313
+ if "db.statement" in attributes:
314
+ context["sql"] = _normalize_sql(str(attributes["db.statement"]))
315
+
316
+ return context
317
+
318
+
319
+ def _normalize_sql(sql: str) -> str:
320
+ """清理 SQL 多余空白。"""
321
+ import re
322
+ # 将多个空白字符(包括换行、制表符)合并为单个空格
323
+ sql = re.sub(r'\s+', ' ', sql)
324
+ return sql.strip()
325
+
326
+
327
+ __all__ = ["AlertingSpanProcessor"]
@@ -0,0 +1,320 @@
1
+ """OpenTelemetry TracerProvider 配置和初始化。"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from aury.boot.common.logging import logger
9
+
10
+ from .processor import AlertingSpanProcessor
11
+
12
+ # OTel 核心可选依赖
13
+ try:
14
+ from opentelemetry import trace as otel_trace
15
+ from opentelemetry.sdk.resources import Resource
16
+ from opentelemetry.sdk.trace import TracerProvider as OTelTracerProvider
17
+ except ImportError:
18
+ otel_trace = None # type: ignore[assignment]
19
+ Resource = None # type: ignore[assignment, misc]
20
+ OTelTracerProvider = None # type: ignore[assignment, misc]
21
+
22
+ # OTel Traces 导出可选依赖
23
+ try:
24
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
25
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
26
+ except ImportError:
27
+ OTLPSpanExporter = None # type: ignore[assignment, misc]
28
+ BatchSpanProcessor = None # type: ignore[assignment, misc]
29
+
30
+ # OTel Metrics 导出可选依赖
31
+ try:
32
+ from opentelemetry import metrics as otel_metrics
33
+ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
34
+ from opentelemetry.sdk.metrics import MeterProvider
35
+ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
36
+ except ImportError:
37
+ otel_metrics = None # type: ignore[assignment]
38
+ OTLPMetricExporter = None # type: ignore[assignment, misc]
39
+ MeterProvider = None # type: ignore[assignment, misc]
40
+ PeriodicExportingMetricReader = None # type: ignore[assignment, misc]
41
+
42
+ # OTel Instrumentation 可选依赖
43
+ try:
44
+ from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
45
+ except ImportError:
46
+ FastAPIInstrumentor = None # type: ignore[assignment, misc]
47
+
48
+ try:
49
+ from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
50
+ except ImportError:
51
+ SQLAlchemyInstrumentor = None # type: ignore[assignment, misc]
52
+
53
+ try:
54
+ from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
55
+ except ImportError:
56
+ HTTPXClientInstrumentor = None # type: ignore[assignment, misc]
57
+
58
+ if TYPE_CHECKING:
59
+ from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
60
+
61
+
62
+ @dataclass
63
+ class TelemetryConfig:
64
+ """遍测配置。"""
65
+
66
+ # 基础配置
67
+ service_name: str = "aury-service"
68
+ service_version: str = ""
69
+ environment: str = "development"
70
+
71
+ # 启用的 instrumentation
72
+ instrument_fastapi: bool = True
73
+ instrument_sqlalchemy: bool = True
74
+ instrument_httpx: bool = True
75
+
76
+ # 告警配置
77
+ alert_enabled: bool = True
78
+ slow_request_threshold: float = 1.0 # HTTP 请求慢阈值(秒)
79
+ slow_sql_threshold: float = 0.5 # SQL 查询慢阈值(秒)
80
+ alert_on_slow_request: bool = True # 是否对慢 HTTP 请求发送告警
81
+ alert_on_slow_sql: bool = True # 是否对慢 SQL 发送告警
82
+ alert_on_error: bool = True
83
+ alert_callback: Any = None # 告警回调函数
84
+
85
+ # OTLP Traces 导出配置
86
+ traces_endpoint: str | None = None
87
+ traces_headers: dict[str, str] = field(default_factory=dict)
88
+
89
+ # OTLP Logs 导出配置
90
+ logs_endpoint: str | None = None
91
+ logs_headers: dict[str, str] = field(default_factory=dict)
92
+
93
+ # OTLP Metrics 导出配置
94
+ metrics_endpoint: str | None = None
95
+ metrics_headers: dict[str, str] = field(default_factory=dict)
96
+
97
+ # 采样配置
98
+ sampling_rate: float = 1.0 # 1.0 = 100%
99
+
100
+
101
+ class TelemetryProvider:
102
+ """遥测提供者。
103
+
104
+ 封装 OpenTelemetry TracerProvider 的配置和初始化逻辑。
105
+ """
106
+
107
+ def __init__(self, config: TelemetryConfig) -> None:
108
+ """初始化 TelemetryProvider。
109
+
110
+ Args:
111
+ config: 遥测配置
112
+ """
113
+ self._config = config
114
+ self._provider: "TracerProvider | None" = None
115
+ self._initialized = False
116
+
117
+ def initialize(self) -> bool:
118
+ """初始化 OpenTelemetry。
119
+
120
+ Returns:
121
+ bool: 是否成功初始化
122
+ """
123
+ if self._initialized:
124
+ return True
125
+
126
+ if OTelTracerProvider is None:
127
+ logger.warning("OpenTelemetry 初始化失败(缺少依赖)")
128
+ return False
129
+
130
+ try:
131
+ # 创建 Resource
132
+ resource = Resource.create({
133
+ "service.name": self._config.service_name,
134
+ "service.version": self._config.service_version,
135
+ "deployment.environment": self._config.environment,
136
+ })
137
+
138
+ # 创建 TracerProvider
139
+ self._provider = OTelTracerProvider(resource=resource)
140
+
141
+ # 添加 SpanProcessor
142
+ self._setup_processors()
143
+
144
+ # 设置为全局 TracerProvider
145
+ otel_trace.set_tracer_provider(self._provider)
146
+
147
+ # 配置 instrumentation
148
+ self._setup_instrumentations()
149
+
150
+ self._initialized = True
151
+ logger.info(
152
+ f"OpenTelemetry 初始化完成: "
153
+ f"service={self._config.service_name}, "
154
+ f"alert_enabled={self._config.alert_enabled}"
155
+ )
156
+ return True
157
+ except Exception as e:
158
+ logger.error(f"OpenTelemetry 初始化失败: {e}")
159
+ return False
160
+
161
+ def shutdown(self) -> None:
162
+ """关闭 OpenTelemetry。"""
163
+ if self._provider:
164
+ try:
165
+ self._provider.shutdown()
166
+ logger.info("OpenTelemetry 已关闭")
167
+ except Exception as e:
168
+ logger.warning(f"OpenTelemetry 关闭失败: {e}")
169
+
170
+ def _setup_processors(self) -> None:
171
+ """设置 SpanProcessor。"""
172
+ if not self._provider:
173
+ return
174
+
175
+ # 添加告警处理器
176
+ if self._config.alert_enabled:
177
+ alerting_processor = AlertingSpanProcessor(
178
+ slow_request_threshold=self._config.slow_request_threshold,
179
+ slow_sql_threshold=self._config.slow_sql_threshold,
180
+ alert_on_slow_request=self._config.alert_on_slow_request,
181
+ alert_on_slow_sql=self._config.alert_on_slow_sql,
182
+ alert_on_error=self._config.alert_on_error,
183
+ alert_callback=self._config.alert_callback,
184
+ )
185
+ self._provider.add_span_processor(alerting_processor)
186
+ logger.debug("已添加 AlertingSpanProcessor")
187
+
188
+ # 添加 OTLP Traces 导出器
189
+ if self._config.traces_endpoint:
190
+ self._setup_traces_exporter()
191
+
192
+ # 添加 OTLP Metrics 导出器
193
+ if self._config.metrics_endpoint:
194
+ self._setup_metrics_exporter()
195
+
196
+ def _setup_traces_exporter(self) -> None:
197
+ """设置 Traces OTLP 导出器。"""
198
+ if not self._provider or not self._config.traces_endpoint:
199
+ return
200
+
201
+ if OTLPSpanExporter is None:
202
+ logger.warning("Traces OTLP 导出器未安装,跳过配置")
203
+ return
204
+
205
+ try:
206
+ exporter = OTLPSpanExporter(
207
+ endpoint=self._config.traces_endpoint,
208
+ headers=self._config.traces_headers or None,
209
+ )
210
+ self._provider.add_span_processor(BatchSpanProcessor(exporter))
211
+ logger.info(f"Traces OTLP 导出器已配置: {self._config.traces_endpoint}")
212
+ except Exception as e:
213
+ logger.warning(f"Traces OTLP 导出器配置失败: {e}")
214
+
215
+ def _setup_metrics_exporter(self) -> None:
216
+ """设置 Metrics OTLP 导出器。"""
217
+ if not self._config.metrics_endpoint:
218
+ return
219
+
220
+ if OTLPMetricExporter is None:
221
+ logger.warning("Metrics OTLP 导出器未安装,跳过配置")
222
+ return
223
+
224
+ try:
225
+ resource = Resource.create({
226
+ "service.name": self._config.service_name,
227
+ "service.version": self._config.service_version,
228
+ "deployment.environment": self._config.environment,
229
+ })
230
+
231
+ exporter = OTLPMetricExporter(
232
+ endpoint=self._config.metrics_endpoint,
233
+ headers=self._config.metrics_headers or None,
234
+ )
235
+ reader = PeriodicExportingMetricReader(exporter)
236
+ meter_provider = MeterProvider(resource=resource, metric_readers=[reader])
237
+ otel_metrics.set_meter_provider(meter_provider)
238
+
239
+ logger.info(f"Metrics OTLP 导出器已配置: {self._config.metrics_endpoint}")
240
+ except Exception as e:
241
+ logger.warning(f"Metrics OTLP 导出器配置失败: {e}")
242
+
243
+ def _setup_instrumentations(self) -> None:
244
+ """配置自动 instrumentation。
245
+
246
+ 注意:
247
+ - FastAPI instrumentation 需要通过 instrument_fastapi_app() 单独调用。
248
+ - SQLAlchemy instrumentation 需要在 engine 创建后调用(由 DatabaseComponent 处理)。
249
+ """
250
+ enabled = []
251
+ pending = []
252
+
253
+ # SQLAlchemy instrumentation 需要 engine,在 DatabaseComponent 中处理
254
+ if self._config.instrument_sqlalchemy:
255
+ pending.append("SQLAlchemy")
256
+
257
+ # HTTPX instrumentation
258
+ if self._config.instrument_httpx:
259
+ if self._instrument_httpx():
260
+ enabled.append("HTTPX")
261
+
262
+ if enabled:
263
+ logger.info(f"Instrumentation 已启用: {', '.join(enabled)}")
264
+ if pending:
265
+ logger.debug(f"Instrumentation 待启用(需要 engine): {', '.join(pending)}")
266
+
267
+ def instrument_fastapi_app(self, app) -> None:
268
+ """对已创建的 FastAPI app 进行 instrumentation。
269
+
270
+ Args:
271
+ app: FastAPI 应用实例
272
+ """
273
+ if FastAPIInstrumentor is None:
274
+ logger.debug("FastAPI instrumentation 未安装")
275
+ return
276
+
277
+ if not self._config.instrument_fastapi:
278
+ return
279
+
280
+ try:
281
+ FastAPIInstrumentor.instrument_app(app)
282
+ logger.info("FastAPI instrumentation 已启用")
283
+ except Exception as e:
284
+ logger.warning(f"FastAPI instrumentation 配置失败: {e}")
285
+
286
+
287
+ def _instrument_httpx(self) -> bool:
288
+ """配置 HTTPX instrumentation。"""
289
+ if HTTPXClientInstrumentor is None:
290
+ logger.debug("HTTPX instrumentation 未安装")
291
+ return False
292
+
293
+ try:
294
+ HTTPXClientInstrumentor().instrument()
295
+ return True
296
+ except Exception as e:
297
+ logger.warning(f"HTTPX instrumentation 配置失败: {e}")
298
+ return False
299
+
300
+ def add_processor(self, processor: "SpanProcessor") -> None:
301
+ """添加自定义 SpanProcessor。
302
+
303
+ Args:
304
+ processor: SpanProcessor 实例
305
+ """
306
+ if self._provider:
307
+ self._provider.add_span_processor(processor)
308
+
309
+ @property
310
+ def provider(self) -> "TracerProvider | None":
311
+ """获取 TracerProvider 实例。"""
312
+ return self._provider
313
+
314
+ @property
315
+ def is_initialized(self) -> bool:
316
+ """是否已初始化。"""
317
+ return self._initialized
318
+
319
+
320
+ __all__ = ["TelemetryConfig", "TelemetryProvider"]