aury-boot 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/__init__.py +2 -4
- aury/boot/application/app/base.py +126 -2
- aury/boot/application/app/components.py +226 -1
- aury/boot/application/config/settings.py +201 -3
- aury/boot/application/constants/components.py +3 -0
- aury/boot/application/middleware/logging.py +45 -6
- aury/boot/commands/docs.py +40 -0
- aury/boot/commands/init.py +2 -0
- aury/boot/commands/templates/project/AGENTS.md.tpl +59 -0
- aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
- aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
- aury/boot/commands/templates/project/env_templates/messaging.tpl +21 -13
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +63 -0
- aury/boot/common/logging/context.py +17 -1
- aury/boot/common/logging/format.py +4 -0
- aury/boot/infrastructure/__init__.py +4 -8
- aury/boot/infrastructure/channel/__init__.py +9 -8
- aury/boot/infrastructure/channel/backends/__init__.py +2 -6
- aury/boot/infrastructure/channel/backends/broadcaster.py +141 -0
- aury/boot/infrastructure/channel/base.py +11 -4
- aury/boot/infrastructure/channel/manager.py +25 -24
- aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
- aury/boot/infrastructure/events/__init__.py +4 -6
- aury/boot/infrastructure/events/backends/__init__.py +2 -4
- aury/boot/infrastructure/events/backends/broadcaster.py +189 -0
- aury/boot/infrastructure/events/base.py +9 -4
- aury/boot/infrastructure/events/manager.py +24 -20
- aury/boot/infrastructure/monitoring/__init__.py +210 -6
- aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
- aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
- aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
- aury/boot/infrastructure/monitoring/alerting/manager.py +430 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
- aury/boot/infrastructure/monitoring/alerting/rules.py +179 -0
- aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
- aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
- aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
- aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
- aury/boot/infrastructure/monitoring/tracing/processor.py +357 -0
- aury/boot/infrastructure/monitoring/tracing/provider.py +322 -0
- aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/METADATA +14 -1
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/RECORD +50 -33
- aury/boot/infrastructure/channel/backends/memory.py +0 -126
- aury/boot/infrastructure/channel/backends/redis.py +0 -130
- aury/boot/infrastructure/events/backends/memory.py +0 -86
- aury/boot/infrastructure/events/backends/redis.py +0 -169
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""自定义 SpanProcessor,用于检测慢 span 和异常 span 并触发告警。"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Awaitable, Callable
|
|
7
|
+
import fnmatch
|
|
8
|
+
import re
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from aury.boot.common.logging import logger
|
|
12
|
+
|
|
13
|
+
# OTel 可选依赖
|
|
14
|
+
try:
|
|
15
|
+
from opentelemetry.trace import SpanKind as OTelSpanKind
|
|
16
|
+
from opentelemetry.trace import StatusCode
|
|
17
|
+
except ImportError:
|
|
18
|
+
OTelSpanKind = None # type: ignore[assignment, misc]
|
|
19
|
+
StatusCode = None # type: ignore[assignment, misc]
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from asyncio import Task
|
|
23
|
+
|
|
24
|
+
from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
|
|
25
|
+
|
|
26
|
+
# 告警回调类型: async def callback(event_type, message, **metadata)
|
|
27
|
+
AlertCallback = Callable[[str, str], Awaitable[None]] | Callable[..., Awaitable[None]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AlertingSpanProcessor:
|
|
31
|
+
"""告警 SpanProcessor。
|
|
32
|
+
|
|
33
|
+
在 span 结束时检测:
|
|
34
|
+
- 慢 span(duration > threshold)
|
|
35
|
+
- 异常 span(status = ERROR)
|
|
36
|
+
|
|
37
|
+
通过回调函数触发告警,不直接依赖 alerting 模块。
|
|
38
|
+
|
|
39
|
+
注意:这是一个同步 SpanProcessor,告警发送会在后台异步执行。
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# 保存后台任务引用,避免被 GC 回收
|
|
43
|
+
_background_tasks: set["Task"] = set()
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
*,
|
|
48
|
+
slow_request_threshold: float = 1.0,
|
|
49
|
+
slow_sql_threshold: float = 0.5,
|
|
50
|
+
alert_on_slow_request: bool = True,
|
|
51
|
+
alert_on_slow_sql: bool = True,
|
|
52
|
+
alert_on_error: bool = True,
|
|
53
|
+
alert_callback: AlertCallback | None = None,
|
|
54
|
+
slow_request_exclude_paths: list[str] | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""初始化 AlertingSpanProcessor。
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
slow_request_threshold: HTTP 请求慢阈值(秒)
|
|
60
|
+
slow_sql_threshold: SQL 查询慢阈值(秒)
|
|
61
|
+
alert_on_slow_request: 是否对慢 HTTP 请求发送告警
|
|
62
|
+
alert_on_slow_sql: 是否对慢 SQL 发送告警
|
|
63
|
+
alert_on_error: 是否对异常 span 发送告警
|
|
64
|
+
alert_callback: 告警回调函数,签名: async (event_type, message, **metadata) -> None
|
|
65
|
+
slow_request_exclude_paths: 慢请求排除路径列表(支持 * 通配符),如 SSE/WebSocket 长连接
|
|
66
|
+
"""
|
|
67
|
+
self._slow_request_threshold = slow_request_threshold
|
|
68
|
+
self._slow_sql_threshold = slow_sql_threshold
|
|
69
|
+
self._alert_on_slow_request = alert_on_slow_request
|
|
70
|
+
self._alert_on_slow_sql = alert_on_slow_sql
|
|
71
|
+
self._alert_on_error = alert_on_error
|
|
72
|
+
self._alert_callback = alert_callback
|
|
73
|
+
|
|
74
|
+
# 编译排除路径正则
|
|
75
|
+
self._exclude_regexes: list[re.Pattern] = []
|
|
76
|
+
if slow_request_exclude_paths:
|
|
77
|
+
for pattern in slow_request_exclude_paths:
|
|
78
|
+
regex_pattern = fnmatch.translate(pattern)
|
|
79
|
+
self._exclude_regexes.append(re.compile(regex_pattern))
|
|
80
|
+
|
|
81
|
+
def on_start(self, span: "Span", parent_context: object = None) -> None:
|
|
82
|
+
"""span 开始时调用(不做处理)。"""
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
def on_end(self, span: "ReadableSpan") -> None:
|
|
86
|
+
"""span 结束时调用,检测并触发告警。"""
|
|
87
|
+
if StatusCode is None:
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
# 获取 span 信息
|
|
91
|
+
name = span.name
|
|
92
|
+
duration_ns = (span.end_time or 0) - (span.start_time or 0)
|
|
93
|
+
duration_s = duration_ns / 1e9
|
|
94
|
+
status = span.status
|
|
95
|
+
trace_id = format(span.context.trace_id, "032x") if span.context else ""
|
|
96
|
+
|
|
97
|
+
# 获取 span 属性
|
|
98
|
+
attributes = dict(span.attributes) if span.attributes else {}
|
|
99
|
+
|
|
100
|
+
# 判断 span 类型
|
|
101
|
+
span_kind = _get_span_kind(span)
|
|
102
|
+
|
|
103
|
+
# 根据 span 类型获取对应的慢阈值和开关
|
|
104
|
+
threshold = self._get_slow_threshold(span_kind)
|
|
105
|
+
should_alert = self._should_alert_slow(span_kind)
|
|
106
|
+
|
|
107
|
+
# 检测慢 span
|
|
108
|
+
if should_alert and threshold > 0 and duration_s >= threshold:
|
|
109
|
+
# 检查是否在排除路径中
|
|
110
|
+
if self._is_path_excluded(name, attributes):
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
self._emit_slow_alert(
|
|
114
|
+
name=name,
|
|
115
|
+
duration=duration_s,
|
|
116
|
+
trace_id=trace_id,
|
|
117
|
+
span_kind=span_kind,
|
|
118
|
+
attributes=attributes,
|
|
119
|
+
threshold=threshold,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# 检测异常 span(只对 SERVER span 发告警,避免重复)
|
|
123
|
+
if (
|
|
124
|
+
self._alert_on_error
|
|
125
|
+
and status
|
|
126
|
+
and status.status_code == StatusCode.ERROR
|
|
127
|
+
and span_kind == "http" # 只对 HTTP SERVER span 发异常告警
|
|
128
|
+
):
|
|
129
|
+
# 过滤 4xx 业务异常(如 401/403/404 等),只对 5xx 系统异常告警
|
|
130
|
+
http_status = attributes.get("http.status_code", 0)
|
|
131
|
+
if 400 <= http_status < 500:
|
|
132
|
+
return # 业务异常不告警
|
|
133
|
+
|
|
134
|
+
# 从 span events 中提取异常详情
|
|
135
|
+
exception_info = _extract_exception_info(span)
|
|
136
|
+
self._emit_error_alert(
|
|
137
|
+
name=name,
|
|
138
|
+
duration=duration_s,
|
|
139
|
+
trace_id=trace_id,
|
|
140
|
+
span_kind=span_kind,
|
|
141
|
+
error_message=exception_info.get("message") or status.description or "Unknown error",
|
|
142
|
+
attributes=attributes,
|
|
143
|
+
exception_info=exception_info,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def shutdown(self) -> None:
|
|
147
|
+
"""关闭处理器。"""
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
151
|
+
"""强制刷新(无缓冲,直接返回)。"""
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
def _get_slow_threshold(self, span_kind: str) -> float:
|
|
155
|
+
"""根据 span 类型获取对应的慢阈值。"""
|
|
156
|
+
if span_kind == "database":
|
|
157
|
+
return self._slow_sql_threshold
|
|
158
|
+
elif span_kind in ("http", "http_client"):
|
|
159
|
+
return self._slow_request_threshold
|
|
160
|
+
# 其他类型使用 HTTP 阈值作为默认
|
|
161
|
+
return self._slow_request_threshold
|
|
162
|
+
|
|
163
|
+
def _should_alert_slow(self, span_kind: str) -> bool:
|
|
164
|
+
"""根据 span 类型判断是否应该发送慢告警。"""
|
|
165
|
+
if span_kind == "database":
|
|
166
|
+
return self._alert_on_slow_sql
|
|
167
|
+
elif span_kind in ("http", "http_client", "internal"):
|
|
168
|
+
return self._alert_on_slow_request
|
|
169
|
+
# 其他类型默认使用 HTTP 的开关
|
|
170
|
+
return self._alert_on_slow_request
|
|
171
|
+
|
|
172
|
+
def _is_path_excluded(self, name: str, attributes: dict) -> bool:
|
|
173
|
+
"""检查路径是否在排除列表中。"""
|
|
174
|
+
if not self._exclude_regexes:
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
# 从 attributes 或 span name 中提取路径
|
|
178
|
+
path = (
|
|
179
|
+
attributes.get("http.route")
|
|
180
|
+
or attributes.get("http.target")
|
|
181
|
+
or name
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return any(regex.match(path) for regex in self._exclude_regexes)
|
|
185
|
+
|
|
186
|
+
def _emit_slow_alert(
|
|
187
|
+
self,
|
|
188
|
+
name: str,
|
|
189
|
+
duration: float,
|
|
190
|
+
trace_id: str,
|
|
191
|
+
span_kind: str,
|
|
192
|
+
attributes: dict,
|
|
193
|
+
threshold: float,
|
|
194
|
+
) -> None:
|
|
195
|
+
"""发送慢 span 告警。"""
|
|
196
|
+
if not self._alert_callback:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
task = asyncio.create_task(
|
|
201
|
+
self._alert_callback(
|
|
202
|
+
_get_event_type_for_slow(span_kind),
|
|
203
|
+
f"慢 {span_kind}: {name}",
|
|
204
|
+
severity="warning",
|
|
205
|
+
trace_id=trace_id,
|
|
206
|
+
source=span_kind,
|
|
207
|
+
duration=duration,
|
|
208
|
+
threshold=threshold,
|
|
209
|
+
**_extract_alert_context(attributes),
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
self._background_tasks.add(task)
|
|
213
|
+
task.add_done_callback(self._background_tasks.discard)
|
|
214
|
+
except RuntimeError:
|
|
215
|
+
logger.debug(f"无法发送慢 span 告警(无事件循环): {name}")
|
|
216
|
+
|
|
217
|
+
def _emit_error_alert(
|
|
218
|
+
self,
|
|
219
|
+
name: str,
|
|
220
|
+
duration: float,
|
|
221
|
+
trace_id: str,
|
|
222
|
+
span_kind: str,
|
|
223
|
+
error_message: str,
|
|
224
|
+
attributes: dict,
|
|
225
|
+
exception_info: dict | None = None,
|
|
226
|
+
) -> None:
|
|
227
|
+
"""发送异常 span 告警。"""
|
|
228
|
+
if not self._alert_callback:
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
# 合并异常信息
|
|
232
|
+
extra_context = _extract_alert_context(attributes)
|
|
233
|
+
if exception_info:
|
|
234
|
+
if exception_info.get("type"):
|
|
235
|
+
extra_context["error_type"] = exception_info["type"]
|
|
236
|
+
if exception_info.get("stacktrace"):
|
|
237
|
+
extra_context["stacktrace"] = exception_info["stacktrace"]
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
task = asyncio.create_task(
|
|
241
|
+
self._alert_callback(
|
|
242
|
+
"exception",
|
|
243
|
+
f"异常: {error_message}",
|
|
244
|
+
severity="error",
|
|
245
|
+
trace_id=trace_id,
|
|
246
|
+
source=span_kind,
|
|
247
|
+
duration=duration,
|
|
248
|
+
error_message=error_message,
|
|
249
|
+
**extra_context,
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
self._background_tasks.add(task)
|
|
253
|
+
task.add_done_callback(self._background_tasks.discard)
|
|
254
|
+
except RuntimeError:
|
|
255
|
+
logger.debug(f"无法发送异常 span 告警(无事件循环): {name}")
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _get_span_kind(span: "ReadableSpan") -> str:
|
|
259
|
+
"""根据 span 属性推断类型。"""
|
|
260
|
+
if OTelSpanKind is None:
|
|
261
|
+
return "unknown"
|
|
262
|
+
|
|
263
|
+
kind = span.kind
|
|
264
|
+
|
|
265
|
+
if kind == OTelSpanKind.SERVER:
|
|
266
|
+
# HTTP 请求
|
|
267
|
+
return "http"
|
|
268
|
+
elif kind == OTelSpanKind.CLIENT:
|
|
269
|
+
# 外部调用(DB、HTTP client 等)
|
|
270
|
+
attributes = dict(span.attributes) if span.attributes else {}
|
|
271
|
+
if "db.system" in attributes:
|
|
272
|
+
return "database"
|
|
273
|
+
elif "http.url" in attributes or "http.target" in attributes:
|
|
274
|
+
return "http_client"
|
|
275
|
+
return "client"
|
|
276
|
+
elif kind == OTelSpanKind.INTERNAL:
|
|
277
|
+
return "internal"
|
|
278
|
+
elif kind == OTelSpanKind.PRODUCER:
|
|
279
|
+
return "producer"
|
|
280
|
+
elif kind == OTelSpanKind.CONSUMER:
|
|
281
|
+
return "consumer"
|
|
282
|
+
|
|
283
|
+
return "unknown"
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _get_event_type_for_slow(span_kind: str) -> str:
|
|
287
|
+
"""根据 span 类型获取慢操作的告警事件类型。"""
|
|
288
|
+
mapping = {
|
|
289
|
+
"http": "slow_request",
|
|
290
|
+
"database": "slow_sql",
|
|
291
|
+
"http_client": "slow_request",
|
|
292
|
+
"internal": "slow_request", # internal span 也用 slow_request 类型
|
|
293
|
+
}
|
|
294
|
+
return mapping.get(span_kind, "custom")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _extract_exception_info(span: "ReadableSpan") -> dict:
|
|
298
|
+
"""从 span events 中提取异常信息。
|
|
299
|
+
|
|
300
|
+
OTEL 会将异常作为 span event 记录,包含:
|
|
301
|
+
- exception.type: 异常类型
|
|
302
|
+
- exception.message: 异常消息
|
|
303
|
+
- exception.stacktrace: 堆栈信息
|
|
304
|
+
"""
|
|
305
|
+
info = {}
|
|
306
|
+
|
|
307
|
+
if not span.events:
|
|
308
|
+
return info
|
|
309
|
+
|
|
310
|
+
for event in span.events:
|
|
311
|
+
if event.name == "exception":
|
|
312
|
+
attrs = dict(event.attributes) if event.attributes else {}
|
|
313
|
+
if "exception.type" in attrs:
|
|
314
|
+
info["type"] = str(attrs["exception.type"])
|
|
315
|
+
if "exception.message" in attrs:
|
|
316
|
+
info["message"] = str(attrs["exception.message"])
|
|
317
|
+
if "exception.stacktrace" in attrs:
|
|
318
|
+
info["stacktrace"] = str(attrs["exception.stacktrace"])
|
|
319
|
+
break # 只取第一个异常事件
|
|
320
|
+
|
|
321
|
+
return info
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _extract_alert_context(attributes: dict) -> dict:
|
|
325
|
+
"""从 span 属性中提取告警上下文。"""
|
|
326
|
+
context = {}
|
|
327
|
+
|
|
328
|
+
# HTTP 相关
|
|
329
|
+
if "http.method" in attributes:
|
|
330
|
+
context["method"] = attributes["http.method"]
|
|
331
|
+
if "http.route" in attributes:
|
|
332
|
+
context["route"] = attributes["http.route"]
|
|
333
|
+
if "http.target" in attributes:
|
|
334
|
+
context["endpoint"] = attributes["http.target"]
|
|
335
|
+
if "http.url" in attributes:
|
|
336
|
+
context["url"] = attributes["http.url"]
|
|
337
|
+
if "http.status_code" in attributes:
|
|
338
|
+
context["status_code"] = attributes["http.status_code"]
|
|
339
|
+
|
|
340
|
+
# 数据库相关
|
|
341
|
+
if "db.system" in attributes:
|
|
342
|
+
context["db_system"] = attributes["db.system"]
|
|
343
|
+
if "db.statement" in attributes:
|
|
344
|
+
context["sql"] = _normalize_sql(str(attributes["db.statement"]))
|
|
345
|
+
|
|
346
|
+
return context
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _normalize_sql(sql: str) -> str:
|
|
350
|
+
"""清理 SQL 多余空白。"""
|
|
351
|
+
import re
|
|
352
|
+
# 将多个空白字符(包括换行、制表符)合并为单个空格
|
|
353
|
+
sql = re.sub(r'\s+', ' ', sql)
|
|
354
|
+
return sql.strip()
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
__all__ = ["AlertingSpanProcessor"]
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""OpenTelemetry TracerProvider 配置和初始化。"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from aury.boot.common.logging import logger
|
|
9
|
+
|
|
10
|
+
from .processor import AlertingSpanProcessor
|
|
11
|
+
|
|
12
|
+
# OTel 核心可选依赖
|
|
13
|
+
try:
|
|
14
|
+
from opentelemetry import trace as otel_trace
|
|
15
|
+
from opentelemetry.sdk.resources import Resource
|
|
16
|
+
from opentelemetry.sdk.trace import TracerProvider as OTelTracerProvider
|
|
17
|
+
except ImportError:
|
|
18
|
+
otel_trace = None # type: ignore[assignment]
|
|
19
|
+
Resource = None # type: ignore[assignment, misc]
|
|
20
|
+
OTelTracerProvider = None # type: ignore[assignment, misc]
|
|
21
|
+
|
|
22
|
+
# OTel Traces 导出可选依赖
|
|
23
|
+
try:
|
|
24
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
25
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
26
|
+
except ImportError:
|
|
27
|
+
OTLPSpanExporter = None # type: ignore[assignment, misc]
|
|
28
|
+
BatchSpanProcessor = None # type: ignore[assignment, misc]
|
|
29
|
+
|
|
30
|
+
# OTel Metrics 导出可选依赖
|
|
31
|
+
try:
|
|
32
|
+
from opentelemetry import metrics as otel_metrics
|
|
33
|
+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
|
34
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
35
|
+
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
|
36
|
+
except ImportError:
|
|
37
|
+
otel_metrics = None # type: ignore[assignment]
|
|
38
|
+
OTLPMetricExporter = None # type: ignore[assignment, misc]
|
|
39
|
+
MeterProvider = None # type: ignore[assignment, misc]
|
|
40
|
+
PeriodicExportingMetricReader = None # type: ignore[assignment, misc]
|
|
41
|
+
|
|
42
|
+
# OTel Instrumentation 可选依赖
|
|
43
|
+
try:
|
|
44
|
+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
45
|
+
except ImportError:
|
|
46
|
+
FastAPIInstrumentor = None # type: ignore[assignment, misc]
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
|
50
|
+
except ImportError:
|
|
51
|
+
SQLAlchemyInstrumentor = None # type: ignore[assignment, misc]
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
55
|
+
except ImportError:
|
|
56
|
+
HTTPXClientInstrumentor = None # type: ignore[assignment, misc]
|
|
57
|
+
|
|
58
|
+
if TYPE_CHECKING:
|
|
59
|
+
from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class TelemetryConfig:
|
|
64
|
+
"""遍测配置。"""
|
|
65
|
+
|
|
66
|
+
# 基础配置
|
|
67
|
+
service_name: str = "aury-service"
|
|
68
|
+
service_version: str = ""
|
|
69
|
+
environment: str = "development"
|
|
70
|
+
|
|
71
|
+
# 启用的 instrumentation
|
|
72
|
+
instrument_fastapi: bool = True
|
|
73
|
+
instrument_sqlalchemy: bool = True
|
|
74
|
+
instrument_httpx: bool = True
|
|
75
|
+
|
|
76
|
+
# 告警配置
|
|
77
|
+
alert_enabled: bool = True
|
|
78
|
+
slow_request_threshold: float = 1.0 # HTTP 请求慢阈值(秒)
|
|
79
|
+
slow_sql_threshold: float = 0.5 # SQL 查询慢阈值(秒)
|
|
80
|
+
alert_on_slow_request: bool = True # 是否对慢 HTTP 请求发送告警
|
|
81
|
+
alert_on_slow_sql: bool = True # 是否对慢 SQL 发送告警
|
|
82
|
+
alert_on_error: bool = True
|
|
83
|
+
alert_callback: Any = None # 告警回调函数
|
|
84
|
+
slow_request_exclude_paths: list[str] = field(default_factory=list) # 慢请求排除路径
|
|
85
|
+
|
|
86
|
+
# OTLP Traces 导出配置
|
|
87
|
+
traces_endpoint: str | None = None
|
|
88
|
+
traces_headers: dict[str, str] = field(default_factory=dict)
|
|
89
|
+
|
|
90
|
+
# OTLP Logs 导出配置
|
|
91
|
+
logs_endpoint: str | None = None
|
|
92
|
+
logs_headers: dict[str, str] = field(default_factory=dict)
|
|
93
|
+
|
|
94
|
+
# OTLP Metrics 导出配置
|
|
95
|
+
metrics_endpoint: str | None = None
|
|
96
|
+
metrics_headers: dict[str, str] = field(default_factory=dict)
|
|
97
|
+
|
|
98
|
+
# 采样配置
|
|
99
|
+
sampling_rate: float = 1.0 # 1.0 = 100%
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class TelemetryProvider:
|
|
103
|
+
"""遥测提供者。
|
|
104
|
+
|
|
105
|
+
封装 OpenTelemetry TracerProvider 的配置和初始化逻辑。
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self, config: TelemetryConfig) -> None:
|
|
109
|
+
"""初始化 TelemetryProvider。
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
config: 遥测配置
|
|
113
|
+
"""
|
|
114
|
+
self._config = config
|
|
115
|
+
self._provider: "TracerProvider | None" = None
|
|
116
|
+
self._initialized = False
|
|
117
|
+
|
|
118
|
+
def initialize(self) -> bool:
|
|
119
|
+
"""初始化 OpenTelemetry。
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
bool: 是否成功初始化
|
|
123
|
+
"""
|
|
124
|
+
if self._initialized:
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
if OTelTracerProvider is None:
|
|
128
|
+
logger.warning("OpenTelemetry 初始化失败(缺少依赖)")
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
# 创建 Resource
|
|
133
|
+
resource = Resource.create({
|
|
134
|
+
"service.name": self._config.service_name,
|
|
135
|
+
"service.version": self._config.service_version,
|
|
136
|
+
"deployment.environment": self._config.environment,
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
# 创建 TracerProvider
|
|
140
|
+
self._provider = OTelTracerProvider(resource=resource)
|
|
141
|
+
|
|
142
|
+
# 添加 SpanProcessor
|
|
143
|
+
self._setup_processors()
|
|
144
|
+
|
|
145
|
+
# 设置为全局 TracerProvider
|
|
146
|
+
otel_trace.set_tracer_provider(self._provider)
|
|
147
|
+
|
|
148
|
+
# 配置 instrumentation
|
|
149
|
+
self._setup_instrumentations()
|
|
150
|
+
|
|
151
|
+
self._initialized = True
|
|
152
|
+
logger.info(
|
|
153
|
+
f"OpenTelemetry 初始化完成: "
|
|
154
|
+
f"service={self._config.service_name}, "
|
|
155
|
+
f"alert_enabled={self._config.alert_enabled}"
|
|
156
|
+
)
|
|
157
|
+
return True
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"OpenTelemetry 初始化失败: {e}")
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
def shutdown(self) -> None:
|
|
163
|
+
"""关闭 OpenTelemetry。"""
|
|
164
|
+
if self._provider:
|
|
165
|
+
try:
|
|
166
|
+
self._provider.shutdown()
|
|
167
|
+
logger.info("OpenTelemetry 已关闭")
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.warning(f"OpenTelemetry 关闭失败: {e}")
|
|
170
|
+
|
|
171
|
+
def _setup_processors(self) -> None:
|
|
172
|
+
"""设置 SpanProcessor。"""
|
|
173
|
+
if not self._provider:
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
# 添加告警处理器
|
|
177
|
+
if self._config.alert_enabled:
|
|
178
|
+
alerting_processor = AlertingSpanProcessor(
|
|
179
|
+
slow_request_threshold=self._config.slow_request_threshold,
|
|
180
|
+
slow_sql_threshold=self._config.slow_sql_threshold,
|
|
181
|
+
alert_on_slow_request=self._config.alert_on_slow_request,
|
|
182
|
+
alert_on_slow_sql=self._config.alert_on_slow_sql,
|
|
183
|
+
alert_on_error=self._config.alert_on_error,
|
|
184
|
+
alert_callback=self._config.alert_callback,
|
|
185
|
+
slow_request_exclude_paths=self._config.slow_request_exclude_paths or None,
|
|
186
|
+
)
|
|
187
|
+
self._provider.add_span_processor(alerting_processor)
|
|
188
|
+
logger.debug("已添加 AlertingSpanProcessor")
|
|
189
|
+
|
|
190
|
+
# 添加 OTLP Traces 导出器
|
|
191
|
+
if self._config.traces_endpoint:
|
|
192
|
+
self._setup_traces_exporter()
|
|
193
|
+
|
|
194
|
+
# 添加 OTLP Metrics 导出器
|
|
195
|
+
if self._config.metrics_endpoint:
|
|
196
|
+
self._setup_metrics_exporter()
|
|
197
|
+
|
|
198
|
+
def _setup_traces_exporter(self) -> None:
|
|
199
|
+
"""设置 Traces OTLP 导出器。"""
|
|
200
|
+
if not self._provider or not self._config.traces_endpoint:
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
if OTLPSpanExporter is None:
|
|
204
|
+
logger.warning("Traces OTLP 导出器未安装,跳过配置")
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
exporter = OTLPSpanExporter(
|
|
209
|
+
endpoint=self._config.traces_endpoint,
|
|
210
|
+
headers=self._config.traces_headers or None,
|
|
211
|
+
)
|
|
212
|
+
self._provider.add_span_processor(BatchSpanProcessor(exporter))
|
|
213
|
+
logger.info(f"Traces OTLP 导出器已配置: {self._config.traces_endpoint}")
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.warning(f"Traces OTLP 导出器配置失败: {e}")
|
|
216
|
+
|
|
217
|
+
def _setup_metrics_exporter(self) -> None:
|
|
218
|
+
"""设置 Metrics OTLP 导出器。"""
|
|
219
|
+
if not self._config.metrics_endpoint:
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
if OTLPMetricExporter is None:
|
|
223
|
+
logger.warning("Metrics OTLP 导出器未安装,跳过配置")
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
resource = Resource.create({
|
|
228
|
+
"service.name": self._config.service_name,
|
|
229
|
+
"service.version": self._config.service_version,
|
|
230
|
+
"deployment.environment": self._config.environment,
|
|
231
|
+
})
|
|
232
|
+
|
|
233
|
+
exporter = OTLPMetricExporter(
|
|
234
|
+
endpoint=self._config.metrics_endpoint,
|
|
235
|
+
headers=self._config.metrics_headers or None,
|
|
236
|
+
)
|
|
237
|
+
reader = PeriodicExportingMetricReader(exporter)
|
|
238
|
+
meter_provider = MeterProvider(resource=resource, metric_readers=[reader])
|
|
239
|
+
otel_metrics.set_meter_provider(meter_provider)
|
|
240
|
+
|
|
241
|
+
logger.info(f"Metrics OTLP 导出器已配置: {self._config.metrics_endpoint}")
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger.warning(f"Metrics OTLP 导出器配置失败: {e}")
|
|
244
|
+
|
|
245
|
+
def _setup_instrumentations(self) -> None:
|
|
246
|
+
"""配置自动 instrumentation。
|
|
247
|
+
|
|
248
|
+
注意:
|
|
249
|
+
- FastAPI instrumentation 需要通过 instrument_fastapi_app() 单独调用。
|
|
250
|
+
- SQLAlchemy instrumentation 需要在 engine 创建后调用(由 DatabaseComponent 处理)。
|
|
251
|
+
"""
|
|
252
|
+
enabled = []
|
|
253
|
+
pending = []
|
|
254
|
+
|
|
255
|
+
# SQLAlchemy instrumentation 需要 engine,在 DatabaseComponent 中处理
|
|
256
|
+
if self._config.instrument_sqlalchemy:
|
|
257
|
+
pending.append("SQLAlchemy")
|
|
258
|
+
|
|
259
|
+
# HTTPX instrumentation
|
|
260
|
+
if self._config.instrument_httpx:
|
|
261
|
+
if self._instrument_httpx():
|
|
262
|
+
enabled.append("HTTPX")
|
|
263
|
+
|
|
264
|
+
if enabled:
|
|
265
|
+
logger.info(f"Instrumentation 已启用: {', '.join(enabled)}")
|
|
266
|
+
if pending:
|
|
267
|
+
logger.debug(f"Instrumentation 待启用(需要 engine): {', '.join(pending)}")
|
|
268
|
+
|
|
269
|
+
def instrument_fastapi_app(self, app) -> None:
|
|
270
|
+
"""对已创建的 FastAPI app 进行 instrumentation。
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
app: FastAPI 应用实例
|
|
274
|
+
"""
|
|
275
|
+
if FastAPIInstrumentor is None:
|
|
276
|
+
logger.debug("FastAPI instrumentation 未安装")
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
if not self._config.instrument_fastapi:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
FastAPIInstrumentor.instrument_app(app)
|
|
284
|
+
logger.info("FastAPI instrumentation 已启用")
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.warning(f"FastAPI instrumentation 配置失败: {e}")
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _instrument_httpx(self) -> bool:
|
|
290
|
+
"""配置 HTTPX instrumentation。"""
|
|
291
|
+
if HTTPXClientInstrumentor is None:
|
|
292
|
+
logger.debug("HTTPX instrumentation 未安装")
|
|
293
|
+
return False
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
HTTPXClientInstrumentor().instrument()
|
|
297
|
+
return True
|
|
298
|
+
except Exception as e:
|
|
299
|
+
logger.warning(f"HTTPX instrumentation 配置失败: {e}")
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
def add_processor(self, processor: "SpanProcessor") -> None:
|
|
303
|
+
"""添加自定义 SpanProcessor。
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
processor: SpanProcessor 实例
|
|
307
|
+
"""
|
|
308
|
+
if self._provider:
|
|
309
|
+
self._provider.add_span_processor(processor)
|
|
310
|
+
|
|
311
|
+
@property
|
|
312
|
+
def provider(self) -> "TracerProvider | None":
|
|
313
|
+
"""获取 TracerProvider 实例。"""
|
|
314
|
+
return self._provider
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def is_initialized(self) -> bool:
|
|
318
|
+
"""是否已初始化。"""
|
|
319
|
+
return self._initialized
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
__all__ = ["TelemetryConfig", "TelemetryProvider"]
|