aury-boot 0.0.29__py3-none-any.whl → 0.0.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/app/base.py +126 -2
- aury/boot/application/app/components.py +224 -1
- aury/boot/application/config/settings.py +195 -3
- aury/boot/application/constants/components.py +3 -0
- aury/boot/application/middleware/logging.py +45 -6
- aury/boot/commands/docs.py +40 -0
- aury/boot/commands/init.py +2 -0
- aury/boot/commands/templates/project/AGENTS.md.tpl +5 -0
- aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
- aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +61 -0
- aury/boot/common/logging/context.py +17 -1
- aury/boot/common/logging/format.py +4 -0
- aury/boot/infrastructure/channel/base.py +6 -2
- aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
- aury/boot/infrastructure/monitoring/__init__.py +210 -6
- aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
- aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
- aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
- aury/boot/infrastructure/monitoring/alerting/manager.py +428 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
- aury/boot/infrastructure/monitoring/alerting/rules.py +163 -0
- aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
- aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
- aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
- aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
- aury/boot/infrastructure/monitoring/tracing/processor.py +327 -0
- aury/boot/infrastructure/monitoring/tracing/provider.py +320 -0
- aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/METADATA +14 -1
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/RECORD +38 -19
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# 告警系统
|
|
2
|
+
|
|
3
|
+
本文档介绍 {project_name} 项目中的告警系统配置和使用方法。
|
|
4
|
+
|
|
5
|
+
## 快速开始(简版配置)
|
|
6
|
+
|
|
7
|
+
所有告警发送到同一个飞书群:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# .env
|
|
11
|
+
ALERT__ENABLED=true
|
|
12
|
+
ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
13
|
+
ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/xxx
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
框架会自动创建默认规则,检测慢请求、慢 SQL、异常并发送告警。
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## 完整版配置(分群告警)
|
|
21
|
+
|
|
22
|
+
不同类型的告警发送到不同的飞书群,避免一个群接收所有消息。
|
|
23
|
+
|
|
24
|
+
### 1. 环境变量
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# .env
|
|
28
|
+
|
|
29
|
+
# ============ 告警系统 ============
|
|
30
|
+
ALERT__ENABLED=true
|
|
31
|
+
ALERT__RULES_FILE=alert_rules.yaml
|
|
32
|
+
|
|
33
|
+
# 性能群(慢请求、慢SQL)
|
|
34
|
+
ALERT__NOTIFIERS__PERF_GROUP__TYPE=feishu
|
|
35
|
+
ALERT__NOTIFIERS__PERF_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/perf-xxx
|
|
36
|
+
ALERT__NOTIFIERS__PERF_GROUP__SECRET=your-secret # 可选
|
|
37
|
+
|
|
38
|
+
# 错误群(异常)
|
|
39
|
+
ALERT__NOTIFIERS__ERROR_GROUP__TYPE=feishu
|
|
40
|
+
ALERT__NOTIFIERS__ERROR_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/error-xxx
|
|
41
|
+
|
|
42
|
+
# 运维群(任务失败、超时)
|
|
43
|
+
ALERT__NOTIFIERS__OPS_GROUP__TYPE=feishu
|
|
44
|
+
ALERT__NOTIFIERS__OPS_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/ops-xxx
|
|
45
|
+
|
|
46
|
+
# 慢操作阈值
|
|
47
|
+
ALERT__SLOW_REQUEST_THRESHOLD=1.0
|
|
48
|
+
ALERT__SLOW_SQL_THRESHOLD=0.5
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### 2. 规则文件
|
|
52
|
+
|
|
53
|
+
生成规则模板:`aury docs alert-rules`
|
|
54
|
+
|
|
55
|
+
```yaml
|
|
56
|
+
# alert_rules.yaml
|
|
57
|
+
defaults:
|
|
58
|
+
slow_request_threshold: 1.0
|
|
59
|
+
slow_sql_threshold: 0.5
|
|
60
|
+
aggregate_window: 10
|
|
61
|
+
suppress_seconds: 300
|
|
62
|
+
|
|
63
|
+
rules:
|
|
64
|
+
# 慢请求 → 性能群
|
|
65
|
+
- name: slow_request
|
|
66
|
+
event_types: [slow_request]
|
|
67
|
+
aggregate_threshold: 5
|
|
68
|
+
notifiers: [perf_group]
|
|
69
|
+
|
|
70
|
+
# 慢 SQL → 性能群
|
|
71
|
+
- name: slow_sql
|
|
72
|
+
event_types: [slow_sql]
|
|
73
|
+
aggregate_threshold: 10
|
|
74
|
+
notifiers: [perf_group]
|
|
75
|
+
|
|
76
|
+
# 异常 → 错误群(立即告警)
|
|
77
|
+
- name: exception
|
|
78
|
+
event_types: [exception]
|
|
79
|
+
aggregate_threshold: 1
|
|
80
|
+
suppress_seconds: 60
|
|
81
|
+
notifiers: [error_group]
|
|
82
|
+
|
|
83
|
+
# 任务失败/超时 → 运维群
|
|
84
|
+
- name: task_issues
|
|
85
|
+
event_types: [task_failure, task_timeout]
|
|
86
|
+
aggregate_threshold: 1
|
|
87
|
+
notifiers: [ops_group]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### 3. 效果
|
|
91
|
+
|
|
92
|
+
| 事件类型 | 目标群 | 触发条件 |
|
|
93
|
+
|---------|--------|----------|
|
|
94
|
+
| 慢请求(>1s) | 性能群 | 60秒内累计5次 |
|
|
95
|
+
| 慢 SQL(>0.5s) | 性能群 | 60秒内累计10次 |
|
|
96
|
+
| 异常 | 错误群 | 立即告警 |
|
|
97
|
+
| 任务失败/超时 | 运维群 | 立即告警 |
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 代码中手动发送告警
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from aury.boot.infrastructure.monitoring.alerting import emit_alert, AlertEventType, AlertSeverity
|
|
105
|
+
|
|
106
|
+
# 发送自定义告警
|
|
107
|
+
await emit_alert(
|
|
108
|
+
AlertEventType.CUSTOM,
|
|
109
|
+
"订单支付超时",
|
|
110
|
+
severity=AlertSeverity.WARNING,
|
|
111
|
+
order_id="12345",
|
|
112
|
+
user_id="u001",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# 发送慢 SQL 告警(通常由框架自动触发)
|
|
116
|
+
await emit_alert(
|
|
117
|
+
AlertEventType.SLOW_SQL,
|
|
118
|
+
"慢查询告警",
|
|
119
|
+
duration=2.5,
|
|
120
|
+
sql="SELECT * FROM orders WHERE ...",
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 通知器类型
|
|
127
|
+
|
|
128
|
+
### 飞书(feishu)
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
ALERT__NOTIFIERS__XXX__TYPE=feishu
|
|
132
|
+
ALERT__NOTIFIERS__XXX__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/xxx
|
|
133
|
+
ALERT__NOTIFIERS__XXX__SECRET=xxx # 可选,签名密钥
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### 通用 Webhook
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
ALERT__NOTIFIERS__XXX__TYPE=webhook
|
|
140
|
+
ALERT__NOTIFIERS__XXX__URL=https://your-system.com/alert
|
|
141
|
+
ALERT__NOTIFIERS__XXX__METHOD=POST
|
|
142
|
+
ALERT__NOTIFIERS__XXX__HEADERS='{{"Authorization": "Bearer xxx"}}'
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 自定义通知器
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from aury.boot.infrastructure.monitoring.alerting import AlertNotifier, AlertManager
|
|
149
|
+
|
|
150
|
+
class DingTalkNotifier(AlertNotifier):
|
|
151
|
+
@classmethod
|
|
152
|
+
def from_config(cls, config: dict) -> "DingTalkNotifier":
|
|
153
|
+
return cls(webhook=config["webhook"])
|
|
154
|
+
|
|
155
|
+
async def send(self, notification) -> bool:
|
|
156
|
+
# 实现发送逻辑
|
|
157
|
+
...
|
|
158
|
+
|
|
159
|
+
# 注册
|
|
160
|
+
AlertManager.register_notifier_class("dingtalk", DingTalkNotifier)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
然后在环境变量中使用:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
ALERT__NOTIFIERS__DING__TYPE=dingtalk
|
|
167
|
+
ALERT__NOTIFIERS__DING__WEBHOOK=https://oapi.dingtalk.com/robot/send?access_token=xxx
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 告警事件类型
|
|
173
|
+
|
|
174
|
+
| 类型 | 说明 | 自动触发 |
|
|
175
|
+
|-----|------|----------|
|
|
176
|
+
| `slow_request` | 慢 HTTP 请求 | 框架自动检测 |
|
|
177
|
+
| `slow_sql` | 慢 SQL 查询 | 框架自动检测 |
|
|
178
|
+
| `exception` | 异常/错误 | 框架自动检测 |
|
|
179
|
+
| `task_failure` | 任务执行失败 | 任务系统触发 |
|
|
180
|
+
| `task_timeout` | 任务执行超时 | 任务系统触发 |
|
|
181
|
+
| `custom` | 自定义告警 | 手动调用 emit_alert |
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## 告警抑制与聚合
|
|
186
|
+
|
|
187
|
+
- **聚合窗口**:在窗口时间内累计触发次数,达到阈值才发送告警
|
|
188
|
+
- **抑制时间**:同一告警在抑制时间内不会重复发送
|
|
189
|
+
- **示例**:`aggregate_window=60, aggregate_threshold=5` 表示 60 秒内触发 5 次才告警
|
|
190
|
+
|
|
191
|
+
这样可以避免告警风暴,同时不遗漏重要问题。
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## 环境变量参考
|
|
196
|
+
|
|
197
|
+
| 变量 | 说明 | 默认值 |
|
|
198
|
+
|------|------|--------|
|
|
199
|
+
| `ALERT__ENABLED` | 是否启用告警 | `false` |
|
|
200
|
+
| `ALERT__RULES_FILE` | 规则文件路径 | - |
|
|
201
|
+
| `ALERT__SLOW_REQUEST_THRESHOLD` | 慢请求阈值(秒) | `1.0` |
|
|
202
|
+
| `ALERT__SLOW_SQL_THRESHOLD` | 慢 SQL 阈值(秒) | `0.5` |
|
|
203
|
+
| `ALERT__ALERT_ON_SLOW_REQUEST` | 是否对慢请求告警 | `true` |
|
|
204
|
+
| `ALERT__ALERT_ON_SLOW_SQL` | 是否对慢 SQL 告警 | `true` |
|
|
205
|
+
| `ALERT__ALERT_ON_ERROR` | 是否对异常告警 | `true` |
|
|
206
|
+
| `ALERT__AGGREGATE_WINDOW` | 聚合窗口(秒) | `10` |
|
|
207
|
+
| `ALERT__SLOW_REQUEST_AGGREGATE` | 慢请求触发阈值(窗口内次数) | `5` |
|
|
208
|
+
| `ALERT__SLOW_SQL_AGGREGATE` | 慢 SQL 触发阈值 | `10` |
|
|
209
|
+
| `ALERT__EXCEPTION_AGGREGATE` | 异常触发阈值 | `1` |
|
|
210
|
+
| `ALERT__SUPPRESS_SECONDS` | 抑制时间(秒) | `300` |
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# 监控与告警配置
|
|
3
|
+
# =============================================================================
|
|
4
|
+
|
|
5
|
+
# ---------- OpenTelemetry 遥测 ----------
|
|
6
|
+
# 启用后自动 instrument FastAPI、SQLAlchemy、httpx
|
|
7
|
+
TELEMETRY__ENABLED=false
|
|
8
|
+
# TELEMETRY__SAMPLING_RATE=1.0
|
|
9
|
+
|
|
10
|
+
# OTLP 导出(可选)
|
|
11
|
+
# TELEMETRY__TRACES_ENDPOINT=http://jaeger:4317
|
|
12
|
+
# TELEMETRY__LOGS_ENDPOINT=http://loki:3100
|
|
13
|
+
# TELEMETRY__METRICS_ENDPOINT=http://prometheus:9090
|
|
14
|
+
|
|
15
|
+
# ---------- 告警系统 ----------
|
|
16
|
+
ALERT__ENABLED=false
|
|
17
|
+
# 慢操作阈值
|
|
18
|
+
ALERT__SLOW_REQUEST_THRESHOLD=1.0
|
|
19
|
+
ALERT__SLOW_SQL_THRESHOLD=0.5
|
|
20
|
+
# 告警开关
|
|
21
|
+
ALERT__ALERT_ON_SLOW_REQUEST=true
|
|
22
|
+
ALERT__ALERT_ON_SLOW_SQL=true
|
|
23
|
+
ALERT__ALERT_ON_ERROR=true
|
|
24
|
+
# 聚合与抑制
|
|
25
|
+
ALERT__AGGREGATE_WINDOW=10
|
|
26
|
+
ALERT__SLOW_REQUEST_AGGREGATE=5
|
|
27
|
+
ALERT__SLOW_SQL_AGGREGATE=10
|
|
28
|
+
ALERT__EXCEPTION_AGGREGATE=1
|
|
29
|
+
ALERT__SUPPRESS_SECONDS=300
|
|
30
|
+
|
|
31
|
+
# ---------- 告警通知器(简版:单群)----------
|
|
32
|
+
# 所有告警发到同一个飞书群,取消注释并填写 Webhook 即可
|
|
33
|
+
# ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
34
|
+
# ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/your-webhook-id
|
|
35
|
+
# ALERT__NOTIFIERS__DEFAULT__SECRET=your-secret
|
|
36
|
+
|
|
37
|
+
# ---------- 告警通知器(完整版:分群告警)----------
|
|
38
|
+
# 不同类型告警发到不同群,需配合 alert_rules.yaml 使用
|
|
39
|
+
# 生成规则模板:aury docs alert-rules
|
|
40
|
+
# ALERT__RULES_FILE=alert_rules.yaml
|
|
41
|
+
|
|
42
|
+
# 性能群(慢请求、慢SQL)
|
|
43
|
+
# ALERT__NOTIFIERS__PERF_GROUP__TYPE=feishu
|
|
44
|
+
# ALERT__NOTIFIERS__PERF_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/perf-webhook-id
|
|
45
|
+
# ALERT__NOTIFIERS__PERF_GROUP__SECRET=perf-secret
|
|
46
|
+
|
|
47
|
+
# 错误群(异常)
|
|
48
|
+
# ALERT__NOTIFIERS__ERROR_GROUP__TYPE=feishu
|
|
49
|
+
# ALERT__NOTIFIERS__ERROR_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/error-webhook-id
|
|
50
|
+
# ALERT__NOTIFIERS__ERROR_GROUP__SECRET=error-secret
|
|
51
|
+
|
|
52
|
+
# 运维群(任务失败、超时)
|
|
53
|
+
# ALERT__NOTIFIERS__OPS_GROUP__TYPE=feishu
|
|
54
|
+
# ALERT__NOTIFIERS__OPS_GROUP__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/ops-webhook-id
|
|
55
|
+
# ALERT__NOTIFIERS__OPS_GROUP__SECRET=ops-secret
|
|
56
|
+
|
|
57
|
+
# 通用 Webhook(自定义系统)
|
|
58
|
+
# ALERT__NOTIFIERS__CUSTOM__TYPE=webhook
|
|
59
|
+
# ALERT__NOTIFIERS__CUSTOM__URL=https://your-system.com/api/alert
|
|
60
|
+
# ALERT__NOTIFIERS__CUSTOM__METHOD=POST
|
|
61
|
+
# ALERT__NOTIFIERS__CUSTOM__HEADERS={{"Authorization": "Bearer your-token"}}
|
|
@@ -58,8 +58,24 @@ def set_service_context(context: ServiceContext | str) -> None:
|
|
|
58
58
|
def get_trace_id() -> str:
|
|
59
59
|
"""获取当前链路追踪ID。
|
|
60
60
|
|
|
61
|
-
|
|
61
|
+
优先从 OpenTelemetry 获取(如果已启用),否则使用内置 trace_id。
|
|
62
|
+
如果都没有设置,则生成一个新的随机 ID。
|
|
62
63
|
"""
|
|
64
|
+
# 优先从 OTel 获取
|
|
65
|
+
try:
|
|
66
|
+
from opentelemetry import trace
|
|
67
|
+
|
|
68
|
+
span = trace.get_current_span()
|
|
69
|
+
if span and span.is_recording():
|
|
70
|
+
otel_trace_id = span.get_span_context().trace_id
|
|
71
|
+
if otel_trace_id:
|
|
72
|
+
return format(otel_trace_id, "032x")
|
|
73
|
+
except ImportError:
|
|
74
|
+
pass
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
# 回退到内置实现
|
|
63
79
|
trace_id = _trace_id_var.get()
|
|
64
80
|
if not trace_id:
|
|
65
81
|
trace_id = str(uuid.uuid4())
|
|
@@ -309,7 +309,11 @@ def log_exception(
|
|
|
309
309
|
|
|
310
310
|
__all__ = [
|
|
311
311
|
"create_console_sink",
|
|
312
|
+
"format_exception_compact",
|
|
312
313
|
"format_exception_java_style",
|
|
313
314
|
"format_message",
|
|
314
315
|
"log_exception",
|
|
315
316
|
]
|
|
317
|
+
|
|
318
|
+
# 别名导出(保持内部使用 _ 前缀)
|
|
319
|
+
format_exception_compact = _format_exception_compact
|
|
@@ -31,7 +31,10 @@ class ChannelMessage:
|
|
|
31
31
|
timestamp: datetime = field(default_factory=datetime.now)
|
|
32
32
|
|
|
33
33
|
def to_sse(self) -> str:
|
|
34
|
-
"""转换为 SSE 格式。
|
|
34
|
+
"""转换为 SSE 格式。
|
|
35
|
+
|
|
36
|
+
SSE 规范要求每个消息以双换行符结束。
|
|
37
|
+
"""
|
|
35
38
|
lines = []
|
|
36
39
|
if self.id:
|
|
37
40
|
lines.append(f"id: {self.id}")
|
|
@@ -42,7 +45,8 @@ class ChannelMessage:
|
|
|
42
45
|
for line in data_str.split("\n"):
|
|
43
46
|
lines.append(f"data: {line}")
|
|
44
47
|
lines.append("") # 空行结束
|
|
45
|
-
|
|
48
|
+
# SSE 规范要求消息以双换行符结束
|
|
49
|
+
return "\n".join(lines) + "\n"
|
|
46
50
|
|
|
47
51
|
|
|
48
52
|
class IChannel(ABC):
|
|
@@ -12,8 +12,8 @@ import time
|
|
|
12
12
|
|
|
13
13
|
from aury.boot.common.logging import logger
|
|
14
14
|
|
|
15
|
-
#
|
|
16
|
-
|
|
15
|
+
# 默认慢查询阈值(秒)
|
|
16
|
+
DEFAULT_SLOW_QUERY_THRESHOLD = 1.0
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def cache_query(
|
|
@@ -86,7 +86,7 @@ def cache_query(
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
def monitor_query(
|
|
89
|
-
slow_threshold: float =
|
|
89
|
+
slow_threshold: float = DEFAULT_SLOW_QUERY_THRESHOLD,
|
|
90
90
|
enable_explain: bool = False,
|
|
91
91
|
) -> Callable:
|
|
92
92
|
"""查询性能监控装饰器。
|
|
@@ -104,7 +104,6 @@ def monitor_query(
|
|
|
104
104
|
async def list(self, **filters):
|
|
105
105
|
return await super().list(**filters)
|
|
106
106
|
"""
|
|
107
|
-
|
|
108
107
|
def decorator(func: Callable) -> Callable:
|
|
109
108
|
@wraps(func)
|
|
110
109
|
async def wrapper(self, *args, **kwargs):
|
|
@@ -154,7 +153,6 @@ def monitor_query(
|
|
|
154
153
|
|
|
155
154
|
return decorator
|
|
156
155
|
|
|
157
|
-
|
|
158
156
|
__all__ = [
|
|
159
157
|
"cache_query",
|
|
160
158
|
"monitor_query",
|
|
@@ -98,15 +98,18 @@ class SlowMethodDetectorComponent(MonitorComponent):
|
|
|
98
98
|
"""慢方法检测组件。
|
|
99
99
|
|
|
100
100
|
检测执行时间超过阈值的方法并记录警告。
|
|
101
|
+
支持可选的告警通知(飞书/Webhook)。
|
|
101
102
|
"""
|
|
102
103
|
|
|
103
|
-
def __init__(self, threshold: float) -> None:
|
|
104
|
+
def __init__(self, threshold: float, *, alert: bool = False) -> None:
|
|
104
105
|
"""初始化慢方法检测组件。
|
|
105
106
|
|
|
106
107
|
Args:
|
|
107
108
|
threshold: 慢方法阈值(秒)
|
|
109
|
+
alert: 是否发送告警通知
|
|
108
110
|
"""
|
|
109
111
|
self._threshold = threshold
|
|
112
|
+
self._alert = alert
|
|
110
113
|
|
|
111
114
|
async def process(self, context: MonitorContext) -> None:
|
|
112
115
|
"""检测慢方法。
|
|
@@ -119,6 +122,156 @@ class SlowMethodDetectorComponent(MonitorComponent):
|
|
|
119
122
|
f"慢方法检测: {context.func_name} 执行时间 {context.duration:.3f}s "
|
|
120
123
|
f"(阈值: {self._threshold}s)"
|
|
121
124
|
)
|
|
125
|
+
|
|
126
|
+
# 发送告警通知
|
|
127
|
+
if self._alert:
|
|
128
|
+
await self._emit_alert(context)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def _emit_alert(self, context: MonitorContext) -> None:
|
|
132
|
+
"""发送慢方法告警。"""
|
|
133
|
+
try:
|
|
134
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
135
|
+
AlertEventType,
|
|
136
|
+
AlertSeverity,
|
|
137
|
+
emit_alert,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
await emit_alert(
|
|
141
|
+
AlertEventType.CUSTOM,
|
|
142
|
+
f"慢方法: {context.func_name}",
|
|
143
|
+
severity=AlertSeverity.WARNING,
|
|
144
|
+
source="service",
|
|
145
|
+
duration=context.duration,
|
|
146
|
+
threshold=self._threshold,
|
|
147
|
+
service=context.service_name,
|
|
148
|
+
)
|
|
149
|
+
except ImportError:
|
|
150
|
+
pass # alerting 模块未加载
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.debug(f"发送慢方法告警失败: {e}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# =============================================================================
|
|
156
|
+
# HTTP 请求监控
|
|
157
|
+
# =============================================================================
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def monitor_http_request(
|
|
161
|
+
*,
|
|
162
|
+
method: str,
|
|
163
|
+
path: str,
|
|
164
|
+
duration: float,
|
|
165
|
+
status_code: int,
|
|
166
|
+
threshold: float = 1.0,
|
|
167
|
+
alert: bool = False,
|
|
168
|
+
trace_id: str = "",
|
|
169
|
+
exception: Exception | None = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""监控 HTTP 请求。
|
|
172
|
+
|
|
173
|
+
由 middleware 调用,统一处理 HTTP 请求的慢请求检测和告警。
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
method: HTTP 方法
|
|
177
|
+
path: 请求路径
|
|
178
|
+
duration: 执行时间(秒)
|
|
179
|
+
status_code: 响应状态码
|
|
180
|
+
threshold: 慢请求阈值(秒)
|
|
181
|
+
alert: 是否发送告警
|
|
182
|
+
trace_id: 追踪 ID
|
|
183
|
+
exception: 异常对象(如果有)
|
|
184
|
+
"""
|
|
185
|
+
# 慢请求检测和告警
|
|
186
|
+
if duration >= threshold:
|
|
187
|
+
if alert:
|
|
188
|
+
await _emit_http_slow_alert(
|
|
189
|
+
method=method,
|
|
190
|
+
path=path,
|
|
191
|
+
duration=duration,
|
|
192
|
+
threshold=threshold,
|
|
193
|
+
trace_id=trace_id,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# 异常告警
|
|
197
|
+
if exception is not None and alert:
|
|
198
|
+
await _emit_http_exception_alert(
|
|
199
|
+
method=method,
|
|
200
|
+
path=path,
|
|
201
|
+
duration=duration,
|
|
202
|
+
exception=exception,
|
|
203
|
+
trace_id=trace_id,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def _emit_http_slow_alert(
|
|
208
|
+
method: str,
|
|
209
|
+
path: str,
|
|
210
|
+
duration: float,
|
|
211
|
+
threshold: float,
|
|
212
|
+
trace_id: str,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""发送 HTTP 慢请求告警。"""
|
|
215
|
+
try:
|
|
216
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
217
|
+
AlertEventType,
|
|
218
|
+
AlertSeverity,
|
|
219
|
+
emit_alert,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
await emit_alert(
|
|
223
|
+
AlertEventType.SLOW_REQUEST,
|
|
224
|
+
f"慢请求: {method} {path}",
|
|
225
|
+
severity=AlertSeverity.WARNING,
|
|
226
|
+
trace_id=trace_id,
|
|
227
|
+
source="api",
|
|
228
|
+
duration=duration,
|
|
229
|
+
threshold=threshold,
|
|
230
|
+
endpoint=path,
|
|
231
|
+
method=method,
|
|
232
|
+
)
|
|
233
|
+
except ImportError:
|
|
234
|
+
pass
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.debug(f"发送慢请求告警失败: {e}")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
async def _emit_http_exception_alert(
|
|
240
|
+
method: str,
|
|
241
|
+
path: str,
|
|
242
|
+
duration: float,
|
|
243
|
+
exception: Exception,
|
|
244
|
+
trace_id: str,
|
|
245
|
+
) -> None:
|
|
246
|
+
"""发送 HTTP 异常告警。"""
|
|
247
|
+
try:
|
|
248
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
249
|
+
AlertEventType,
|
|
250
|
+
AlertSeverity,
|
|
251
|
+
emit_alert,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
await emit_alert(
|
|
255
|
+
AlertEventType.EXCEPTION,
|
|
256
|
+
f"请求异常: {method} {path} - {type(exception).__name__}: {exception}",
|
|
257
|
+
severity=AlertSeverity.ERROR,
|
|
258
|
+
trace_id=trace_id,
|
|
259
|
+
source="api",
|
|
260
|
+
duration=duration,
|
|
261
|
+
endpoint=path,
|
|
262
|
+
method=method,
|
|
263
|
+
error_type=type(exception).__name__,
|
|
264
|
+
error_message=str(exception),
|
|
265
|
+
)
|
|
266
|
+
except ImportError:
|
|
267
|
+
pass
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.debug(f"发送异常告警失败: {e}")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# =============================================================================
|
|
273
|
+
# Service 方法监控组件
|
|
274
|
+
# =============================================================================
|
|
122
275
|
|
|
123
276
|
|
|
124
277
|
class StandardMetricsReporterComponent(MonitorComponent):
|
|
@@ -172,8 +325,17 @@ class ErrorReporterComponent(MonitorComponent):
|
|
|
172
325
|
"""错误报告组件。
|
|
173
326
|
|
|
174
327
|
报告方法执行失败的错误信息。
|
|
328
|
+
支持可选的告警通知(飞书/Webhook)。
|
|
175
329
|
"""
|
|
176
330
|
|
|
331
|
+
def __init__(self, *, alert: bool = False) -> None:
|
|
332
|
+
"""初始化错误报告组件。
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
alert: 是否发送告警通知
|
|
336
|
+
"""
|
|
337
|
+
self._alert = alert
|
|
338
|
+
|
|
177
339
|
async def process(self, context: MonitorContext) -> None:
|
|
178
340
|
"""报告错误信息。
|
|
179
341
|
|
|
@@ -186,6 +348,34 @@ class ErrorReporterComponent(MonitorComponent):
|
|
|
186
348
|
f"执行时间: {context.duration:.3f}s | "
|
|
187
349
|
f"异常: {type(context.exception).__name__}: {context.exception}"
|
|
188
350
|
)
|
|
351
|
+
|
|
352
|
+
# 发送告警通知
|
|
353
|
+
if self._alert:
|
|
354
|
+
await self._emit_alert(context)
|
|
355
|
+
|
|
356
|
+
async def _emit_alert(self, context: MonitorContext) -> None:
|
|
357
|
+
"""发送异常告警。"""
|
|
358
|
+
try:
|
|
359
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
360
|
+
AlertEventType,
|
|
361
|
+
AlertSeverity,
|
|
362
|
+
emit_alert,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
await emit_alert(
|
|
366
|
+
AlertEventType.EXCEPTION,
|
|
367
|
+
f"方法异常: {context.func_name}",
|
|
368
|
+
severity=AlertSeverity.ERROR,
|
|
369
|
+
source="service",
|
|
370
|
+
duration=context.duration,
|
|
371
|
+
service=context.service_name,
|
|
372
|
+
exception_type=type(context.exception).__name__,
|
|
373
|
+
exception_message=str(context.exception),
|
|
374
|
+
)
|
|
375
|
+
except ImportError:
|
|
376
|
+
pass # alerting 模块未加载
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.debug(f"发送异常告警失败: {e}")
|
|
189
379
|
|
|
190
380
|
|
|
191
381
|
class MonitorPipeline:
|
|
@@ -278,19 +468,22 @@ class MonitorPipelineBuilder:
|
|
|
278
468
|
def with_slow_detector(
|
|
279
469
|
self,
|
|
280
470
|
threshold: float = 1.0,
|
|
471
|
+
*,
|
|
472
|
+
alert: bool = False,
|
|
281
473
|
detector: SlowMethodDetectorComponent | None = None,
|
|
282
474
|
) -> MonitorPipelineBuilder:
|
|
283
475
|
"""添加慢方法检测组件。
|
|
284
476
|
|
|
285
477
|
Args:
|
|
286
478
|
threshold: 慢方法阈值(秒)
|
|
479
|
+
alert: 是否发送告警通知
|
|
287
480
|
detector: 慢方法检测组件,如果为 None 则创建新实例
|
|
288
481
|
|
|
289
482
|
Returns:
|
|
290
483
|
MonitorPipelineBuilder: 构建器实例(支持链式调用)
|
|
291
484
|
"""
|
|
292
485
|
if detector is None:
|
|
293
|
-
detector = SlowMethodDetectorComponent(threshold)
|
|
486
|
+
detector = SlowMethodDetectorComponent(threshold, alert=alert)
|
|
294
487
|
self._components.append(detector)
|
|
295
488
|
return self
|
|
296
489
|
|
|
@@ -318,18 +511,21 @@ class MonitorPipelineBuilder:
|
|
|
318
511
|
|
|
319
512
|
def with_error_reporter(
|
|
320
513
|
self,
|
|
514
|
+
*,
|
|
515
|
+
alert: bool = False,
|
|
321
516
|
reporter: ErrorReporterComponent | None = None,
|
|
322
517
|
) -> MonitorPipelineBuilder:
|
|
323
518
|
"""添加错误报告组件。
|
|
324
519
|
|
|
325
520
|
Args:
|
|
521
|
+
alert: 是否发送告警通知
|
|
326
522
|
reporter: 错误报告组件,如果为 None 则创建新实例
|
|
327
523
|
|
|
328
524
|
Returns:
|
|
329
525
|
MonitorPipelineBuilder: 构建器实例(支持链式调用)
|
|
330
526
|
"""
|
|
331
527
|
if reporter is None:
|
|
332
|
-
reporter = ErrorReporterComponent()
|
|
528
|
+
reporter = ErrorReporterComponent(alert=alert)
|
|
333
529
|
self._components.append(reporter)
|
|
334
530
|
return self
|
|
335
531
|
|
|
@@ -370,6 +566,7 @@ def monitor(
|
|
|
370
566
|
slow_threshold: float = 1.0,
|
|
371
567
|
metrics: bool = True,
|
|
372
568
|
prometheus_format: bool = False,
|
|
569
|
+
alert: bool = False,
|
|
373
570
|
pipeline: MonitorPipeline | None = None,
|
|
374
571
|
components: list[MonitorComponent] | None = None,
|
|
375
572
|
pipeline_builder: Callable[[], MonitorPipeline] | None = None,
|
|
@@ -377,13 +574,14 @@ def monitor(
|
|
|
377
574
|
"""服务层性能监控装饰器。
|
|
378
575
|
|
|
379
576
|
监控服务方法的执行时间和调用次数。
|
|
380
|
-
|
|
577
|
+
支持慢方法警告、Prometheus 格式导出和告警通知。
|
|
381
578
|
支持自定义监控管道和组件。
|
|
382
579
|
|
|
383
580
|
Args:
|
|
384
581
|
slow_threshold: 慢方法阈值(秒),默认 1.0 秒
|
|
385
582
|
metrics: 是否记录指标(执行时间、调用次数),默认 True
|
|
386
583
|
prometheus_format: 是否使用 Prometheus 格式记录指标,默认 False
|
|
584
|
+
alert: 是否发送告警通知(飞书/Webhook),默认 False
|
|
387
585
|
pipeline: 自定义监控管道,如果提供则忽略其他参数
|
|
388
586
|
components: 自定义组件列表,如果提供则使用这些组件构建管道
|
|
389
587
|
pipeline_builder: 自定义管道构建函数,如果提供则使用此函数构建管道
|
|
@@ -395,6 +593,12 @@ def monitor(
|
|
|
395
593
|
async def create_user(self, data: dict):
|
|
396
594
|
return await self.user_repo.create(data)
|
|
397
595
|
|
|
596
|
+
# 启用告警通知
|
|
597
|
+
class PaymentService(BaseService):
|
|
598
|
+
@monitor(slow_threshold=0.5, alert=True)
|
|
599
|
+
async def process_payment(self, order_id: str):
|
|
600
|
+
...
|
|
601
|
+
|
|
398
602
|
# 使用自定义组件
|
|
399
603
|
custom_component = MyCustomMonitorComponent()
|
|
400
604
|
@monitor(components=[custom_component])
|
|
@@ -441,12 +645,12 @@ def monitor(
|
|
|
441
645
|
if metrics:
|
|
442
646
|
builder.with_call_counter()
|
|
443
647
|
|
|
444
|
-
builder.with_slow_detector(threshold=slow_threshold)
|
|
648
|
+
builder.with_slow_detector(threshold=slow_threshold, alert=alert)
|
|
445
649
|
|
|
446
650
|
if metrics:
|
|
447
651
|
builder.with_metrics_reporter(prometheus_format=prometheus_format)
|
|
448
652
|
|
|
449
|
-
builder.with_error_reporter()
|
|
653
|
+
builder.with_error_reporter(alert=alert)
|
|
450
654
|
|
|
451
655
|
monitor_pipeline = builder.build()
|
|
452
656
|
|