aury-boot 0.0.29__py3-none-any.whl → 0.0.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. aury/boot/_version.py +2 -2
  2. aury/boot/application/app/base.py +126 -2
  3. aury/boot/application/app/components.py +224 -1
  4. aury/boot/application/config/settings.py +195 -3
  5. aury/boot/application/constants/components.py +3 -0
  6. aury/boot/application/middleware/logging.py +45 -6
  7. aury/boot/commands/docs.py +40 -0
  8. aury/boot/commands/init.py +2 -0
  9. aury/boot/commands/templates/project/AGENTS.md.tpl +5 -0
  10. aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
  11. aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
  12. aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
  13. aury/boot/commands/templates/project/env_templates/monitoring.tpl +61 -0
  14. aury/boot/common/logging/context.py +17 -1
  15. aury/boot/common/logging/format.py +4 -0
  16. aury/boot/infrastructure/channel/base.py +6 -2
  17. aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
  18. aury/boot/infrastructure/monitoring/__init__.py +210 -6
  19. aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
  20. aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
  21. aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
  22. aury/boot/infrastructure/monitoring/alerting/manager.py +428 -0
  23. aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
  24. aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
  25. aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
  26. aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
  27. aury/boot/infrastructure/monitoring/alerting/rules.py +163 -0
  28. aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
  29. aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
  30. aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
  31. aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
  32. aury/boot/infrastructure/monitoring/tracing/processor.py +327 -0
  33. aury/boot/infrastructure/monitoring/tracing/provider.py +320 -0
  34. aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
  35. {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/METADATA +14 -1
  36. {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/RECORD +38 -19
  37. {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/WHEEL +0 -0
  38. {aury_boot-0.0.29.dist-info → aury_boot-0.0.30.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,50 @@
1
+ """告警系统模块。
2
+
3
+ 提供企业级告警通知功能:
4
+ - 慢请求、慢SQL、异常自动告警
5
+ - 累计触发和抑制机制
6
+ - 可扩展的通知渠道(内置飞书、Webhook)
7
+
8
+ 快速开始:
9
+ 1. 配置环境变量
10
+ ALERT_ENABLED=true
11
+ ALERT_NOTIFIER_FEISHU_WEBHOOK=https://open.feishu.cn/...
12
+
13
+ 2. 在应用启动时初始化(FoundationApp 自动处理)
14
+
15
+ 3. 可选:添加规则文件 alert_rules.yaml
16
+
17
+ 使用便捷函数发送自定义告警:
18
+ from aury.boot.infrastructure.monitoring.alerting import emit_alert, AlertEventType
19
+
20
+ await emit_alert(
21
+ AlertEventType.CUSTOM,
22
+ "自定义告警消息",
23
+ severity=AlertSeverity.WARNING,
24
+ my_data="xxx",
25
+ )
26
+ """
27
+
28
+ from .aggregator import AlertAggregator
29
+ from .events import AlertEvent, AlertEventType, AlertNotification, AlertSeverity
30
+ from .manager import AlertManager, emit_alert
31
+ from .notifiers import AlertNotifier, FeishuNotifier, WebhookNotifier
32
+ from .rules import AlertRule, load_rules_from_dict
33
+
34
+ __all__ = [
35
+ # 核心类
36
+ "AlertAggregator",
37
+ "AlertEvent",
38
+ "AlertEventType",
39
+ "AlertManager",
40
+ "AlertNotification",
41
+ "AlertRule",
42
+ "AlertSeverity",
43
+ # 通知器
44
+ "AlertNotifier",
45
+ "FeishuNotifier",
46
+ "WebhookNotifier",
47
+ # 便捷函数
48
+ "emit_alert",
49
+ "load_rules_from_dict",
50
+ ]
@@ -0,0 +1,193 @@
1
+ """告警聚合器。
2
+
3
+ 实现累计触发和抑制逻辑:
4
+ - 滑动窗口计数:在窗口时间内达到阈值才触发
5
+ - 抑制机制:相同告警在抑制时间内不重复发送
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections import deque
11
+ from dataclasses import dataclass, field
12
+ from datetime import datetime, timedelta
13
+ import threading
14
+ from typing import TYPE_CHECKING
15
+
16
+ if TYPE_CHECKING:
17
+ from .events import AlertEvent
18
+
19
+
20
+ @dataclass
21
+ class AggregationState:
22
+ """单个指纹的聚合状态。"""
23
+
24
+ fingerprint: str
25
+
26
+ # 滑动窗口内的事件时间戳
27
+ event_timestamps: deque[datetime] = field(default_factory=deque)
28
+
29
+ # 最后一次发送告警的时间
30
+ last_alert_time: datetime | None = None
31
+
32
+ # 聚合的 trace_id 列表(最多保留 5 个)
33
+ trace_ids: deque[str] = field(default_factory=lambda: deque(maxlen=5))
34
+
35
+ # 窗口内事件总数(用于通知)
36
+ window_count: int = 0
37
+
38
+
39
+ class AlertAggregator:
40
+ """告警聚合器。
41
+
42
+ 实现两层控制:
43
+ 1. 累计触发:窗口时间内达到阈值才触发告警
44
+ 2. 抑制机制:触发后的抑制时间内不重复发送
45
+
46
+ 示例:
47
+ aggregator = AlertAggregator(
48
+ window_seconds=60, # 1 分钟窗口
49
+ threshold=5, # 5 次触发
50
+ suppress_seconds=300, # 5 分钟抑制
51
+ )
52
+
53
+ if aggregator.should_alert(event):
54
+ # 发送告警
55
+ ...
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ window_seconds: int = 60,
61
+ threshold: int = 1,
62
+ suppress_seconds: int = 300,
63
+ ) -> None:
64
+ """初始化聚合器。
65
+
66
+ Args:
67
+ window_seconds: 滑动窗口大小(秒)
68
+ threshold: 窗口内触发阈值
69
+ suppress_seconds: 告警抑制时间(秒)
70
+ """
71
+ self.window_seconds = window_seconds
72
+ self.threshold = threshold
73
+ self.suppress_seconds = suppress_seconds
74
+
75
+ self._states: dict[str, AggregationState] = {}
76
+ self._lock = threading.Lock()
77
+
78
+ def should_alert(self, event: "AlertEvent") -> bool:
79
+ """判断是否应该触发告警。
80
+
81
+ Args:
82
+ event: 告警事件
83
+
84
+ Returns:
85
+ bool: 是否应该发送告警
86
+ """
87
+ fingerprint = event.fingerprint
88
+ now = event.timestamp
89
+
90
+ with self._lock:
91
+ # 获取或创建状态
92
+ if fingerprint not in self._states:
93
+ self._states[fingerprint] = AggregationState(fingerprint=fingerprint)
94
+
95
+ state = self._states[fingerprint]
96
+
97
+ # 清理过期的时间戳
98
+ window_start = now - timedelta(seconds=self.window_seconds)
99
+ while state.event_timestamps and state.event_timestamps[0] < window_start:
100
+ state.event_timestamps.popleft()
101
+
102
+ # 添加当前事件
103
+ state.event_timestamps.append(now)
104
+ state.trace_ids.append(event.trace_id)
105
+ state.window_count = len(state.event_timestamps)
106
+
107
+ # 检查是否达到阈值
108
+ if state.window_count < self.threshold:
109
+ return False
110
+
111
+ # 检查是否在抑制期内
112
+ if state.last_alert_time:
113
+ suppress_until = state.last_alert_time + timedelta(seconds=self.suppress_seconds)
114
+ if now < suppress_until:
115
+ return False
116
+
117
+ # 触发告警,更新最后告警时间
118
+ state.last_alert_time = now
119
+ return True
120
+
121
+ def get_state(self, fingerprint: str) -> AggregationState | None:
122
+ """获取指纹的聚合状态。
123
+
124
+ Args:
125
+ fingerprint: 事件指纹
126
+
127
+ Returns:
128
+ 聚合状态,如果不存在返回 None
129
+ """
130
+ with self._lock:
131
+ return self._states.get(fingerprint)
132
+
133
+ def get_aggregation_info(self, event: "AlertEvent") -> dict:
134
+ """获取聚合信息(用于告警通知)。
135
+
136
+ Args:
137
+ event: 告警事件
138
+
139
+ Returns:
140
+ 包含聚合信息的字典
141
+ """
142
+ state = self.get_state(event.fingerprint)
143
+ if not state:
144
+ return {
145
+ "count": 1,
146
+ "trace_ids": [event.trace_id],
147
+ }
148
+
149
+ return {
150
+ "count": state.window_count,
151
+ "trace_ids": list(state.trace_ids),
152
+ }
153
+
154
+ def reset(self, fingerprint: str | None = None) -> None:
155
+ """重置聚合状态。
156
+
157
+ Args:
158
+ fingerprint: 指定指纹,如果为 None 则重置所有
159
+ """
160
+ with self._lock:
161
+ if fingerprint:
162
+ self._states.pop(fingerprint, None)
163
+ else:
164
+ self._states.clear()
165
+
166
+ def cleanup_expired(self) -> int:
167
+ """清理过期的状态。
168
+
169
+ 返回清理的状态数量。
170
+ """
171
+ now = datetime.now()
172
+ expire_time = now - timedelta(seconds=self.window_seconds + self.suppress_seconds)
173
+
174
+ cleaned = 0
175
+ with self._lock:
176
+ to_remove = []
177
+ for fp, state in self._states.items():
178
+ # 如果窗口为空且已过抑制期,可以清理
179
+ if not state.event_timestamps:
180
+ if not state.last_alert_time or state.last_alert_time < expire_time:
181
+ to_remove.append(fp)
182
+
183
+ for fp in to_remove:
184
+ del self._states[fp]
185
+ cleaned += 1
186
+
187
+ return cleaned
188
+
189
+
190
+ __all__ = [
191
+ "AggregationState",
192
+ "AlertAggregator",
193
+ ]
@@ -0,0 +1,141 @@
1
+ """告警事件定义。
2
+
3
+ 定义告警事件类型、严重级别和事件数据结构。
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from enum import Enum
11
+ from typing import Any
12
+ import hashlib
13
+
14
+
15
+ class AlertEventType(str, Enum):
16
+ """告警事件类型。"""
17
+
18
+ SLOW_REQUEST = "slow_request" # 慢请求
19
+ SLOW_SQL = "slow_sql" # 慢 SQL
20
+ EXCEPTION = "exception" # 异常
21
+ TASK_FAILURE = "task_failure" # 任务失败
22
+ TASK_TIMEOUT = "task_timeout" # 任务超时
23
+ CUSTOM = "custom" # 自定义
24
+
25
+
26
+ class AlertSeverity(str, Enum):
27
+ """告警严重级别。"""
28
+
29
+ INFO = "info"
30
+ WARNING = "warning"
31
+ ERROR = "error"
32
+ CRITICAL = "critical"
33
+
34
+
35
+ @dataclass
36
+ class AlertEvent:
37
+ """告警事件。
38
+
39
+ 包含告警的所有上下文信息。
40
+ """
41
+
42
+ event_type: AlertEventType
43
+ severity: AlertSeverity
44
+ message: str
45
+ trace_id: str
46
+
47
+ source: str = "unknown" # api / scheduler / task
48
+ service_name: str = "" # 服务名
49
+ timestamp: datetime = field(default_factory=datetime.now)
50
+ metadata: dict[str, Any] = field(default_factory=dict)
51
+
52
+ # 用于聚合/去重的指纹(相同指纹的事件会被聚合)
53
+ _fingerprint: str | None = field(default=None, repr=False)
54
+
55
+ @property
56
+ def fingerprint(self) -> str:
57
+ """获取事件指纹。
58
+
59
+ 相同指纹的事件会被聚合处理。
60
+ """
61
+ if self._fingerprint:
62
+ return self._fingerprint
63
+
64
+ # 默认指纹:类型 + 来源 + 关键元数据
65
+ key_parts = [
66
+ self.event_type.value,
67
+ self.source,
68
+ self.metadata.get("endpoint", ""),
69
+ self.metadata.get("task_name", ""),
70
+ self.metadata.get("error_type", ""),
71
+ ]
72
+ key_str = ":".join(str(p) for p in key_parts if p)
73
+ return hashlib.md5(key_str.encode()).hexdigest()[:16]
74
+
75
+ @fingerprint.setter
76
+ def fingerprint(self, value: str) -> None:
77
+ self._fingerprint = value
78
+
79
+ def to_dict(self) -> dict[str, Any]:
80
+ """转换为字典。"""
81
+ return {
82
+ "event_type": self.event_type.value,
83
+ "severity": self.severity.value,
84
+ "message": self.message,
85
+ "trace_id": self.trace_id,
86
+ "source": self.source,
87
+ "service_name": self.service_name,
88
+ "timestamp": self.timestamp.isoformat(),
89
+ "metadata": self.metadata,
90
+ "fingerprint": self.fingerprint,
91
+ }
92
+
93
+
94
+ @dataclass
95
+ class AlertNotification:
96
+ """告警通知(发送给 Notifier 的数据)。
97
+
98
+ 包含聚合后的告警信息。
99
+ """
100
+
101
+ title: str
102
+ message: str
103
+ severity: AlertSeverity
104
+ event_type: AlertEventType
105
+ source: str
106
+ service_name: str
107
+
108
+ # 聚合信息
109
+ count: int = 1 # 聚合的事件数量
110
+ first_timestamp: datetime = field(default_factory=datetime.now)
111
+ last_timestamp: datetime = field(default_factory=datetime.now)
112
+
113
+ # 关联的 trace_id 列表(最多保留最近几个)
114
+ trace_ids: list[str] = field(default_factory=list)
115
+
116
+ # 额外元数据
117
+ metadata: dict[str, Any] = field(default_factory=dict)
118
+
119
+ def to_dict(self) -> dict[str, Any]:
120
+ """转换为字典。"""
121
+ return {
122
+ "title": self.title,
123
+ "message": self.message,
124
+ "severity": self.severity.value,
125
+ "event_type": self.event_type.value,
126
+ "source": self.source,
127
+ "service_name": self.service_name,
128
+ "count": self.count,
129
+ "first_timestamp": self.first_timestamp.isoformat(),
130
+ "last_timestamp": self.last_timestamp.isoformat(),
131
+ "trace_ids": self.trace_ids,
132
+ "metadata": self.metadata,
133
+ }
134
+
135
+
136
+ __all__ = [
137
+ "AlertEvent",
138
+ "AlertEventType",
139
+ "AlertNotification",
140
+ "AlertSeverity",
141
+ ]