aury-boot 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/__init__.py +2 -4
- aury/boot/application/app/base.py +126 -2
- aury/boot/application/app/components.py +226 -1
- aury/boot/application/config/settings.py +201 -3
- aury/boot/application/constants/components.py +3 -0
- aury/boot/application/middleware/logging.py +45 -6
- aury/boot/commands/docs.py +40 -0
- aury/boot/commands/init.py +2 -0
- aury/boot/commands/templates/project/AGENTS.md.tpl +59 -0
- aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
- aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
- aury/boot/commands/templates/project/env_templates/messaging.tpl +21 -13
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +63 -0
- aury/boot/common/logging/context.py +17 -1
- aury/boot/common/logging/format.py +4 -0
- aury/boot/infrastructure/__init__.py +4 -8
- aury/boot/infrastructure/channel/__init__.py +9 -8
- aury/boot/infrastructure/channel/backends/__init__.py +2 -6
- aury/boot/infrastructure/channel/backends/broadcaster.py +141 -0
- aury/boot/infrastructure/channel/base.py +11 -4
- aury/boot/infrastructure/channel/manager.py +25 -24
- aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
- aury/boot/infrastructure/events/__init__.py +4 -6
- aury/boot/infrastructure/events/backends/__init__.py +2 -4
- aury/boot/infrastructure/events/backends/broadcaster.py +189 -0
- aury/boot/infrastructure/events/base.py +9 -4
- aury/boot/infrastructure/events/manager.py +24 -20
- aury/boot/infrastructure/monitoring/__init__.py +210 -6
- aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
- aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
- aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
- aury/boot/infrastructure/monitoring/alerting/manager.py +430 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
- aury/boot/infrastructure/monitoring/alerting/rules.py +179 -0
- aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
- aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
- aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
- aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
- aury/boot/infrastructure/monitoring/tracing/processor.py +357 -0
- aury/boot/infrastructure/monitoring/tracing/provider.py +322 -0
- aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/METADATA +14 -1
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/RECORD +50 -33
- aury/boot/infrastructure/channel/backends/memory.py +0 -126
- aury/boot/infrastructure/channel/backends/redis.py +0 -130
- aury/boot/infrastructure/events/backends/memory.py +0 -86
- aury/boot/infrastructure/events/backends/redis.py +0 -169
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/entry_points.txt +0 -0
|
@@ -98,15 +98,18 @@ class SlowMethodDetectorComponent(MonitorComponent):
|
|
|
98
98
|
"""慢方法检测组件。
|
|
99
99
|
|
|
100
100
|
检测执行时间超过阈值的方法并记录警告。
|
|
101
|
+
支持可选的告警通知(飞书/Webhook)。
|
|
101
102
|
"""
|
|
102
103
|
|
|
103
|
-
def __init__(self, threshold: float) -> None:
|
|
104
|
+
def __init__(self, threshold: float, *, alert: bool = False) -> None:
|
|
104
105
|
"""初始化慢方法检测组件。
|
|
105
106
|
|
|
106
107
|
Args:
|
|
107
108
|
threshold: 慢方法阈值(秒)
|
|
109
|
+
alert: 是否发送告警通知
|
|
108
110
|
"""
|
|
109
111
|
self._threshold = threshold
|
|
112
|
+
self._alert = alert
|
|
110
113
|
|
|
111
114
|
async def process(self, context: MonitorContext) -> None:
|
|
112
115
|
"""检测慢方法。
|
|
@@ -119,6 +122,156 @@ class SlowMethodDetectorComponent(MonitorComponent):
|
|
|
119
122
|
f"慢方法检测: {context.func_name} 执行时间 {context.duration:.3f}s "
|
|
120
123
|
f"(阈值: {self._threshold}s)"
|
|
121
124
|
)
|
|
125
|
+
|
|
126
|
+
# 发送告警通知
|
|
127
|
+
if self._alert:
|
|
128
|
+
await self._emit_alert(context)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def _emit_alert(self, context: MonitorContext) -> None:
|
|
132
|
+
"""发送慢方法告警。"""
|
|
133
|
+
try:
|
|
134
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
135
|
+
AlertEventType,
|
|
136
|
+
AlertSeverity,
|
|
137
|
+
emit_alert,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
await emit_alert(
|
|
141
|
+
AlertEventType.CUSTOM,
|
|
142
|
+
f"慢方法: {context.func_name}",
|
|
143
|
+
severity=AlertSeverity.WARNING,
|
|
144
|
+
source="service",
|
|
145
|
+
duration=context.duration,
|
|
146
|
+
threshold=self._threshold,
|
|
147
|
+
service=context.service_name,
|
|
148
|
+
)
|
|
149
|
+
except ImportError:
|
|
150
|
+
pass # alerting 模块未加载
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.debug(f"发送慢方法告警失败: {e}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# =============================================================================
|
|
156
|
+
# HTTP 请求监控
|
|
157
|
+
# =============================================================================
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def monitor_http_request(
|
|
161
|
+
*,
|
|
162
|
+
method: str,
|
|
163
|
+
path: str,
|
|
164
|
+
duration: float,
|
|
165
|
+
status_code: int,
|
|
166
|
+
threshold: float = 1.0,
|
|
167
|
+
alert: bool = False,
|
|
168
|
+
trace_id: str = "",
|
|
169
|
+
exception: Exception | None = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""监控 HTTP 请求。
|
|
172
|
+
|
|
173
|
+
由 middleware 调用,统一处理 HTTP 请求的慢请求检测和告警。
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
method: HTTP 方法
|
|
177
|
+
path: 请求路径
|
|
178
|
+
duration: 执行时间(秒)
|
|
179
|
+
status_code: 响应状态码
|
|
180
|
+
threshold: 慢请求阈值(秒)
|
|
181
|
+
alert: 是否发送告警
|
|
182
|
+
trace_id: 追踪 ID
|
|
183
|
+
exception: 异常对象(如果有)
|
|
184
|
+
"""
|
|
185
|
+
# 慢请求检测和告警
|
|
186
|
+
if duration >= threshold:
|
|
187
|
+
if alert:
|
|
188
|
+
await _emit_http_slow_alert(
|
|
189
|
+
method=method,
|
|
190
|
+
path=path,
|
|
191
|
+
duration=duration,
|
|
192
|
+
threshold=threshold,
|
|
193
|
+
trace_id=trace_id,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# 异常告警
|
|
197
|
+
if exception is not None and alert:
|
|
198
|
+
await _emit_http_exception_alert(
|
|
199
|
+
method=method,
|
|
200
|
+
path=path,
|
|
201
|
+
duration=duration,
|
|
202
|
+
exception=exception,
|
|
203
|
+
trace_id=trace_id,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
async def _emit_http_slow_alert(
|
|
208
|
+
method: str,
|
|
209
|
+
path: str,
|
|
210
|
+
duration: float,
|
|
211
|
+
threshold: float,
|
|
212
|
+
trace_id: str,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""发送 HTTP 慢请求告警。"""
|
|
215
|
+
try:
|
|
216
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
217
|
+
AlertEventType,
|
|
218
|
+
AlertSeverity,
|
|
219
|
+
emit_alert,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
await emit_alert(
|
|
223
|
+
AlertEventType.SLOW_REQUEST,
|
|
224
|
+
f"慢请求: {method} {path}",
|
|
225
|
+
severity=AlertSeverity.WARNING,
|
|
226
|
+
trace_id=trace_id,
|
|
227
|
+
source="api",
|
|
228
|
+
duration=duration,
|
|
229
|
+
threshold=threshold,
|
|
230
|
+
endpoint=path,
|
|
231
|
+
method=method,
|
|
232
|
+
)
|
|
233
|
+
except ImportError:
|
|
234
|
+
pass
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.debug(f"发送慢请求告警失败: {e}")
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
async def _emit_http_exception_alert(
|
|
240
|
+
method: str,
|
|
241
|
+
path: str,
|
|
242
|
+
duration: float,
|
|
243
|
+
exception: Exception,
|
|
244
|
+
trace_id: str,
|
|
245
|
+
) -> None:
|
|
246
|
+
"""发送 HTTP 异常告警。"""
|
|
247
|
+
try:
|
|
248
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
249
|
+
AlertEventType,
|
|
250
|
+
AlertSeverity,
|
|
251
|
+
emit_alert,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
await emit_alert(
|
|
255
|
+
AlertEventType.EXCEPTION,
|
|
256
|
+
f"请求异常: {method} {path} - {type(exception).__name__}: {exception}",
|
|
257
|
+
severity=AlertSeverity.ERROR,
|
|
258
|
+
trace_id=trace_id,
|
|
259
|
+
source="api",
|
|
260
|
+
duration=duration,
|
|
261
|
+
endpoint=path,
|
|
262
|
+
method=method,
|
|
263
|
+
error_type=type(exception).__name__,
|
|
264
|
+
error_message=str(exception),
|
|
265
|
+
)
|
|
266
|
+
except ImportError:
|
|
267
|
+
pass
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.debug(f"发送异常告警失败: {e}")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# =============================================================================
|
|
273
|
+
# Service 方法监控组件
|
|
274
|
+
# =============================================================================
|
|
122
275
|
|
|
123
276
|
|
|
124
277
|
class StandardMetricsReporterComponent(MonitorComponent):
|
|
@@ -172,8 +325,17 @@ class ErrorReporterComponent(MonitorComponent):
|
|
|
172
325
|
"""错误报告组件。
|
|
173
326
|
|
|
174
327
|
报告方法执行失败的错误信息。
|
|
328
|
+
支持可选的告警通知(飞书/Webhook)。
|
|
175
329
|
"""
|
|
176
330
|
|
|
331
|
+
def __init__(self, *, alert: bool = False) -> None:
|
|
332
|
+
"""初始化错误报告组件。
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
alert: 是否发送告警通知
|
|
336
|
+
"""
|
|
337
|
+
self._alert = alert
|
|
338
|
+
|
|
177
339
|
async def process(self, context: MonitorContext) -> None:
|
|
178
340
|
"""报告错误信息。
|
|
179
341
|
|
|
@@ -186,6 +348,34 @@ class ErrorReporterComponent(MonitorComponent):
|
|
|
186
348
|
f"执行时间: {context.duration:.3f}s | "
|
|
187
349
|
f"异常: {type(context.exception).__name__}: {context.exception}"
|
|
188
350
|
)
|
|
351
|
+
|
|
352
|
+
# 发送告警通知
|
|
353
|
+
if self._alert:
|
|
354
|
+
await self._emit_alert(context)
|
|
355
|
+
|
|
356
|
+
async def _emit_alert(self, context: MonitorContext) -> None:
|
|
357
|
+
"""发送异常告警。"""
|
|
358
|
+
try:
|
|
359
|
+
from aury.boot.infrastructure.monitoring.alerting import (
|
|
360
|
+
AlertEventType,
|
|
361
|
+
AlertSeverity,
|
|
362
|
+
emit_alert,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
await emit_alert(
|
|
366
|
+
AlertEventType.EXCEPTION,
|
|
367
|
+
f"方法异常: {context.func_name}",
|
|
368
|
+
severity=AlertSeverity.ERROR,
|
|
369
|
+
source="service",
|
|
370
|
+
duration=context.duration,
|
|
371
|
+
service=context.service_name,
|
|
372
|
+
exception_type=type(context.exception).__name__,
|
|
373
|
+
exception_message=str(context.exception),
|
|
374
|
+
)
|
|
375
|
+
except ImportError:
|
|
376
|
+
pass # alerting 模块未加载
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.debug(f"发送异常告警失败: {e}")
|
|
189
379
|
|
|
190
380
|
|
|
191
381
|
class MonitorPipeline:
|
|
@@ -278,19 +468,22 @@ class MonitorPipelineBuilder:
|
|
|
278
468
|
def with_slow_detector(
|
|
279
469
|
self,
|
|
280
470
|
threshold: float = 1.0,
|
|
471
|
+
*,
|
|
472
|
+
alert: bool = False,
|
|
281
473
|
detector: SlowMethodDetectorComponent | None = None,
|
|
282
474
|
) -> MonitorPipelineBuilder:
|
|
283
475
|
"""添加慢方法检测组件。
|
|
284
476
|
|
|
285
477
|
Args:
|
|
286
478
|
threshold: 慢方法阈值(秒)
|
|
479
|
+
alert: 是否发送告警通知
|
|
287
480
|
detector: 慢方法检测组件,如果为 None 则创建新实例
|
|
288
481
|
|
|
289
482
|
Returns:
|
|
290
483
|
MonitorPipelineBuilder: 构建器实例(支持链式调用)
|
|
291
484
|
"""
|
|
292
485
|
if detector is None:
|
|
293
|
-
detector = SlowMethodDetectorComponent(threshold)
|
|
486
|
+
detector = SlowMethodDetectorComponent(threshold, alert=alert)
|
|
294
487
|
self._components.append(detector)
|
|
295
488
|
return self
|
|
296
489
|
|
|
@@ -318,18 +511,21 @@ class MonitorPipelineBuilder:
|
|
|
318
511
|
|
|
319
512
|
def with_error_reporter(
|
|
320
513
|
self,
|
|
514
|
+
*,
|
|
515
|
+
alert: bool = False,
|
|
321
516
|
reporter: ErrorReporterComponent | None = None,
|
|
322
517
|
) -> MonitorPipelineBuilder:
|
|
323
518
|
"""添加错误报告组件。
|
|
324
519
|
|
|
325
520
|
Args:
|
|
521
|
+
alert: 是否发送告警通知
|
|
326
522
|
reporter: 错误报告组件,如果为 None 则创建新实例
|
|
327
523
|
|
|
328
524
|
Returns:
|
|
329
525
|
MonitorPipelineBuilder: 构建器实例(支持链式调用)
|
|
330
526
|
"""
|
|
331
527
|
if reporter is None:
|
|
332
|
-
reporter = ErrorReporterComponent()
|
|
528
|
+
reporter = ErrorReporterComponent(alert=alert)
|
|
333
529
|
self._components.append(reporter)
|
|
334
530
|
return self
|
|
335
531
|
|
|
@@ -370,6 +566,7 @@ def monitor(
|
|
|
370
566
|
slow_threshold: float = 1.0,
|
|
371
567
|
metrics: bool = True,
|
|
372
568
|
prometheus_format: bool = False,
|
|
569
|
+
alert: bool = False,
|
|
373
570
|
pipeline: MonitorPipeline | None = None,
|
|
374
571
|
components: list[MonitorComponent] | None = None,
|
|
375
572
|
pipeline_builder: Callable[[], MonitorPipeline] | None = None,
|
|
@@ -377,13 +574,14 @@ def monitor(
|
|
|
377
574
|
"""服务层性能监控装饰器。
|
|
378
575
|
|
|
379
576
|
监控服务方法的执行时间和调用次数。
|
|
380
|
-
|
|
577
|
+
支持慢方法警告、Prometheus 格式导出和告警通知。
|
|
381
578
|
支持自定义监控管道和组件。
|
|
382
579
|
|
|
383
580
|
Args:
|
|
384
581
|
slow_threshold: 慢方法阈值(秒),默认 1.0 秒
|
|
385
582
|
metrics: 是否记录指标(执行时间、调用次数),默认 True
|
|
386
583
|
prometheus_format: 是否使用 Prometheus 格式记录指标,默认 False
|
|
584
|
+
alert: 是否发送告警通知(飞书/Webhook),默认 False
|
|
387
585
|
pipeline: 自定义监控管道,如果提供则忽略其他参数
|
|
388
586
|
components: 自定义组件列表,如果提供则使用这些组件构建管道
|
|
389
587
|
pipeline_builder: 自定义管道构建函数,如果提供则使用此函数构建管道
|
|
@@ -395,6 +593,12 @@ def monitor(
|
|
|
395
593
|
async def create_user(self, data: dict):
|
|
396
594
|
return await self.user_repo.create(data)
|
|
397
595
|
|
|
596
|
+
# 启用告警通知
|
|
597
|
+
class PaymentService(BaseService):
|
|
598
|
+
@monitor(slow_threshold=0.5, alert=True)
|
|
599
|
+
async def process_payment(self, order_id: str):
|
|
600
|
+
...
|
|
601
|
+
|
|
398
602
|
# 使用自定义组件
|
|
399
603
|
custom_component = MyCustomMonitorComponent()
|
|
400
604
|
@monitor(components=[custom_component])
|
|
@@ -441,12 +645,12 @@ def monitor(
|
|
|
441
645
|
if metrics:
|
|
442
646
|
builder.with_call_counter()
|
|
443
647
|
|
|
444
|
-
builder.with_slow_detector(threshold=slow_threshold)
|
|
648
|
+
builder.with_slow_detector(threshold=slow_threshold, alert=alert)
|
|
445
649
|
|
|
446
650
|
if metrics:
|
|
447
651
|
builder.with_metrics_reporter(prometheus_format=prometheus_format)
|
|
448
652
|
|
|
449
|
-
builder.with_error_reporter()
|
|
653
|
+
builder.with_error_reporter(alert=alert)
|
|
450
654
|
|
|
451
655
|
monitor_pipeline = builder.build()
|
|
452
656
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""告警系统模块。
|
|
2
|
+
|
|
3
|
+
提供企业级告警通知功能:
|
|
4
|
+
- 慢请求、慢SQL、异常自动告警
|
|
5
|
+
- 累计触发和抑制机制
|
|
6
|
+
- 可扩展的通知渠道(内置飞书、Webhook)
|
|
7
|
+
|
|
8
|
+
快速开始:
|
|
9
|
+
1. 配置环境变量
|
|
10
|
+
ALERT_ENABLED=true
|
|
11
|
+
ALERT_NOTIFIER_FEISHU_WEBHOOK=https://open.feishu.cn/...
|
|
12
|
+
|
|
13
|
+
2. 在应用启动时初始化(FoundationApp 自动处理)
|
|
14
|
+
|
|
15
|
+
3. 可选:添加规则文件 alert_rules.yaml
|
|
16
|
+
|
|
17
|
+
使用便捷函数发送自定义告警:
|
|
18
|
+
from aury.boot.infrastructure.monitoring.alerting import emit_alert, AlertEventType
|
|
19
|
+
|
|
20
|
+
await emit_alert(
|
|
21
|
+
AlertEventType.CUSTOM,
|
|
22
|
+
"自定义告警消息",
|
|
23
|
+
severity=AlertSeverity.WARNING,
|
|
24
|
+
my_data="xxx",
|
|
25
|
+
)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from .aggregator import AlertAggregator
|
|
29
|
+
from .events import AlertEvent, AlertEventType, AlertNotification, AlertSeverity
|
|
30
|
+
from .manager import AlertManager, emit_alert
|
|
31
|
+
from .notifiers import AlertNotifier, FeishuNotifier, WebhookNotifier
|
|
32
|
+
from .rules import AlertRule, load_rules_from_dict
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# 核心类
|
|
36
|
+
"AlertAggregator",
|
|
37
|
+
"AlertEvent",
|
|
38
|
+
"AlertEventType",
|
|
39
|
+
"AlertManager",
|
|
40
|
+
"AlertNotification",
|
|
41
|
+
"AlertRule",
|
|
42
|
+
"AlertSeverity",
|
|
43
|
+
# 通知器
|
|
44
|
+
"AlertNotifier",
|
|
45
|
+
"FeishuNotifier",
|
|
46
|
+
"WebhookNotifier",
|
|
47
|
+
# 便捷函数
|
|
48
|
+
"emit_alert",
|
|
49
|
+
"load_rules_from_dict",
|
|
50
|
+
]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""告警聚合器。
|
|
2
|
+
|
|
3
|
+
实现累计触发和抑制逻辑:
|
|
4
|
+
- 滑动窗口计数:在窗口时间内达到阈值才触发
|
|
5
|
+
- 抑制机制:相同告警在抑制时间内不重复发送
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections import deque
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from datetime import datetime, timedelta
|
|
13
|
+
import threading
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from .events import AlertEvent
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class AggregationState:
|
|
22
|
+
"""单个指纹的聚合状态。"""
|
|
23
|
+
|
|
24
|
+
fingerprint: str
|
|
25
|
+
|
|
26
|
+
# 滑动窗口内的事件时间戳
|
|
27
|
+
event_timestamps: deque[datetime] = field(default_factory=deque)
|
|
28
|
+
|
|
29
|
+
# 最后一次发送告警的时间
|
|
30
|
+
last_alert_time: datetime | None = None
|
|
31
|
+
|
|
32
|
+
# 聚合的 trace_id 列表(最多保留 5 个)
|
|
33
|
+
trace_ids: deque[str] = field(default_factory=lambda: deque(maxlen=5))
|
|
34
|
+
|
|
35
|
+
# 窗口内事件总数(用于通知)
|
|
36
|
+
window_count: int = 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class AlertAggregator:
|
|
40
|
+
"""告警聚合器。
|
|
41
|
+
|
|
42
|
+
实现两层控制:
|
|
43
|
+
1. 累计触发:窗口时间内达到阈值才触发告警
|
|
44
|
+
2. 抑制机制:触发后的抑制时间内不重复发送
|
|
45
|
+
|
|
46
|
+
示例:
|
|
47
|
+
aggregator = AlertAggregator(
|
|
48
|
+
window_seconds=60, # 1 分钟窗口
|
|
49
|
+
threshold=5, # 5 次触发
|
|
50
|
+
suppress_seconds=300, # 5 分钟抑制
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if aggregator.should_alert(event):
|
|
54
|
+
# 发送告警
|
|
55
|
+
...
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
window_seconds: int = 60,
|
|
61
|
+
threshold: int = 1,
|
|
62
|
+
suppress_seconds: int = 300,
|
|
63
|
+
) -> None:
|
|
64
|
+
"""初始化聚合器。
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
window_seconds: 滑动窗口大小(秒)
|
|
68
|
+
threshold: 窗口内触发阈值
|
|
69
|
+
suppress_seconds: 告警抑制时间(秒)
|
|
70
|
+
"""
|
|
71
|
+
self.window_seconds = window_seconds
|
|
72
|
+
self.threshold = threshold
|
|
73
|
+
self.suppress_seconds = suppress_seconds
|
|
74
|
+
|
|
75
|
+
self._states: dict[str, AggregationState] = {}
|
|
76
|
+
self._lock = threading.Lock()
|
|
77
|
+
|
|
78
|
+
def should_alert(self, event: "AlertEvent") -> bool:
|
|
79
|
+
"""判断是否应该触发告警。
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
event: 告警事件
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
bool: 是否应该发送告警
|
|
86
|
+
"""
|
|
87
|
+
fingerprint = event.fingerprint
|
|
88
|
+
now = event.timestamp
|
|
89
|
+
|
|
90
|
+
with self._lock:
|
|
91
|
+
# 获取或创建状态
|
|
92
|
+
if fingerprint not in self._states:
|
|
93
|
+
self._states[fingerprint] = AggregationState(fingerprint=fingerprint)
|
|
94
|
+
|
|
95
|
+
state = self._states[fingerprint]
|
|
96
|
+
|
|
97
|
+
# 清理过期的时间戳
|
|
98
|
+
window_start = now - timedelta(seconds=self.window_seconds)
|
|
99
|
+
while state.event_timestamps and state.event_timestamps[0] < window_start:
|
|
100
|
+
state.event_timestamps.popleft()
|
|
101
|
+
|
|
102
|
+
# 添加当前事件
|
|
103
|
+
state.event_timestamps.append(now)
|
|
104
|
+
state.trace_ids.append(event.trace_id)
|
|
105
|
+
state.window_count = len(state.event_timestamps)
|
|
106
|
+
|
|
107
|
+
# 检查是否达到阈值
|
|
108
|
+
if state.window_count < self.threshold:
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
# 检查是否在抑制期内
|
|
112
|
+
if state.last_alert_time:
|
|
113
|
+
suppress_until = state.last_alert_time + timedelta(seconds=self.suppress_seconds)
|
|
114
|
+
if now < suppress_until:
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
# 触发告警,更新最后告警时间
|
|
118
|
+
state.last_alert_time = now
|
|
119
|
+
return True
|
|
120
|
+
|
|
121
|
+
def get_state(self, fingerprint: str) -> AggregationState | None:
|
|
122
|
+
"""获取指纹的聚合状态。
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
fingerprint: 事件指纹
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
聚合状态,如果不存在返回 None
|
|
129
|
+
"""
|
|
130
|
+
with self._lock:
|
|
131
|
+
return self._states.get(fingerprint)
|
|
132
|
+
|
|
133
|
+
def get_aggregation_info(self, event: "AlertEvent") -> dict:
|
|
134
|
+
"""获取聚合信息(用于告警通知)。
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
event: 告警事件
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
包含聚合信息的字典
|
|
141
|
+
"""
|
|
142
|
+
state = self.get_state(event.fingerprint)
|
|
143
|
+
if not state:
|
|
144
|
+
return {
|
|
145
|
+
"count": 1,
|
|
146
|
+
"trace_ids": [event.trace_id],
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"count": state.window_count,
|
|
151
|
+
"trace_ids": list(state.trace_ids),
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def reset(self, fingerprint: str | None = None) -> None:
|
|
155
|
+
"""重置聚合状态。
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
fingerprint: 指定指纹,如果为 None 则重置所有
|
|
159
|
+
"""
|
|
160
|
+
with self._lock:
|
|
161
|
+
if fingerprint:
|
|
162
|
+
self._states.pop(fingerprint, None)
|
|
163
|
+
else:
|
|
164
|
+
self._states.clear()
|
|
165
|
+
|
|
166
|
+
def cleanup_expired(self) -> int:
|
|
167
|
+
"""清理过期的状态。
|
|
168
|
+
|
|
169
|
+
返回清理的状态数量。
|
|
170
|
+
"""
|
|
171
|
+
now = datetime.now()
|
|
172
|
+
expire_time = now - timedelta(seconds=self.window_seconds + self.suppress_seconds)
|
|
173
|
+
|
|
174
|
+
cleaned = 0
|
|
175
|
+
with self._lock:
|
|
176
|
+
to_remove = []
|
|
177
|
+
for fp, state in self._states.items():
|
|
178
|
+
# 如果窗口为空且已过抑制期,可以清理
|
|
179
|
+
if not state.event_timestamps:
|
|
180
|
+
if not state.last_alert_time or state.last_alert_time < expire_time:
|
|
181
|
+
to_remove.append(fp)
|
|
182
|
+
|
|
183
|
+
for fp in to_remove:
|
|
184
|
+
del self._states[fp]
|
|
185
|
+
cleaned += 1
|
|
186
|
+
|
|
187
|
+
return cleaned
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
__all__ = [
|
|
191
|
+
"AggregationState",
|
|
192
|
+
"AlertAggregator",
|
|
193
|
+
]
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""告警事件定义。
|
|
2
|
+
|
|
3
|
+
定义告警事件类型、严重级别和事件数据结构。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
import hashlib
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AlertEventType(str, Enum):
|
|
16
|
+
"""告警事件类型。"""
|
|
17
|
+
|
|
18
|
+
SLOW_REQUEST = "slow_request" # 慢请求
|
|
19
|
+
SLOW_SQL = "slow_sql" # 慢 SQL
|
|
20
|
+
EXCEPTION = "exception" # 异常
|
|
21
|
+
TASK_FAILURE = "task_failure" # 任务失败
|
|
22
|
+
TASK_TIMEOUT = "task_timeout" # 任务超时
|
|
23
|
+
CUSTOM = "custom" # 自定义
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AlertSeverity(str, Enum):
|
|
27
|
+
"""告警严重级别。"""
|
|
28
|
+
|
|
29
|
+
INFO = "info"
|
|
30
|
+
WARNING = "warning"
|
|
31
|
+
ERROR = "error"
|
|
32
|
+
CRITICAL = "critical"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class AlertEvent:
|
|
37
|
+
"""告警事件。
|
|
38
|
+
|
|
39
|
+
包含告警的所有上下文信息。
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
event_type: AlertEventType
|
|
43
|
+
severity: AlertSeverity
|
|
44
|
+
message: str
|
|
45
|
+
trace_id: str
|
|
46
|
+
|
|
47
|
+
source: str = "unknown" # api / scheduler / task
|
|
48
|
+
service_name: str = "" # 服务名
|
|
49
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
50
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
# 用于聚合/去重的指纹(相同指纹的事件会被聚合)
|
|
53
|
+
_fingerprint: str | None = field(default=None, repr=False)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def fingerprint(self) -> str:
|
|
57
|
+
"""获取事件指纹。
|
|
58
|
+
|
|
59
|
+
相同指纹的事件会被聚合处理。
|
|
60
|
+
"""
|
|
61
|
+
if self._fingerprint:
|
|
62
|
+
return self._fingerprint
|
|
63
|
+
|
|
64
|
+
# 默认指纹:类型 + 来源 + 关键元数据
|
|
65
|
+
key_parts = [
|
|
66
|
+
self.event_type.value,
|
|
67
|
+
self.source,
|
|
68
|
+
self.metadata.get("endpoint", ""),
|
|
69
|
+
self.metadata.get("task_name", ""),
|
|
70
|
+
self.metadata.get("error_type", ""),
|
|
71
|
+
]
|
|
72
|
+
key_str = ":".join(str(p) for p in key_parts if p)
|
|
73
|
+
return hashlib.md5(key_str.encode()).hexdigest()[:16]
|
|
74
|
+
|
|
75
|
+
@fingerprint.setter
|
|
76
|
+
def fingerprint(self, value: str) -> None:
|
|
77
|
+
self._fingerprint = value
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict[str, Any]:
|
|
80
|
+
"""转换为字典。"""
|
|
81
|
+
return {
|
|
82
|
+
"event_type": self.event_type.value,
|
|
83
|
+
"severity": self.severity.value,
|
|
84
|
+
"message": self.message,
|
|
85
|
+
"trace_id": self.trace_id,
|
|
86
|
+
"source": self.source,
|
|
87
|
+
"service_name": self.service_name,
|
|
88
|
+
"timestamp": self.timestamp.isoformat(),
|
|
89
|
+
"metadata": self.metadata,
|
|
90
|
+
"fingerprint": self.fingerprint,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class AlertNotification:
|
|
96
|
+
"""告警通知(发送给 Notifier 的数据)。
|
|
97
|
+
|
|
98
|
+
包含聚合后的告警信息。
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
title: str
|
|
102
|
+
message: str
|
|
103
|
+
severity: AlertSeverity
|
|
104
|
+
event_type: AlertEventType
|
|
105
|
+
source: str
|
|
106
|
+
service_name: str
|
|
107
|
+
|
|
108
|
+
# 聚合信息
|
|
109
|
+
count: int = 1 # 聚合的事件数量
|
|
110
|
+
first_timestamp: datetime = field(default_factory=datetime.now)
|
|
111
|
+
last_timestamp: datetime = field(default_factory=datetime.now)
|
|
112
|
+
|
|
113
|
+
# 关联的 trace_id 列表(最多保留最近几个)
|
|
114
|
+
trace_ids: list[str] = field(default_factory=list)
|
|
115
|
+
|
|
116
|
+
# 额外元数据
|
|
117
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
118
|
+
|
|
119
|
+
def to_dict(self) -> dict[str, Any]:
|
|
120
|
+
"""转换为字典。"""
|
|
121
|
+
return {
|
|
122
|
+
"title": self.title,
|
|
123
|
+
"message": self.message,
|
|
124
|
+
"severity": self.severity.value,
|
|
125
|
+
"event_type": self.event_type.value,
|
|
126
|
+
"source": self.source,
|
|
127
|
+
"service_name": self.service_name,
|
|
128
|
+
"count": self.count,
|
|
129
|
+
"first_timestamp": self.first_timestamp.isoformat(),
|
|
130
|
+
"last_timestamp": self.last_timestamp.isoformat(),
|
|
131
|
+
"trace_ids": self.trace_ids,
|
|
132
|
+
"metadata": self.metadata,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
__all__ = [
|
|
137
|
+
"AlertEvent",
|
|
138
|
+
"AlertEventType",
|
|
139
|
+
"AlertNotification",
|
|
140
|
+
"AlertSeverity",
|
|
141
|
+
]
|