aury-boot 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. aury/boot/_version.py +2 -2
  2. aury/boot/application/__init__.py +2 -4
  3. aury/boot/application/app/base.py +126 -2
  4. aury/boot/application/app/components.py +226 -1
  5. aury/boot/application/config/settings.py +201 -3
  6. aury/boot/application/constants/components.py +3 -0
  7. aury/boot/application/middleware/logging.py +45 -6
  8. aury/boot/commands/docs.py +40 -0
  9. aury/boot/commands/init.py +2 -0
  10. aury/boot/commands/templates/project/AGENTS.md.tpl +59 -0
  11. aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
  12. aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
  13. aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
  14. aury/boot/commands/templates/project/env_templates/messaging.tpl +21 -13
  15. aury/boot/commands/templates/project/env_templates/monitoring.tpl +63 -0
  16. aury/boot/common/logging/context.py +17 -1
  17. aury/boot/common/logging/format.py +4 -0
  18. aury/boot/infrastructure/__init__.py +4 -8
  19. aury/boot/infrastructure/channel/__init__.py +9 -8
  20. aury/boot/infrastructure/channel/backends/__init__.py +2 -6
  21. aury/boot/infrastructure/channel/backends/broadcaster.py +141 -0
  22. aury/boot/infrastructure/channel/base.py +11 -4
  23. aury/boot/infrastructure/channel/manager.py +25 -24
  24. aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
  25. aury/boot/infrastructure/events/__init__.py +4 -6
  26. aury/boot/infrastructure/events/backends/__init__.py +2 -4
  27. aury/boot/infrastructure/events/backends/broadcaster.py +189 -0
  28. aury/boot/infrastructure/events/base.py +9 -4
  29. aury/boot/infrastructure/events/manager.py +24 -20
  30. aury/boot/infrastructure/monitoring/__init__.py +210 -6
  31. aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
  32. aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
  33. aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
  34. aury/boot/infrastructure/monitoring/alerting/manager.py +430 -0
  35. aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
  36. aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
  37. aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
  38. aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
  39. aury/boot/infrastructure/monitoring/alerting/rules.py +179 -0
  40. aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
  41. aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
  42. aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
  43. aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
  44. aury/boot/infrastructure/monitoring/tracing/processor.py +357 -0
  45. aury/boot/infrastructure/monitoring/tracing/provider.py +322 -0
  46. aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
  47. {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/METADATA +14 -1
  48. {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/RECORD +50 -33
  49. aury/boot/infrastructure/channel/backends/memory.py +0 -126
  50. aury/boot/infrastructure/channel/backends/redis.py +0 -130
  51. aury/boot/infrastructure/events/backends/memory.py +0 -86
  52. aury/boot/infrastructure/events/backends/redis.py +0 -169
  53. {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/WHEEL +0 -0
  54. {aury_boot-0.0.29.dist-info → aury_boot-0.0.31.dist-info}/entry_points.txt +0 -0
@@ -15,7 +15,11 @@ from dotenv import load_dotenv
15
15
  from pydantic import BaseModel, Field
16
16
  from pydantic_settings import BaseSettings, SettingsConfigDict
17
17
 
18
- from .multi_instance import MultiInstanceConfigLoader, MultiInstanceSettings
18
+ from .multi_instance import (
19
+ MultiInstanceConfigLoader,
20
+ MultiInstanceSettings,
21
+ parse_multi_instance_env,
22
+ )
19
23
 
20
24
 
21
25
  def _load_env_file(env_file: str | Path) -> bool:
@@ -218,6 +222,10 @@ class DatabaseSettings(BaseModel):
218
222
  default=True,
219
223
  description="是否在获取连接前进行 PING"
220
224
  )
225
+ slow_query_threshold: float = Field(
226
+ default=1.0,
227
+ description="慢查询阈值(秒),超过此时间的查询会记录警告日志"
228
+ )
221
229
 
222
230
 
223
231
  class CacheSettings(BaseModel):
@@ -407,7 +415,7 @@ class ServiceSettings(BaseModel):
407
415
  """服务配置。
408
416
 
409
417
  环境变量格式: SERVICE__{FIELD}
410
- 示例: SERVICE__NAME, SERVICE__TYPE
418
+ 示例: SERVICE__NAME, SERVICE__TYPE, SERVICE__ENVIRONMENT
411
419
 
412
420
  服务类型说明:
413
421
  - api: 运行 API 服务(SCHEDULER__ENABLED 决定是否同时运行调度器)
@@ -418,7 +426,15 @@ class ServiceSettings(BaseModel):
418
426
 
419
427
  name: str = Field(
420
428
  default="app",
421
- description="服务名称,用于日志目录区分"
429
+ description="服务名称,用于日志目录区分、链路追踪标识"
430
+ )
431
+ version: str = Field(
432
+ default="",
433
+ description="服务版本(用于链路追踪和监控)"
434
+ )
435
+ environment: str = Field(
436
+ default="development",
437
+ description="部署环境 (development/staging/production)"
422
438
  )
423
439
  service_type: str = Field(
424
440
  default="api",
@@ -555,6 +571,183 @@ class MessageQueueSettings(BaseModel):
555
571
  )
556
572
 
557
573
 
574
+ class TelemetrySettings(BaseModel):
575
+ """OpenTelemetry 配置。
576
+
577
+ 环境变量格式: TELEMETRY__{FIELD}
578
+ 示例: TELEMETRY__ENABLED, TELEMETRY__SLOW_THRESHOLD
579
+
580
+ 功能说明:
581
+ - 启用后自动 instrument FastAPI、SQLAlchemy、httpx
582
+ - get_trace_id() 会优先使用 OTel trace_id
583
+ - 可配置 AlertingSpanProcessor 自动检测慢请求/异常并触发告警
584
+ - 可选配置 OTLP 导出到 Jaeger/Tempo/Collector
585
+
586
+ 注意:service_name/version/environment 从 ServiceSettings 获取。
587
+ """
588
+
589
+ enabled: bool = Field(
590
+ default=False,
591
+ description="是否启用 OpenTelemetry"
592
+ )
593
+
594
+ # Instrumentation 开关
595
+ instrument_fastapi: bool = Field(
596
+ default=True,
597
+ description="是否自动 instrument FastAPI"
598
+ )
599
+ instrument_sqlalchemy: bool = Field(
600
+ default=True,
601
+ description="是否自动 instrument SQLAlchemy"
602
+ )
603
+ instrument_httpx: bool = Field(
604
+ default=True,
605
+ description="是否自动 instrument httpx"
606
+ )
607
+
608
+
609
+ # OTLP Traces 导出配置
610
+ traces_endpoint: str | None = Field(
611
+ default=None,
612
+ description="Traces 导出端点(如 http://jaeger:4317)"
613
+ )
614
+ traces_headers: dict[str, str] = Field(
615
+ default_factory=dict,
616
+ description="Traces 导出请求头"
617
+ )
618
+
619
+ # OTLP Logs 导出配置
620
+ logs_endpoint: str | None = Field(
621
+ default=None,
622
+ description="Logs 导出端点(如 http://loki:3100)"
623
+ )
624
+ logs_headers: dict[str, str] = Field(
625
+ default_factory=dict,
626
+ description="Logs 导出请求头"
627
+ )
628
+
629
+ # OTLP Metrics 导出配置
630
+ metrics_endpoint: str | None = Field(
631
+ default=None,
632
+ description="Metrics 导出端点(如 http://prometheus:9090)"
633
+ )
634
+ metrics_headers: dict[str, str] = Field(
635
+ default_factory=dict,
636
+ description="Metrics 导出请求头"
637
+ )
638
+
639
+ # 采样配置
640
+ sampling_rate: float = Field(
641
+ default=1.0,
642
+ description="采样率 (0.0-1.0),1.0 表示 100%"
643
+ )
644
+
645
+
646
+ class AlertSettings(BaseModel):
647
+ """告警系统配置。
648
+
649
+ 环境变量格式: ALERT__{FIELD}
650
+ 示例: ALERT__ENABLED, ALERT__RULES_FILE
651
+
652
+ 通知器配置(作为子配置,动态字段):
653
+ ALERT__NOTIFIERS__FEISHU__TYPE=feishu
654
+ ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://open.feishu.cn/...
655
+ ALERT__NOTIFIERS__FEISHU__SECRET=xxx
656
+
657
+ ALERT__NOTIFIERS__OPS__TYPE=webhook
658
+ ALERT__NOTIFIERS__OPS__URL=https://my-system.com/alert
659
+
660
+ 自定义通知器通过 AlertManager.register_notifier() 注册。
661
+ """
662
+
663
+ # 通知器配置缓存(动态字段)
664
+ _notifiers: dict[str, dict[str, Any]] | None = None
665
+
666
+ enabled: bool = Field(
667
+ default=False,
668
+ description="是否启用告警系统"
669
+ )
670
+ rules_file: str | None = Field(
671
+ default=None,
672
+ description="告警规则文件路径(YAML 格式),如 alert_rules.yaml"
673
+ )
674
+
675
+ # 慢操作阈值
676
+ slow_request_threshold: float = Field(
677
+ default=1.0,
678
+ description="慢请求阈值(秒)"
679
+ )
680
+ slow_sql_threshold: float = Field(
681
+ default=0.5,
682
+ description="慢 SQL 阈值(秒)"
683
+ )
684
+
685
+ # 告警开关
686
+ alert_on_slow_request: bool = Field(
687
+ default=True,
688
+ description="是否对慢 HTTP 请求发送告警"
689
+ )
690
+ alert_on_slow_sql: bool = Field(
691
+ default=True,
692
+ description="是否对慢 SQL 发送告警"
693
+ )
694
+ alert_on_error: bool = Field(
695
+ default=True,
696
+ description="是否对异常发送告警(默认只对 5xx 告警,4xx 业务异常不告警)"
697
+ )
698
+
699
+ # 慢请求路径排除配置
700
+ slow_request_exclude_paths: list[str] = Field(
701
+ default_factory=list,
702
+ description="排除慢请求告警的路径列表(支持 * 通配符),如 SSE/WebSocket 长连接接口"
703
+ )
704
+
705
+ # 默认累计触发配置
706
+ aggregate_window: int = Field(
707
+ default=10,
708
+ description="聚合窗口(秒)"
709
+ )
710
+ slow_request_aggregate: int = Field(
711
+ default=5,
712
+ description="慢请求触发阈值(窗口内次数)"
713
+ )
714
+ slow_sql_aggregate: int = Field(
715
+ default=10,
716
+ description="慢 SQL 触发阈值(窗口内次数)"
717
+ )
718
+ exception_aggregate: int = Field(
719
+ default=1,
720
+ description="异常触发阈值(通常为 1,立即告警)"
721
+ )
722
+
723
+ # 抑制配置
724
+ suppress_seconds: int = Field(
725
+ default=10,
726
+ description="告警抑制时间(秒),相同告警在此时间内不重复发送"
727
+ )
728
+
729
+ def get_notifiers(self) -> dict[str, dict[str, Any]]:
730
+ """获取所有告警通知器实例配置。
731
+
732
+ 从环境变量解析 ALERT__NOTIFIERS__{INSTANCE}__{FIELD} 格式的配置。
733
+ 支持动态字段,不同类型通知器可有不同字段。
734
+
735
+ Returns:
736
+ dict[str, dict[str, Any]]: 实例名 -> 配置字典
737
+
738
+ 示例:
739
+ ALERT__NOTIFIERS__FEISHU__TYPE=feishu
740
+ ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://...
741
+
742
+ 返回: {"feishu": {"type": "feishu", "webhook": "https://..."}}
743
+
744
+ 自定义通知器通过 AlertManager.register_notifier() 注册。
745
+ """
746
+ if self._notifiers is None:
747
+ self._notifiers = parse_multi_instance_env("ALERT__NOTIFIERS")
748
+ return self._notifiers
749
+
750
+
558
751
  class MigrationSettings(BaseModel):
559
752
  """数据库迁移配置。
560
753
 
@@ -824,6 +1017,10 @@ class BaseConfig(BaseSettings):
824
1017
  # RPC 服务配置(当前服务注册)
825
1018
  rpc_service: RPCServiceSettings = Field(default_factory=RPCServiceSettings)
826
1019
 
1020
+ # ========== 监控告警 ==========
1021
+ telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
1022
+ alert: AlertSettings = Field(default_factory=AlertSettings)
1023
+
827
1024
  model_config = SettingsConfigDict(
828
1025
  case_sensitive=False,
829
1026
  extra="ignore",
@@ -953,6 +1150,7 @@ __all__ = [
953
1150
  # 配置类
954
1151
  "AdminAuthSettings",
955
1152
  "AdminConsoleSettings",
1153
+ "AlertSettings",
956
1154
  "BaseConfig",
957
1155
  "CORSSettings",
958
1156
  # 多实例配置类
@@ -50,6 +50,9 @@ class ComponentName(str, Enum):
50
50
  # 管理后台(可选扩展)
51
51
  ADMIN_CONSOLE = "admin_console"
52
52
 
53
+ # 遍测组件 (OpenTelemetry)
54
+ TELEMETRY = "telemetry"
55
+
53
56
 
54
57
  __all__ = [
55
58
  "ComponentName",
@@ -16,8 +16,33 @@ from starlette.middleware.base import BaseHTTPMiddleware
16
16
  from starlette.requests import Request
17
17
  from starlette.responses import Response
18
18
 
19
- from aury.boot.application.errors import global_exception_handler
20
- from aury.boot.common.logging import logger, set_trace_id
19
+ from aury.boot.application.errors.chain import global_exception_handler
20
+ from aury.boot.common.logging import get_trace_id, logger, set_trace_id
21
+
22
+
23
+ def _record_exception_to_span(exc: Exception) -> None:
24
+ """将异常记录到当前 OTEL span(使用与 loguru 一致的格式)。"""
25
+ try:
26
+ from opentelemetry import trace
27
+
28
+ from aury.boot.common.logging.format import format_exception_compact
29
+
30
+ span = trace.get_current_span()
31
+ if span and span.is_recording():
32
+ # 使用与 loguru 一致的堆栈格式(包含代码行和局部变量)
33
+ formatted_tb = format_exception_compact(
34
+ type(exc), exc, exc.__traceback__
35
+ )
36
+
37
+ # 记录异常,并将格式化堆栈放入 attributes
38
+ span.record_exception(exc, attributes={
39
+ "exception.stacktrace": formatted_tb,
40
+ })
41
+ span.set_status(trace.Status(trace.StatusCode.ERROR, str(exc)))
42
+ except ImportError:
43
+ pass # OTEL 未安装
44
+ except Exception:
45
+ pass # 忽略记录错误
21
46
 
22
47
 
23
48
  def log_request[T](func: Callable[..., T]) -> Callable[..., T]:
@@ -112,22 +137,33 @@ def _should_log_body(content_type: str | None) -> bool:
112
137
 
113
138
 
114
139
  class RequestLoggingMiddleware(BaseHTTPMiddleware):
115
- """请求日志中间件(支持链路追踪)。
140
+ """请求日志中间件(支持链路追踪和告警)。
116
141
 
117
142
  自动记录所有HTTP请求的详细信息,包括:
118
143
  - 请求方法、路径、查询参数、请求体
119
144
  - 客户端IP、User-Agent
120
145
  - 响应状态码、耗时、响应体
121
146
  - 链路追踪 ID(X-Trace-ID / X-Request-ID)
147
+ - 慢请求和异常告警(如果启用告警系统)
122
148
 
123
149
  注意:文件上传、二进制数据等不会记录 body 内容。
124
150
 
125
151
  使用示例:
126
152
  from aury.boot.application.middleware.logging import RequestLoggingMiddleware
127
153
 
128
- app.add_middleware(RequestLoggingMiddleware)
154
+ app.add_middleware(RequestLoggingMiddleware, slow_request_threshold=1.0)
129
155
  """
130
156
 
157
+ def __init__(self, app, slow_request_threshold: float = 1.0) -> None:
158
+ """初始化中间件。
159
+
160
+ Args:
161
+ app: ASGI 应用
162
+ slow_request_threshold: 慢请求阈值(秒),默认 1.0
163
+ """
164
+ super().__init__(app)
165
+ self.slow_request_threshold = slow_request_threshold
166
+
131
167
  async def dispatch(self, request: Request, call_next) -> Response:
132
168
  """处理请求并记录日志。"""
133
169
  start_time = time.time()
@@ -193,10 +229,10 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
193
229
  )
194
230
 
195
231
  # 慢请求警告
196
- if duration > 1.0:
232
+ if duration > self.slow_request_threshold:
197
233
  logger.warning(
198
234
  f"慢请求: {request.method} {request.url.path} | "
199
- f"耗时: {duration:.3f}s (超过1秒) | "
235
+ f"耗时: {duration:.3f}s (阈值: {self.slow_request_threshold}s) | "
200
236
  f"Trace-ID: {trace_id}"
201
237
  )
202
238
 
@@ -210,6 +246,9 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
210
246
  f"耗时: {duration:.3f}s | Trace-ID: {trace_id}"
211
247
  )
212
248
 
249
+ # 将异常记录到当前 OTEL span(以便告警系统提取)
250
+ _record_exception_to_span(exc)
251
+
213
252
  # 使用全局异常处理器生成响应,而不是直接抛出异常
214
253
  # BaseHTTPMiddleware 中直接 raise 会绕过 FastAPI 的异常处理器
215
254
  response = await global_exception_handler(request, exc)
@@ -349,6 +349,45 @@ def generate_env_example(
349
349
  raise typer.Exit(1)
350
350
 
351
351
 
352
+ @app.command(name="alert-rules")
353
+ def generate_alert_rules(
354
+ project_dir: Path = typer.Argument(
355
+ Path("."),
356
+ help="项目目录路径",
357
+ exists=True,
358
+ file_okay=False,
359
+ dir_okay=True,
360
+ resolve_path=True,
361
+ ),
362
+ force: bool = typer.Option(
363
+ False,
364
+ "--force",
365
+ "-f",
366
+ help="强制覆盖已存在的文件",
367
+ ),
368
+ dry_run: bool = typer.Option(
369
+ False,
370
+ "--dry-run",
371
+ "-n",
372
+ help="预览模式,不实际写入文件",
373
+ ),
374
+ ) -> None:
375
+ """生成/更新 alert_rules.yaml 告警规则模板。"""
376
+ context = _detect_project_info(project_dir)
377
+
378
+ console.print(f"[cyan]📢 检测到项目: {context['project_name']}[/cyan]")
379
+
380
+ try:
381
+ # 使用模板文件
382
+ template_path = TEMPLATES_DIR / "alert_rules.example.yaml.tpl"
383
+ content = template_path.read_text(encoding="utf-8")
384
+ output_path = project_dir / "alert_rules.example.yaml"
385
+ _write_file(output_path, content, force=force, dry_run=dry_run)
386
+ except Exception as e:
387
+ console.print(f"[red]❌ 生成失败: {e}[/red]")
388
+ raise typer.Exit(1)
389
+
390
+
352
391
  @app.command(name="all")
353
392
  def generate_all_docs(
354
393
  project_dir: Path = typer.Argument(
@@ -382,6 +421,7 @@ def generate_all_docs(
382
421
  root_docs: list[tuple[str, str, str]] = [
383
422
  ("AGENTS.md.tpl", "AGENTS.md", "AI 编程助手上下文"),
384
423
  ("env.example.tpl", ".env.example", "环境变量示例"),
424
+ ("alert_rules.example.yaml.tpl", "alert_rules.example.yaml", "告警规则示例"),
385
425
  ]
386
426
 
387
427
  # aury_docs/ 开发文档
@@ -159,6 +159,7 @@ TEMPLATE_FILE_MAP = {
159
159
  "AGENTS.md": "AGENTS.md.tpl",
160
160
  "conftest.py": "conftest.py.tpl",
161
161
  "admin_console/__init__.py": "admin_console_init.py.tpl",
162
+ "alert_rules.example.yaml": "alert_rules.example.yaml.tpl",
162
163
  }
163
164
 
164
165
  # env 模板拼接顺序
@@ -172,6 +173,7 @@ ENV_TEMPLATE_ORDER = [
172
173
  "scheduler.tpl",
173
174
  "messaging.tpl",
174
175
  "storage.tpl",
176
+ "monitoring.tpl",
175
177
  "third_party.tpl",
176
178
  "rpc.tpl",
177
179
  ]
@@ -112,6 +112,11 @@ mypy {package_name}/
112
112
  - **[aury_docs/14-mq.md](./aury_docs/14-mq.md)** - 消息队列
113
113
  - **[aury_docs/15-events.md](./aury_docs/15-events.md)** - 事件总线
114
114
 
115
+ ### 监控与告警
116
+
117
+ - **[aury_docs/17-alerting.md](./aury_docs/17-alerting.md)** - 告警系统(慢请求/慢SQL/异常 → 飞书)
118
+ - **[alert_rules.example.yaml](./alert_rules.example.yaml)** - 告警规则示例(复制为 alert_rules.yaml 使用)
119
+
115
120
  ### 第三方集成
116
121
 
117
122
  - **[aury_docs/16-adapter.md](./aury_docs/16-adapter.md)** - 第三方接口适配器(Mock/真实切换)
@@ -122,6 +127,60 @@ mypy {package_name}/
122
127
  - **[aury_docs/99-cli.md](./aury_docs/99-cli.md)** - CLI 命令参考
123
128
  - **[.env.example](./.env.example)** - 所有可用环境变量
124
129
 
130
+ ## 配置结构
131
+
132
+ 框架使用 `BaseConfig` 统一管理配置,环境变量通过 `__` 分隔符映射到嵌套配置:
133
+
134
+ ```python
135
+ # 配置结构(BaseConfig)
136
+ class BaseConfig(BaseSettings):
137
+ # 基础服务
138
+ server: ServerSettings # SERVER__*
139
+ cors: CORSSettings # CORS__*
140
+ log: LogSettings # LOG__*
141
+ health_check: HealthCheckSettings # HEALTH_CHECK__*
142
+ admin: AdminConsoleSettings # ADMIN__*
143
+
144
+ # 数据与缓存
145
+ database: DatabaseSettings # DATABASE__*
146
+ cache: CacheSettings # CACHE__*
147
+ channel: ChannelSettings # CHANNEL__*
148
+ storage: StorageSettings # STORAGE__*
149
+ migration: MigrationSettings # MIGRATION__*
150
+
151
+ # 服务编排
152
+ service: ServiceSettings # SERVICE__*
153
+ scheduler: SchedulerSettings # SCHEDULER__*
154
+
155
+ # 异步与事件
156
+ task: TaskSettings # TASK__*
157
+ event: EventSettings # EVENT__*
158
+
159
+ # 微服务通信
160
+ rpc_client: RPCClientSettings # RPC_CLIENT__*
161
+ rpc_service: RPCServiceSettings # RPC_SERVICE__*
162
+
163
+ # 监控告警
164
+ telemetry: TelemetrySettings # TELEMETRY__*
165
+ alert: AlertSettings # ALERT__*
166
+
167
+ model_config = SettingsConfigDict(
168
+ env_nested_delimiter="__", # 环境变量分隔符
169
+ )
170
+ ```
171
+
172
+ **环境变量命名规则**:`{SECTION}__{FIELD}`
173
+
174
+ ```bash
175
+ # 示例
176
+ DATABASE__URL=postgresql://...
177
+ DATABASE__POOL_SIZE=10
178
+ CACHE__CACHE_TYPE=redis
179
+ CACHE__URL=redis://localhost:6379
180
+ ALERT__ENABLED=true
181
+ ALERT__SLOW_REQUEST_THRESHOLD=1.0
182
+ ```
183
+
125
184
  ## 代码规范
126
185
 
127
186
  > 项目所有业务配置请通过应用 `settings`/配置对象获取,**不要**直接使用 `os.environ` 在业务代码中读环境变量。
@@ -0,0 +1,85 @@
1
+ # 告警规则配置文件
2
+ #
3
+ # 使用方法:
4
+ # 1. 在 .env 中配置 ALERT__RULES_FILE=alert_rules.yaml
5
+ # 2. 配置通知器(见下方示例)
6
+ # 3. 根据需要修改规则
7
+ #
8
+ # 简版配置(所有告警发到同一个群):
9
+ # ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
10
+ # ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/xxx
11
+ #
12
+ # 完整版配置(分群告警):
13
+ # ALERT__NOTIFIERS__PERF_GROUP__TYPE=feishu
14
+ # ALERT__NOTIFIERS__PERF_GROUP__WEBHOOK=https://open.feishu.cn/.../perf-xxx
15
+ # ALERT__NOTIFIERS__ERROR_GROUP__TYPE=feishu
16
+ # ALERT__NOTIFIERS__ERROR_GROUP__WEBHOOK=https://open.feishu.cn/.../error-xxx
17
+ # ALERT__NOTIFIERS__OPS_GROUP__TYPE=feishu
18
+ # ALERT__NOTIFIERS__OPS_GROUP__WEBHOOK=https://open.feishu.cn/.../ops-xxx
19
+
20
+ defaults:
21
+ slow_request_threshold: 1.0
22
+ slow_sql_threshold: 0.5
23
+ aggregate_window: 10
24
+ suppress_seconds: 300
25
+
26
+ rules:
27
+ # ============ 简版:所有告警发到 default 群 ============
28
+ # 如果使用分群告警,请注释掉这部分,使用下方的完整版
29
+
30
+ - name: slow_request
31
+ event_types: [slow_request]
32
+ aggregate_threshold: 5
33
+ notifiers: [default]
34
+
35
+ - name: slow_sql
36
+ event_types: [slow_sql]
37
+ aggregate_threshold: 10
38
+ notifiers: [default]
39
+
40
+ - name: exception
41
+ event_types: [exception]
42
+ aggregate_threshold: 1
43
+ suppress_seconds: 60
44
+ notifiers: [default]
45
+
46
+ - name: task_issues
47
+ event_types: [task_failure, task_timeout]
48
+ aggregate_threshold: 1
49
+ notifiers: [default]
50
+
51
+ # ============ 完整版:分群告警 ============
52
+ # 取消注释并配置对应的通知器即可使用
53
+
54
+ # 慢请求 → 性能群
55
+ # - name: slow_request
56
+ # event_types: [slow_request]
57
+ # aggregate_threshold: 5
58
+ # notifiers: [perf_group]
59
+
60
+ # 慢 SQL → 性能群
61
+ # - name: slow_sql
62
+ # event_types: [slow_sql]
63
+ # aggregate_threshold: 10
64
+ # notifiers: [perf_group]
65
+
66
+ # 异常 → 错误群(立即告警)
67
+ # - name: exception
68
+ # event_types: [exception]
69
+ # aggregate_threshold: 1
70
+ # suppress_seconds: 60
71
+ # notifiers: [error_group]
72
+
73
+ # 任务失败/超时 → 运维群
74
+ # - name: task_issues
75
+ # event_types: [task_failure, task_timeout]
76
+ # aggregate_threshold: 1
77
+ # notifiers: [ops_group]
78
+
79
+ # 关键接口更严格的阈值(示例)
80
+ # - name: critical_api
81
+ # event_types: [slow_request]
82
+ # path_pattern: "/api/v1/payments/*"
83
+ # threshold: 0.5
84
+ # aggregate_threshold: 1
85
+ # notifiers: [error_group]
@@ -57,3 +57,6 @@ CLI 命令参考请查看 [99-cli.md](./99-cli.md)。
57
57
  - [13-channel.md](./13-channel.md) - 流式通道(SSE)
58
58
  - [14-mq.md](./14-mq.md) - 消息队列
59
59
  - [15-events.md](./15-events.md) - 事件总线
60
+
61
+ ### 监控与告警
62
+ - [17-alerting.md](./17-alerting.md) - 告警系统(慢请求/慢SQL/异常 → 飞书)