aury-boot 0.0.28__py3-none-any.whl → 0.0.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. aury/boot/_version.py +2 -2
  2. aury/boot/application/app/base.py +126 -2
  3. aury/boot/application/app/components.py +224 -1
  4. aury/boot/application/config/settings.py +195 -3
  5. aury/boot/application/constants/components.py +3 -0
  6. aury/boot/application/middleware/logging.py +45 -6
  7. aury/boot/commands/docs.py +40 -0
  8. aury/boot/commands/init.py +2 -0
  9. aury/boot/commands/templates/project/AGENTS.md.tpl +16 -1
  10. aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
  11. aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
  12. aury/boot/commands/templates/project/aury_docs/03-service.md.tpl +60 -0
  13. aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
  14. aury/boot/commands/templates/project/env_templates/monitoring.tpl +61 -0
  15. aury/boot/common/logging/context.py +17 -1
  16. aury/boot/common/logging/format.py +4 -0
  17. aury/boot/domain/transaction/__init__.py +57 -0
  18. aury/boot/infrastructure/channel/base.py +6 -2
  19. aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
  20. aury/boot/infrastructure/monitoring/__init__.py +210 -6
  21. aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
  22. aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
  23. aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
  24. aury/boot/infrastructure/monitoring/alerting/manager.py +428 -0
  25. aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
  26. aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
  27. aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
  28. aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
  29. aury/boot/infrastructure/monitoring/alerting/rules.py +163 -0
  30. aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
  31. aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
  32. aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
  33. aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
  34. aury/boot/infrastructure/monitoring/tracing/processor.py +327 -0
  35. aury/boot/infrastructure/monitoring/tracing/provider.py +320 -0
  36. aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
  37. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/METADATA +14 -1
  38. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/RECORD +40 -21
  39. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/WHEEL +0 -0
  40. {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/entry_points.txt +0 -0
@@ -15,7 +15,11 @@ from dotenv import load_dotenv
15
15
  from pydantic import BaseModel, Field
16
16
  from pydantic_settings import BaseSettings, SettingsConfigDict
17
17
 
18
- from .multi_instance import MultiInstanceConfigLoader, MultiInstanceSettings
18
+ from .multi_instance import (
19
+ MultiInstanceConfigLoader,
20
+ MultiInstanceSettings,
21
+ parse_multi_instance_env,
22
+ )
19
23
 
20
24
 
21
25
  def _load_env_file(env_file: str | Path) -> bool:
@@ -218,6 +222,10 @@ class DatabaseSettings(BaseModel):
218
222
  default=True,
219
223
  description="是否在获取连接前进行 PING"
220
224
  )
225
+ slow_query_threshold: float = Field(
226
+ default=1.0,
227
+ description="慢查询阈值(秒),超过此时间的查询会记录警告日志"
228
+ )
221
229
 
222
230
 
223
231
  class CacheSettings(BaseModel):
@@ -407,7 +415,7 @@ class ServiceSettings(BaseModel):
407
415
  """服务配置。
408
416
 
409
417
  环境变量格式: SERVICE__{FIELD}
410
- 示例: SERVICE__NAME, SERVICE__TYPE
418
+ 示例: SERVICE__NAME, SERVICE__TYPE, SERVICE__ENVIRONMENT
411
419
 
412
420
  服务类型说明:
413
421
  - api: 运行 API 服务(SCHEDULER__ENABLED 决定是否同时运行调度器)
@@ -418,7 +426,15 @@ class ServiceSettings(BaseModel):
418
426
 
419
427
  name: str = Field(
420
428
  default="app",
421
- description="服务名称,用于日志目录区分"
429
+ description="服务名称,用于日志目录区分、链路追踪标识"
430
+ )
431
+ version: str = Field(
432
+ default="",
433
+ description="服务版本(用于链路追踪和监控)"
434
+ )
435
+ environment: str = Field(
436
+ default="development",
437
+ description="部署环境 (development/staging/production)"
422
438
  )
423
439
  service_type: str = Field(
424
440
  default="api",
@@ -555,6 +571,177 @@ class MessageQueueSettings(BaseModel):
555
571
  )
556
572
 
557
573
 
574
+ class TelemetrySettings(BaseModel):
575
+ """OpenTelemetry 配置。
576
+
577
+ 环境变量格式: TELEMETRY__{FIELD}
578
+ 示例: TELEMETRY__ENABLED, TELEMETRY__SLOW_THRESHOLD
579
+
580
+ 功能说明:
581
+ - 启用后自动 instrument FastAPI、SQLAlchemy、httpx
582
+ - get_trace_id() 会优先使用 OTel trace_id
583
+ - 可配置 AlertingSpanProcessor 自动检测慢请求/异常并触发告警
584
+ - 可选配置 OTLP 导出到 Jaeger/Tempo/Collector
585
+
586
+ 注意:service_name/version/environment 从 ServiceSettings 获取。
587
+ """
588
+
589
+ enabled: bool = Field(
590
+ default=False,
591
+ description="是否启用 OpenTelemetry"
592
+ )
593
+
594
+ # Instrumentation 开关
595
+ instrument_fastapi: bool = Field(
596
+ default=True,
597
+ description="是否自动 instrument FastAPI"
598
+ )
599
+ instrument_sqlalchemy: bool = Field(
600
+ default=True,
601
+ description="是否自动 instrument SQLAlchemy"
602
+ )
603
+ instrument_httpx: bool = Field(
604
+ default=True,
605
+ description="是否自动 instrument httpx"
606
+ )
607
+
608
+
609
+ # OTLP Traces 导出配置
610
+ traces_endpoint: str | None = Field(
611
+ default=None,
612
+ description="Traces 导出端点(如 http://jaeger:4317)"
613
+ )
614
+ traces_headers: dict[str, str] = Field(
615
+ default_factory=dict,
616
+ description="Traces 导出请求头"
617
+ )
618
+
619
+ # OTLP Logs 导出配置
620
+ logs_endpoint: str | None = Field(
621
+ default=None,
622
+ description="Logs 导出端点(如 http://loki:3100)"
623
+ )
624
+ logs_headers: dict[str, str] = Field(
625
+ default_factory=dict,
626
+ description="Logs 导出请求头"
627
+ )
628
+
629
+ # OTLP Metrics 导出配置
630
+ metrics_endpoint: str | None = Field(
631
+ default=None,
632
+ description="Metrics 导出端点(如 http://prometheus:9090)"
633
+ )
634
+ metrics_headers: dict[str, str] = Field(
635
+ default_factory=dict,
636
+ description="Metrics 导出请求头"
637
+ )
638
+
639
+ # 采样配置
640
+ sampling_rate: float = Field(
641
+ default=1.0,
642
+ description="采样率 (0.0-1.0),1.0 表示 100%"
643
+ )
644
+
645
+
646
+ class AlertSettings(BaseModel):
647
+ """告警系统配置。
648
+
649
+ 环境变量格式: ALERT__{FIELD}
650
+ 示例: ALERT__ENABLED, ALERT__RULES_FILE
651
+
652
+ 通知器配置(作为子配置,动态字段):
653
+ ALERT__NOTIFIERS__FEISHU__TYPE=feishu
654
+ ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://open.feishu.cn/...
655
+ ALERT__NOTIFIERS__FEISHU__SECRET=xxx
656
+
657
+ ALERT__NOTIFIERS__OPS__TYPE=webhook
658
+ ALERT__NOTIFIERS__OPS__URL=https://my-system.com/alert
659
+
660
+ 自定义通知器通过 AlertManager.register_notifier() 注册。
661
+ """
662
+
663
+ # 通知器配置缓存(动态字段)
664
+ _notifiers: dict[str, dict[str, Any]] | None = None
665
+
666
+ enabled: bool = Field(
667
+ default=False,
668
+ description="是否启用告警系统"
669
+ )
670
+ rules_file: str | None = Field(
671
+ default=None,
672
+ description="告警规则文件路径(YAML 格式),如 alert_rules.yaml"
673
+ )
674
+
675
+ # 慢操作阈值
676
+ slow_request_threshold: float = Field(
677
+ default=1.0,
678
+ description="慢请求阈值(秒)"
679
+ )
680
+ slow_sql_threshold: float = Field(
681
+ default=0.5,
682
+ description="慢 SQL 阈值(秒)"
683
+ )
684
+
685
+ # 告警开关
686
+ alert_on_slow_request: bool = Field(
687
+ default=True,
688
+ description="是否对慢 HTTP 请求发送告警"
689
+ )
690
+ alert_on_slow_sql: bool = Field(
691
+ default=True,
692
+ description="是否对慢 SQL 发送告警"
693
+ )
694
+ alert_on_error: bool = Field(
695
+ default=True,
696
+ description="是否对异常发送告警(默认只对 5xx 告警,4xx 业务异常不告警)"
697
+ )
698
+
699
+ # 默认累计触发配置
700
+ aggregate_window: int = Field(
701
+ default=10,
702
+ description="聚合窗口(秒)"
703
+ )
704
+ slow_request_aggregate: int = Field(
705
+ default=5,
706
+ description="慢请求触发阈值(窗口内次数)"
707
+ )
708
+ slow_sql_aggregate: int = Field(
709
+ default=10,
710
+ description="慢 SQL 触发阈值(窗口内次数)"
711
+ )
712
+ exception_aggregate: int = Field(
713
+ default=1,
714
+ description="异常触发阈值(通常为 1,立即告警)"
715
+ )
716
+
717
+ # 抑制配置
718
+ suppress_seconds: int = Field(
719
+ default=10,
720
+ description="告警抑制时间(秒),相同告警在此时间内不重复发送"
721
+ )
722
+
723
+ def get_notifiers(self) -> dict[str, dict[str, Any]]:
724
+ """获取所有告警通知器实例配置。
725
+
726
+ 从环境变量解析 ALERT__NOTIFIERS__{INSTANCE}__{FIELD} 格式的配置。
727
+ 支持动态字段,不同类型通知器可有不同字段。
728
+
729
+ Returns:
730
+ dict[str, dict[str, Any]]: 实例名 -> 配置字典
731
+
732
+ 示例:
733
+ ALERT__NOTIFIERS__FEISHU__TYPE=feishu
734
+ ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://...
735
+
736
+ 返回: {"feishu": {"type": "feishu", "webhook": "https://..."}}
737
+
738
+ 自定义通知器通过 AlertManager.register_notifier() 注册。
739
+ """
740
+ if self._notifiers is None:
741
+ self._notifiers = parse_multi_instance_env("ALERT__NOTIFIERS")
742
+ return self._notifiers
743
+
744
+
558
745
  class MigrationSettings(BaseModel):
559
746
  """数据库迁移配置。
560
747
 
@@ -824,6 +1011,10 @@ class BaseConfig(BaseSettings):
824
1011
  # RPC 服务配置(当前服务注册)
825
1012
  rpc_service: RPCServiceSettings = Field(default_factory=RPCServiceSettings)
826
1013
 
1014
+ # ========== 监控告警 ==========
1015
+ telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
1016
+ alert: AlertSettings = Field(default_factory=AlertSettings)
1017
+
827
1018
  model_config = SettingsConfigDict(
828
1019
  case_sensitive=False,
829
1020
  extra="ignore",
@@ -953,6 +1144,7 @@ __all__ = [
953
1144
  # 配置类
954
1145
  "AdminAuthSettings",
955
1146
  "AdminConsoleSettings",
1147
+ "AlertSettings",
956
1148
  "BaseConfig",
957
1149
  "CORSSettings",
958
1150
  # 多实例配置类
@@ -50,6 +50,9 @@ class ComponentName(str, Enum):
50
50
  # 管理后台(可选扩展)
51
51
  ADMIN_CONSOLE = "admin_console"
52
52
 
53
+ # 遍测组件 (OpenTelemetry)
54
+ TELEMETRY = "telemetry"
55
+
53
56
 
54
57
  __all__ = [
55
58
  "ComponentName",
@@ -16,8 +16,33 @@ from starlette.middleware.base import BaseHTTPMiddleware
16
16
  from starlette.requests import Request
17
17
  from starlette.responses import Response
18
18
 
19
- from aury.boot.application.errors import global_exception_handler
20
- from aury.boot.common.logging import logger, set_trace_id
19
+ from aury.boot.application.errors.chain import global_exception_handler
20
+ from aury.boot.common.logging import get_trace_id, logger, set_trace_id
21
+
22
+
23
+ def _record_exception_to_span(exc: Exception) -> None:
24
+ """将异常记录到当前 OTEL span(使用与 loguru 一致的格式)。"""
25
+ try:
26
+ from opentelemetry import trace
27
+
28
+ from aury.boot.common.logging.format import format_exception_compact
29
+
30
+ span = trace.get_current_span()
31
+ if span and span.is_recording():
32
+ # 使用与 loguru 一致的堆栈格式(包含代码行和局部变量)
33
+ formatted_tb = format_exception_compact(
34
+ type(exc), exc, exc.__traceback__
35
+ )
36
+
37
+ # 记录异常,并将格式化堆栈放入 attributes
38
+ span.record_exception(exc, attributes={
39
+ "exception.stacktrace": formatted_tb,
40
+ })
41
+ span.set_status(trace.Status(trace.StatusCode.ERROR, str(exc)))
42
+ except ImportError:
43
+ pass # OTEL 未安装
44
+ except Exception:
45
+ pass # 忽略记录错误
21
46
 
22
47
 
23
48
  def log_request[T](func: Callable[..., T]) -> Callable[..., T]:
@@ -112,22 +137,33 @@ def _should_log_body(content_type: str | None) -> bool:
112
137
 
113
138
 
114
139
  class RequestLoggingMiddleware(BaseHTTPMiddleware):
115
- """请求日志中间件(支持链路追踪)。
140
+ """请求日志中间件(支持链路追踪和告警)。
116
141
 
117
142
  自动记录所有HTTP请求的详细信息,包括:
118
143
  - 请求方法、路径、查询参数、请求体
119
144
  - 客户端IP、User-Agent
120
145
  - 响应状态码、耗时、响应体
121
146
  - 链路追踪 ID(X-Trace-ID / X-Request-ID)
147
+ - 慢请求和异常告警(如果启用告警系统)
122
148
 
123
149
  注意:文件上传、二进制数据等不会记录 body 内容。
124
150
 
125
151
  使用示例:
126
152
  from aury.boot.application.middleware.logging import RequestLoggingMiddleware
127
153
 
128
- app.add_middleware(RequestLoggingMiddleware)
154
+ app.add_middleware(RequestLoggingMiddleware, slow_request_threshold=1.0)
129
155
  """
130
156
 
157
+ def __init__(self, app, slow_request_threshold: float = 1.0) -> None:
158
+ """初始化中间件。
159
+
160
+ Args:
161
+ app: ASGI 应用
162
+ slow_request_threshold: 慢请求阈值(秒),默认 1.0
163
+ """
164
+ super().__init__(app)
165
+ self.slow_request_threshold = slow_request_threshold
166
+
131
167
  async def dispatch(self, request: Request, call_next) -> Response:
132
168
  """处理请求并记录日志。"""
133
169
  start_time = time.time()
@@ -193,10 +229,10 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
193
229
  )
194
230
 
195
231
  # 慢请求警告
196
- if duration > 1.0:
232
+ if duration > self.slow_request_threshold:
197
233
  logger.warning(
198
234
  f"慢请求: {request.method} {request.url.path} | "
199
- f"耗时: {duration:.3f}s (超过1秒) | "
235
+ f"耗时: {duration:.3f}s (阈值: {self.slow_request_threshold}s) | "
200
236
  f"Trace-ID: {trace_id}"
201
237
  )
202
238
 
@@ -210,6 +246,9 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
210
246
  f"耗时: {duration:.3f}s | Trace-ID: {trace_id}"
211
247
  )
212
248
 
249
+ # 将异常记录到当前 OTEL span(以便告警系统提取)
250
+ _record_exception_to_span(exc)
251
+
213
252
  # 使用全局异常处理器生成响应,而不是直接抛出异常
214
253
  # BaseHTTPMiddleware 中直接 raise 会绕过 FastAPI 的异常处理器
215
254
  response = await global_exception_handler(request, exc)
@@ -349,6 +349,45 @@ def generate_env_example(
349
349
  raise typer.Exit(1)
350
350
 
351
351
 
352
+ @app.command(name="alert-rules")
353
+ def generate_alert_rules(
354
+ project_dir: Path = typer.Argument(
355
+ Path("."),
356
+ help="项目目录路径",
357
+ exists=True,
358
+ file_okay=False,
359
+ dir_okay=True,
360
+ resolve_path=True,
361
+ ),
362
+ force: bool = typer.Option(
363
+ False,
364
+ "--force",
365
+ "-f",
366
+ help="强制覆盖已存在的文件",
367
+ ),
368
+ dry_run: bool = typer.Option(
369
+ False,
370
+ "--dry-run",
371
+ "-n",
372
+ help="预览模式,不实际写入文件",
373
+ ),
374
+ ) -> None:
375
+ """生成/更新 alert_rules.yaml 告警规则模板。"""
376
+ context = _detect_project_info(project_dir)
377
+
378
+ console.print(f"[cyan]📢 检测到项目: {context['project_name']}[/cyan]")
379
+
380
+ try:
381
+ # 使用模板文件
382
+ template_path = TEMPLATES_DIR / "alert_rules.example.yaml.tpl"
383
+ content = template_path.read_text(encoding="utf-8")
384
+ output_path = project_dir / "alert_rules.example.yaml"
385
+ _write_file(output_path, content, force=force, dry_run=dry_run)
386
+ except Exception as e:
387
+ console.print(f"[red]❌ 生成失败: {e}[/red]")
388
+ raise typer.Exit(1)
389
+
390
+
352
391
  @app.command(name="all")
353
392
  def generate_all_docs(
354
393
  project_dir: Path = typer.Argument(
@@ -382,6 +421,7 @@ def generate_all_docs(
382
421
  root_docs: list[tuple[str, str, str]] = [
383
422
  ("AGENTS.md.tpl", "AGENTS.md", "AI 编程助手上下文"),
384
423
  ("env.example.tpl", ".env.example", "环境变量示例"),
424
+ ("alert_rules.example.yaml.tpl", "alert_rules.example.yaml", "告警规则示例"),
385
425
  ]
386
426
 
387
427
  # aury_docs/ 开发文档
@@ -159,6 +159,7 @@ TEMPLATE_FILE_MAP = {
159
159
  "AGENTS.md": "AGENTS.md.tpl",
160
160
  "conftest.py": "conftest.py.tpl",
161
161
  "admin_console/__init__.py": "admin_console_init.py.tpl",
162
+ "alert_rules.example.yaml": "alert_rules.example.yaml.tpl",
162
163
  }
163
164
 
164
165
  # env 模板拼接顺序
@@ -172,6 +173,7 @@ ENV_TEMPLATE_ORDER = [
172
173
  "scheduler.tpl",
173
174
  "messaging.tpl",
174
175
  "storage.tpl",
176
+ "monitoring.tpl",
175
177
  "third_party.tpl",
176
178
  "rpc.tpl",
177
179
  ]
@@ -112,6 +112,11 @@ mypy {package_name}/
112
112
  - **[aury_docs/14-mq.md](./aury_docs/14-mq.md)** - 消息队列
113
113
  - **[aury_docs/15-events.md](./aury_docs/15-events.md)** - 事件总线
114
114
 
115
+ ### 监控与告警
116
+
117
+ - **[aury_docs/17-alerting.md](./aury_docs/17-alerting.md)** - 告警系统(慢请求/慢SQL/异常 → 飞书)
118
+ - **[alert_rules.example.yaml](./alert_rules.example.yaml)** - 告警规则示例(复制为 alert_rules.yaml 使用)
119
+
115
120
  ### 第三方集成
116
121
 
117
122
  - **[aury_docs/16-adapter.md](./aury_docs/16-adapter.md)** - 第三方接口适配器(Mock/真实切换)
@@ -164,15 +169,25 @@ class User(Base): ...
164
169
  - 写操作**必须**使用 `@transactional` 装饰器
165
170
  - 只读操作可以不加事务装饰器
166
171
  - 跨 Service 调用通过共享 session 实现事务共享
172
+ - **后台任务必须**使用 `@isolated_task` 装饰器
167
173
 
168
174
  ```python
169
- from aury.boot.domain.transaction import transactional
175
+ from aury.boot.domain.transaction import transactional, isolated_task
170
176
 
171
177
  class UserService(BaseService):
172
178
  @transactional
173
179
  async def create(self, data: UserCreate) -> User:
174
180
  # 自动事务管理
175
181
  return await self.repo.create(data.model_dump())
182
+
183
+
184
+ # 后台任务必须加 @isolated_task,否则事务不会提交
185
+ @isolated_task
186
+ async def background_upload(space_id: int, url: str):
187
+ async with db.session() as session:
188
+ async with transactional_context(session):
189
+ repo = SpaceRepository(session, Space)
190
+ await repo.update(...)
176
191
  ```
177
192
 
178
193
  ### Manager API 规范
@@ -0,0 +1,85 @@
1
+ # 告警规则配置文件
2
+ #
3
+ # 使用方法:
4
+ # 1. 在 .env 中配置 ALERT__RULES_FILE=alert_rules.yaml
5
+ # 2. 配置通知器(见下方示例)
6
+ # 3. 根据需要修改规则
7
+ #
8
+ # 简版配置(所有告警发到同一个群):
9
+ # ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
10
+ # ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/xxx
11
+ #
12
+ # 完整版配置(分群告警):
13
+ # ALERT__NOTIFIERS__PERF_GROUP__TYPE=feishu
14
+ # ALERT__NOTIFIERS__PERF_GROUP__WEBHOOK=https://open.feishu.cn/.../perf-xxx
15
+ # ALERT__NOTIFIERS__ERROR_GROUP__TYPE=feishu
16
+ # ALERT__NOTIFIERS__ERROR_GROUP__WEBHOOK=https://open.feishu.cn/.../error-xxx
17
+ # ALERT__NOTIFIERS__OPS_GROUP__TYPE=feishu
18
+ # ALERT__NOTIFIERS__OPS_GROUP__WEBHOOK=https://open.feishu.cn/.../ops-xxx
19
+
20
+ defaults:
21
+ slow_request_threshold: 1.0
22
+ slow_sql_threshold: 0.5
23
+ aggregate_window: 10
24
+ suppress_seconds: 300
25
+
26
+ rules:
27
+ # ============ 简版:所有告警发到 default 群 ============
28
+ # 如果使用分群告警,请注释掉这部分,使用下方的完整版
29
+
30
+ - name: slow_request
31
+ event_types: [slow_request]
32
+ aggregate_threshold: 5
33
+ notifiers: [default]
34
+
35
+ - name: slow_sql
36
+ event_types: [slow_sql]
37
+ aggregate_threshold: 10
38
+ notifiers: [default]
39
+
40
+ - name: exception
41
+ event_types: [exception]
42
+ aggregate_threshold: 1
43
+ suppress_seconds: 60
44
+ notifiers: [default]
45
+
46
+ - name: task_issues
47
+ event_types: [task_failure, task_timeout]
48
+ aggregate_threshold: 1
49
+ notifiers: [default]
50
+
51
+ # ============ 完整版:分群告警 ============
52
+ # 取消注释并配置对应的通知器即可使用
53
+
54
+ # 慢请求 → 性能群
55
+ # - name: slow_request
56
+ # event_types: [slow_request]
57
+ # aggregate_threshold: 5
58
+ # notifiers: [perf_group]
59
+
60
+ # 慢 SQL → 性能群
61
+ # - name: slow_sql
62
+ # event_types: [slow_sql]
63
+ # aggregate_threshold: 10
64
+ # notifiers: [perf_group]
65
+
66
+ # 异常 → 错误群(立即告警)
67
+ # - name: exception
68
+ # event_types: [exception]
69
+ # aggregate_threshold: 1
70
+ # suppress_seconds: 60
71
+ # notifiers: [error_group]
72
+
73
+ # 任务失败/超时 → 运维群
74
+ # - name: task_issues
75
+ # event_types: [task_failure, task_timeout]
76
+ # aggregate_threshold: 1
77
+ # notifiers: [ops_group]
78
+
79
+ # 关键接口更严格的阈值(示例)
80
+ # - name: critical_api
81
+ # event_types: [slow_request]
82
+ # path_pattern: "/api/v1/payments/*"
83
+ # threshold: 0.5
84
+ # aggregate_threshold: 1
85
+ # notifiers: [error_group]
@@ -57,3 +57,6 @@ CLI 命令参考请查看 [99-cli.md](./99-cli.md)。
57
57
  - [13-channel.md](./13-channel.md) - 流式通道(SSE)
58
58
  - [14-mq.md](./14-mq.md) - 消息队列
59
59
  - [15-events.md](./15-events.md) - 事件总线
60
+
61
+ ### 监控与告警
62
+ - [17-alerting.md](./17-alerting.md) - 告警系统(慢请求/慢SQL/异常 → 飞书)
@@ -396,3 +396,63 @@ DATABASE_ISOLATION_LEVEL=REPEATABLE READ
396
396
  - 大多数场景:`READ COMMITTED`(平衡性能和一致性)
397
397
  - 报表/统计查询:`REPEATABLE READ`(保证读取一致性)
398
398
  - 金融交易:`SERIALIZABLE`(最强一致性,性能较低)
399
+
400
+ ### 3.12 后台任务事务隔离(重要)
401
+
402
+ 在 `@transactional` 装饰的 Service 方法中 spawn 后台任务时,**必须**使用 `@isolated_task` 或 `isolated_context`,否则事务不会提交。
403
+
404
+ **问题背景**:
405
+ `asyncio.create_task()` 会继承父协程的 `contextvars`。如果父协程在 `@transactional` 中,子任务会继承事务深度标记,导致:
406
+ - `auto_commit` 失效
407
+ - `transactional_context` 也不会提交
408
+ - session 关闭时数据被 rollback
409
+
410
+ **解决方案 1:装饰器(推荐)**
411
+
412
+ ```python
413
+ import asyncio
414
+ from aury.boot.domain.transaction import isolated_task, transactional_context
415
+ from aury.boot.infrastructure.database import DatabaseManager
416
+
417
+ db = DatabaseManager.get_instance()
418
+
419
+
420
+ @isolated_task
421
+ async def upload_cover(space_id: int, cover_url: str):
422
+ """后台任务:上传封面。"""
423
+ async with db.session() as session:
424
+ async with transactional_context(session):
425
+ repo = SpaceRepository(session, Space)
426
+ space = await repo.get(space_id)
427
+ if space:
428
+ await repo.update(space, {{"cover": cover_url}})
429
+ # 现在会正常 commit
430
+
431
+
432
+ class SpaceService(BaseService):
433
+ @transactional
434
+ async def create(self, data: SpaceCreate) -> Space:
435
+ space = await self.repo.create(data.model_dump())
436
+
437
+ # spawn 后台任务
438
+ asyncio.create_task(upload_cover(space.id, data.cover_url))
439
+
440
+ return space
441
+ ```
442
+
443
+ **解决方案 2:上下文管理器**
444
+
445
+ ```python
446
+ from aury.boot.domain.transaction import isolated_context
447
+
448
+ async def background_job():
449
+ async with isolated_context():
450
+ async with db.session() as session:
451
+ async with transactional_context(session):
452
+ # 正常的事务处理
453
+ ...
454
+ ```
455
+
456
+ **注意事项**:
457
+ - 后台任务必须新开 session(`db.session()`),不能复用主请求的 `self.session`
458
+ - 后台任务的事务与主请求独立,主请求回滚不影响后台任务