aury-boot 0.0.28__py3-none-any.whl → 0.0.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aury/boot/_version.py +2 -2
- aury/boot/application/app/base.py +126 -2
- aury/boot/application/app/components.py +224 -1
- aury/boot/application/config/settings.py +195 -3
- aury/boot/application/constants/components.py +3 -0
- aury/boot/application/middleware/logging.py +45 -6
- aury/boot/commands/docs.py +40 -0
- aury/boot/commands/init.py +2 -0
- aury/boot/commands/templates/project/AGENTS.md.tpl +16 -1
- aury/boot/commands/templates/project/alert_rules.example.yaml.tpl +85 -0
- aury/boot/commands/templates/project/aury_docs/00-overview.md.tpl +3 -0
- aury/boot/commands/templates/project/aury_docs/03-service.md.tpl +60 -0
- aury/boot/commands/templates/project/aury_docs/17-alerting.md.tpl +210 -0
- aury/boot/commands/templates/project/env_templates/monitoring.tpl +61 -0
- aury/boot/common/logging/context.py +17 -1
- aury/boot/common/logging/format.py +4 -0
- aury/boot/domain/transaction/__init__.py +57 -0
- aury/boot/infrastructure/channel/base.py +6 -2
- aury/boot/infrastructure/database/query_tools/__init__.py +3 -5
- aury/boot/infrastructure/monitoring/__init__.py +210 -6
- aury/boot/infrastructure/monitoring/alerting/__init__.py +50 -0
- aury/boot/infrastructure/monitoring/alerting/aggregator.py +193 -0
- aury/boot/infrastructure/monitoring/alerting/events.py +141 -0
- aury/boot/infrastructure/monitoring/alerting/manager.py +428 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/__init__.py +16 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/base.py +60 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/feishu.py +209 -0
- aury/boot/infrastructure/monitoring/alerting/notifiers/webhook.py +110 -0
- aury/boot/infrastructure/monitoring/alerting/rules.py +163 -0
- aury/boot/infrastructure/monitoring/health/__init__.py +231 -0
- aury/boot/infrastructure/monitoring/tracing/__init__.py +55 -0
- aury/boot/infrastructure/monitoring/tracing/context.py +43 -0
- aury/boot/infrastructure/monitoring/tracing/logging.py +73 -0
- aury/boot/infrastructure/monitoring/tracing/processor.py +327 -0
- aury/boot/infrastructure/monitoring/tracing/provider.py +320 -0
- aury/boot/infrastructure/monitoring/tracing/tracing.py +235 -0
- {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/METADATA +14 -1
- {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/RECORD +40 -21
- {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/WHEEL +0 -0
- {aury_boot-0.0.28.dist-info → aury_boot-0.0.30.dist-info}/entry_points.txt +0 -0
|
@@ -15,7 +15,11 @@ from dotenv import load_dotenv
|
|
|
15
15
|
from pydantic import BaseModel, Field
|
|
16
16
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
17
17
|
|
|
18
|
-
from .multi_instance import
|
|
18
|
+
from .multi_instance import (
|
|
19
|
+
MultiInstanceConfigLoader,
|
|
20
|
+
MultiInstanceSettings,
|
|
21
|
+
parse_multi_instance_env,
|
|
22
|
+
)
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
def _load_env_file(env_file: str | Path) -> bool:
|
|
@@ -218,6 +222,10 @@ class DatabaseSettings(BaseModel):
|
|
|
218
222
|
default=True,
|
|
219
223
|
description="是否在获取连接前进行 PING"
|
|
220
224
|
)
|
|
225
|
+
slow_query_threshold: float = Field(
|
|
226
|
+
default=1.0,
|
|
227
|
+
description="慢查询阈值(秒),超过此时间的查询会记录警告日志"
|
|
228
|
+
)
|
|
221
229
|
|
|
222
230
|
|
|
223
231
|
class CacheSettings(BaseModel):
|
|
@@ -407,7 +415,7 @@ class ServiceSettings(BaseModel):
|
|
|
407
415
|
"""服务配置。
|
|
408
416
|
|
|
409
417
|
环境变量格式: SERVICE__{FIELD}
|
|
410
|
-
示例: SERVICE__NAME, SERVICE__TYPE
|
|
418
|
+
示例: SERVICE__NAME, SERVICE__TYPE, SERVICE__ENVIRONMENT
|
|
411
419
|
|
|
412
420
|
服务类型说明:
|
|
413
421
|
- api: 运行 API 服务(SCHEDULER__ENABLED 决定是否同时运行调度器)
|
|
@@ -418,7 +426,15 @@ class ServiceSettings(BaseModel):
|
|
|
418
426
|
|
|
419
427
|
name: str = Field(
|
|
420
428
|
default="app",
|
|
421
|
-
description="
|
|
429
|
+
description="服务名称,用于日志目录区分、链路追踪标识"
|
|
430
|
+
)
|
|
431
|
+
version: str = Field(
|
|
432
|
+
default="",
|
|
433
|
+
description="服务版本(用于链路追踪和监控)"
|
|
434
|
+
)
|
|
435
|
+
environment: str = Field(
|
|
436
|
+
default="development",
|
|
437
|
+
description="部署环境 (development/staging/production)"
|
|
422
438
|
)
|
|
423
439
|
service_type: str = Field(
|
|
424
440
|
default="api",
|
|
@@ -555,6 +571,177 @@ class MessageQueueSettings(BaseModel):
|
|
|
555
571
|
)
|
|
556
572
|
|
|
557
573
|
|
|
574
|
+
class TelemetrySettings(BaseModel):
|
|
575
|
+
"""OpenTelemetry 配置。
|
|
576
|
+
|
|
577
|
+
环境变量格式: TELEMETRY__{FIELD}
|
|
578
|
+
示例: TELEMETRY__ENABLED, TELEMETRY__SLOW_THRESHOLD
|
|
579
|
+
|
|
580
|
+
功能说明:
|
|
581
|
+
- 启用后自动 instrument FastAPI、SQLAlchemy、httpx
|
|
582
|
+
- get_trace_id() 会优先使用 OTel trace_id
|
|
583
|
+
- 可配置 AlertingSpanProcessor 自动检测慢请求/异常并触发告警
|
|
584
|
+
- 可选配置 OTLP 导出到 Jaeger/Tempo/Collector
|
|
585
|
+
|
|
586
|
+
注意:service_name/version/environment 从 ServiceSettings 获取。
|
|
587
|
+
"""
|
|
588
|
+
|
|
589
|
+
enabled: bool = Field(
|
|
590
|
+
default=False,
|
|
591
|
+
description="是否启用 OpenTelemetry"
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# Instrumentation 开关
|
|
595
|
+
instrument_fastapi: bool = Field(
|
|
596
|
+
default=True,
|
|
597
|
+
description="是否自动 instrument FastAPI"
|
|
598
|
+
)
|
|
599
|
+
instrument_sqlalchemy: bool = Field(
|
|
600
|
+
default=True,
|
|
601
|
+
description="是否自动 instrument SQLAlchemy"
|
|
602
|
+
)
|
|
603
|
+
instrument_httpx: bool = Field(
|
|
604
|
+
default=True,
|
|
605
|
+
description="是否自动 instrument httpx"
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
# OTLP Traces 导出配置
|
|
610
|
+
traces_endpoint: str | None = Field(
|
|
611
|
+
default=None,
|
|
612
|
+
description="Traces 导出端点(如 http://jaeger:4317)"
|
|
613
|
+
)
|
|
614
|
+
traces_headers: dict[str, str] = Field(
|
|
615
|
+
default_factory=dict,
|
|
616
|
+
description="Traces 导出请求头"
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# OTLP Logs 导出配置
|
|
620
|
+
logs_endpoint: str | None = Field(
|
|
621
|
+
default=None,
|
|
622
|
+
description="Logs 导出端点(如 http://loki:3100)"
|
|
623
|
+
)
|
|
624
|
+
logs_headers: dict[str, str] = Field(
|
|
625
|
+
default_factory=dict,
|
|
626
|
+
description="Logs 导出请求头"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# OTLP Metrics 导出配置
|
|
630
|
+
metrics_endpoint: str | None = Field(
|
|
631
|
+
default=None,
|
|
632
|
+
description="Metrics 导出端点(如 http://prometheus:9090)"
|
|
633
|
+
)
|
|
634
|
+
metrics_headers: dict[str, str] = Field(
|
|
635
|
+
default_factory=dict,
|
|
636
|
+
description="Metrics 导出请求头"
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# 采样配置
|
|
640
|
+
sampling_rate: float = Field(
|
|
641
|
+
default=1.0,
|
|
642
|
+
description="采样率 (0.0-1.0),1.0 表示 100%"
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
class AlertSettings(BaseModel):
|
|
647
|
+
"""告警系统配置。
|
|
648
|
+
|
|
649
|
+
环境变量格式: ALERT__{FIELD}
|
|
650
|
+
示例: ALERT__ENABLED, ALERT__RULES_FILE
|
|
651
|
+
|
|
652
|
+
通知器配置(作为子配置,动态字段):
|
|
653
|
+
ALERT__NOTIFIERS__FEISHU__TYPE=feishu
|
|
654
|
+
ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://open.feishu.cn/...
|
|
655
|
+
ALERT__NOTIFIERS__FEISHU__SECRET=xxx
|
|
656
|
+
|
|
657
|
+
ALERT__NOTIFIERS__OPS__TYPE=webhook
|
|
658
|
+
ALERT__NOTIFIERS__OPS__URL=https://my-system.com/alert
|
|
659
|
+
|
|
660
|
+
自定义通知器通过 AlertManager.register_notifier() 注册。
|
|
661
|
+
"""
|
|
662
|
+
|
|
663
|
+
# 通知器配置缓存(动态字段)
|
|
664
|
+
_notifiers: dict[str, dict[str, Any]] | None = None
|
|
665
|
+
|
|
666
|
+
enabled: bool = Field(
|
|
667
|
+
default=False,
|
|
668
|
+
description="是否启用告警系统"
|
|
669
|
+
)
|
|
670
|
+
rules_file: str | None = Field(
|
|
671
|
+
default=None,
|
|
672
|
+
description="告警规则文件路径(YAML 格式),如 alert_rules.yaml"
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# 慢操作阈值
|
|
676
|
+
slow_request_threshold: float = Field(
|
|
677
|
+
default=1.0,
|
|
678
|
+
description="慢请求阈值(秒)"
|
|
679
|
+
)
|
|
680
|
+
slow_sql_threshold: float = Field(
|
|
681
|
+
default=0.5,
|
|
682
|
+
description="慢 SQL 阈值(秒)"
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# 告警开关
|
|
686
|
+
alert_on_slow_request: bool = Field(
|
|
687
|
+
default=True,
|
|
688
|
+
description="是否对慢 HTTP 请求发送告警"
|
|
689
|
+
)
|
|
690
|
+
alert_on_slow_sql: bool = Field(
|
|
691
|
+
default=True,
|
|
692
|
+
description="是否对慢 SQL 发送告警"
|
|
693
|
+
)
|
|
694
|
+
alert_on_error: bool = Field(
|
|
695
|
+
default=True,
|
|
696
|
+
description="是否对异常发送告警(默认只对 5xx 告警,4xx 业务异常不告警)"
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# 默认累计触发配置
|
|
700
|
+
aggregate_window: int = Field(
|
|
701
|
+
default=10,
|
|
702
|
+
description="聚合窗口(秒)"
|
|
703
|
+
)
|
|
704
|
+
slow_request_aggregate: int = Field(
|
|
705
|
+
default=5,
|
|
706
|
+
description="慢请求触发阈值(窗口内次数)"
|
|
707
|
+
)
|
|
708
|
+
slow_sql_aggregate: int = Field(
|
|
709
|
+
default=10,
|
|
710
|
+
description="慢 SQL 触发阈值(窗口内次数)"
|
|
711
|
+
)
|
|
712
|
+
exception_aggregate: int = Field(
|
|
713
|
+
default=1,
|
|
714
|
+
description="异常触发阈值(通常为 1,立即告警)"
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# 抑制配置
|
|
718
|
+
suppress_seconds: int = Field(
|
|
719
|
+
default=10,
|
|
720
|
+
description="告警抑制时间(秒),相同告警在此时间内不重复发送"
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
def get_notifiers(self) -> dict[str, dict[str, Any]]:
|
|
724
|
+
"""获取所有告警通知器实例配置。
|
|
725
|
+
|
|
726
|
+
从环境变量解析 ALERT__NOTIFIERS__{INSTANCE}__{FIELD} 格式的配置。
|
|
727
|
+
支持动态字段,不同类型通知器可有不同字段。
|
|
728
|
+
|
|
729
|
+
Returns:
|
|
730
|
+
dict[str, dict[str, Any]]: 实例名 -> 配置字典
|
|
731
|
+
|
|
732
|
+
示例:
|
|
733
|
+
ALERT__NOTIFIERS__FEISHU__TYPE=feishu
|
|
734
|
+
ALERT__NOTIFIERS__FEISHU__WEBHOOK=https://...
|
|
735
|
+
|
|
736
|
+
返回: {"feishu": {"type": "feishu", "webhook": "https://..."}}
|
|
737
|
+
|
|
738
|
+
自定义通知器通过 AlertManager.register_notifier() 注册。
|
|
739
|
+
"""
|
|
740
|
+
if self._notifiers is None:
|
|
741
|
+
self._notifiers = parse_multi_instance_env("ALERT__NOTIFIERS")
|
|
742
|
+
return self._notifiers
|
|
743
|
+
|
|
744
|
+
|
|
558
745
|
class MigrationSettings(BaseModel):
|
|
559
746
|
"""数据库迁移配置。
|
|
560
747
|
|
|
@@ -824,6 +1011,10 @@ class BaseConfig(BaseSettings):
|
|
|
824
1011
|
# RPC 服务配置(当前服务注册)
|
|
825
1012
|
rpc_service: RPCServiceSettings = Field(default_factory=RPCServiceSettings)
|
|
826
1013
|
|
|
1014
|
+
# ========== 监控告警 ==========
|
|
1015
|
+
telemetry: TelemetrySettings = Field(default_factory=TelemetrySettings)
|
|
1016
|
+
alert: AlertSettings = Field(default_factory=AlertSettings)
|
|
1017
|
+
|
|
827
1018
|
model_config = SettingsConfigDict(
|
|
828
1019
|
case_sensitive=False,
|
|
829
1020
|
extra="ignore",
|
|
@@ -953,6 +1144,7 @@ __all__ = [
|
|
|
953
1144
|
# 配置类
|
|
954
1145
|
"AdminAuthSettings",
|
|
955
1146
|
"AdminConsoleSettings",
|
|
1147
|
+
"AlertSettings",
|
|
956
1148
|
"BaseConfig",
|
|
957
1149
|
"CORSSettings",
|
|
958
1150
|
# 多实例配置类
|
|
@@ -16,8 +16,33 @@ from starlette.middleware.base import BaseHTTPMiddleware
|
|
|
16
16
|
from starlette.requests import Request
|
|
17
17
|
from starlette.responses import Response
|
|
18
18
|
|
|
19
|
-
from aury.boot.application.errors import global_exception_handler
|
|
20
|
-
from aury.boot.common.logging import logger, set_trace_id
|
|
19
|
+
from aury.boot.application.errors.chain import global_exception_handler
|
|
20
|
+
from aury.boot.common.logging import get_trace_id, logger, set_trace_id
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _record_exception_to_span(exc: Exception) -> None:
|
|
24
|
+
"""将异常记录到当前 OTEL span(使用与 loguru 一致的格式)。"""
|
|
25
|
+
try:
|
|
26
|
+
from opentelemetry import trace
|
|
27
|
+
|
|
28
|
+
from aury.boot.common.logging.format import format_exception_compact
|
|
29
|
+
|
|
30
|
+
span = trace.get_current_span()
|
|
31
|
+
if span and span.is_recording():
|
|
32
|
+
# 使用与 loguru 一致的堆栈格式(包含代码行和局部变量)
|
|
33
|
+
formatted_tb = format_exception_compact(
|
|
34
|
+
type(exc), exc, exc.__traceback__
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# 记录异常,并将格式化堆栈放入 attributes
|
|
38
|
+
span.record_exception(exc, attributes={
|
|
39
|
+
"exception.stacktrace": formatted_tb,
|
|
40
|
+
})
|
|
41
|
+
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exc)))
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass # OTEL 未安装
|
|
44
|
+
except Exception:
|
|
45
|
+
pass # 忽略记录错误
|
|
21
46
|
|
|
22
47
|
|
|
23
48
|
def log_request[T](func: Callable[..., T]) -> Callable[..., T]:
|
|
@@ -112,22 +137,33 @@ def _should_log_body(content_type: str | None) -> bool:
|
|
|
112
137
|
|
|
113
138
|
|
|
114
139
|
class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
|
115
|
-
"""
|
|
140
|
+
"""请求日志中间件(支持链路追踪和告警)。
|
|
116
141
|
|
|
117
142
|
自动记录所有HTTP请求的详细信息,包括:
|
|
118
143
|
- 请求方法、路径、查询参数、请求体
|
|
119
144
|
- 客户端IP、User-Agent
|
|
120
145
|
- 响应状态码、耗时、响应体
|
|
121
146
|
- 链路追踪 ID(X-Trace-ID / X-Request-ID)
|
|
147
|
+
- 慢请求和异常告警(如果启用告警系统)
|
|
122
148
|
|
|
123
149
|
注意:文件上传、二进制数据等不会记录 body 内容。
|
|
124
150
|
|
|
125
151
|
使用示例:
|
|
126
152
|
from aury.boot.application.middleware.logging import RequestLoggingMiddleware
|
|
127
153
|
|
|
128
|
-
app.add_middleware(RequestLoggingMiddleware)
|
|
154
|
+
app.add_middleware(RequestLoggingMiddleware, slow_request_threshold=1.0)
|
|
129
155
|
"""
|
|
130
156
|
|
|
157
|
+
def __init__(self, app, slow_request_threshold: float = 1.0) -> None:
|
|
158
|
+
"""初始化中间件。
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
app: ASGI 应用
|
|
162
|
+
slow_request_threshold: 慢请求阈值(秒),默认 1.0
|
|
163
|
+
"""
|
|
164
|
+
super().__init__(app)
|
|
165
|
+
self.slow_request_threshold = slow_request_threshold
|
|
166
|
+
|
|
131
167
|
async def dispatch(self, request: Request, call_next) -> Response:
|
|
132
168
|
"""处理请求并记录日志。"""
|
|
133
169
|
start_time = time.time()
|
|
@@ -193,10 +229,10 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
|
|
193
229
|
)
|
|
194
230
|
|
|
195
231
|
# 慢请求警告
|
|
196
|
-
if duration >
|
|
232
|
+
if duration > self.slow_request_threshold:
|
|
197
233
|
logger.warning(
|
|
198
234
|
f"慢请求: {request.method} {request.url.path} | "
|
|
199
|
-
f"耗时: {duration:.3f}s (
|
|
235
|
+
f"耗时: {duration:.3f}s (阈值: {self.slow_request_threshold}s) | "
|
|
200
236
|
f"Trace-ID: {trace_id}"
|
|
201
237
|
)
|
|
202
238
|
|
|
@@ -210,6 +246,9 @@ class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
|
|
210
246
|
f"耗时: {duration:.3f}s | Trace-ID: {trace_id}"
|
|
211
247
|
)
|
|
212
248
|
|
|
249
|
+
# 将异常记录到当前 OTEL span(以便告警系统提取)
|
|
250
|
+
_record_exception_to_span(exc)
|
|
251
|
+
|
|
213
252
|
# 使用全局异常处理器生成响应,而不是直接抛出异常
|
|
214
253
|
# BaseHTTPMiddleware 中直接 raise 会绕过 FastAPI 的异常处理器
|
|
215
254
|
response = await global_exception_handler(request, exc)
|
aury/boot/commands/docs.py
CHANGED
|
@@ -349,6 +349,45 @@ def generate_env_example(
|
|
|
349
349
|
raise typer.Exit(1)
|
|
350
350
|
|
|
351
351
|
|
|
352
|
+
@app.command(name="alert-rules")
|
|
353
|
+
def generate_alert_rules(
|
|
354
|
+
project_dir: Path = typer.Argument(
|
|
355
|
+
Path("."),
|
|
356
|
+
help="项目目录路径",
|
|
357
|
+
exists=True,
|
|
358
|
+
file_okay=False,
|
|
359
|
+
dir_okay=True,
|
|
360
|
+
resolve_path=True,
|
|
361
|
+
),
|
|
362
|
+
force: bool = typer.Option(
|
|
363
|
+
False,
|
|
364
|
+
"--force",
|
|
365
|
+
"-f",
|
|
366
|
+
help="强制覆盖已存在的文件",
|
|
367
|
+
),
|
|
368
|
+
dry_run: bool = typer.Option(
|
|
369
|
+
False,
|
|
370
|
+
"--dry-run",
|
|
371
|
+
"-n",
|
|
372
|
+
help="预览模式,不实际写入文件",
|
|
373
|
+
),
|
|
374
|
+
) -> None:
|
|
375
|
+
"""生成/更新 alert_rules.yaml 告警规则模板。"""
|
|
376
|
+
context = _detect_project_info(project_dir)
|
|
377
|
+
|
|
378
|
+
console.print(f"[cyan]📢 检测到项目: {context['project_name']}[/cyan]")
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
# 使用模板文件
|
|
382
|
+
template_path = TEMPLATES_DIR / "alert_rules.example.yaml.tpl"
|
|
383
|
+
content = template_path.read_text(encoding="utf-8")
|
|
384
|
+
output_path = project_dir / "alert_rules.example.yaml"
|
|
385
|
+
_write_file(output_path, content, force=force, dry_run=dry_run)
|
|
386
|
+
except Exception as e:
|
|
387
|
+
console.print(f"[red]❌ 生成失败: {e}[/red]")
|
|
388
|
+
raise typer.Exit(1)
|
|
389
|
+
|
|
390
|
+
|
|
352
391
|
@app.command(name="all")
|
|
353
392
|
def generate_all_docs(
|
|
354
393
|
project_dir: Path = typer.Argument(
|
|
@@ -382,6 +421,7 @@ def generate_all_docs(
|
|
|
382
421
|
root_docs: list[tuple[str, str, str]] = [
|
|
383
422
|
("AGENTS.md.tpl", "AGENTS.md", "AI 编程助手上下文"),
|
|
384
423
|
("env.example.tpl", ".env.example", "环境变量示例"),
|
|
424
|
+
("alert_rules.example.yaml.tpl", "alert_rules.example.yaml", "告警规则示例"),
|
|
385
425
|
]
|
|
386
426
|
|
|
387
427
|
# aury_docs/ 开发文档
|
aury/boot/commands/init.py
CHANGED
|
@@ -159,6 +159,7 @@ TEMPLATE_FILE_MAP = {
|
|
|
159
159
|
"AGENTS.md": "AGENTS.md.tpl",
|
|
160
160
|
"conftest.py": "conftest.py.tpl",
|
|
161
161
|
"admin_console/__init__.py": "admin_console_init.py.tpl",
|
|
162
|
+
"alert_rules.example.yaml": "alert_rules.example.yaml.tpl",
|
|
162
163
|
}
|
|
163
164
|
|
|
164
165
|
# env 模板拼接顺序
|
|
@@ -172,6 +173,7 @@ ENV_TEMPLATE_ORDER = [
|
|
|
172
173
|
"scheduler.tpl",
|
|
173
174
|
"messaging.tpl",
|
|
174
175
|
"storage.tpl",
|
|
176
|
+
"monitoring.tpl",
|
|
175
177
|
"third_party.tpl",
|
|
176
178
|
"rpc.tpl",
|
|
177
179
|
]
|
|
@@ -112,6 +112,11 @@ mypy {package_name}/
|
|
|
112
112
|
- **[aury_docs/14-mq.md](./aury_docs/14-mq.md)** - 消息队列
|
|
113
113
|
- **[aury_docs/15-events.md](./aury_docs/15-events.md)** - 事件总线
|
|
114
114
|
|
|
115
|
+
### 监控与告警
|
|
116
|
+
|
|
117
|
+
- **[aury_docs/17-alerting.md](./aury_docs/17-alerting.md)** - 告警系统(慢请求/慢SQL/异常 → 飞书)
|
|
118
|
+
- **[alert_rules.example.yaml](./alert_rules.example.yaml)** - 告警规则示例(复制为 alert_rules.yaml 使用)
|
|
119
|
+
|
|
115
120
|
### 第三方集成
|
|
116
121
|
|
|
117
122
|
- **[aury_docs/16-adapter.md](./aury_docs/16-adapter.md)** - 第三方接口适配器(Mock/真实切换)
|
|
@@ -164,15 +169,25 @@ class User(Base): ...
|
|
|
164
169
|
- 写操作**必须**使用 `@transactional` 装饰器
|
|
165
170
|
- 只读操作可以不加事务装饰器
|
|
166
171
|
- 跨 Service 调用通过共享 session 实现事务共享
|
|
172
|
+
- **后台任务必须**使用 `@isolated_task` 装饰器
|
|
167
173
|
|
|
168
174
|
```python
|
|
169
|
-
from aury.boot.domain.transaction import transactional
|
|
175
|
+
from aury.boot.domain.transaction import transactional, isolated_task
|
|
170
176
|
|
|
171
177
|
class UserService(BaseService):
|
|
172
178
|
@transactional
|
|
173
179
|
async def create(self, data: UserCreate) -> User:
|
|
174
180
|
# 自动事务管理
|
|
175
181
|
return await self.repo.create(data.model_dump())
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# 后台任务必须加 @isolated_task,否则事务不会提交
|
|
185
|
+
@isolated_task
|
|
186
|
+
async def background_upload(space_id: int, url: str):
|
|
187
|
+
async with db.session() as session:
|
|
188
|
+
async with transactional_context(session):
|
|
189
|
+
repo = SpaceRepository(session, Space)
|
|
190
|
+
await repo.update(...)
|
|
176
191
|
```
|
|
177
192
|
|
|
178
193
|
### Manager API 规范
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# 告警规则配置文件
|
|
2
|
+
#
|
|
3
|
+
# 使用方法:
|
|
4
|
+
# 1. 在 .env 中配置 ALERT__RULES_FILE=alert_rules.yaml
|
|
5
|
+
# 2. 配置通知器(见下方示例)
|
|
6
|
+
# 3. 根据需要修改规则
|
|
7
|
+
#
|
|
8
|
+
# 简版配置(所有告警发到同一个群):
|
|
9
|
+
# ALERT__NOTIFIERS__DEFAULT__TYPE=feishu
|
|
10
|
+
# ALERT__NOTIFIERS__DEFAULT__WEBHOOK=https://open.feishu.cn/open-apis/bot/v2/hook/xxx
|
|
11
|
+
#
|
|
12
|
+
# 完整版配置(分群告警):
|
|
13
|
+
# ALERT__NOTIFIERS__PERF_GROUP__TYPE=feishu
|
|
14
|
+
# ALERT__NOTIFIERS__PERF_GROUP__WEBHOOK=https://open.feishu.cn/.../perf-xxx
|
|
15
|
+
# ALERT__NOTIFIERS__ERROR_GROUP__TYPE=feishu
|
|
16
|
+
# ALERT__NOTIFIERS__ERROR_GROUP__WEBHOOK=https://open.feishu.cn/.../error-xxx
|
|
17
|
+
# ALERT__NOTIFIERS__OPS_GROUP__TYPE=feishu
|
|
18
|
+
# ALERT__NOTIFIERS__OPS_GROUP__WEBHOOK=https://open.feishu.cn/.../ops-xxx
|
|
19
|
+
|
|
20
|
+
defaults:
|
|
21
|
+
slow_request_threshold: 1.0
|
|
22
|
+
slow_sql_threshold: 0.5
|
|
23
|
+
aggregate_window: 10
|
|
24
|
+
suppress_seconds: 300
|
|
25
|
+
|
|
26
|
+
rules:
|
|
27
|
+
# ============ 简版:所有告警发到 default 群 ============
|
|
28
|
+
# 如果使用分群告警,请注释掉这部分,使用下方的完整版
|
|
29
|
+
|
|
30
|
+
- name: slow_request
|
|
31
|
+
event_types: [slow_request]
|
|
32
|
+
aggregate_threshold: 5
|
|
33
|
+
notifiers: [default]
|
|
34
|
+
|
|
35
|
+
- name: slow_sql
|
|
36
|
+
event_types: [slow_sql]
|
|
37
|
+
aggregate_threshold: 10
|
|
38
|
+
notifiers: [default]
|
|
39
|
+
|
|
40
|
+
- name: exception
|
|
41
|
+
event_types: [exception]
|
|
42
|
+
aggregate_threshold: 1
|
|
43
|
+
suppress_seconds: 60
|
|
44
|
+
notifiers: [default]
|
|
45
|
+
|
|
46
|
+
- name: task_issues
|
|
47
|
+
event_types: [task_failure, task_timeout]
|
|
48
|
+
aggregate_threshold: 1
|
|
49
|
+
notifiers: [default]
|
|
50
|
+
|
|
51
|
+
# ============ 完整版:分群告警 ============
|
|
52
|
+
# 取消注释并配置对应的通知器即可使用
|
|
53
|
+
|
|
54
|
+
# 慢请求 → 性能群
|
|
55
|
+
# - name: slow_request
|
|
56
|
+
# event_types: [slow_request]
|
|
57
|
+
# aggregate_threshold: 5
|
|
58
|
+
# notifiers: [perf_group]
|
|
59
|
+
|
|
60
|
+
# 慢 SQL → 性能群
|
|
61
|
+
# - name: slow_sql
|
|
62
|
+
# event_types: [slow_sql]
|
|
63
|
+
# aggregate_threshold: 10
|
|
64
|
+
# notifiers: [perf_group]
|
|
65
|
+
|
|
66
|
+
# 异常 → 错误群(立即告警)
|
|
67
|
+
# - name: exception
|
|
68
|
+
# event_types: [exception]
|
|
69
|
+
# aggregate_threshold: 1
|
|
70
|
+
# suppress_seconds: 60
|
|
71
|
+
# notifiers: [error_group]
|
|
72
|
+
|
|
73
|
+
# 任务失败/超时 → 运维群
|
|
74
|
+
# - name: task_issues
|
|
75
|
+
# event_types: [task_failure, task_timeout]
|
|
76
|
+
# aggregate_threshold: 1
|
|
77
|
+
# notifiers: [ops_group]
|
|
78
|
+
|
|
79
|
+
# 关键接口更严格的阈值(示例)
|
|
80
|
+
# - name: critical_api
|
|
81
|
+
# event_types: [slow_request]
|
|
82
|
+
# path_pattern: "/api/v1/payments/*"
|
|
83
|
+
# threshold: 0.5
|
|
84
|
+
# aggregate_threshold: 1
|
|
85
|
+
# notifiers: [error_group]
|
|
@@ -396,3 +396,63 @@ DATABASE_ISOLATION_LEVEL=REPEATABLE READ
|
|
|
396
396
|
- 大多数场景:`READ COMMITTED`(平衡性能和一致性)
|
|
397
397
|
- 报表/统计查询:`REPEATABLE READ`(保证读取一致性)
|
|
398
398
|
- 金融交易:`SERIALIZABLE`(最强一致性,性能较低)
|
|
399
|
+
|
|
400
|
+
### 3.12 后台任务事务隔离(重要)
|
|
401
|
+
|
|
402
|
+
在 `@transactional` 装饰的 Service 方法中 spawn 后台任务时,**必须**使用 `@isolated_task` 或 `isolated_context`,否则事务不会提交。
|
|
403
|
+
|
|
404
|
+
**问题背景**:
|
|
405
|
+
`asyncio.create_task()` 会继承父协程的 `contextvars`。如果父协程在 `@transactional` 中,子任务会继承事务深度标记,导致:
|
|
406
|
+
- `auto_commit` 失效
|
|
407
|
+
- `transactional_context` 也不会提交
|
|
408
|
+
- session 关闭时数据被 rollback
|
|
409
|
+
|
|
410
|
+
**解决方案 1:装饰器(推荐)**
|
|
411
|
+
|
|
412
|
+
```python
|
|
413
|
+
import asyncio
|
|
414
|
+
from aury.boot.domain.transaction import isolated_task, transactional_context
|
|
415
|
+
from aury.boot.infrastructure.database import DatabaseManager
|
|
416
|
+
|
|
417
|
+
db = DatabaseManager.get_instance()
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
@isolated_task
|
|
421
|
+
async def upload_cover(space_id: int, cover_url: str):
|
|
422
|
+
"""后台任务:上传封面。"""
|
|
423
|
+
async with db.session() as session:
|
|
424
|
+
async with transactional_context(session):
|
|
425
|
+
repo = SpaceRepository(session, Space)
|
|
426
|
+
space = await repo.get(space_id)
|
|
427
|
+
if space:
|
|
428
|
+
await repo.update(space, {{"cover": cover_url}})
|
|
429
|
+
# 现在会正常 commit
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class SpaceService(BaseService):
|
|
433
|
+
@transactional
|
|
434
|
+
async def create(self, data: SpaceCreate) -> Space:
|
|
435
|
+
space = await self.repo.create(data.model_dump())
|
|
436
|
+
|
|
437
|
+
# spawn 后台任务
|
|
438
|
+
asyncio.create_task(upload_cover(space.id, data.cover_url))
|
|
439
|
+
|
|
440
|
+
return space
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
**解决方案 2:上下文管理器**
|
|
444
|
+
|
|
445
|
+
```python
|
|
446
|
+
from aury.boot.domain.transaction import isolated_context
|
|
447
|
+
|
|
448
|
+
async def background_job():
|
|
449
|
+
async with isolated_context():
|
|
450
|
+
async with db.session() as session:
|
|
451
|
+
async with transactional_context(session):
|
|
452
|
+
# 正常的事务处理
|
|
453
|
+
...
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
**注意事项**:
|
|
457
|
+
- 后台任务必须新开 session(`db.session()`),不能复用主请求的 `self.session`
|
|
458
|
+
- 后台任务的事务与主请求独立,主请求回滚不影响后台任务
|