agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
import structlog
|
|
7
|
+
|
|
8
|
+
from src.serving.api.alerts import (
|
|
9
|
+
DEFAULT_ALERTS_CONFIG_PATH,
|
|
10
|
+
AlertConfig,
|
|
11
|
+
AlertDispatcher,
|
|
12
|
+
AlertEscalationStep,
|
|
13
|
+
AlertFlapDetection,
|
|
14
|
+
AlertRule,
|
|
15
|
+
create_alert,
|
|
16
|
+
deactivate_alert,
|
|
17
|
+
ensure_alert_dispatcher,
|
|
18
|
+
ensure_alert_history_table,
|
|
19
|
+
get_alert,
|
|
20
|
+
get_alert_config_path,
|
|
21
|
+
get_alert_history,
|
|
22
|
+
list_alerts,
|
|
23
|
+
load_alerts,
|
|
24
|
+
save_alerts,
|
|
25
|
+
update_alert,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
logger = structlog.get_logger()
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"DEFAULT_ALERTS_CONFIG_PATH",
|
|
32
|
+
"AlertConfig",
|
|
33
|
+
"AlertDispatcher",
|
|
34
|
+
"AlertEscalationStep",
|
|
35
|
+
"AlertFlapDetection",
|
|
36
|
+
"AlertRule",
|
|
37
|
+
"create_alert",
|
|
38
|
+
"datetime",
|
|
39
|
+
"deactivate_alert",
|
|
40
|
+
"ensure_alert_dispatcher",
|
|
41
|
+
"ensure_alert_history_table",
|
|
42
|
+
"get_alert",
|
|
43
|
+
"get_alert_config_path",
|
|
44
|
+
"get_alert_history",
|
|
45
|
+
"httpx",
|
|
46
|
+
"list_alerts",
|
|
47
|
+
"load_alerts",
|
|
48
|
+
"logger",
|
|
49
|
+
"save_alerts",
|
|
50
|
+
"update_alert",
|
|
51
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from .dispatcher import (
|
|
2
|
+
DEFAULT_ALERTS_CONFIG_PATH,
|
|
3
|
+
AlertConfig,
|
|
4
|
+
AlertDispatcher,
|
|
5
|
+
AlertEscalationStep,
|
|
6
|
+
AlertFlapDetection,
|
|
7
|
+
AlertRule,
|
|
8
|
+
create_alert,
|
|
9
|
+
deactivate_alert,
|
|
10
|
+
ensure_alert_dispatcher,
|
|
11
|
+
get_alert,
|
|
12
|
+
get_alert_config_path,
|
|
13
|
+
list_alerts,
|
|
14
|
+
load_alerts,
|
|
15
|
+
save_alerts,
|
|
16
|
+
update_alert,
|
|
17
|
+
)
|
|
18
|
+
from .history import ensure_alert_history_table, get_alert_history
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"DEFAULT_ALERTS_CONFIG_PATH",
|
|
22
|
+
"AlertConfig",
|
|
23
|
+
"AlertDispatcher",
|
|
24
|
+
"AlertEscalationStep",
|
|
25
|
+
"AlertFlapDetection",
|
|
26
|
+
"AlertRule",
|
|
27
|
+
"create_alert",
|
|
28
|
+
"deactivate_alert",
|
|
29
|
+
"ensure_alert_dispatcher",
|
|
30
|
+
"ensure_alert_history_table",
|
|
31
|
+
"get_alert",
|
|
32
|
+
"get_alert_config_path",
|
|
33
|
+
"get_alert_history",
|
|
34
|
+
"list_alerts",
|
|
35
|
+
"load_alerts",
|
|
36
|
+
"save_alerts",
|
|
37
|
+
"update_alert",
|
|
38
|
+
]
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import secrets
|
|
7
|
+
import uuid
|
|
8
|
+
from datetime import UTC, datetime, timedelta
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Literal
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field, model_validator
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import yaml # type: ignore[import-untyped]
|
|
16
|
+
except ImportError: # pragma: no cover
|
|
17
|
+
yaml = None
|
|
18
|
+
|
|
19
|
+
DEFAULT_ALERTS_CONFIG_PATH = Path(os.getenv("AGENTFLOW_ALERTS_FILE", "config/alerts.yaml"))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class AlertEscalationStep(BaseModel):
|
|
23
|
+
level: int = Field(ge=1)
|
|
24
|
+
after_minutes: int = Field(ge=0)
|
|
25
|
+
webhook_url: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AlertFlapDetection(BaseModel):
|
|
29
|
+
enabled: bool = False
|
|
30
|
+
window_minutes: int = Field(default=5, ge=1)
|
|
31
|
+
max_changes: int = Field(default=3, ge=1)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AlertRule(BaseModel):
|
|
35
|
+
id: str
|
|
36
|
+
name: str
|
|
37
|
+
tenant: str
|
|
38
|
+
metric: str
|
|
39
|
+
window: str
|
|
40
|
+
condition: Literal["above", "below", "change_pct"]
|
|
41
|
+
threshold: float
|
|
42
|
+
webhook_url: str
|
|
43
|
+
secret: str
|
|
44
|
+
cooldown_minutes: int = 30
|
|
45
|
+
active: bool = True
|
|
46
|
+
created_at: datetime
|
|
47
|
+
updated_at: datetime
|
|
48
|
+
last_triggered_at: datetime | None = None
|
|
49
|
+
escalation: list[AlertEscalationStep] = Field(default_factory=list)
|
|
50
|
+
flap_detection: AlertFlapDetection = Field(default_factory=AlertFlapDetection)
|
|
51
|
+
state: Literal["ok", "firing", "sustained", "resolved", "suppressed"] = "ok"
|
|
52
|
+
fired_at: datetime | None = None
|
|
53
|
+
resolved_at: datetime | None = None
|
|
54
|
+
last_escalation_level: int = 0
|
|
55
|
+
state_changes: list[datetime] = Field(default_factory=list)
|
|
56
|
+
last_condition_triggered: bool = False
|
|
57
|
+
|
|
58
|
+
@model_validator(mode="after")
|
|
59
|
+
def _normalize_escalation(self) -> AlertRule:
|
|
60
|
+
steps = sorted(self.escalation, key=lambda step: (step.after_minutes, step.level))
|
|
61
|
+
if not steps or steps[0].after_minutes != 0:
|
|
62
|
+
steps.insert(
|
|
63
|
+
0,
|
|
64
|
+
AlertEscalationStep(level=1, after_minutes=0, webhook_url=self.webhook_url),
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
first_step = steps[0]
|
|
68
|
+
steps[0] = AlertEscalationStep(
|
|
69
|
+
level=1,
|
|
70
|
+
after_minutes=0,
|
|
71
|
+
webhook_url=self.webhook_url,
|
|
72
|
+
)
|
|
73
|
+
if first_step.level == 1 and first_step.webhook_url == self.webhook_url:
|
|
74
|
+
steps[0] = first_step
|
|
75
|
+
self.escalation = steps
|
|
76
|
+
self.webhook_url = self.escalation[0].webhook_url
|
|
77
|
+
return self
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class AlertConfig(BaseModel):
|
|
81
|
+
alerts: list[AlertRule] = Field(default_factory=list)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_alert_config_path(app) -> Path:
|
|
85
|
+
configured = getattr(app.state, "alert_config_path", None)
|
|
86
|
+
return Path(configured) if configured else DEFAULT_ALERTS_CONFIG_PATH
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def load_alerts(path: Path) -> list[AlertRule]:
|
|
90
|
+
if not path.exists():
|
|
91
|
+
return []
|
|
92
|
+
raw = path.read_text(encoding="utf-8")
|
|
93
|
+
if not raw.strip():
|
|
94
|
+
return []
|
|
95
|
+
data = yaml.safe_load(raw) if yaml is not None else json.loads(raw)
|
|
96
|
+
config = AlertConfig.model_validate(data or {})
|
|
97
|
+
return config.alerts
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def save_alerts(path: Path, alerts: list[AlertRule]) -> None:
|
|
101
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
payload = AlertConfig(alerts=alerts).model_dump(mode="json")
|
|
103
|
+
content = (
|
|
104
|
+
yaml.safe_dump(payload, sort_keys=False)
|
|
105
|
+
if yaml is not None
|
|
106
|
+
else json.dumps(payload, indent=2)
|
|
107
|
+
)
|
|
108
|
+
path.write_text(content, encoding="utf-8", newline="\n")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def create_alert(
|
|
112
|
+
path: Path,
|
|
113
|
+
*,
|
|
114
|
+
name: str,
|
|
115
|
+
tenant: str,
|
|
116
|
+
metric: str,
|
|
117
|
+
window: str,
|
|
118
|
+
condition: Literal["above", "below", "change_pct"],
|
|
119
|
+
threshold: float,
|
|
120
|
+
webhook_url: str,
|
|
121
|
+
cooldown_minutes: int,
|
|
122
|
+
) -> AlertRule:
|
|
123
|
+
from src.serving.api import alert_dispatcher as compat
|
|
124
|
+
|
|
125
|
+
alerts = load_alerts(path)
|
|
126
|
+
now = compat.datetime.now(UTC)
|
|
127
|
+
rule = AlertRule(
|
|
128
|
+
id=str(uuid.uuid4()),
|
|
129
|
+
name=name,
|
|
130
|
+
tenant=tenant,
|
|
131
|
+
metric=metric,
|
|
132
|
+
window=window,
|
|
133
|
+
condition=condition,
|
|
134
|
+
threshold=threshold,
|
|
135
|
+
webhook_url=webhook_url,
|
|
136
|
+
secret=secrets.token_urlsafe(32),
|
|
137
|
+
cooldown_minutes=cooldown_minutes,
|
|
138
|
+
created_at=now,
|
|
139
|
+
updated_at=now,
|
|
140
|
+
)
|
|
141
|
+
alerts.append(rule)
|
|
142
|
+
save_alerts(path, alerts)
|
|
143
|
+
return rule
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def list_alerts(path: Path, tenant: str) -> list[AlertRule]:
|
|
147
|
+
return [alert for alert in load_alerts(path) if alert.tenant == tenant and alert.active]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def get_alert(path: Path, alert_id: str, tenant: str) -> AlertRule | None:
|
|
151
|
+
for alert in load_alerts(path):
|
|
152
|
+
if alert.id == alert_id and alert.tenant == tenant and alert.active:
|
|
153
|
+
return alert
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def update_alert(path: Path, alert_id: str, tenant: str, updates: dict) -> AlertRule | None:
|
|
158
|
+
from src.serving.api import alert_dispatcher as compat
|
|
159
|
+
|
|
160
|
+
alerts = load_alerts(path)
|
|
161
|
+
for index, alert in enumerate(alerts):
|
|
162
|
+
if alert.id != alert_id or alert.tenant != tenant or not alert.active:
|
|
163
|
+
continue
|
|
164
|
+
payload = alert.model_dump(mode="python")
|
|
165
|
+
payload.update(updates)
|
|
166
|
+
payload["updated_at"] = compat.datetime.now(UTC)
|
|
167
|
+
updated = AlertRule.model_validate(payload)
|
|
168
|
+
alerts[index] = updated
|
|
169
|
+
save_alerts(path, alerts)
|
|
170
|
+
return updated
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def deactivate_alert(path: Path, alert_id: str, tenant: str) -> bool:
|
|
175
|
+
from src.serving.api import alert_dispatcher as compat
|
|
176
|
+
|
|
177
|
+
alerts = load_alerts(path)
|
|
178
|
+
changed = False
|
|
179
|
+
for index, alert in enumerate(alerts):
|
|
180
|
+
if alert.id != alert_id or alert.tenant != tenant or not alert.active:
|
|
181
|
+
continue
|
|
182
|
+
payload = alert.model_dump(mode="python")
|
|
183
|
+
payload["active"] = False
|
|
184
|
+
payload["updated_at"] = compat.datetime.now(UTC)
|
|
185
|
+
alerts[index] = AlertRule.model_validate(payload)
|
|
186
|
+
changed = True
|
|
187
|
+
break
|
|
188
|
+
if changed:
|
|
189
|
+
save_alerts(path, alerts)
|
|
190
|
+
return changed
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def ensure_alert_dispatcher(app) -> AlertDispatcher:
|
|
194
|
+
dispatcher = getattr(app.state, "alert_dispatcher", None)
|
|
195
|
+
if dispatcher is None:
|
|
196
|
+
dispatcher = AlertDispatcher(app)
|
|
197
|
+
app.state.alert_dispatcher = dispatcher
|
|
198
|
+
return dispatcher
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def cooldown_elapsed(alert: AlertRule, now: datetime) -> bool:
|
|
202
|
+
if alert.last_triggered_at is None:
|
|
203
|
+
return True
|
|
204
|
+
cooldown = timedelta(minutes=alert.cooldown_minutes)
|
|
205
|
+
return now - alert.last_triggered_at >= cooldown
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def next_escalation_step(
|
|
209
|
+
alert: AlertRule,
|
|
210
|
+
now: datetime,
|
|
211
|
+
) -> AlertEscalationStep | None:
|
|
212
|
+
if alert.fired_at is None:
|
|
213
|
+
return None
|
|
214
|
+
elapsed_minutes = (now - alert.fired_at).total_seconds() / 60
|
|
215
|
+
due_steps = [
|
|
216
|
+
step
|
|
217
|
+
for step in alert.escalation
|
|
218
|
+
if step.level > alert.last_escalation_level and elapsed_minutes >= step.after_minutes
|
|
219
|
+
]
|
|
220
|
+
if due_steps:
|
|
221
|
+
return due_steps[-1]
|
|
222
|
+
if (
|
|
223
|
+
len(alert.escalation) == 1
|
|
224
|
+
and alert.last_escalation_level == alert.escalation[0].level
|
|
225
|
+
and cooldown_elapsed(alert, now)
|
|
226
|
+
):
|
|
227
|
+
return alert.escalation[0]
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class AlertDispatcher:
|
|
232
|
+
def __init__(self, app, poll_interval_seconds: float = 60.0) -> None:
|
|
233
|
+
self.app = app
|
|
234
|
+
self.poll_interval_seconds = poll_interval_seconds
|
|
235
|
+
self.backoff_seconds = [1.0, 5.0, 25.0]
|
|
236
|
+
self._task: asyncio.Task | None = None
|
|
237
|
+
|
|
238
|
+
def start(self) -> None:
|
|
239
|
+
if self._task is not None and not self._task.done():
|
|
240
|
+
return
|
|
241
|
+
self._task = asyncio.create_task(self.run())
|
|
242
|
+
|
|
243
|
+
async def stop(self) -> None:
|
|
244
|
+
if self._task is None or self._task.done():
|
|
245
|
+
return
|
|
246
|
+
self._task.cancel()
|
|
247
|
+
try:
|
|
248
|
+
await self._task
|
|
249
|
+
except asyncio.CancelledError:
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
async def run(self) -> None:
|
|
253
|
+
from src.serving.api import alert_dispatcher as compat
|
|
254
|
+
|
|
255
|
+
while True:
|
|
256
|
+
try:
|
|
257
|
+
await self.dispatch_alerts()
|
|
258
|
+
except Exception as exc:
|
|
259
|
+
compat.logger.warning("alert_dispatcher_error", error=str(exc))
|
|
260
|
+
await asyncio.sleep(self.poll_interval_seconds)
|
|
261
|
+
|
|
262
|
+
async def dispatch_alerts(self) -> int:
|
|
263
|
+
from src.serving.api import alert_dispatcher as compat
|
|
264
|
+
|
|
265
|
+
from .escalation import dispatch_alert
|
|
266
|
+
|
|
267
|
+
path = get_alert_config_path(self.app)
|
|
268
|
+
alerts = load_alerts(path)
|
|
269
|
+
now = compat.datetime.now(UTC)
|
|
270
|
+
triggered = 0
|
|
271
|
+
changed = False
|
|
272
|
+
for index, alert in enumerate(alerts):
|
|
273
|
+
if not alert.active:
|
|
274
|
+
continue
|
|
275
|
+
updated_alert, alert_changed, alert_triggered = await dispatch_alert(self, alert, now)
|
|
276
|
+
alerts[index] = updated_alert
|
|
277
|
+
triggered += alert_triggered
|
|
278
|
+
changed = changed or alert_changed
|
|
279
|
+
if changed:
|
|
280
|
+
save_alerts(path, alerts)
|
|
281
|
+
return triggered
|
|
282
|
+
|
|
283
|
+
async def send_test_alert(self, alert: AlertRule) -> dict:
|
|
284
|
+
from src.serving.api import alert_dispatcher as compat
|
|
285
|
+
|
|
286
|
+
from .escalation import deliver
|
|
287
|
+
|
|
288
|
+
payload = {
|
|
289
|
+
"alert_id": alert.id,
|
|
290
|
+
"alert_name": alert.name,
|
|
291
|
+
"metric": alert.metric,
|
|
292
|
+
"threshold": alert.threshold,
|
|
293
|
+
"condition": alert.condition,
|
|
294
|
+
"window": alert.window,
|
|
295
|
+
"triggered_at": compat.datetime.now(UTC).isoformat(),
|
|
296
|
+
"tenant": alert.tenant,
|
|
297
|
+
"test": True,
|
|
298
|
+
}
|
|
299
|
+
return await deliver(self, alert, payload, event_type="alert.test")
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from src.serving.api.webhook_dispatcher import _event_body, _signature
|
|
9
|
+
|
|
10
|
+
from .evaluator import evaluate_rule
|
|
11
|
+
from .history import ensure_alert_history_table, log_alert_history
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .dispatcher import AlertDispatcher, AlertRule
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def dispatch_alert(
|
|
18
|
+
dispatcher: AlertDispatcher,
|
|
19
|
+
alert: AlertRule,
|
|
20
|
+
now: datetime,
|
|
21
|
+
) -> tuple[AlertRule, bool, int]:
|
|
22
|
+
from src.serving.api import alert_dispatcher as compat
|
|
23
|
+
|
|
24
|
+
from .dispatcher import next_escalation_step
|
|
25
|
+
|
|
26
|
+
evaluation = evaluate_rule(dispatcher, alert, now)
|
|
27
|
+
current_triggered = bool(evaluation["triggered"])
|
|
28
|
+
alert_changed = False
|
|
29
|
+
triggered = 0
|
|
30
|
+
|
|
31
|
+
if alert.flap_detection.enabled:
|
|
32
|
+
previous_count = len(alert.state_changes)
|
|
33
|
+
flap_window = timedelta(minutes=alert.flap_detection.window_minutes)
|
|
34
|
+
alert.state_changes = [
|
|
35
|
+
state_change
|
|
36
|
+
for state_change in alert.state_changes
|
|
37
|
+
if now - state_change <= flap_window
|
|
38
|
+
]
|
|
39
|
+
alert_changed = alert_changed or len(alert.state_changes) != previous_count
|
|
40
|
+
|
|
41
|
+
if alert.state == "suppressed" and (
|
|
42
|
+
not alert.flap_detection.enabled
|
|
43
|
+
or len(alert.state_changes) <= alert.flap_detection.max_changes
|
|
44
|
+
):
|
|
45
|
+
alert.state = "ok"
|
|
46
|
+
alert.fired_at = None
|
|
47
|
+
alert.resolved_at = now if not current_triggered else None
|
|
48
|
+
alert.last_escalation_level = 0
|
|
49
|
+
alert.last_condition_triggered = False
|
|
50
|
+
alert_changed = True
|
|
51
|
+
|
|
52
|
+
state_changed = current_triggered != alert.last_condition_triggered
|
|
53
|
+
if state_changed and alert.flap_detection.enabled:
|
|
54
|
+
alert.state_changes.append(now)
|
|
55
|
+
alert_changed = True
|
|
56
|
+
if len(alert.state_changes) > alert.flap_detection.max_changes:
|
|
57
|
+
alert.state = "suppressed"
|
|
58
|
+
alert.fired_at = None
|
|
59
|
+
alert.resolved_at = now if not current_triggered else None
|
|
60
|
+
alert.last_escalation_level = 0
|
|
61
|
+
alert.last_condition_triggered = current_triggered
|
|
62
|
+
alert.updated_at = now
|
|
63
|
+
compat.logger.warning(
|
|
64
|
+
"alert_flapping_suppressed",
|
|
65
|
+
alert_id=alert.id,
|
|
66
|
+
alert_name=alert.name,
|
|
67
|
+
changes=len(alert.state_changes),
|
|
68
|
+
window_minutes=alert.flap_detection.window_minutes,
|
|
69
|
+
)
|
|
70
|
+
return alert, True, 0
|
|
71
|
+
|
|
72
|
+
if alert.state == "suppressed":
|
|
73
|
+
alert.last_condition_triggered = current_triggered
|
|
74
|
+
alert.updated_at = now
|
|
75
|
+
return alert, True, 0
|
|
76
|
+
|
|
77
|
+
if current_triggered and alert.fired_at is None:
|
|
78
|
+
alert.fired_at = now
|
|
79
|
+
alert.resolved_at = None
|
|
80
|
+
alert.state = "firing"
|
|
81
|
+
alert.last_escalation_level = 1
|
|
82
|
+
payload = {
|
|
83
|
+
"alert_id": alert.id,
|
|
84
|
+
"alert_name": alert.name,
|
|
85
|
+
"status": "firing",
|
|
86
|
+
"metric": alert.metric,
|
|
87
|
+
"current_value": evaluation["current_value"],
|
|
88
|
+
"threshold": alert.threshold,
|
|
89
|
+
"condition": alert.condition,
|
|
90
|
+
"window": alert.window,
|
|
91
|
+
"triggered_at": now.isoformat(),
|
|
92
|
+
"fired_at": now.isoformat(),
|
|
93
|
+
"level": 1,
|
|
94
|
+
"tenant": alert.tenant,
|
|
95
|
+
}
|
|
96
|
+
if evaluation["previous_value"] is not None:
|
|
97
|
+
payload["previous_value"] = evaluation["previous_value"]
|
|
98
|
+
if evaluation["change_pct"] is not None:
|
|
99
|
+
payload["change_pct"] = evaluation["change_pct"]
|
|
100
|
+
await deliver(
|
|
101
|
+
dispatcher,
|
|
102
|
+
alert,
|
|
103
|
+
payload,
|
|
104
|
+
event_type="alert.triggered",
|
|
105
|
+
current_value=evaluation["current_value"],
|
|
106
|
+
previous_value=evaluation["previous_value"],
|
|
107
|
+
change_pct=evaluation["change_pct"],
|
|
108
|
+
webhook_url=alert.escalation[0].webhook_url,
|
|
109
|
+
)
|
|
110
|
+
alert.last_triggered_at = now
|
|
111
|
+
alert.last_condition_triggered = True
|
|
112
|
+
alert.updated_at = now
|
|
113
|
+
return alert, True, 1
|
|
114
|
+
|
|
115
|
+
if current_triggered and alert.fired_at is not None:
|
|
116
|
+
next_step = next_escalation_step(alert, now)
|
|
117
|
+
if next_step is not None:
|
|
118
|
+
duration_minutes = max(0, int((now - alert.fired_at).total_seconds() // 60))
|
|
119
|
+
payload = {
|
|
120
|
+
"alert_id": alert.id,
|
|
121
|
+
"alert_name": alert.name,
|
|
122
|
+
"status": "sustained",
|
|
123
|
+
"metric": alert.metric,
|
|
124
|
+
"current_value": evaluation["current_value"],
|
|
125
|
+
"threshold": alert.threshold,
|
|
126
|
+
"condition": alert.condition,
|
|
127
|
+
"window": alert.window,
|
|
128
|
+
"triggered_at": alert.fired_at.isoformat(),
|
|
129
|
+
"fired_at": alert.fired_at.isoformat(),
|
|
130
|
+
"level": next_step.level,
|
|
131
|
+
"duration_minutes": duration_minutes,
|
|
132
|
+
"tenant": alert.tenant,
|
|
133
|
+
}
|
|
134
|
+
if evaluation["previous_value"] is not None:
|
|
135
|
+
payload["previous_value"] = evaluation["previous_value"]
|
|
136
|
+
if evaluation["change_pct"] is not None:
|
|
137
|
+
payload["change_pct"] = evaluation["change_pct"]
|
|
138
|
+
await deliver(
|
|
139
|
+
dispatcher,
|
|
140
|
+
alert,
|
|
141
|
+
payload,
|
|
142
|
+
event_type=(
|
|
143
|
+
"alert.escalated"
|
|
144
|
+
if next_step.level > alert.last_escalation_level
|
|
145
|
+
else "alert.sustained"
|
|
146
|
+
),
|
|
147
|
+
current_value=evaluation["current_value"],
|
|
148
|
+
previous_value=evaluation["previous_value"],
|
|
149
|
+
change_pct=evaluation["change_pct"],
|
|
150
|
+
webhook_url=next_step.webhook_url,
|
|
151
|
+
)
|
|
152
|
+
alert.last_triggered_at = now
|
|
153
|
+
alert.last_escalation_level = max(
|
|
154
|
+
alert.last_escalation_level,
|
|
155
|
+
next_step.level,
|
|
156
|
+
)
|
|
157
|
+
alert.updated_at = now
|
|
158
|
+
triggered += 1
|
|
159
|
+
alert_changed = True
|
|
160
|
+
alert.state = "sustained"
|
|
161
|
+
alert.last_condition_triggered = True
|
|
162
|
+
return alert, alert_changed, triggered
|
|
163
|
+
|
|
164
|
+
if not current_triggered and alert.fired_at is not None:
|
|
165
|
+
duration_minutes = max(0, int((now - alert.fired_at).total_seconds() // 60))
|
|
166
|
+
payload = {
|
|
167
|
+
"alert_id": alert.id,
|
|
168
|
+
"alert_name": alert.name,
|
|
169
|
+
"status": "resolved",
|
|
170
|
+
"metric": alert.metric,
|
|
171
|
+
"resolved_value": evaluation["current_value"],
|
|
172
|
+
"fired_at": alert.fired_at.isoformat(),
|
|
173
|
+
"resolved_at": now.isoformat(),
|
|
174
|
+
"duration_minutes": duration_minutes,
|
|
175
|
+
"tenant": alert.tenant,
|
|
176
|
+
}
|
|
177
|
+
notified_urls: list[str] = []
|
|
178
|
+
for step in alert.escalation:
|
|
179
|
+
if step.level > max(1, alert.last_escalation_level):
|
|
180
|
+
continue
|
|
181
|
+
if step.webhook_url not in notified_urls:
|
|
182
|
+
notified_urls.append(step.webhook_url)
|
|
183
|
+
for webhook_url in notified_urls or [alert.webhook_url]:
|
|
184
|
+
await deliver(
|
|
185
|
+
dispatcher,
|
|
186
|
+
alert,
|
|
187
|
+
payload,
|
|
188
|
+
event_type="alert.resolved",
|
|
189
|
+
current_value=evaluation["current_value"],
|
|
190
|
+
previous_value=evaluation["previous_value"],
|
|
191
|
+
change_pct=evaluation["change_pct"],
|
|
192
|
+
webhook_url=webhook_url,
|
|
193
|
+
)
|
|
194
|
+
triggered += 1
|
|
195
|
+
alert.state = "resolved"
|
|
196
|
+
alert.resolved_at = now
|
|
197
|
+
alert.fired_at = None
|
|
198
|
+
alert.last_escalation_level = 0
|
|
199
|
+
alert.last_triggered_at = now
|
|
200
|
+
alert.last_condition_triggered = False
|
|
201
|
+
alert.updated_at = now
|
|
202
|
+
return alert, True, triggered
|
|
203
|
+
|
|
204
|
+
if alert.state == "resolved":
|
|
205
|
+
alert.state = "ok"
|
|
206
|
+
alert.updated_at = now
|
|
207
|
+
alert_changed = True
|
|
208
|
+
alert.last_condition_triggered = False
|
|
209
|
+
return alert, alert_changed, triggered
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
async def deliver(
|
|
213
|
+
dispatcher: AlertDispatcher,
|
|
214
|
+
alert: AlertRule,
|
|
215
|
+
payload: dict,
|
|
216
|
+
*,
|
|
217
|
+
event_type: str,
|
|
218
|
+
current_value: float | None = None,
|
|
219
|
+
previous_value: float | None = None,
|
|
220
|
+
change_pct: float | None = None,
|
|
221
|
+
webhook_url: str | None = None,
|
|
222
|
+
) -> dict:
|
|
223
|
+
from src.serving.api import alert_dispatcher as compat
|
|
224
|
+
|
|
225
|
+
conn = dispatcher.app.state.query_engine._conn
|
|
226
|
+
ensure_alert_history_table(conn)
|
|
227
|
+
delivery_id = str(uuid.uuid4())
|
|
228
|
+
body = _event_body(payload)
|
|
229
|
+
headers = {
|
|
230
|
+
"Content-Type": "application/json",
|
|
231
|
+
"X-AgentFlow-Event": event_type,
|
|
232
|
+
"X-AgentFlow-Signature": _signature(alert.secret, body),
|
|
233
|
+
"X-AgentFlow-Delivery": delivery_id,
|
|
234
|
+
}
|
|
235
|
+
attempts = 0
|
|
236
|
+
success = False
|
|
237
|
+
status_code: int | None = None
|
|
238
|
+
error: str | None = None
|
|
239
|
+
|
|
240
|
+
async with compat.httpx.AsyncClient(timeout=5.0) as client:
|
|
241
|
+
for attempt in range(1, 4):
|
|
242
|
+
attempts = attempt
|
|
243
|
+
error = None
|
|
244
|
+
try:
|
|
245
|
+
response = await client.post(
|
|
246
|
+
webhook_url or alert.webhook_url,
|
|
247
|
+
content=body,
|
|
248
|
+
headers=headers,
|
|
249
|
+
)
|
|
250
|
+
status_code = response.status_code
|
|
251
|
+
success = 200 <= response.status_code < 300
|
|
252
|
+
if response.status_code < 500:
|
|
253
|
+
break
|
|
254
|
+
except (compat.httpx.TimeoutException, compat.httpx.TransportError) as exc:
|
|
255
|
+
status_code = None
|
|
256
|
+
success = False
|
|
257
|
+
error = str(exc)
|
|
258
|
+
|
|
259
|
+
if attempt < 3:
|
|
260
|
+
delay = dispatcher.backoff_seconds[
|
|
261
|
+
min(attempt - 1, len(dispatcher.backoff_seconds) - 1)
|
|
262
|
+
]
|
|
263
|
+
await asyncio.sleep(delay)
|
|
264
|
+
|
|
265
|
+
log_alert_history(
|
|
266
|
+
conn,
|
|
267
|
+
delivery_id=delivery_id,
|
|
268
|
+
alert=alert,
|
|
269
|
+
metric=alert.metric,
|
|
270
|
+
current_value=current_value,
|
|
271
|
+
previous_value=previous_value,
|
|
272
|
+
change_pct=change_pct,
|
|
273
|
+
threshold=alert.threshold,
|
|
274
|
+
condition=alert.condition,
|
|
275
|
+
window=alert.window,
|
|
276
|
+
event_type=event_type,
|
|
277
|
+
status_code=status_code,
|
|
278
|
+
success=success,
|
|
279
|
+
error=error,
|
|
280
|
+
payload=payload,
|
|
281
|
+
)
|
|
282
|
+
return {
|
|
283
|
+
"delivery_id": delivery_id,
|
|
284
|
+
"alert_id": alert.id,
|
|
285
|
+
"event_type": event_type,
|
|
286
|
+
"success": success,
|
|
287
|
+
"status_code": status_code,
|
|
288
|
+
"error": error,
|
|
289
|
+
"attempts": attempts,
|
|
290
|
+
}
|