agentflow-runtime 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
  2. agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
  3. agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
  4. agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
  5. src/__init__.py +0 -0
  6. src/constants.py +3 -0
  7. src/ingestion/__init__.py +0 -0
  8. src/ingestion/cdc/__init__.py +5 -0
  9. src/ingestion/cdc/normalizer.py +186 -0
  10. src/ingestion/connectors/__init__.py +0 -0
  11. src/ingestion/connectors/mysql_cdc.py +63 -0
  12. src/ingestion/connectors/postgres_cdc.py +68 -0
  13. src/ingestion/producers/__init__.py +0 -0
  14. src/ingestion/producers/event_producer.py +237 -0
  15. src/ingestion/schemas/__init__.py +0 -0
  16. src/ingestion/schemas/events.py +147 -0
  17. src/ingestion/tenant_router.py +80 -0
  18. src/logger.py +41 -0
  19. src/orchestration/__init__.py +0 -0
  20. src/orchestration/dags/__init__.py +0 -0
  21. src/orchestration/dags/daily_batch.py +201 -0
  22. src/processing/__init__.py +0 -0
  23. src/processing/event_replayer.py +250 -0
  24. src/processing/flink_jobs/Dockerfile +55 -0
  25. src/processing/flink_jobs/__init__.py +0 -0
  26. src/processing/flink_jobs/checkpointing.py +32 -0
  27. src/processing/flink_jobs/session_aggregation.py +212 -0
  28. src/processing/flink_jobs/session_aggregator.py +199 -0
  29. src/processing/flink_jobs/stream_processor.py +316 -0
  30. src/processing/iceberg_sink.py +348 -0
  31. src/processing/local_pipeline.py +452 -0
  32. src/processing/outbox.py +273 -0
  33. src/processing/tracing.py +36 -0
  34. src/processing/transformations/__init__.py +0 -0
  35. src/processing/transformations/enrichment.py +125 -0
  36. src/quality/__init__.py +0 -0
  37. src/quality/monitors/__init__.py +0 -0
  38. src/quality/monitors/freshness_monitor.py +166 -0
  39. src/quality/monitors/metrics_collector.py +367 -0
  40. src/quality/validators/__init__.py +0 -0
  41. src/quality/validators/schema_validator.py +119 -0
  42. src/quality/validators/semantic_validator.py +202 -0
  43. src/serving/__init__.py +0 -0
  44. src/serving/api/__init__.py +0 -0
  45. src/serving/api/alert_dispatcher.py +51 -0
  46. src/serving/api/alerts/__init__.py +38 -0
  47. src/serving/api/alerts/dispatcher.py +299 -0
  48. src/serving/api/alerts/escalation.py +290 -0
  49. src/serving/api/alerts/evaluator.py +81 -0
  50. src/serving/api/alerts/history.py +115 -0
  51. src/serving/api/analytics.py +543 -0
  52. src/serving/api/auth/__init__.py +46 -0
  53. src/serving/api/auth/key_rotation.py +400 -0
  54. src/serving/api/auth/manager.py +406 -0
  55. src/serving/api/auth/middleware.py +331 -0
  56. src/serving/api/main.py +390 -0
  57. src/serving/api/middleware/logging.py +41 -0
  58. src/serving/api/middleware/tracing.py +51 -0
  59. src/serving/api/rate_limiter.py +76 -0
  60. src/serving/api/routers/__init__.py +0 -0
  61. src/serving/api/routers/admin.py +150 -0
  62. src/serving/api/routers/admin_ui.py +93 -0
  63. src/serving/api/routers/agent_query.py +639 -0
  64. src/serving/api/routers/alerts.py +134 -0
  65. src/serving/api/routers/batch.py +231 -0
  66. src/serving/api/routers/contracts.py +98 -0
  67. src/serving/api/routers/deadletter.py +337 -0
  68. src/serving/api/routers/lineage.py +218 -0
  69. src/serving/api/routers/search.py +103 -0
  70. src/serving/api/routers/slo.py +231 -0
  71. src/serving/api/routers/stream.py +141 -0
  72. src/serving/api/routers/webhooks.py +93 -0
  73. src/serving/api/security.py +83 -0
  74. src/serving/api/telemetry.py +66 -0
  75. src/serving/api/templates/admin.html +214 -0
  76. src/serving/api/versioning.py +328 -0
  77. src/serving/api/webhook_dispatcher.py +423 -0
  78. src/serving/backends/__init__.py +117 -0
  79. src/serving/backends/clickhouse_backend.py +310 -0
  80. src/serving/backends/duckdb_backend.py +268 -0
  81. src/serving/cache.py +169 -0
  82. src/serving/db_pool.py +105 -0
  83. src/serving/masking.py +122 -0
  84. src/serving/semantic_layer/__init__.py +0 -0
  85. src/serving/semantic_layer/catalog.py +177 -0
  86. src/serving/semantic_layer/contract_registry.py +258 -0
  87. src/serving/semantic_layer/entity_type_registry.py +107 -0
  88. src/serving/semantic_layer/nl_engine.py +189 -0
  89. src/serving/semantic_layer/query/__init__.py +3 -0
  90. src/serving/semantic_layer/query/contracts.py +47 -0
  91. src/serving/semantic_layer/query/engine.py +81 -0
  92. src/serving/semantic_layer/query/entity_queries.py +221 -0
  93. src/serving/semantic_layer/query/metric_queries.py +84 -0
  94. src/serving/semantic_layer/query/nl_queries.py +305 -0
  95. src/serving/semantic_layer/query/sql_builder.py +113 -0
  96. src/serving/semantic_layer/query/sql_guard.py +3 -0
  97. src/serving/semantic_layer/query_engine.py +5 -0
  98. src/serving/semantic_layer/schema_evolution.py +175 -0
  99. src/serving/semantic_layer/search_index.py +337 -0
  100. src/serving/semantic_layer/sql_guard.py +56 -0
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ import httpx
6
+ import structlog
7
+
8
+ from src.serving.api.alerts import (
9
+ DEFAULT_ALERTS_CONFIG_PATH,
10
+ AlertConfig,
11
+ AlertDispatcher,
12
+ AlertEscalationStep,
13
+ AlertFlapDetection,
14
+ AlertRule,
15
+ create_alert,
16
+ deactivate_alert,
17
+ ensure_alert_dispatcher,
18
+ ensure_alert_history_table,
19
+ get_alert,
20
+ get_alert_config_path,
21
+ get_alert_history,
22
+ list_alerts,
23
+ load_alerts,
24
+ save_alerts,
25
+ update_alert,
26
+ )
27
+
28
+ logger = structlog.get_logger()
29
+
30
+ __all__ = [
31
+ "DEFAULT_ALERTS_CONFIG_PATH",
32
+ "AlertConfig",
33
+ "AlertDispatcher",
34
+ "AlertEscalationStep",
35
+ "AlertFlapDetection",
36
+ "AlertRule",
37
+ "create_alert",
38
+ "datetime",
39
+ "deactivate_alert",
40
+ "ensure_alert_dispatcher",
41
+ "ensure_alert_history_table",
42
+ "get_alert",
43
+ "get_alert_config_path",
44
+ "get_alert_history",
45
+ "httpx",
46
+ "list_alerts",
47
+ "load_alerts",
48
+ "logger",
49
+ "save_alerts",
50
+ "update_alert",
51
+ ]
@@ -0,0 +1,38 @@
1
+ from .dispatcher import (
2
+ DEFAULT_ALERTS_CONFIG_PATH,
3
+ AlertConfig,
4
+ AlertDispatcher,
5
+ AlertEscalationStep,
6
+ AlertFlapDetection,
7
+ AlertRule,
8
+ create_alert,
9
+ deactivate_alert,
10
+ ensure_alert_dispatcher,
11
+ get_alert,
12
+ get_alert_config_path,
13
+ list_alerts,
14
+ load_alerts,
15
+ save_alerts,
16
+ update_alert,
17
+ )
18
+ from .history import ensure_alert_history_table, get_alert_history
19
+
20
+ __all__ = [
21
+ "DEFAULT_ALERTS_CONFIG_PATH",
22
+ "AlertConfig",
23
+ "AlertDispatcher",
24
+ "AlertEscalationStep",
25
+ "AlertFlapDetection",
26
+ "AlertRule",
27
+ "create_alert",
28
+ "deactivate_alert",
29
+ "ensure_alert_dispatcher",
30
+ "ensure_alert_history_table",
31
+ "get_alert",
32
+ "get_alert_config_path",
33
+ "get_alert_history",
34
+ "list_alerts",
35
+ "load_alerts",
36
+ "save_alerts",
37
+ "update_alert",
38
+ ]
@@ -0,0 +1,299 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import secrets
7
+ import uuid
8
+ from datetime import UTC, datetime, timedelta
9
+ from pathlib import Path
10
+ from typing import Literal
11
+
12
+ from pydantic import BaseModel, Field, model_validator
13
+
14
+ try:
15
+ import yaml # type: ignore[import-untyped]
16
+ except ImportError: # pragma: no cover
17
+ yaml = None
18
+
19
+ DEFAULT_ALERTS_CONFIG_PATH = Path(os.getenv("AGENTFLOW_ALERTS_FILE", "config/alerts.yaml"))
20
+
21
+
22
+ class AlertEscalationStep(BaseModel):
23
+ level: int = Field(ge=1)
24
+ after_minutes: int = Field(ge=0)
25
+ webhook_url: str
26
+
27
+
28
+ class AlertFlapDetection(BaseModel):
29
+ enabled: bool = False
30
+ window_minutes: int = Field(default=5, ge=1)
31
+ max_changes: int = Field(default=3, ge=1)
32
+
33
+
34
+ class AlertRule(BaseModel):
35
+ id: str
36
+ name: str
37
+ tenant: str
38
+ metric: str
39
+ window: str
40
+ condition: Literal["above", "below", "change_pct"]
41
+ threshold: float
42
+ webhook_url: str
43
+ secret: str
44
+ cooldown_minutes: int = 30
45
+ active: bool = True
46
+ created_at: datetime
47
+ updated_at: datetime
48
+ last_triggered_at: datetime | None = None
49
+ escalation: list[AlertEscalationStep] = Field(default_factory=list)
50
+ flap_detection: AlertFlapDetection = Field(default_factory=AlertFlapDetection)
51
+ state: Literal["ok", "firing", "sustained", "resolved", "suppressed"] = "ok"
52
+ fired_at: datetime | None = None
53
+ resolved_at: datetime | None = None
54
+ last_escalation_level: int = 0
55
+ state_changes: list[datetime] = Field(default_factory=list)
56
+ last_condition_triggered: bool = False
57
+
58
+ @model_validator(mode="after")
59
+ def _normalize_escalation(self) -> AlertRule:
60
+ steps = sorted(self.escalation, key=lambda step: (step.after_minutes, step.level))
61
+ if not steps or steps[0].after_minutes != 0:
62
+ steps.insert(
63
+ 0,
64
+ AlertEscalationStep(level=1, after_minutes=0, webhook_url=self.webhook_url),
65
+ )
66
+ else:
67
+ first_step = steps[0]
68
+ steps[0] = AlertEscalationStep(
69
+ level=1,
70
+ after_minutes=0,
71
+ webhook_url=self.webhook_url,
72
+ )
73
+ if first_step.level == 1 and first_step.webhook_url == self.webhook_url:
74
+ steps[0] = first_step
75
+ self.escalation = steps
76
+ self.webhook_url = self.escalation[0].webhook_url
77
+ return self
78
+
79
+
80
+ class AlertConfig(BaseModel):
81
+ alerts: list[AlertRule] = Field(default_factory=list)
82
+
83
+
84
+ def get_alert_config_path(app) -> Path:
85
+ configured = getattr(app.state, "alert_config_path", None)
86
+ return Path(configured) if configured else DEFAULT_ALERTS_CONFIG_PATH
87
+
88
+
89
+ def load_alerts(path: Path) -> list[AlertRule]:
90
+ if not path.exists():
91
+ return []
92
+ raw = path.read_text(encoding="utf-8")
93
+ if not raw.strip():
94
+ return []
95
+ data = yaml.safe_load(raw) if yaml is not None else json.loads(raw)
96
+ config = AlertConfig.model_validate(data or {})
97
+ return config.alerts
98
+
99
+
100
+ def save_alerts(path: Path, alerts: list[AlertRule]) -> None:
101
+ path.parent.mkdir(parents=True, exist_ok=True)
102
+ payload = AlertConfig(alerts=alerts).model_dump(mode="json")
103
+ content = (
104
+ yaml.safe_dump(payload, sort_keys=False)
105
+ if yaml is not None
106
+ else json.dumps(payload, indent=2)
107
+ )
108
+ path.write_text(content, encoding="utf-8", newline="\n")
109
+
110
+
111
+ def create_alert(
112
+ path: Path,
113
+ *,
114
+ name: str,
115
+ tenant: str,
116
+ metric: str,
117
+ window: str,
118
+ condition: Literal["above", "below", "change_pct"],
119
+ threshold: float,
120
+ webhook_url: str,
121
+ cooldown_minutes: int,
122
+ ) -> AlertRule:
123
+ from src.serving.api import alert_dispatcher as compat
124
+
125
+ alerts = load_alerts(path)
126
+ now = compat.datetime.now(UTC)
127
+ rule = AlertRule(
128
+ id=str(uuid.uuid4()),
129
+ name=name,
130
+ tenant=tenant,
131
+ metric=metric,
132
+ window=window,
133
+ condition=condition,
134
+ threshold=threshold,
135
+ webhook_url=webhook_url,
136
+ secret=secrets.token_urlsafe(32),
137
+ cooldown_minutes=cooldown_minutes,
138
+ created_at=now,
139
+ updated_at=now,
140
+ )
141
+ alerts.append(rule)
142
+ save_alerts(path, alerts)
143
+ return rule
144
+
145
+
146
+ def list_alerts(path: Path, tenant: str) -> list[AlertRule]:
147
+ return [alert for alert in load_alerts(path) if alert.tenant == tenant and alert.active]
148
+
149
+
150
+ def get_alert(path: Path, alert_id: str, tenant: str) -> AlertRule | None:
151
+ for alert in load_alerts(path):
152
+ if alert.id == alert_id and alert.tenant == tenant and alert.active:
153
+ return alert
154
+ return None
155
+
156
+
157
+ def update_alert(path: Path, alert_id: str, tenant: str, updates: dict) -> AlertRule | None:
158
+ from src.serving.api import alert_dispatcher as compat
159
+
160
+ alerts = load_alerts(path)
161
+ for index, alert in enumerate(alerts):
162
+ if alert.id != alert_id or alert.tenant != tenant or not alert.active:
163
+ continue
164
+ payload = alert.model_dump(mode="python")
165
+ payload.update(updates)
166
+ payload["updated_at"] = compat.datetime.now(UTC)
167
+ updated = AlertRule.model_validate(payload)
168
+ alerts[index] = updated
169
+ save_alerts(path, alerts)
170
+ return updated
171
+ return None
172
+
173
+
174
+ def deactivate_alert(path: Path, alert_id: str, tenant: str) -> bool:
175
+ from src.serving.api import alert_dispatcher as compat
176
+
177
+ alerts = load_alerts(path)
178
+ changed = False
179
+ for index, alert in enumerate(alerts):
180
+ if alert.id != alert_id or alert.tenant != tenant or not alert.active:
181
+ continue
182
+ payload = alert.model_dump(mode="python")
183
+ payload["active"] = False
184
+ payload["updated_at"] = compat.datetime.now(UTC)
185
+ alerts[index] = AlertRule.model_validate(payload)
186
+ changed = True
187
+ break
188
+ if changed:
189
+ save_alerts(path, alerts)
190
+ return changed
191
+
192
+
193
+ def ensure_alert_dispatcher(app) -> AlertDispatcher:
194
+ dispatcher = getattr(app.state, "alert_dispatcher", None)
195
+ if dispatcher is None:
196
+ dispatcher = AlertDispatcher(app)
197
+ app.state.alert_dispatcher = dispatcher
198
+ return dispatcher
199
+
200
+
201
+ def cooldown_elapsed(alert: AlertRule, now: datetime) -> bool:
202
+ if alert.last_triggered_at is None:
203
+ return True
204
+ cooldown = timedelta(minutes=alert.cooldown_minutes)
205
+ return now - alert.last_triggered_at >= cooldown
206
+
207
+
208
+ def next_escalation_step(
209
+ alert: AlertRule,
210
+ now: datetime,
211
+ ) -> AlertEscalationStep | None:
212
+ if alert.fired_at is None:
213
+ return None
214
+ elapsed_minutes = (now - alert.fired_at).total_seconds() / 60
215
+ due_steps = [
216
+ step
217
+ for step in alert.escalation
218
+ if step.level > alert.last_escalation_level and elapsed_minutes >= step.after_minutes
219
+ ]
220
+ if due_steps:
221
+ return due_steps[-1]
222
+ if (
223
+ len(alert.escalation) == 1
224
+ and alert.last_escalation_level == alert.escalation[0].level
225
+ and cooldown_elapsed(alert, now)
226
+ ):
227
+ return alert.escalation[0]
228
+ return None
229
+
230
+
231
+ class AlertDispatcher:
232
+ def __init__(self, app, poll_interval_seconds: float = 60.0) -> None:
233
+ self.app = app
234
+ self.poll_interval_seconds = poll_interval_seconds
235
+ self.backoff_seconds = [1.0, 5.0, 25.0]
236
+ self._task: asyncio.Task | None = None
237
+
238
+ def start(self) -> None:
239
+ if self._task is not None and not self._task.done():
240
+ return
241
+ self._task = asyncio.create_task(self.run())
242
+
243
+ async def stop(self) -> None:
244
+ if self._task is None or self._task.done():
245
+ return
246
+ self._task.cancel()
247
+ try:
248
+ await self._task
249
+ except asyncio.CancelledError:
250
+ pass
251
+
252
+ async def run(self) -> None:
253
+ from src.serving.api import alert_dispatcher as compat
254
+
255
+ while True:
256
+ try:
257
+ await self.dispatch_alerts()
258
+ except Exception as exc:
259
+ compat.logger.warning("alert_dispatcher_error", error=str(exc))
260
+ await asyncio.sleep(self.poll_interval_seconds)
261
+
262
+ async def dispatch_alerts(self) -> int:
263
+ from src.serving.api import alert_dispatcher as compat
264
+
265
+ from .escalation import dispatch_alert
266
+
267
+ path = get_alert_config_path(self.app)
268
+ alerts = load_alerts(path)
269
+ now = compat.datetime.now(UTC)
270
+ triggered = 0
271
+ changed = False
272
+ for index, alert in enumerate(alerts):
273
+ if not alert.active:
274
+ continue
275
+ updated_alert, alert_changed, alert_triggered = await dispatch_alert(self, alert, now)
276
+ alerts[index] = updated_alert
277
+ triggered += alert_triggered
278
+ changed = changed or alert_changed
279
+ if changed:
280
+ save_alerts(path, alerts)
281
+ return triggered
282
+
283
+ async def send_test_alert(self, alert: AlertRule) -> dict:
284
+ from src.serving.api import alert_dispatcher as compat
285
+
286
+ from .escalation import deliver
287
+
288
+ payload = {
289
+ "alert_id": alert.id,
290
+ "alert_name": alert.name,
291
+ "metric": alert.metric,
292
+ "threshold": alert.threshold,
293
+ "condition": alert.condition,
294
+ "window": alert.window,
295
+ "triggered_at": compat.datetime.now(UTC).isoformat(),
296
+ "tenant": alert.tenant,
297
+ "test": True,
298
+ }
299
+ return await deliver(self, alert, payload, event_type="alert.test")
@@ -0,0 +1,290 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import uuid
5
+ from datetime import datetime, timedelta
6
+ from typing import TYPE_CHECKING
7
+
8
+ from src.serving.api.webhook_dispatcher import _event_body, _signature
9
+
10
+ from .evaluator import evaluate_rule
11
+ from .history import ensure_alert_history_table, log_alert_history
12
+
13
+ if TYPE_CHECKING:
14
+ from .dispatcher import AlertDispatcher, AlertRule
15
+
16
+
17
+ async def dispatch_alert(
18
+ dispatcher: AlertDispatcher,
19
+ alert: AlertRule,
20
+ now: datetime,
21
+ ) -> tuple[AlertRule, bool, int]:
22
+ from src.serving.api import alert_dispatcher as compat
23
+
24
+ from .dispatcher import next_escalation_step
25
+
26
+ evaluation = evaluate_rule(dispatcher, alert, now)
27
+ current_triggered = bool(evaluation["triggered"])
28
+ alert_changed = False
29
+ triggered = 0
30
+
31
+ if alert.flap_detection.enabled:
32
+ previous_count = len(alert.state_changes)
33
+ flap_window = timedelta(minutes=alert.flap_detection.window_minutes)
34
+ alert.state_changes = [
35
+ state_change
36
+ for state_change in alert.state_changes
37
+ if now - state_change <= flap_window
38
+ ]
39
+ alert_changed = alert_changed or len(alert.state_changes) != previous_count
40
+
41
+ if alert.state == "suppressed" and (
42
+ not alert.flap_detection.enabled
43
+ or len(alert.state_changes) <= alert.flap_detection.max_changes
44
+ ):
45
+ alert.state = "ok"
46
+ alert.fired_at = None
47
+ alert.resolved_at = now if not current_triggered else None
48
+ alert.last_escalation_level = 0
49
+ alert.last_condition_triggered = False
50
+ alert_changed = True
51
+
52
+ state_changed = current_triggered != alert.last_condition_triggered
53
+ if state_changed and alert.flap_detection.enabled:
54
+ alert.state_changes.append(now)
55
+ alert_changed = True
56
+ if len(alert.state_changes) > alert.flap_detection.max_changes:
57
+ alert.state = "suppressed"
58
+ alert.fired_at = None
59
+ alert.resolved_at = now if not current_triggered else None
60
+ alert.last_escalation_level = 0
61
+ alert.last_condition_triggered = current_triggered
62
+ alert.updated_at = now
63
+ compat.logger.warning(
64
+ "alert_flapping_suppressed",
65
+ alert_id=alert.id,
66
+ alert_name=alert.name,
67
+ changes=len(alert.state_changes),
68
+ window_minutes=alert.flap_detection.window_minutes,
69
+ )
70
+ return alert, True, 0
71
+
72
+ if alert.state == "suppressed":
73
+ alert.last_condition_triggered = current_triggered
74
+ alert.updated_at = now
75
+ return alert, True, 0
76
+
77
+ if current_triggered and alert.fired_at is None:
78
+ alert.fired_at = now
79
+ alert.resolved_at = None
80
+ alert.state = "firing"
81
+ alert.last_escalation_level = 1
82
+ payload = {
83
+ "alert_id": alert.id,
84
+ "alert_name": alert.name,
85
+ "status": "firing",
86
+ "metric": alert.metric,
87
+ "current_value": evaluation["current_value"],
88
+ "threshold": alert.threshold,
89
+ "condition": alert.condition,
90
+ "window": alert.window,
91
+ "triggered_at": now.isoformat(),
92
+ "fired_at": now.isoformat(),
93
+ "level": 1,
94
+ "tenant": alert.tenant,
95
+ }
96
+ if evaluation["previous_value"] is not None:
97
+ payload["previous_value"] = evaluation["previous_value"]
98
+ if evaluation["change_pct"] is not None:
99
+ payload["change_pct"] = evaluation["change_pct"]
100
+ await deliver(
101
+ dispatcher,
102
+ alert,
103
+ payload,
104
+ event_type="alert.triggered",
105
+ current_value=evaluation["current_value"],
106
+ previous_value=evaluation["previous_value"],
107
+ change_pct=evaluation["change_pct"],
108
+ webhook_url=alert.escalation[0].webhook_url,
109
+ )
110
+ alert.last_triggered_at = now
111
+ alert.last_condition_triggered = True
112
+ alert.updated_at = now
113
+ return alert, True, 1
114
+
115
+ if current_triggered and alert.fired_at is not None:
116
+ next_step = next_escalation_step(alert, now)
117
+ if next_step is not None:
118
+ duration_minutes = max(0, int((now - alert.fired_at).total_seconds() // 60))
119
+ payload = {
120
+ "alert_id": alert.id,
121
+ "alert_name": alert.name,
122
+ "status": "sustained",
123
+ "metric": alert.metric,
124
+ "current_value": evaluation["current_value"],
125
+ "threshold": alert.threshold,
126
+ "condition": alert.condition,
127
+ "window": alert.window,
128
+ "triggered_at": alert.fired_at.isoformat(),
129
+ "fired_at": alert.fired_at.isoformat(),
130
+ "level": next_step.level,
131
+ "duration_minutes": duration_minutes,
132
+ "tenant": alert.tenant,
133
+ }
134
+ if evaluation["previous_value"] is not None:
135
+ payload["previous_value"] = evaluation["previous_value"]
136
+ if evaluation["change_pct"] is not None:
137
+ payload["change_pct"] = evaluation["change_pct"]
138
+ await deliver(
139
+ dispatcher,
140
+ alert,
141
+ payload,
142
+ event_type=(
143
+ "alert.escalated"
144
+ if next_step.level > alert.last_escalation_level
145
+ else "alert.sustained"
146
+ ),
147
+ current_value=evaluation["current_value"],
148
+ previous_value=evaluation["previous_value"],
149
+ change_pct=evaluation["change_pct"],
150
+ webhook_url=next_step.webhook_url,
151
+ )
152
+ alert.last_triggered_at = now
153
+ alert.last_escalation_level = max(
154
+ alert.last_escalation_level,
155
+ next_step.level,
156
+ )
157
+ alert.updated_at = now
158
+ triggered += 1
159
+ alert_changed = True
160
+ alert.state = "sustained"
161
+ alert.last_condition_triggered = True
162
+ return alert, alert_changed, triggered
163
+
164
+ if not current_triggered and alert.fired_at is not None:
165
+ duration_minutes = max(0, int((now - alert.fired_at).total_seconds() // 60))
166
+ payload = {
167
+ "alert_id": alert.id,
168
+ "alert_name": alert.name,
169
+ "status": "resolved",
170
+ "metric": alert.metric,
171
+ "resolved_value": evaluation["current_value"],
172
+ "fired_at": alert.fired_at.isoformat(),
173
+ "resolved_at": now.isoformat(),
174
+ "duration_minutes": duration_minutes,
175
+ "tenant": alert.tenant,
176
+ }
177
+ notified_urls: list[str] = []
178
+ for step in alert.escalation:
179
+ if step.level > max(1, alert.last_escalation_level):
180
+ continue
181
+ if step.webhook_url not in notified_urls:
182
+ notified_urls.append(step.webhook_url)
183
+ for webhook_url in notified_urls or [alert.webhook_url]:
184
+ await deliver(
185
+ dispatcher,
186
+ alert,
187
+ payload,
188
+ event_type="alert.resolved",
189
+ current_value=evaluation["current_value"],
190
+ previous_value=evaluation["previous_value"],
191
+ change_pct=evaluation["change_pct"],
192
+ webhook_url=webhook_url,
193
+ )
194
+ triggered += 1
195
+ alert.state = "resolved"
196
+ alert.resolved_at = now
197
+ alert.fired_at = None
198
+ alert.last_escalation_level = 0
199
+ alert.last_triggered_at = now
200
+ alert.last_condition_triggered = False
201
+ alert.updated_at = now
202
+ return alert, True, triggered
203
+
204
+ if alert.state == "resolved":
205
+ alert.state = "ok"
206
+ alert.updated_at = now
207
+ alert_changed = True
208
+ alert.last_condition_triggered = False
209
+ return alert, alert_changed, triggered
210
+
211
+
212
+ async def deliver(
213
+ dispatcher: AlertDispatcher,
214
+ alert: AlertRule,
215
+ payload: dict,
216
+ *,
217
+ event_type: str,
218
+ current_value: float | None = None,
219
+ previous_value: float | None = None,
220
+ change_pct: float | None = None,
221
+ webhook_url: str | None = None,
222
+ ) -> dict:
223
+ from src.serving.api import alert_dispatcher as compat
224
+
225
+ conn = dispatcher.app.state.query_engine._conn
226
+ ensure_alert_history_table(conn)
227
+ delivery_id = str(uuid.uuid4())
228
+ body = _event_body(payload)
229
+ headers = {
230
+ "Content-Type": "application/json",
231
+ "X-AgentFlow-Event": event_type,
232
+ "X-AgentFlow-Signature": _signature(alert.secret, body),
233
+ "X-AgentFlow-Delivery": delivery_id,
234
+ }
235
+ attempts = 0
236
+ success = False
237
+ status_code: int | None = None
238
+ error: str | None = None
239
+
240
+ async with compat.httpx.AsyncClient(timeout=5.0) as client:
241
+ for attempt in range(1, 4):
242
+ attempts = attempt
243
+ error = None
244
+ try:
245
+ response = await client.post(
246
+ webhook_url or alert.webhook_url,
247
+ content=body,
248
+ headers=headers,
249
+ )
250
+ status_code = response.status_code
251
+ success = 200 <= response.status_code < 300
252
+ if response.status_code < 500:
253
+ break
254
+ except (compat.httpx.TimeoutException, compat.httpx.TransportError) as exc:
255
+ status_code = None
256
+ success = False
257
+ error = str(exc)
258
+
259
+ if attempt < 3:
260
+ delay = dispatcher.backoff_seconds[
261
+ min(attempt - 1, len(dispatcher.backoff_seconds) - 1)
262
+ ]
263
+ await asyncio.sleep(delay)
264
+
265
+ log_alert_history(
266
+ conn,
267
+ delivery_id=delivery_id,
268
+ alert=alert,
269
+ metric=alert.metric,
270
+ current_value=current_value,
271
+ previous_value=previous_value,
272
+ change_pct=change_pct,
273
+ threshold=alert.threshold,
274
+ condition=alert.condition,
275
+ window=alert.window,
276
+ event_type=event_type,
277
+ status_code=status_code,
278
+ success=success,
279
+ error=error,
280
+ payload=payload,
281
+ )
282
+ return {
283
+ "delivery_id": delivery_id,
284
+ "alert_id": alert.id,
285
+ "event_type": event_type,
286
+ "success": success,
287
+ "status_code": status_code,
288
+ "error": error,
289
+ "attempts": attempts,
290
+ }