truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
- truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,2896 @@
|
|
|
1
|
+
"""Storage backends for escalation state.
|
|
2
|
+
|
|
3
|
+
This module provides storage backends for persisting escalation
|
|
4
|
+
policies and incidents.
|
|
5
|
+
|
|
6
|
+
Storage Backends:
|
|
7
|
+
- InMemoryEscalationStore: Simple in-memory storage
|
|
8
|
+
- SQLiteEscalationStore: Persistent SQLite storage
|
|
9
|
+
- RedisEscalationStore: Redis-based storage for distributed deployments
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import random
|
|
18
|
+
import sqlite3
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
import uuid
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from datetime import datetime, timedelta
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import TYPE_CHECKING, Any
|
|
27
|
+
|
|
28
|
+
from .models import EscalationIncident, EscalationPolicy, EscalationState
|
|
29
|
+
|
|
30
|
+
# Optional Redis dependency
|
|
31
|
+
try:
|
|
32
|
+
import redis
|
|
33
|
+
import redis.asyncio
|
|
34
|
+
|
|
35
|
+
REDIS_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
REDIS_AVAILABLE = False
|
|
38
|
+
redis = None # type: ignore[assignment]
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
import redis as redis_sync
|
|
42
|
+
import redis.asyncio as redis_async
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaseEscalationStore(ABC):
|
|
46
|
+
"""Abstract base class for escalation storage."""
|
|
47
|
+
|
|
48
|
+
# Policy operations
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def save_policy(self, policy: EscalationPolicy) -> str:
|
|
51
|
+
"""Save or update a policy."""
|
|
52
|
+
...
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def get_policy(self, policy_id: str) -> EscalationPolicy | None:
|
|
56
|
+
"""Get policy by ID."""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
|
|
61
|
+
"""Get policy by name."""
|
|
62
|
+
...
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
|
|
66
|
+
"""List all policies."""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
@abstractmethod
|
|
70
|
+
def delete_policy(self, policy_id: str) -> bool:
|
|
71
|
+
"""Delete a policy."""
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
# Incident operations
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def save_incident(self, incident: EscalationIncident) -> str:
|
|
77
|
+
"""Save or update an incident."""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def get_incident(self, incident_id: str) -> EscalationIncident | None:
|
|
82
|
+
"""Get incident by ID."""
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
|
|
87
|
+
"""Get incident by external reference."""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def list_incidents(
|
|
92
|
+
self,
|
|
93
|
+
policy_id: str | None = None,
|
|
94
|
+
states: list[EscalationState] | None = None,
|
|
95
|
+
) -> list[EscalationIncident]:
|
|
96
|
+
"""List incidents with optional filters."""
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def get_pending_escalations(self) -> list[EscalationIncident]:
|
|
101
|
+
"""Get incidents due for escalation."""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class InMemoryEscalationStore(BaseEscalationStore):
|
|
106
|
+
"""In-memory escalation storage.
|
|
107
|
+
|
|
108
|
+
Simple thread-safe storage suitable for development
|
|
109
|
+
and testing.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(self) -> None:
|
|
113
|
+
"""Initialize in-memory store."""
|
|
114
|
+
self._policies: dict[str, EscalationPolicy] = {}
|
|
115
|
+
self._incidents: dict[str, EscalationIncident] = {}
|
|
116
|
+
self._policy_counter = 0
|
|
117
|
+
self._incident_counter = 0
|
|
118
|
+
self._lock = threading.RLock()
|
|
119
|
+
|
|
120
|
+
def _generate_policy_id(self) -> str:
|
|
121
|
+
"""Generate unique policy ID."""
|
|
122
|
+
self._policy_counter += 1
|
|
123
|
+
return f"policy-{self._policy_counter}"
|
|
124
|
+
|
|
125
|
+
def _generate_incident_id(self) -> str:
|
|
126
|
+
"""Generate unique incident ID."""
|
|
127
|
+
self._incident_counter += 1
|
|
128
|
+
return f"incident-{self._incident_counter}"
|
|
129
|
+
|
|
130
|
+
def save_policy(self, policy: EscalationPolicy) -> str:
|
|
131
|
+
"""Save or update a policy."""
|
|
132
|
+
with self._lock:
|
|
133
|
+
if not policy.id:
|
|
134
|
+
policy.id = self._generate_policy_id()
|
|
135
|
+
self._policies[policy.id] = policy
|
|
136
|
+
return policy.id
|
|
137
|
+
|
|
138
|
+
def get_policy(self, policy_id: str) -> EscalationPolicy | None:
|
|
139
|
+
"""Get policy by ID."""
|
|
140
|
+
with self._lock:
|
|
141
|
+
return self._policies.get(policy_id)
|
|
142
|
+
|
|
143
|
+
def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
|
|
144
|
+
"""Get policy by name."""
|
|
145
|
+
with self._lock:
|
|
146
|
+
for policy in self._policies.values():
|
|
147
|
+
if policy.name == name:
|
|
148
|
+
return policy
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
|
|
152
|
+
"""List all policies."""
|
|
153
|
+
with self._lock:
|
|
154
|
+
policies = list(self._policies.values())
|
|
155
|
+
if active_only:
|
|
156
|
+
policies = [p for p in policies if p.is_active]
|
|
157
|
+
return policies
|
|
158
|
+
|
|
159
|
+
def delete_policy(self, policy_id: str) -> bool:
|
|
160
|
+
"""Delete a policy."""
|
|
161
|
+
with self._lock:
|
|
162
|
+
if policy_id in self._policies:
|
|
163
|
+
del self._policies[policy_id]
|
|
164
|
+
return True
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
def save_incident(self, incident: EscalationIncident) -> str:
|
|
168
|
+
"""Save or update an incident."""
|
|
169
|
+
with self._lock:
|
|
170
|
+
if not incident.id:
|
|
171
|
+
incident.id = self._generate_incident_id()
|
|
172
|
+
incident.updated_at = datetime.utcnow()
|
|
173
|
+
self._incidents[incident.id] = incident
|
|
174
|
+
return incident.id
|
|
175
|
+
|
|
176
|
+
def get_incident(self, incident_id: str) -> EscalationIncident | None:
|
|
177
|
+
"""Get incident by ID."""
|
|
178
|
+
with self._lock:
|
|
179
|
+
return self._incidents.get(incident_id)
|
|
180
|
+
|
|
181
|
+
def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
|
|
182
|
+
"""Get incident by external reference."""
|
|
183
|
+
with self._lock:
|
|
184
|
+
for incident in self._incidents.values():
|
|
185
|
+
if incident.incident_ref == incident_ref:
|
|
186
|
+
return incident
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def list_incidents(
|
|
190
|
+
self,
|
|
191
|
+
policy_id: str | None = None,
|
|
192
|
+
states: list[EscalationState] | None = None,
|
|
193
|
+
) -> list[EscalationIncident]:
|
|
194
|
+
"""List incidents with optional filters."""
|
|
195
|
+
with self._lock:
|
|
196
|
+
incidents = list(self._incidents.values())
|
|
197
|
+
|
|
198
|
+
if policy_id:
|
|
199
|
+
incidents = [i for i in incidents if i.policy_id == policy_id]
|
|
200
|
+
|
|
201
|
+
if states:
|
|
202
|
+
incidents = [i for i in incidents if i.state in states]
|
|
203
|
+
|
|
204
|
+
return incidents
|
|
205
|
+
|
|
206
|
+
def get_pending_escalations(self) -> list[EscalationIncident]:
|
|
207
|
+
"""Get incidents due for escalation."""
|
|
208
|
+
now = datetime.utcnow()
|
|
209
|
+
active_states = [
|
|
210
|
+
EscalationState.TRIGGERED,
|
|
211
|
+
EscalationState.ESCALATED,
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
with self._lock:
|
|
215
|
+
return [
|
|
216
|
+
i for i in self._incidents.values()
|
|
217
|
+
if i.state in active_states
|
|
218
|
+
and i.next_escalation_at
|
|
219
|
+
and i.next_escalation_at <= now
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class SQLiteEscalationStore(BaseEscalationStore):
|
|
224
|
+
"""SQLite-based persistent escalation storage."""
|
|
225
|
+
|
|
226
|
+
def __init__(self, db_path: str | Path = "escalation.db") -> None:
|
|
227
|
+
"""Initialize SQLite store."""
|
|
228
|
+
self.db_path = Path(db_path)
|
|
229
|
+
self._local = threading.local()
|
|
230
|
+
self._init_db()
|
|
231
|
+
|
|
232
|
+
def _get_connection(self) -> sqlite3.Connection:
|
|
233
|
+
"""Get thread-local database connection."""
|
|
234
|
+
if not hasattr(self._local, "connection"):
|
|
235
|
+
self._local.connection = sqlite3.connect(
|
|
236
|
+
str(self.db_path),
|
|
237
|
+
check_same_thread=False,
|
|
238
|
+
)
|
|
239
|
+
self._local.connection.row_factory = sqlite3.Row
|
|
240
|
+
return self._local.connection
|
|
241
|
+
|
|
242
|
+
def _init_db(self) -> None:
|
|
243
|
+
"""Initialize database schema."""
|
|
244
|
+
conn = self._get_connection()
|
|
245
|
+
|
|
246
|
+
# Policies table
|
|
247
|
+
conn.execute("""
|
|
248
|
+
CREATE TABLE IF NOT EXISTS escalation_policies (
|
|
249
|
+
id TEXT PRIMARY KEY,
|
|
250
|
+
name TEXT NOT NULL UNIQUE,
|
|
251
|
+
description TEXT,
|
|
252
|
+
levels TEXT NOT NULL,
|
|
253
|
+
auto_resolve_on_success INTEGER NOT NULL DEFAULT 1,
|
|
254
|
+
max_escalations INTEGER NOT NULL DEFAULT 3,
|
|
255
|
+
is_active INTEGER NOT NULL DEFAULT 1,
|
|
256
|
+
created_at TEXT NOT NULL,
|
|
257
|
+
updated_at TEXT NOT NULL
|
|
258
|
+
)
|
|
259
|
+
""")
|
|
260
|
+
|
|
261
|
+
# Incidents table
|
|
262
|
+
conn.execute("""
|
|
263
|
+
CREATE TABLE IF NOT EXISTS escalation_incidents (
|
|
264
|
+
id TEXT PRIMARY KEY,
|
|
265
|
+
policy_id TEXT NOT NULL,
|
|
266
|
+
incident_ref TEXT NOT NULL UNIQUE,
|
|
267
|
+
state TEXT NOT NULL,
|
|
268
|
+
current_level INTEGER NOT NULL DEFAULT 0,
|
|
269
|
+
context TEXT,
|
|
270
|
+
acknowledged_by TEXT,
|
|
271
|
+
acknowledged_at TEXT,
|
|
272
|
+
resolved_by TEXT,
|
|
273
|
+
resolved_at TEXT,
|
|
274
|
+
created_at TEXT NOT NULL,
|
|
275
|
+
updated_at TEXT NOT NULL,
|
|
276
|
+
next_escalation_at TEXT,
|
|
277
|
+
escalation_count INTEGER NOT NULL DEFAULT 0,
|
|
278
|
+
events TEXT,
|
|
279
|
+
FOREIGN KEY (policy_id) REFERENCES escalation_policies(id)
|
|
280
|
+
)
|
|
281
|
+
""")
|
|
282
|
+
|
|
283
|
+
conn.execute("""
|
|
284
|
+
CREATE INDEX IF NOT EXISTS idx_incident_state
|
|
285
|
+
ON escalation_incidents(state)
|
|
286
|
+
""")
|
|
287
|
+
conn.execute("""
|
|
288
|
+
CREATE INDEX IF NOT EXISTS idx_incident_next_escalation
|
|
289
|
+
ON escalation_incidents(next_escalation_at)
|
|
290
|
+
""")
|
|
291
|
+
|
|
292
|
+
conn.commit()
|
|
293
|
+
|
|
294
|
+
def save_policy(self, policy: EscalationPolicy) -> str:
|
|
295
|
+
"""Save or update a policy."""
|
|
296
|
+
conn = self._get_connection()
|
|
297
|
+
now = datetime.utcnow().isoformat()
|
|
298
|
+
|
|
299
|
+
if not policy.id:
|
|
300
|
+
import uuid
|
|
301
|
+
policy.id = str(uuid.uuid4())
|
|
302
|
+
|
|
303
|
+
levels_json = json.dumps([l.to_dict() for l in policy.levels])
|
|
304
|
+
|
|
305
|
+
conn.execute(
|
|
306
|
+
"""
|
|
307
|
+
INSERT OR REPLACE INTO escalation_policies
|
|
308
|
+
(id, name, description, levels, auto_resolve_on_success,
|
|
309
|
+
max_escalations, is_active, created_at, updated_at)
|
|
310
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
311
|
+
""",
|
|
312
|
+
(
|
|
313
|
+
policy.id,
|
|
314
|
+
policy.name,
|
|
315
|
+
policy.description,
|
|
316
|
+
levels_json,
|
|
317
|
+
1 if policy.auto_resolve_on_success else 0,
|
|
318
|
+
policy.max_escalations,
|
|
319
|
+
1 if policy.is_active else 0,
|
|
320
|
+
now,
|
|
321
|
+
now,
|
|
322
|
+
),
|
|
323
|
+
)
|
|
324
|
+
conn.commit()
|
|
325
|
+
return policy.id
|
|
326
|
+
|
|
327
|
+
def get_policy(self, policy_id: str) -> EscalationPolicy | None:
|
|
328
|
+
"""Get policy by ID."""
|
|
329
|
+
conn = self._get_connection()
|
|
330
|
+
cursor = conn.execute(
|
|
331
|
+
"SELECT * FROM escalation_policies WHERE id = ?",
|
|
332
|
+
(policy_id,),
|
|
333
|
+
)
|
|
334
|
+
row = cursor.fetchone()
|
|
335
|
+
return self._row_to_policy(row) if row else None
|
|
336
|
+
|
|
337
|
+
def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
|
|
338
|
+
"""Get policy by name."""
|
|
339
|
+
conn = self._get_connection()
|
|
340
|
+
cursor = conn.execute(
|
|
341
|
+
"SELECT * FROM escalation_policies WHERE name = ?",
|
|
342
|
+
(name,),
|
|
343
|
+
)
|
|
344
|
+
row = cursor.fetchone()
|
|
345
|
+
return self._row_to_policy(row) if row else None
|
|
346
|
+
|
|
347
|
+
def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
|
|
348
|
+
"""List all policies."""
|
|
349
|
+
conn = self._get_connection()
|
|
350
|
+
if active_only:
|
|
351
|
+
cursor = conn.execute(
|
|
352
|
+
"SELECT * FROM escalation_policies WHERE is_active = 1"
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
cursor = conn.execute("SELECT * FROM escalation_policies")
|
|
356
|
+
|
|
357
|
+
return [self._row_to_policy(row) for row in cursor.fetchall()]
|
|
358
|
+
|
|
359
|
+
def delete_policy(self, policy_id: str) -> bool:
|
|
360
|
+
"""Delete a policy."""
|
|
361
|
+
conn = self._get_connection()
|
|
362
|
+
cursor = conn.execute(
|
|
363
|
+
"DELETE FROM escalation_policies WHERE id = ?",
|
|
364
|
+
(policy_id,),
|
|
365
|
+
)
|
|
366
|
+
conn.commit()
|
|
367
|
+
return cursor.rowcount > 0
|
|
368
|
+
|
|
369
|
+
def _row_to_policy(self, row: sqlite3.Row) -> EscalationPolicy:
|
|
370
|
+
"""Convert database row to policy."""
|
|
371
|
+
from .models import EscalationLevel
|
|
372
|
+
|
|
373
|
+
levels_data = json.loads(row["levels"])
|
|
374
|
+
levels = [EscalationLevel.from_dict(l) for l in levels_data]
|
|
375
|
+
|
|
376
|
+
return EscalationPolicy(
|
|
377
|
+
id=row["id"],
|
|
378
|
+
name=row["name"],
|
|
379
|
+
description=row["description"] or "",
|
|
380
|
+
levels=levels,
|
|
381
|
+
auto_resolve_on_success=bool(row["auto_resolve_on_success"]),
|
|
382
|
+
max_escalations=row["max_escalations"],
|
|
383
|
+
is_active=bool(row["is_active"]),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
def save_incident(self, incident: EscalationIncident) -> str:
|
|
387
|
+
"""Save or update an incident."""
|
|
388
|
+
conn = self._get_connection()
|
|
389
|
+
now = datetime.utcnow().isoformat()
|
|
390
|
+
|
|
391
|
+
if not incident.id:
|
|
392
|
+
import uuid
|
|
393
|
+
incident.id = str(uuid.uuid4())
|
|
394
|
+
|
|
395
|
+
incident.updated_at = datetime.utcnow()
|
|
396
|
+
|
|
397
|
+
conn.execute(
|
|
398
|
+
"""
|
|
399
|
+
INSERT OR REPLACE INTO escalation_incidents
|
|
400
|
+
(id, policy_id, incident_ref, state, current_level, context,
|
|
401
|
+
acknowledged_by, acknowledged_at, resolved_by, resolved_at,
|
|
402
|
+
created_at, updated_at, next_escalation_at, escalation_count, events)
|
|
403
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
404
|
+
""",
|
|
405
|
+
(
|
|
406
|
+
incident.id,
|
|
407
|
+
incident.policy_id,
|
|
408
|
+
incident.incident_ref,
|
|
409
|
+
incident.state.value,
|
|
410
|
+
incident.current_level,
|
|
411
|
+
json.dumps(incident.context),
|
|
412
|
+
incident.acknowledged_by,
|
|
413
|
+
incident.acknowledged_at.isoformat() if incident.acknowledged_at else None,
|
|
414
|
+
incident.resolved_by,
|
|
415
|
+
incident.resolved_at.isoformat() if incident.resolved_at else None,
|
|
416
|
+
incident.created_at.isoformat(),
|
|
417
|
+
incident.updated_at.isoformat(),
|
|
418
|
+
incident.next_escalation_at.isoformat() if incident.next_escalation_at else None,
|
|
419
|
+
incident.escalation_count,
|
|
420
|
+
json.dumps([e.to_dict() for e in incident.events]),
|
|
421
|
+
),
|
|
422
|
+
)
|
|
423
|
+
conn.commit()
|
|
424
|
+
return incident.id
|
|
425
|
+
|
|
426
|
+
def get_incident(self, incident_id: str) -> EscalationIncident | None:
|
|
427
|
+
"""Get incident by ID."""
|
|
428
|
+
conn = self._get_connection()
|
|
429
|
+
cursor = conn.execute(
|
|
430
|
+
"SELECT * FROM escalation_incidents WHERE id = ?",
|
|
431
|
+
(incident_id,),
|
|
432
|
+
)
|
|
433
|
+
row = cursor.fetchone()
|
|
434
|
+
return self._row_to_incident(row) if row else None
|
|
435
|
+
|
|
436
|
+
def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
|
|
437
|
+
"""Get incident by external reference."""
|
|
438
|
+
conn = self._get_connection()
|
|
439
|
+
cursor = conn.execute(
|
|
440
|
+
"SELECT * FROM escalation_incidents WHERE incident_ref = ?",
|
|
441
|
+
(incident_ref,),
|
|
442
|
+
)
|
|
443
|
+
row = cursor.fetchone()
|
|
444
|
+
return self._row_to_incident(row) if row else None
|
|
445
|
+
|
|
446
|
+
def list_incidents(
|
|
447
|
+
self,
|
|
448
|
+
policy_id: str | None = None,
|
|
449
|
+
states: list[EscalationState] | None = None,
|
|
450
|
+
) -> list[EscalationIncident]:
|
|
451
|
+
"""List incidents with optional filters."""
|
|
452
|
+
conn = self._get_connection()
|
|
453
|
+
|
|
454
|
+
query = "SELECT * FROM escalation_incidents WHERE 1=1"
|
|
455
|
+
params: list[Any] = []
|
|
456
|
+
|
|
457
|
+
if policy_id:
|
|
458
|
+
query += " AND policy_id = ?"
|
|
459
|
+
params.append(policy_id)
|
|
460
|
+
|
|
461
|
+
if states:
|
|
462
|
+
placeholders = ",".join("?" * len(states))
|
|
463
|
+
query += f" AND state IN ({placeholders})"
|
|
464
|
+
params.extend(s.value for s in states)
|
|
465
|
+
|
|
466
|
+
cursor = conn.execute(query, params)
|
|
467
|
+
return [self._row_to_incident(row) for row in cursor.fetchall()]
|
|
468
|
+
|
|
469
|
+
def get_pending_escalations(self) -> list[EscalationIncident]:
|
|
470
|
+
"""Get incidents due for escalation."""
|
|
471
|
+
now = datetime.utcnow().isoformat()
|
|
472
|
+
conn = self._get_connection()
|
|
473
|
+
cursor = conn.execute(
|
|
474
|
+
"""
|
|
475
|
+
SELECT * FROM escalation_incidents
|
|
476
|
+
WHERE state IN (?, ?)
|
|
477
|
+
AND next_escalation_at IS NOT NULL
|
|
478
|
+
AND next_escalation_at <= ?
|
|
479
|
+
""",
|
|
480
|
+
(EscalationState.TRIGGERED.value, EscalationState.ESCALATED.value, now),
|
|
481
|
+
)
|
|
482
|
+
return [self._row_to_incident(row) for row in cursor.fetchall()]
|
|
483
|
+
|
|
484
|
+
def _row_to_incident(self, row: sqlite3.Row) -> EscalationIncident:
|
|
485
|
+
"""Convert database row to incident."""
|
|
486
|
+
from .models import EscalationEvent
|
|
487
|
+
|
|
488
|
+
events_data = json.loads(row["events"]) if row["events"] else []
|
|
489
|
+
events = [EscalationEvent.from_dict(e) for e in events_data]
|
|
490
|
+
|
|
491
|
+
return EscalationIncident(
|
|
492
|
+
id=row["id"],
|
|
493
|
+
policy_id=row["policy_id"],
|
|
494
|
+
incident_ref=row["incident_ref"],
|
|
495
|
+
state=EscalationState(row["state"]),
|
|
496
|
+
current_level=row["current_level"],
|
|
497
|
+
context=json.loads(row["context"]) if row["context"] else {},
|
|
498
|
+
acknowledged_by=row["acknowledged_by"],
|
|
499
|
+
acknowledged_at=datetime.fromisoformat(row["acknowledged_at"]) if row["acknowledged_at"] else None,
|
|
500
|
+
resolved_by=row["resolved_by"],
|
|
501
|
+
resolved_at=datetime.fromisoformat(row["resolved_at"]) if row["resolved_at"] else None,
|
|
502
|
+
created_at=datetime.fromisoformat(row["created_at"]),
|
|
503
|
+
updated_at=datetime.fromisoformat(row["updated_at"]),
|
|
504
|
+
next_escalation_at=datetime.fromisoformat(row["next_escalation_at"]) if row["next_escalation_at"] else None,
|
|
505
|
+
escalation_count=row["escalation_count"],
|
|
506
|
+
events=events,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def close(self) -> None:
|
|
510
|
+
"""Close database connection."""
|
|
511
|
+
if hasattr(self._local, "connection"):
|
|
512
|
+
self._local.connection.close()
|
|
513
|
+
del self._local.connection
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
# ============================================================================
|
|
517
|
+
# Redis Escalation Store
|
|
518
|
+
# ============================================================================
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
@dataclass
|
|
522
|
+
class EscalationMetrics:
|
|
523
|
+
"""Metrics for escalation store operations.
|
|
524
|
+
|
|
525
|
+
Attributes:
|
|
526
|
+
policy_saves: Number of policy save operations.
|
|
527
|
+
policy_gets: Number of policy get operations.
|
|
528
|
+
policy_deletes: Number of policy delete operations.
|
|
529
|
+
incident_saves: Number of incident save operations.
|
|
530
|
+
incident_gets: Number of incident get operations.
|
|
531
|
+
state_transitions: Number of state transitions.
|
|
532
|
+
errors: Number of Redis errors encountered.
|
|
533
|
+
fallbacks: Number of times fallback to InMemory was used.
|
|
534
|
+
reconnections: Number of successful reconnections.
|
|
535
|
+
pubsub_publishes: Number of Pub/Sub messages published.
|
|
536
|
+
avg_latency_ms: Average operation latency in milliseconds.
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
policy_saves: int = 0
|
|
540
|
+
policy_gets: int = 0
|
|
541
|
+
policy_deletes: int = 0
|
|
542
|
+
incident_saves: int = 0
|
|
543
|
+
incident_gets: int = 0
|
|
544
|
+
state_transitions: int = 0
|
|
545
|
+
errors: int = 0
|
|
546
|
+
fallbacks: int = 0
|
|
547
|
+
reconnections: int = 0
|
|
548
|
+
pubsub_publishes: int = 0
|
|
549
|
+
total_operations: int = 0
|
|
550
|
+
total_latency_ms: float = 0.0
|
|
551
|
+
|
|
552
|
+
@property
|
|
553
|
+
def avg_latency_ms(self) -> float:
|
|
554
|
+
"""Calculate average operation latency."""
|
|
555
|
+
if self.total_operations == 0:
|
|
556
|
+
return 0.0
|
|
557
|
+
return self.total_latency_ms / self.total_operations
|
|
558
|
+
|
|
559
|
+
def record_latency(self, latency_ms: float) -> None:
|
|
560
|
+
"""Record an operation's latency."""
|
|
561
|
+
self.total_operations += 1
|
|
562
|
+
self.total_latency_ms += latency_ms
|
|
563
|
+
|
|
564
|
+
def to_dict(self) -> dict[str, Any]:
|
|
565
|
+
"""Convert metrics to dictionary."""
|
|
566
|
+
return {
|
|
567
|
+
"policy_saves": self.policy_saves,
|
|
568
|
+
"policy_gets": self.policy_gets,
|
|
569
|
+
"policy_deletes": self.policy_deletes,
|
|
570
|
+
"incident_saves": self.incident_saves,
|
|
571
|
+
"incident_gets": self.incident_gets,
|
|
572
|
+
"state_transitions": self.state_transitions,
|
|
573
|
+
"errors": self.errors,
|
|
574
|
+
"fallbacks": self.fallbacks,
|
|
575
|
+
"reconnections": self.reconnections,
|
|
576
|
+
"pubsub_publishes": self.pubsub_publishes,
|
|
577
|
+
"total_operations": self.total_operations,
|
|
578
|
+
"avg_latency_ms": round(self.avg_latency_ms, 3),
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
class RedisEscalationStore(BaseEscalationStore):
|
|
583
|
+
"""Production-ready Redis-based escalation store for distributed deployments.
|
|
584
|
+
|
|
585
|
+
Uses Redis for robust distributed escalation state management with:
|
|
586
|
+
- Connection pool management with configurable pool size
|
|
587
|
+
- Automatic reconnection with exponential backoff
|
|
588
|
+
- Proper JSON serialization/deserialization of incident objects
|
|
589
|
+
- Transaction support for atomic state updates (MULTI/EXEC and Lua scripts)
|
|
590
|
+
- Pub/Sub for real-time incident updates
|
|
591
|
+
- TTL management for completed/resolved incidents (auto-cleanup)
|
|
592
|
+
- Index structures for efficient queries (by state, policy_id, created_at)
|
|
593
|
+
- Graceful degradation (fallback to InMemory on Redis failure)
|
|
594
|
+
- Health check endpoint support
|
|
595
|
+
- Comprehensive metrics (operations, latency, errors)
|
|
596
|
+
|
|
597
|
+
Configuration via environment variables:
|
|
598
|
+
TRUTHOUND_ESCALATION_REDIS_URL: Redis connection URL (default: redis://localhost:6379/0)
|
|
599
|
+
TRUTHOUND_ESCALATION_REDIS_PREFIX: Key prefix (default: truthound:escalation:)
|
|
600
|
+
TRUTHOUND_ESCALATION_REDIS_POOL_SIZE: Connection pool size (default: 10)
|
|
601
|
+
TRUTHOUND_ESCALATION_REDIS_SOCKET_TIMEOUT: Socket timeout (default: 5.0)
|
|
602
|
+
TRUTHOUND_ESCALATION_REDIS_CONNECT_TIMEOUT: Connection timeout (default: 5.0)
|
|
603
|
+
TRUTHOUND_ESCALATION_REDIS_MAX_RETRIES: Max retry attempts (default: 3)
|
|
604
|
+
TRUTHOUND_ESCALATION_REDIS_RETRY_BASE_DELAY: Base delay for exponential backoff (default: 1.0)
|
|
605
|
+
TRUTHOUND_ESCALATION_REDIS_RESOLVED_TTL: TTL in seconds for resolved incidents (default: 86400 = 24h)
|
|
606
|
+
TRUTHOUND_ESCALATION_FALLBACK_ENABLED: Enable fallback to InMemory (default: true)
|
|
607
|
+
TRUTHOUND_ESCALATION_PUBSUB_ENABLED: Enable Pub/Sub notifications (default: true)
|
|
608
|
+
|
|
609
|
+
Example:
|
|
610
|
+
# Basic usage
|
|
611
|
+
store = RedisEscalationStore()
|
|
612
|
+
|
|
613
|
+
# Custom configuration
|
|
614
|
+
store = RedisEscalationStore(
|
|
615
|
+
redis_url="redis://myredis:6379/1",
|
|
616
|
+
max_connections=20,
|
|
617
|
+
resolved_ttl=3600, # 1 hour
|
|
618
|
+
enable_fallback=True,
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
# With context manager
|
|
622
|
+
async with RedisEscalationStore() as store:
|
|
623
|
+
policy_id = await store.save_policy_async(policy)
|
|
624
|
+
incident_id = await store.save_incident_async(incident)
|
|
625
|
+
|
|
626
|
+
Note: Requires the 'redis' optional dependency.
|
|
627
|
+
Install with: pip install truthound-dashboard[redis]
|
|
628
|
+
"""
|
|
629
|
+
|
|
630
|
+
# Redis key patterns
|
|
631
|
+
KEY_POLICY = "policy:{policy_id}"
|
|
632
|
+
KEY_POLICY_INDEX = "policies:all"
|
|
633
|
+
KEY_POLICY_BY_NAME = "policies:name:{name}"
|
|
634
|
+
KEY_POLICY_ACTIVE = "policies:active"
|
|
635
|
+
|
|
636
|
+
KEY_INCIDENT = "incident:{incident_id}"
|
|
637
|
+
KEY_INCIDENT_INDEX = "incidents:all"
|
|
638
|
+
KEY_INCIDENT_BY_REF = "incidents:ref:{incident_ref}"
|
|
639
|
+
KEY_INCIDENT_BY_POLICY = "incidents:policy:{policy_id}"
|
|
640
|
+
KEY_INCIDENT_BY_STATE = "incidents:state:{state}"
|
|
641
|
+
KEY_INCIDENT_BY_CREATED = "incidents:created" # Sorted set
|
|
642
|
+
KEY_INCIDENT_PENDING = "incidents:pending_escalation" # Sorted set by next_escalation_at
|
|
643
|
+
|
|
644
|
+
# Pub/Sub channels
|
|
645
|
+
CHANNEL_INCIDENT_UPDATE = "escalation:incidents:updates"
|
|
646
|
+
CHANNEL_POLICY_UPDATE = "escalation:policies:updates"
|
|
647
|
+
|
|
648
|
+
# Lua script for atomic incident state transition
|
|
649
|
+
LUA_STATE_TRANSITION = """
|
|
650
|
+
local incident_key = KEYS[1]
|
|
651
|
+
local old_state_key = KEYS[2]
|
|
652
|
+
local new_state_key = KEYS[3]
|
|
653
|
+
local pending_key = KEYS[4]
|
|
654
|
+
local incident_id = ARGV[1]
|
|
655
|
+
local new_state = ARGV[2]
|
|
656
|
+
local updated_data = ARGV[3]
|
|
657
|
+
local next_escalation_score = ARGV[4]
|
|
658
|
+
|
|
659
|
+
-- Get current incident
|
|
660
|
+
local current = redis.call('GET', incident_key)
|
|
661
|
+
if not current then
|
|
662
|
+
return {err = 'incident_not_found'}
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
-- Update incident data
|
|
666
|
+
redis.call('SET', incident_key, updated_data)
|
|
667
|
+
|
|
668
|
+
-- Update state indices
|
|
669
|
+
redis.call('SREM', old_state_key, incident_id)
|
|
670
|
+
redis.call('SADD', new_state_key, incident_id)
|
|
671
|
+
|
|
672
|
+
-- Update pending escalation sorted set
|
|
673
|
+
if new_state == 'resolved' or new_state == 'acknowledged' then
|
|
674
|
+
redis.call('ZREM', pending_key, incident_id)
|
|
675
|
+
elseif next_escalation_score ~= '' then
|
|
676
|
+
redis.call('ZADD', pending_key, next_escalation_score, incident_id)
|
|
677
|
+
end
|
|
678
|
+
|
|
679
|
+
return 'OK'
|
|
680
|
+
"""
|
|
681
|
+
|
|
682
|
+
def __init__(
|
|
683
|
+
self,
|
|
684
|
+
redis_url: str | None = None,
|
|
685
|
+
key_prefix: str | None = None,
|
|
686
|
+
max_connections: int | None = None,
|
|
687
|
+
socket_timeout: float | None = None,
|
|
688
|
+
socket_connect_timeout: float | None = None,
|
|
689
|
+
max_retries: int | None = None,
|
|
690
|
+
retry_base_delay: float | None = None,
|
|
691
|
+
resolved_ttl: int | None = None,
|
|
692
|
+
enable_fallback: bool | None = None,
|
|
693
|
+
enable_pubsub: bool | None = None,
|
|
694
|
+
logger: Any | None = None,
|
|
695
|
+
) -> None:
|
|
696
|
+
"""Initialize Redis escalation store.
|
|
697
|
+
|
|
698
|
+
All parameters can be configured via environment variables if not
|
|
699
|
+
explicitly provided.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
redis_url: Redis connection URL.
|
|
703
|
+
key_prefix: Prefix for all Redis keys.
|
|
704
|
+
max_connections: Maximum connections in the pool.
|
|
705
|
+
socket_timeout: Socket timeout in seconds.
|
|
706
|
+
socket_connect_timeout: Connection timeout in seconds.
|
|
707
|
+
max_retries: Maximum retry attempts for reconnection.
|
|
708
|
+
retry_base_delay: Base delay for exponential backoff.
|
|
709
|
+
resolved_ttl: TTL in seconds for resolved/completed incidents.
|
|
710
|
+
enable_fallback: Enable fallback to InMemory on Redis failure.
|
|
711
|
+
enable_pubsub: Enable Pub/Sub notifications for state changes.
|
|
712
|
+
logger: Custom logger instance.
|
|
713
|
+
|
|
714
|
+
Raises:
|
|
715
|
+
ImportError: If redis package is not installed.
|
|
716
|
+
"""
|
|
717
|
+
if not REDIS_AVAILABLE:
|
|
718
|
+
raise ImportError(
|
|
719
|
+
"Redis support requires the 'redis' package. "
|
|
720
|
+
"Install with: pip install truthound-dashboard[redis] "
|
|
721
|
+
"or pip install redis"
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Configuration from environment or parameters
|
|
725
|
+
self.redis_url = redis_url or os.getenv(
|
|
726
|
+
"TRUTHOUND_ESCALATION_REDIS_URL", "redis://localhost:6379/0"
|
|
727
|
+
)
|
|
728
|
+
self.key_prefix = key_prefix or os.getenv(
|
|
729
|
+
"TRUTHOUND_ESCALATION_REDIS_PREFIX", "truthound:escalation:"
|
|
730
|
+
)
|
|
731
|
+
self.max_connections = max_connections or int(
|
|
732
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_POOL_SIZE", "10")
|
|
733
|
+
)
|
|
734
|
+
self.socket_timeout = socket_timeout or float(
|
|
735
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_SOCKET_TIMEOUT", "5.0")
|
|
736
|
+
)
|
|
737
|
+
self.socket_connect_timeout = socket_connect_timeout or float(
|
|
738
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_CONNECT_TIMEOUT", "5.0")
|
|
739
|
+
)
|
|
740
|
+
self.max_retries = max_retries or int(
|
|
741
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_MAX_RETRIES", "3")
|
|
742
|
+
)
|
|
743
|
+
self.retry_base_delay = retry_base_delay or float(
|
|
744
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_RETRY_BASE_DELAY", "1.0")
|
|
745
|
+
)
|
|
746
|
+
self.resolved_ttl = resolved_ttl or int(
|
|
747
|
+
os.getenv("TRUTHOUND_ESCALATION_REDIS_RESOLVED_TTL", "86400")
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
fallback_env = os.getenv("TRUTHOUND_ESCALATION_FALLBACK_ENABLED", "true")
|
|
751
|
+
self.enable_fallback = (
|
|
752
|
+
enable_fallback
|
|
753
|
+
if enable_fallback is not None
|
|
754
|
+
else fallback_env.lower() == "true"
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
pubsub_env = os.getenv("TRUTHOUND_ESCALATION_PUBSUB_ENABLED", "true")
|
|
758
|
+
self.enable_pubsub = (
|
|
759
|
+
enable_pubsub
|
|
760
|
+
if enable_pubsub is not None
|
|
761
|
+
else pubsub_env.lower() == "true"
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
# Logger setup
|
|
765
|
+
self._logger = logger or logging.getLogger(__name__)
|
|
766
|
+
|
|
767
|
+
# Connection pool for sync client
|
|
768
|
+
self._pool: "redis.ConnectionPool | None" = None
|
|
769
|
+
self._client: "redis.Redis | None" = None
|
|
770
|
+
|
|
771
|
+
# Connection pool for async client
|
|
772
|
+
self._async_pool: "redis.asyncio.ConnectionPool | None" = None
|
|
773
|
+
self._async_client: "redis.asyncio.Redis | None" = None
|
|
774
|
+
|
|
775
|
+
# Locks for thread-safe initialization
|
|
776
|
+
self._lock = threading.Lock()
|
|
777
|
+
self._async_lock: Any = None # Created lazily for asyncio
|
|
778
|
+
|
|
779
|
+
# Fallback store for graceful degradation
|
|
780
|
+
self._fallback_store: InMemoryEscalationStore | None = None
|
|
781
|
+
self._using_fallback = False
|
|
782
|
+
|
|
783
|
+
# Connection state tracking
|
|
784
|
+
self._connected = False
|
|
785
|
+
self._retry_count = 0
|
|
786
|
+
self._last_error: Exception | None = None
|
|
787
|
+
self._last_error_time: float | None = None
|
|
788
|
+
|
|
789
|
+
# Metrics
|
|
790
|
+
self._metrics = EscalationMetrics()
|
|
791
|
+
|
|
792
|
+
# Lua script SHA (registered on first use)
|
|
793
|
+
self._state_transition_sha: str | None = None
|
|
794
|
+
|
|
795
|
+
def _get_key(self, pattern: str, **kwargs: str) -> str:
|
|
796
|
+
"""Get full Redis key from pattern.
|
|
797
|
+
|
|
798
|
+
Args:
|
|
799
|
+
pattern: Key pattern with placeholders.
|
|
800
|
+
**kwargs: Values to substitute in pattern.
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
Full Redis key with prefix.
|
|
804
|
+
"""
|
|
805
|
+
key = pattern.format(**kwargs) if kwargs else pattern
|
|
806
|
+
return f"{self.key_prefix}{key}"
|
|
807
|
+
|
|
808
|
+
def _create_pool(self) -> "redis.ConnectionPool":
|
|
809
|
+
"""Create a connection pool for sync client.
|
|
810
|
+
|
|
811
|
+
Returns:
|
|
812
|
+
Configured connection pool.
|
|
813
|
+
"""
|
|
814
|
+
return redis.ConnectionPool.from_url(
|
|
815
|
+
self.redis_url,
|
|
816
|
+
max_connections=self.max_connections,
|
|
817
|
+
socket_timeout=self.socket_timeout,
|
|
818
|
+
socket_connect_timeout=self.socket_connect_timeout,
|
|
819
|
+
retry_on_timeout=True,
|
|
820
|
+
decode_responses=True,
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
async def _create_async_pool(self) -> "redis.asyncio.ConnectionPool":
|
|
824
|
+
"""Create a connection pool for async client.
|
|
825
|
+
|
|
826
|
+
Returns:
|
|
827
|
+
Configured async connection pool.
|
|
828
|
+
"""
|
|
829
|
+
return redis.asyncio.ConnectionPool.from_url(
|
|
830
|
+
self.redis_url,
|
|
831
|
+
max_connections=self.max_connections,
|
|
832
|
+
socket_timeout=self.socket_timeout,
|
|
833
|
+
socket_connect_timeout=self.socket_connect_timeout,
|
|
834
|
+
retry_on_timeout=True,
|
|
835
|
+
decode_responses=True,
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
def _get_fallback_store(self) -> InMemoryEscalationStore:
|
|
839
|
+
"""Get or create fallback in-memory store.
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
InMemoryEscalationStore instance.
|
|
843
|
+
"""
|
|
844
|
+
if self._fallback_store is None:
|
|
845
|
+
self._fallback_store = InMemoryEscalationStore()
|
|
846
|
+
return self._fallback_store
|
|
847
|
+
|
|
848
|
+
def _calculate_backoff_delay(self) -> float:
|
|
849
|
+
"""Calculate exponential backoff delay.
|
|
850
|
+
|
|
851
|
+
Returns:
|
|
852
|
+
Delay in seconds.
|
|
853
|
+
"""
|
|
854
|
+
# Exponential backoff with jitter
|
|
855
|
+
delay = self.retry_base_delay * (2**self._retry_count)
|
|
856
|
+
# Add jitter (up to 25% of delay)
|
|
857
|
+
jitter = delay * random.uniform(0, 0.25)
|
|
858
|
+
return min(delay + jitter, 60.0) # Cap at 60 seconds
|
|
859
|
+
|
|
860
|
+
def _handle_redis_error(self, error: Exception, operation: str) -> None:
|
|
861
|
+
"""Handle Redis errors with logging and metrics.
|
|
862
|
+
|
|
863
|
+
Args:
|
|
864
|
+
error: The exception that occurred.
|
|
865
|
+
operation: Name of the operation that failed.
|
|
866
|
+
"""
|
|
867
|
+
self._metrics.errors += 1
|
|
868
|
+
self._last_error = error
|
|
869
|
+
self._last_error_time = time.time()
|
|
870
|
+
self._connected = False
|
|
871
|
+
|
|
872
|
+
self._logger.error(
|
|
873
|
+
f"Redis error during {operation}: {error}",
|
|
874
|
+
extra={
|
|
875
|
+
"operation": operation,
|
|
876
|
+
"error_type": type(error).__name__,
|
|
877
|
+
"retry_count": self._retry_count,
|
|
878
|
+
},
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
def _try_reconnect_sync(self) -> bool:
|
|
882
|
+
"""Attempt to reconnect to Redis synchronously.
|
|
883
|
+
|
|
884
|
+
Returns:
|
|
885
|
+
True if reconnection successful, False otherwise.
|
|
886
|
+
"""
|
|
887
|
+
if self._retry_count >= self.max_retries:
|
|
888
|
+
self._logger.warning(
|
|
889
|
+
f"Max retries ({self.max_retries}) reached, using fallback"
|
|
890
|
+
)
|
|
891
|
+
return False
|
|
892
|
+
|
|
893
|
+
delay = self._calculate_backoff_delay()
|
|
894
|
+
self._logger.info(
|
|
895
|
+
f"Attempting Redis reconnection in {delay:.2f}s "
|
|
896
|
+
f"(attempt {self._retry_count + 1}/{self.max_retries})"
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
time.sleep(delay)
|
|
900
|
+
self._retry_count += 1
|
|
901
|
+
|
|
902
|
+
try:
|
|
903
|
+
# Close existing connections
|
|
904
|
+
if self._client:
|
|
905
|
+
try:
|
|
906
|
+
self._client.close()
|
|
907
|
+
except Exception:
|
|
908
|
+
pass
|
|
909
|
+
self._client = None
|
|
910
|
+
|
|
911
|
+
if self._pool:
|
|
912
|
+
try:
|
|
913
|
+
self._pool.disconnect()
|
|
914
|
+
except Exception:
|
|
915
|
+
pass
|
|
916
|
+
self._pool = None
|
|
917
|
+
|
|
918
|
+
# Create new connection
|
|
919
|
+
self._pool = self._create_pool()
|
|
920
|
+
self._client = redis.Redis(connection_pool=self._pool)
|
|
921
|
+
|
|
922
|
+
# Test connection
|
|
923
|
+
if self._client.ping():
|
|
924
|
+
self._connected = True
|
|
925
|
+
self._retry_count = 0
|
|
926
|
+
self._using_fallback = False
|
|
927
|
+
self._metrics.reconnections += 1
|
|
928
|
+
self._logger.info("Redis reconnection successful")
|
|
929
|
+
return True
|
|
930
|
+
except Exception as e:
|
|
931
|
+
self._logger.warning(f"Reconnection attempt failed: {e}")
|
|
932
|
+
|
|
933
|
+
return False
|
|
934
|
+
|
|
935
|
+
async def _try_reconnect_async(self) -> bool:
|
|
936
|
+
"""Attempt to reconnect to Redis asynchronously.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
True if reconnection successful, False otherwise.
|
|
940
|
+
"""
|
|
941
|
+
import asyncio
|
|
942
|
+
|
|
943
|
+
if self._retry_count >= self.max_retries:
|
|
944
|
+
self._logger.warning(
|
|
945
|
+
f"Max retries ({self.max_retries}) reached, using fallback"
|
|
946
|
+
)
|
|
947
|
+
return False
|
|
948
|
+
|
|
949
|
+
delay = self._calculate_backoff_delay()
|
|
950
|
+
self._logger.info(
|
|
951
|
+
f"Attempting async Redis reconnection in {delay:.2f}s "
|
|
952
|
+
f"(attempt {self._retry_count + 1}/{self.max_retries})"
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
await asyncio.sleep(delay)
|
|
956
|
+
self._retry_count += 1
|
|
957
|
+
|
|
958
|
+
try:
|
|
959
|
+
# Close existing connections
|
|
960
|
+
if self._async_client:
|
|
961
|
+
try:
|
|
962
|
+
await self._async_client.close()
|
|
963
|
+
except Exception:
|
|
964
|
+
pass
|
|
965
|
+
self._async_client = None
|
|
966
|
+
|
|
967
|
+
if self._async_pool:
|
|
968
|
+
try:
|
|
969
|
+
await self._async_pool.disconnect()
|
|
970
|
+
except Exception:
|
|
971
|
+
pass
|
|
972
|
+
self._async_pool = None
|
|
973
|
+
|
|
974
|
+
# Create new connection
|
|
975
|
+
self._async_pool = await self._create_async_pool()
|
|
976
|
+
self._async_client = redis.asyncio.Redis(connection_pool=self._async_pool)
|
|
977
|
+
|
|
978
|
+
# Test connection
|
|
979
|
+
if await self._async_client.ping():
|
|
980
|
+
self._connected = True
|
|
981
|
+
self._retry_count = 0
|
|
982
|
+
self._using_fallback = False
|
|
983
|
+
self._metrics.reconnections += 1
|
|
984
|
+
self._logger.info("Async Redis reconnection successful")
|
|
985
|
+
return True
|
|
986
|
+
except Exception as e:
|
|
987
|
+
self._logger.warning(f"Async reconnection attempt failed: {e}")
|
|
988
|
+
|
|
989
|
+
return False
|
|
990
|
+
|
|
991
|
+
@property
|
|
992
|
+
def client(self) -> "redis.Redis":
|
|
993
|
+
"""Get sync Redis client with connection pooling.
|
|
994
|
+
|
|
995
|
+
Creates the connection pool and client on first access.
|
|
996
|
+
Handles reconnection on failure.
|
|
997
|
+
|
|
998
|
+
Returns:
|
|
999
|
+
Redis client instance.
|
|
1000
|
+
"""
|
|
1001
|
+
if self._client is None or not self._connected:
|
|
1002
|
+
with self._lock:
|
|
1003
|
+
if self._client is None or not self._connected:
|
|
1004
|
+
try:
|
|
1005
|
+
self._pool = self._create_pool()
|
|
1006
|
+
self._client = redis.Redis(connection_pool=self._pool)
|
|
1007
|
+
# Test connection
|
|
1008
|
+
self._client.ping()
|
|
1009
|
+
self._connected = True
|
|
1010
|
+
self._retry_count = 0
|
|
1011
|
+
self._logger.debug("Redis sync client connected")
|
|
1012
|
+
except Exception as e:
|
|
1013
|
+
self._handle_redis_error(e, "client_init")
|
|
1014
|
+
raise
|
|
1015
|
+
return self._client
|
|
1016
|
+
|
|
1017
|
+
async def get_async_client(self) -> "redis.asyncio.Redis":
|
|
1018
|
+
"""Get async Redis client with connection pooling.
|
|
1019
|
+
|
|
1020
|
+
Creates the async connection pool and client on first access.
|
|
1021
|
+
|
|
1022
|
+
Returns:
|
|
1023
|
+
Async Redis client instance.
|
|
1024
|
+
"""
|
|
1025
|
+
import asyncio
|
|
1026
|
+
|
|
1027
|
+
if self._async_lock is None:
|
|
1028
|
+
self._async_lock = asyncio.Lock()
|
|
1029
|
+
|
|
1030
|
+
if self._async_client is None or not self._connected:
|
|
1031
|
+
async with self._async_lock:
|
|
1032
|
+
if self._async_client is None or not self._connected:
|
|
1033
|
+
try:
|
|
1034
|
+
self._async_pool = await self._create_async_pool()
|
|
1035
|
+
self._async_client = redis.asyncio.Redis(
|
|
1036
|
+
connection_pool=self._async_pool
|
|
1037
|
+
)
|
|
1038
|
+
# Test connection
|
|
1039
|
+
await self._async_client.ping()
|
|
1040
|
+
self._connected = True
|
|
1041
|
+
self._retry_count = 0
|
|
1042
|
+
self._logger.debug("Redis async client connected")
|
|
1043
|
+
except Exception as e:
|
|
1044
|
+
self._handle_redis_error(e, "async_client_init")
|
|
1045
|
+
raise
|
|
1046
|
+
return self._async_client
|
|
1047
|
+
|
|
1048
|
+
def _register_lua_scripts(self, client: "redis.Redis") -> None:
|
|
1049
|
+
"""Register Lua scripts with Redis.
|
|
1050
|
+
|
|
1051
|
+
Args:
|
|
1052
|
+
client: Redis client instance.
|
|
1053
|
+
"""
|
|
1054
|
+
if self._state_transition_sha is None:
|
|
1055
|
+
self._state_transition_sha = client.script_load(self.LUA_STATE_TRANSITION)
|
|
1056
|
+
|
|
1057
|
+
async def _register_lua_scripts_async(
|
|
1058
|
+
self, client: "redis.asyncio.Redis"
|
|
1059
|
+
) -> None:
|
|
1060
|
+
"""Register Lua scripts with Redis asynchronously.
|
|
1061
|
+
|
|
1062
|
+
Args:
|
|
1063
|
+
client: Async Redis client instance.
|
|
1064
|
+
"""
|
|
1065
|
+
if self._state_transition_sha is None:
|
|
1066
|
+
self._state_transition_sha = await client.script_load(
|
|
1067
|
+
self.LUA_STATE_TRANSITION
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
def _serialize_policy(self, policy: EscalationPolicy) -> str:
|
|
1071
|
+
"""Serialize policy to JSON string.
|
|
1072
|
+
|
|
1073
|
+
Args:
|
|
1074
|
+
policy: Policy to serialize.
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
JSON string.
|
|
1078
|
+
"""
|
|
1079
|
+
return json.dumps(policy.to_dict())
|
|
1080
|
+
|
|
1081
|
+
def _deserialize_policy(self, data: str) -> EscalationPolicy:
|
|
1082
|
+
"""Deserialize policy from JSON string.
|
|
1083
|
+
|
|
1084
|
+
Args:
|
|
1085
|
+
data: JSON string.
|
|
1086
|
+
|
|
1087
|
+
Returns:
|
|
1088
|
+
EscalationPolicy instance.
|
|
1089
|
+
"""
|
|
1090
|
+
return EscalationPolicy.from_dict(json.loads(data))
|
|
1091
|
+
|
|
1092
|
+
def _serialize_incident(self, incident: EscalationIncident) -> str:
|
|
1093
|
+
"""Serialize incident to JSON string.
|
|
1094
|
+
|
|
1095
|
+
Args:
|
|
1096
|
+
incident: Incident to serialize.
|
|
1097
|
+
|
|
1098
|
+
Returns:
|
|
1099
|
+
JSON string.
|
|
1100
|
+
"""
|
|
1101
|
+
return json.dumps(incident.to_dict())
|
|
1102
|
+
|
|
1103
|
+
def _deserialize_incident(self, data: str) -> EscalationIncident:
|
|
1104
|
+
"""Deserialize incident from JSON string.
|
|
1105
|
+
|
|
1106
|
+
Args:
|
|
1107
|
+
data: JSON string.
|
|
1108
|
+
|
|
1109
|
+
Returns:
|
|
1110
|
+
EscalationIncident instance.
|
|
1111
|
+
"""
|
|
1112
|
+
return EscalationIncident.from_dict(json.loads(data))
|
|
1113
|
+
|
|
1114
|
+
def _publish_incident_update(
|
|
1115
|
+
self,
|
|
1116
|
+
client: "redis.Redis",
|
|
1117
|
+
incident: EscalationIncident,
|
|
1118
|
+
event_type: str,
|
|
1119
|
+
) -> None:
|
|
1120
|
+
"""Publish incident update via Pub/Sub.
|
|
1121
|
+
|
|
1122
|
+
Args:
|
|
1123
|
+
client: Redis client.
|
|
1124
|
+
incident: Updated incident.
|
|
1125
|
+
event_type: Type of event (created, updated, state_changed, etc.).
|
|
1126
|
+
"""
|
|
1127
|
+
if not self.enable_pubsub:
|
|
1128
|
+
return
|
|
1129
|
+
|
|
1130
|
+
try:
|
|
1131
|
+
message = json.dumps({
|
|
1132
|
+
"event_type": event_type,
|
|
1133
|
+
"incident_id": incident.id,
|
|
1134
|
+
"incident_ref": incident.incident_ref,
|
|
1135
|
+
"policy_id": incident.policy_id,
|
|
1136
|
+
"state": incident.state.value,
|
|
1137
|
+
"current_level": incident.current_level,
|
|
1138
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
1139
|
+
})
|
|
1140
|
+
channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
|
|
1141
|
+
client.publish(channel, message)
|
|
1142
|
+
self._metrics.pubsub_publishes += 1
|
|
1143
|
+
except Exception as e:
|
|
1144
|
+
self._logger.warning(f"Failed to publish incident update: {e}")
|
|
1145
|
+
|
|
1146
|
+
async def _publish_incident_update_async(
|
|
1147
|
+
self,
|
|
1148
|
+
client: "redis.asyncio.Redis",
|
|
1149
|
+
incident: EscalationIncident,
|
|
1150
|
+
event_type: str,
|
|
1151
|
+
) -> None:
|
|
1152
|
+
"""Publish incident update via Pub/Sub asynchronously.
|
|
1153
|
+
|
|
1154
|
+
Args:
|
|
1155
|
+
client: Async Redis client.
|
|
1156
|
+
incident: Updated incident.
|
|
1157
|
+
event_type: Type of event.
|
|
1158
|
+
"""
|
|
1159
|
+
if not self.enable_pubsub:
|
|
1160
|
+
return
|
|
1161
|
+
|
|
1162
|
+
try:
|
|
1163
|
+
message = json.dumps({
|
|
1164
|
+
"event_type": event_type,
|
|
1165
|
+
"incident_id": incident.id,
|
|
1166
|
+
"incident_ref": incident.incident_ref,
|
|
1167
|
+
"policy_id": incident.policy_id,
|
|
1168
|
+
"state": incident.state.value,
|
|
1169
|
+
"current_level": incident.current_level,
|
|
1170
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
1171
|
+
})
|
|
1172
|
+
channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
|
|
1173
|
+
await client.publish(channel, message)
|
|
1174
|
+
self._metrics.pubsub_publishes += 1
|
|
1175
|
+
except Exception as e:
|
|
1176
|
+
self._logger.warning(f"Failed to publish incident update: {e}")
|
|
1177
|
+
|
|
1178
|
+
# =========================================================================
|
|
1179
|
+
# Policy Operations
|
|
1180
|
+
# =========================================================================
|
|
1181
|
+
|
|
1182
|
+
def save_policy(self, policy: EscalationPolicy) -> str:
|
|
1183
|
+
"""Save or update a policy.
|
|
1184
|
+
|
|
1185
|
+
Args:
|
|
1186
|
+
policy: Policy to save.
|
|
1187
|
+
|
|
1188
|
+
Returns:
|
|
1189
|
+
Policy ID.
|
|
1190
|
+
"""
|
|
1191
|
+
start_time = time.time()
|
|
1192
|
+
|
|
1193
|
+
if self._using_fallback and self.enable_fallback:
|
|
1194
|
+
return self._get_fallback_store().save_policy(policy)
|
|
1195
|
+
|
|
1196
|
+
try:
|
|
1197
|
+
client = self.client
|
|
1198
|
+
|
|
1199
|
+
# Generate ID if not present
|
|
1200
|
+
if not policy.id:
|
|
1201
|
+
policy.id = str(uuid.uuid4())
|
|
1202
|
+
|
|
1203
|
+
# Use pipeline for atomicity
|
|
1204
|
+
pipe = client.pipeline()
|
|
1205
|
+
|
|
1206
|
+
# Store policy
|
|
1207
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy.id)
|
|
1208
|
+
pipe.set(policy_key, self._serialize_policy(policy))
|
|
1209
|
+
|
|
1210
|
+
# Update indices
|
|
1211
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1212
|
+
pipe.sadd(index_key, policy.id)
|
|
1213
|
+
|
|
1214
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
|
|
1215
|
+
pipe.set(name_key, policy.id)
|
|
1216
|
+
|
|
1217
|
+
active_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1218
|
+
if policy.is_active:
|
|
1219
|
+
pipe.sadd(active_key, policy.id)
|
|
1220
|
+
else:
|
|
1221
|
+
pipe.srem(active_key, policy.id)
|
|
1222
|
+
|
|
1223
|
+
pipe.execute()
|
|
1224
|
+
|
|
1225
|
+
self._metrics.policy_saves += 1
|
|
1226
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1227
|
+
self._metrics.record_latency(latency_ms)
|
|
1228
|
+
|
|
1229
|
+
return policy.id
|
|
1230
|
+
|
|
1231
|
+
except Exception as e:
|
|
1232
|
+
self._handle_redis_error(e, "save_policy")
|
|
1233
|
+
|
|
1234
|
+
if self.enable_fallback:
|
|
1235
|
+
self._using_fallback = True
|
|
1236
|
+
self._metrics.fallbacks += 1
|
|
1237
|
+
self._logger.warning("Falling back to InMemory store")
|
|
1238
|
+
return self._get_fallback_store().save_policy(policy)
|
|
1239
|
+
|
|
1240
|
+
raise
|
|
1241
|
+
|
|
1242
|
+
async def save_policy_async(self, policy: EscalationPolicy) -> str:
|
|
1243
|
+
"""Save or update a policy asynchronously.
|
|
1244
|
+
|
|
1245
|
+
Args:
|
|
1246
|
+
policy: Policy to save.
|
|
1247
|
+
|
|
1248
|
+
Returns:
|
|
1249
|
+
Policy ID.
|
|
1250
|
+
"""
|
|
1251
|
+
start_time = time.time()
|
|
1252
|
+
|
|
1253
|
+
if self._using_fallback and self.enable_fallback:
|
|
1254
|
+
return self._get_fallback_store().save_policy(policy)
|
|
1255
|
+
|
|
1256
|
+
try:
|
|
1257
|
+
client = await self.get_async_client()
|
|
1258
|
+
|
|
1259
|
+
# Generate ID if not present
|
|
1260
|
+
if not policy.id:
|
|
1261
|
+
policy.id = str(uuid.uuid4())
|
|
1262
|
+
|
|
1263
|
+
# Use pipeline for atomicity
|
|
1264
|
+
pipe = client.pipeline()
|
|
1265
|
+
|
|
1266
|
+
# Store policy
|
|
1267
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy.id)
|
|
1268
|
+
pipe.set(policy_key, self._serialize_policy(policy))
|
|
1269
|
+
|
|
1270
|
+
# Update indices
|
|
1271
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1272
|
+
pipe.sadd(index_key, policy.id)
|
|
1273
|
+
|
|
1274
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
|
|
1275
|
+
pipe.set(name_key, policy.id)
|
|
1276
|
+
|
|
1277
|
+
active_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1278
|
+
if policy.is_active:
|
|
1279
|
+
pipe.sadd(active_key, policy.id)
|
|
1280
|
+
else:
|
|
1281
|
+
pipe.srem(active_key, policy.id)
|
|
1282
|
+
|
|
1283
|
+
await pipe.execute()
|
|
1284
|
+
|
|
1285
|
+
self._metrics.policy_saves += 1
|
|
1286
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1287
|
+
self._metrics.record_latency(latency_ms)
|
|
1288
|
+
|
|
1289
|
+
return policy.id
|
|
1290
|
+
|
|
1291
|
+
except Exception as e:
|
|
1292
|
+
self._handle_redis_error(e, "save_policy_async")
|
|
1293
|
+
|
|
1294
|
+
if self.enable_fallback:
|
|
1295
|
+
self._using_fallback = True
|
|
1296
|
+
self._metrics.fallbacks += 1
|
|
1297
|
+
return self._get_fallback_store().save_policy(policy)
|
|
1298
|
+
|
|
1299
|
+
raise
|
|
1300
|
+
|
|
1301
|
+
def get_policy(self, policy_id: str) -> EscalationPolicy | None:
|
|
1302
|
+
"""Get policy by ID.
|
|
1303
|
+
|
|
1304
|
+
Args:
|
|
1305
|
+
policy_id: Policy ID.
|
|
1306
|
+
|
|
1307
|
+
Returns:
|
|
1308
|
+
Policy if found, None otherwise.
|
|
1309
|
+
"""
|
|
1310
|
+
start_time = time.time()
|
|
1311
|
+
|
|
1312
|
+
if self._using_fallback and self.enable_fallback:
|
|
1313
|
+
return self._get_fallback_store().get_policy(policy_id)
|
|
1314
|
+
|
|
1315
|
+
try:
|
|
1316
|
+
client = self.client
|
|
1317
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
|
|
1318
|
+
data = client.get(policy_key)
|
|
1319
|
+
|
|
1320
|
+
self._metrics.policy_gets += 1
|
|
1321
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1322
|
+
self._metrics.record_latency(latency_ms)
|
|
1323
|
+
|
|
1324
|
+
if not data:
|
|
1325
|
+
return None
|
|
1326
|
+
|
|
1327
|
+
return self._deserialize_policy(data)
|
|
1328
|
+
|
|
1329
|
+
except Exception as e:
|
|
1330
|
+
self._handle_redis_error(e, "get_policy")
|
|
1331
|
+
|
|
1332
|
+
if self.enable_fallback:
|
|
1333
|
+
self._using_fallback = True
|
|
1334
|
+
self._metrics.fallbacks += 1
|
|
1335
|
+
return self._get_fallback_store().get_policy(policy_id)
|
|
1336
|
+
|
|
1337
|
+
raise
|
|
1338
|
+
|
|
1339
|
+
async def get_policy_async(self, policy_id: str) -> EscalationPolicy | None:
|
|
1340
|
+
"""Get policy by ID asynchronously.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
policy_id: Policy ID.
|
|
1344
|
+
|
|
1345
|
+
Returns:
|
|
1346
|
+
Policy if found, None otherwise.
|
|
1347
|
+
"""
|
|
1348
|
+
start_time = time.time()
|
|
1349
|
+
|
|
1350
|
+
if self._using_fallback and self.enable_fallback:
|
|
1351
|
+
return self._get_fallback_store().get_policy(policy_id)
|
|
1352
|
+
|
|
1353
|
+
try:
|
|
1354
|
+
client = await self.get_async_client()
|
|
1355
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
|
|
1356
|
+
data = await client.get(policy_key)
|
|
1357
|
+
|
|
1358
|
+
self._metrics.policy_gets += 1
|
|
1359
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1360
|
+
self._metrics.record_latency(latency_ms)
|
|
1361
|
+
|
|
1362
|
+
if not data:
|
|
1363
|
+
return None
|
|
1364
|
+
|
|
1365
|
+
return self._deserialize_policy(data)
|
|
1366
|
+
|
|
1367
|
+
except Exception as e:
|
|
1368
|
+
self._handle_redis_error(e, "get_policy_async")
|
|
1369
|
+
|
|
1370
|
+
if self.enable_fallback:
|
|
1371
|
+
self._using_fallback = True
|
|
1372
|
+
self._metrics.fallbacks += 1
|
|
1373
|
+
return self._get_fallback_store().get_policy(policy_id)
|
|
1374
|
+
|
|
1375
|
+
raise
|
|
1376
|
+
|
|
1377
|
+
def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
|
|
1378
|
+
"""Get policy by name.
|
|
1379
|
+
|
|
1380
|
+
Args:
|
|
1381
|
+
name: Policy name.
|
|
1382
|
+
|
|
1383
|
+
Returns:
|
|
1384
|
+
Policy if found, None otherwise.
|
|
1385
|
+
"""
|
|
1386
|
+
if self._using_fallback and self.enable_fallback:
|
|
1387
|
+
return self._get_fallback_store().get_policy_by_name(name)
|
|
1388
|
+
|
|
1389
|
+
try:
|
|
1390
|
+
client = self.client
|
|
1391
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=name)
|
|
1392
|
+
policy_id = client.get(name_key)
|
|
1393
|
+
|
|
1394
|
+
if not policy_id:
|
|
1395
|
+
return None
|
|
1396
|
+
|
|
1397
|
+
return self.get_policy(policy_id)
|
|
1398
|
+
|
|
1399
|
+
except Exception as e:
|
|
1400
|
+
self._handle_redis_error(e, "get_policy_by_name")
|
|
1401
|
+
|
|
1402
|
+
if self.enable_fallback:
|
|
1403
|
+
self._using_fallback = True
|
|
1404
|
+
self._metrics.fallbacks += 1
|
|
1405
|
+
return self._get_fallback_store().get_policy_by_name(name)
|
|
1406
|
+
|
|
1407
|
+
raise
|
|
1408
|
+
|
|
1409
|
+
async def get_policy_by_name_async(self, name: str) -> EscalationPolicy | None:
|
|
1410
|
+
"""Get policy by name asynchronously.
|
|
1411
|
+
|
|
1412
|
+
Args:
|
|
1413
|
+
name: Policy name.
|
|
1414
|
+
|
|
1415
|
+
Returns:
|
|
1416
|
+
Policy if found, None otherwise.
|
|
1417
|
+
"""
|
|
1418
|
+
if self._using_fallback and self.enable_fallback:
|
|
1419
|
+
return self._get_fallback_store().get_policy_by_name(name)
|
|
1420
|
+
|
|
1421
|
+
try:
|
|
1422
|
+
client = await self.get_async_client()
|
|
1423
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=name)
|
|
1424
|
+
policy_id = await client.get(name_key)
|
|
1425
|
+
|
|
1426
|
+
if not policy_id:
|
|
1427
|
+
return None
|
|
1428
|
+
|
|
1429
|
+
return await self.get_policy_async(policy_id)
|
|
1430
|
+
|
|
1431
|
+
except Exception as e:
|
|
1432
|
+
self._handle_redis_error(e, "get_policy_by_name_async")
|
|
1433
|
+
|
|
1434
|
+
if self.enable_fallback:
|
|
1435
|
+
self._using_fallback = True
|
|
1436
|
+
self._metrics.fallbacks += 1
|
|
1437
|
+
return self._get_fallback_store().get_policy_by_name(name)
|
|
1438
|
+
|
|
1439
|
+
raise
|
|
1440
|
+
|
|
1441
|
+
def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
|
|
1442
|
+
"""List all policies.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
active_only: If True, only return active policies.
|
|
1446
|
+
|
|
1447
|
+
Returns:
|
|
1448
|
+
List of policies.
|
|
1449
|
+
"""
|
|
1450
|
+
if self._using_fallback and self.enable_fallback:
|
|
1451
|
+
return self._get_fallback_store().list_policies(active_only)
|
|
1452
|
+
|
|
1453
|
+
try:
|
|
1454
|
+
client = self.client
|
|
1455
|
+
|
|
1456
|
+
if active_only:
|
|
1457
|
+
index_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1458
|
+
else:
|
|
1459
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1460
|
+
|
|
1461
|
+
policy_ids = client.smembers(index_key)
|
|
1462
|
+
policies = []
|
|
1463
|
+
|
|
1464
|
+
for policy_id in policy_ids:
|
|
1465
|
+
policy = self.get_policy(policy_id)
|
|
1466
|
+
if policy:
|
|
1467
|
+
policies.append(policy)
|
|
1468
|
+
|
|
1469
|
+
return policies
|
|
1470
|
+
|
|
1471
|
+
except Exception as e:
|
|
1472
|
+
self._handle_redis_error(e, "list_policies")
|
|
1473
|
+
|
|
1474
|
+
if self.enable_fallback:
|
|
1475
|
+
self._using_fallback = True
|
|
1476
|
+
self._metrics.fallbacks += 1
|
|
1477
|
+
return self._get_fallback_store().list_policies(active_only)
|
|
1478
|
+
|
|
1479
|
+
raise
|
|
1480
|
+
|
|
1481
|
+
async def list_policies_async(
|
|
1482
|
+
self, active_only: bool = True
|
|
1483
|
+
) -> list[EscalationPolicy]:
|
|
1484
|
+
"""List all policies asynchronously.
|
|
1485
|
+
|
|
1486
|
+
Args:
|
|
1487
|
+
active_only: If True, only return active policies.
|
|
1488
|
+
|
|
1489
|
+
Returns:
|
|
1490
|
+
List of policies.
|
|
1491
|
+
"""
|
|
1492
|
+
if self._using_fallback and self.enable_fallback:
|
|
1493
|
+
return self._get_fallback_store().list_policies(active_only)
|
|
1494
|
+
|
|
1495
|
+
try:
|
|
1496
|
+
client = await self.get_async_client()
|
|
1497
|
+
|
|
1498
|
+
if active_only:
|
|
1499
|
+
index_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1500
|
+
else:
|
|
1501
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1502
|
+
|
|
1503
|
+
policy_ids = await client.smembers(index_key)
|
|
1504
|
+
policies = []
|
|
1505
|
+
|
|
1506
|
+
for policy_id in policy_ids:
|
|
1507
|
+
policy = await self.get_policy_async(policy_id)
|
|
1508
|
+
if policy:
|
|
1509
|
+
policies.append(policy)
|
|
1510
|
+
|
|
1511
|
+
return policies
|
|
1512
|
+
|
|
1513
|
+
except Exception as e:
|
|
1514
|
+
self._handle_redis_error(e, "list_policies_async")
|
|
1515
|
+
|
|
1516
|
+
if self.enable_fallback:
|
|
1517
|
+
self._using_fallback = True
|
|
1518
|
+
self._metrics.fallbacks += 1
|
|
1519
|
+
return self._get_fallback_store().list_policies(active_only)
|
|
1520
|
+
|
|
1521
|
+
raise
|
|
1522
|
+
|
|
1523
|
+
def delete_policy(self, policy_id: str) -> bool:
|
|
1524
|
+
"""Delete a policy.
|
|
1525
|
+
|
|
1526
|
+
Args:
|
|
1527
|
+
policy_id: Policy ID to delete.
|
|
1528
|
+
|
|
1529
|
+
Returns:
|
|
1530
|
+
True if deleted, False if not found.
|
|
1531
|
+
"""
|
|
1532
|
+
if self._using_fallback and self.enable_fallback:
|
|
1533
|
+
return self._get_fallback_store().delete_policy(policy_id)
|
|
1534
|
+
|
|
1535
|
+
try:
|
|
1536
|
+
client = self.client
|
|
1537
|
+
|
|
1538
|
+
# Get policy first to get name for index cleanup
|
|
1539
|
+
policy = self.get_policy(policy_id)
|
|
1540
|
+
if not policy:
|
|
1541
|
+
return False
|
|
1542
|
+
|
|
1543
|
+
pipe = client.pipeline()
|
|
1544
|
+
|
|
1545
|
+
# Delete policy
|
|
1546
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
|
|
1547
|
+
pipe.delete(policy_key)
|
|
1548
|
+
|
|
1549
|
+
# Remove from indices
|
|
1550
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1551
|
+
pipe.srem(index_key, policy_id)
|
|
1552
|
+
|
|
1553
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
|
|
1554
|
+
pipe.delete(name_key)
|
|
1555
|
+
|
|
1556
|
+
active_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1557
|
+
pipe.srem(active_key, policy_id)
|
|
1558
|
+
|
|
1559
|
+
pipe.execute()
|
|
1560
|
+
|
|
1561
|
+
self._metrics.policy_deletes += 1
|
|
1562
|
+
return True
|
|
1563
|
+
|
|
1564
|
+
except Exception as e:
|
|
1565
|
+
self._handle_redis_error(e, "delete_policy")
|
|
1566
|
+
|
|
1567
|
+
if self.enable_fallback:
|
|
1568
|
+
self._using_fallback = True
|
|
1569
|
+
self._metrics.fallbacks += 1
|
|
1570
|
+
return self._get_fallback_store().delete_policy(policy_id)
|
|
1571
|
+
|
|
1572
|
+
raise
|
|
1573
|
+
|
|
1574
|
+
async def delete_policy_async(self, policy_id: str) -> bool:
|
|
1575
|
+
"""Delete a policy asynchronously.
|
|
1576
|
+
|
|
1577
|
+
Args:
|
|
1578
|
+
policy_id: Policy ID to delete.
|
|
1579
|
+
|
|
1580
|
+
Returns:
|
|
1581
|
+
True if deleted, False if not found.
|
|
1582
|
+
"""
|
|
1583
|
+
if self._using_fallback and self.enable_fallback:
|
|
1584
|
+
return self._get_fallback_store().delete_policy(policy_id)
|
|
1585
|
+
|
|
1586
|
+
try:
|
|
1587
|
+
client = await self.get_async_client()
|
|
1588
|
+
|
|
1589
|
+
# Get policy first to get name for index cleanup
|
|
1590
|
+
policy = await self.get_policy_async(policy_id)
|
|
1591
|
+
if not policy:
|
|
1592
|
+
return False
|
|
1593
|
+
|
|
1594
|
+
pipe = client.pipeline()
|
|
1595
|
+
|
|
1596
|
+
# Delete policy
|
|
1597
|
+
policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
|
|
1598
|
+
pipe.delete(policy_key)
|
|
1599
|
+
|
|
1600
|
+
# Remove from indices
|
|
1601
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
1602
|
+
pipe.srem(index_key, policy_id)
|
|
1603
|
+
|
|
1604
|
+
name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
|
|
1605
|
+
pipe.delete(name_key)
|
|
1606
|
+
|
|
1607
|
+
active_key = self._get_key(self.KEY_POLICY_ACTIVE)
|
|
1608
|
+
pipe.srem(active_key, policy_id)
|
|
1609
|
+
|
|
1610
|
+
await pipe.execute()
|
|
1611
|
+
|
|
1612
|
+
self._metrics.policy_deletes += 1
|
|
1613
|
+
return True
|
|
1614
|
+
|
|
1615
|
+
except Exception as e:
|
|
1616
|
+
self._handle_redis_error(e, "delete_policy_async")
|
|
1617
|
+
|
|
1618
|
+
if self.enable_fallback:
|
|
1619
|
+
self._using_fallback = True
|
|
1620
|
+
self._metrics.fallbacks += 1
|
|
1621
|
+
return self._get_fallback_store().delete_policy(policy_id)
|
|
1622
|
+
|
|
1623
|
+
raise
|
|
1624
|
+
|
|
1625
|
+
# =========================================================================
|
|
1626
|
+
# Incident Operations
|
|
1627
|
+
# =========================================================================
|
|
1628
|
+
|
|
1629
|
+
def save_incident(self, incident: EscalationIncident) -> str:
|
|
1630
|
+
"""Save or update an incident.
|
|
1631
|
+
|
|
1632
|
+
Args:
|
|
1633
|
+
incident: Incident to save.
|
|
1634
|
+
|
|
1635
|
+
Returns:
|
|
1636
|
+
Incident ID.
|
|
1637
|
+
"""
|
|
1638
|
+
start_time = time.time()
|
|
1639
|
+
|
|
1640
|
+
if self._using_fallback and self.enable_fallback:
|
|
1641
|
+
return self._get_fallback_store().save_incident(incident)
|
|
1642
|
+
|
|
1643
|
+
try:
|
|
1644
|
+
client = self.client
|
|
1645
|
+
is_new = not incident.id
|
|
1646
|
+
|
|
1647
|
+
# Generate ID if not present
|
|
1648
|
+
if not incident.id:
|
|
1649
|
+
incident.id = str(uuid.uuid4())
|
|
1650
|
+
|
|
1651
|
+
incident.updated_at = datetime.utcnow()
|
|
1652
|
+
|
|
1653
|
+
# Use pipeline for atomicity
|
|
1654
|
+
pipe = client.pipeline()
|
|
1655
|
+
|
|
1656
|
+
# Store incident
|
|
1657
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
|
|
1658
|
+
pipe.set(incident_key, self._serialize_incident(incident))
|
|
1659
|
+
|
|
1660
|
+
# Set TTL for resolved incidents
|
|
1661
|
+
if incident.state == EscalationState.RESOLVED and self.resolved_ttl > 0:
|
|
1662
|
+
pipe.expire(incident_key, self.resolved_ttl)
|
|
1663
|
+
|
|
1664
|
+
# Update indices
|
|
1665
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
1666
|
+
pipe.sadd(index_key, incident.id)
|
|
1667
|
+
|
|
1668
|
+
ref_key = self._get_key(
|
|
1669
|
+
self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
|
|
1670
|
+
)
|
|
1671
|
+
pipe.set(ref_key, incident.id)
|
|
1672
|
+
|
|
1673
|
+
policy_key = self._get_key(
|
|
1674
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
|
|
1675
|
+
)
|
|
1676
|
+
pipe.sadd(policy_key, incident.id)
|
|
1677
|
+
|
|
1678
|
+
# Update state index (remove from other states first)
|
|
1679
|
+
for state in EscalationState:
|
|
1680
|
+
state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
|
|
1681
|
+
if state == incident.state:
|
|
1682
|
+
pipe.sadd(state_key, incident.id)
|
|
1683
|
+
else:
|
|
1684
|
+
pipe.srem(state_key, incident.id)
|
|
1685
|
+
|
|
1686
|
+
# Update created_at sorted set
|
|
1687
|
+
created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
|
|
1688
|
+
created_score = incident.created_at.timestamp()
|
|
1689
|
+
pipe.zadd(created_key, {incident.id: created_score})
|
|
1690
|
+
|
|
1691
|
+
# Update pending escalation sorted set
|
|
1692
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
1693
|
+
if incident.state in [EscalationState.TRIGGERED, EscalationState.ESCALATED]:
|
|
1694
|
+
if incident.next_escalation_at:
|
|
1695
|
+
score = incident.next_escalation_at.timestamp()
|
|
1696
|
+
pipe.zadd(pending_key, {incident.id: score})
|
|
1697
|
+
else:
|
|
1698
|
+
pipe.zrem(pending_key, incident.id)
|
|
1699
|
+
else:
|
|
1700
|
+
pipe.zrem(pending_key, incident.id)
|
|
1701
|
+
|
|
1702
|
+
pipe.execute()
|
|
1703
|
+
|
|
1704
|
+
# Publish update
|
|
1705
|
+
event_type = "created" if is_new else "updated"
|
|
1706
|
+
self._publish_incident_update(client, incident, event_type)
|
|
1707
|
+
|
|
1708
|
+
self._metrics.incident_saves += 1
|
|
1709
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1710
|
+
self._metrics.record_latency(latency_ms)
|
|
1711
|
+
|
|
1712
|
+
return incident.id
|
|
1713
|
+
|
|
1714
|
+
except Exception as e:
|
|
1715
|
+
self._handle_redis_error(e, "save_incident")
|
|
1716
|
+
|
|
1717
|
+
if self.enable_fallback:
|
|
1718
|
+
self._using_fallback = True
|
|
1719
|
+
self._metrics.fallbacks += 1
|
|
1720
|
+
self._logger.warning("Falling back to InMemory store")
|
|
1721
|
+
return self._get_fallback_store().save_incident(incident)
|
|
1722
|
+
|
|
1723
|
+
raise
|
|
1724
|
+
|
|
1725
|
+
async def save_incident_async(self, incident: EscalationIncident) -> str:
|
|
1726
|
+
"""Save or update an incident asynchronously.
|
|
1727
|
+
|
|
1728
|
+
Args:
|
|
1729
|
+
incident: Incident to save.
|
|
1730
|
+
|
|
1731
|
+
Returns:
|
|
1732
|
+
Incident ID.
|
|
1733
|
+
"""
|
|
1734
|
+
start_time = time.time()
|
|
1735
|
+
|
|
1736
|
+
if self._using_fallback and self.enable_fallback:
|
|
1737
|
+
return self._get_fallback_store().save_incident(incident)
|
|
1738
|
+
|
|
1739
|
+
try:
|
|
1740
|
+
client = await self.get_async_client()
|
|
1741
|
+
is_new = not incident.id
|
|
1742
|
+
|
|
1743
|
+
# Generate ID if not present
|
|
1744
|
+
if not incident.id:
|
|
1745
|
+
incident.id = str(uuid.uuid4())
|
|
1746
|
+
|
|
1747
|
+
incident.updated_at = datetime.utcnow()
|
|
1748
|
+
|
|
1749
|
+
# Use pipeline for atomicity
|
|
1750
|
+
pipe = client.pipeline()
|
|
1751
|
+
|
|
1752
|
+
# Store incident
|
|
1753
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
|
|
1754
|
+
pipe.set(incident_key, self._serialize_incident(incident))
|
|
1755
|
+
|
|
1756
|
+
# Set TTL for resolved incidents
|
|
1757
|
+
if incident.state == EscalationState.RESOLVED and self.resolved_ttl > 0:
|
|
1758
|
+
pipe.expire(incident_key, self.resolved_ttl)
|
|
1759
|
+
|
|
1760
|
+
# Update indices
|
|
1761
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
1762
|
+
pipe.sadd(index_key, incident.id)
|
|
1763
|
+
|
|
1764
|
+
ref_key = self._get_key(
|
|
1765
|
+
self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
|
|
1766
|
+
)
|
|
1767
|
+
pipe.set(ref_key, incident.id)
|
|
1768
|
+
|
|
1769
|
+
policy_key = self._get_key(
|
|
1770
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
|
|
1771
|
+
)
|
|
1772
|
+
pipe.sadd(policy_key, incident.id)
|
|
1773
|
+
|
|
1774
|
+
# Update state index
|
|
1775
|
+
for state in EscalationState:
|
|
1776
|
+
state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
|
|
1777
|
+
if state == incident.state:
|
|
1778
|
+
pipe.sadd(state_key, incident.id)
|
|
1779
|
+
else:
|
|
1780
|
+
pipe.srem(state_key, incident.id)
|
|
1781
|
+
|
|
1782
|
+
# Update created_at sorted set
|
|
1783
|
+
created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
|
|
1784
|
+
created_score = incident.created_at.timestamp()
|
|
1785
|
+
pipe.zadd(created_key, {incident.id: created_score})
|
|
1786
|
+
|
|
1787
|
+
# Update pending escalation sorted set
|
|
1788
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
1789
|
+
if incident.state in [EscalationState.TRIGGERED, EscalationState.ESCALATED]:
|
|
1790
|
+
if incident.next_escalation_at:
|
|
1791
|
+
score = incident.next_escalation_at.timestamp()
|
|
1792
|
+
pipe.zadd(pending_key, {incident.id: score})
|
|
1793
|
+
else:
|
|
1794
|
+
pipe.zrem(pending_key, incident.id)
|
|
1795
|
+
else:
|
|
1796
|
+
pipe.zrem(pending_key, incident.id)
|
|
1797
|
+
|
|
1798
|
+
await pipe.execute()
|
|
1799
|
+
|
|
1800
|
+
# Publish update
|
|
1801
|
+
event_type = "created" if is_new else "updated"
|
|
1802
|
+
await self._publish_incident_update_async(client, incident, event_type)
|
|
1803
|
+
|
|
1804
|
+
self._metrics.incident_saves += 1
|
|
1805
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1806
|
+
self._metrics.record_latency(latency_ms)
|
|
1807
|
+
|
|
1808
|
+
return incident.id
|
|
1809
|
+
|
|
1810
|
+
except Exception as e:
|
|
1811
|
+
self._handle_redis_error(e, "save_incident_async")
|
|
1812
|
+
|
|
1813
|
+
if self.enable_fallback:
|
|
1814
|
+
self._using_fallback = True
|
|
1815
|
+
self._metrics.fallbacks += 1
|
|
1816
|
+
return self._get_fallback_store().save_incident(incident)
|
|
1817
|
+
|
|
1818
|
+
raise
|
|
1819
|
+
|
|
1820
|
+
def get_incident(self, incident_id: str) -> EscalationIncident | None:
|
|
1821
|
+
"""Get incident by ID.
|
|
1822
|
+
|
|
1823
|
+
Args:
|
|
1824
|
+
incident_id: Incident ID.
|
|
1825
|
+
|
|
1826
|
+
Returns:
|
|
1827
|
+
Incident if found, None otherwise.
|
|
1828
|
+
"""
|
|
1829
|
+
start_time = time.time()
|
|
1830
|
+
|
|
1831
|
+
if self._using_fallback and self.enable_fallback:
|
|
1832
|
+
return self._get_fallback_store().get_incident(incident_id)
|
|
1833
|
+
|
|
1834
|
+
try:
|
|
1835
|
+
client = self.client
|
|
1836
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
|
|
1837
|
+
data = client.get(incident_key)
|
|
1838
|
+
|
|
1839
|
+
self._metrics.incident_gets += 1
|
|
1840
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1841
|
+
self._metrics.record_latency(latency_ms)
|
|
1842
|
+
|
|
1843
|
+
if not data:
|
|
1844
|
+
return None
|
|
1845
|
+
|
|
1846
|
+
return self._deserialize_incident(data)
|
|
1847
|
+
|
|
1848
|
+
except Exception as e:
|
|
1849
|
+
self._handle_redis_error(e, "get_incident")
|
|
1850
|
+
|
|
1851
|
+
if self.enable_fallback:
|
|
1852
|
+
self._using_fallback = True
|
|
1853
|
+
self._metrics.fallbacks += 1
|
|
1854
|
+
return self._get_fallback_store().get_incident(incident_id)
|
|
1855
|
+
|
|
1856
|
+
raise
|
|
1857
|
+
|
|
1858
|
+
async def get_incident_async(self, incident_id: str) -> EscalationIncident | None:
|
|
1859
|
+
"""Get incident by ID asynchronously.
|
|
1860
|
+
|
|
1861
|
+
Args:
|
|
1862
|
+
incident_id: Incident ID.
|
|
1863
|
+
|
|
1864
|
+
Returns:
|
|
1865
|
+
Incident if found, None otherwise.
|
|
1866
|
+
"""
|
|
1867
|
+
start_time = time.time()
|
|
1868
|
+
|
|
1869
|
+
if self._using_fallback and self.enable_fallback:
|
|
1870
|
+
return self._get_fallback_store().get_incident(incident_id)
|
|
1871
|
+
|
|
1872
|
+
try:
|
|
1873
|
+
client = await self.get_async_client()
|
|
1874
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
|
|
1875
|
+
data = await client.get(incident_key)
|
|
1876
|
+
|
|
1877
|
+
self._metrics.incident_gets += 1
|
|
1878
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
1879
|
+
self._metrics.record_latency(latency_ms)
|
|
1880
|
+
|
|
1881
|
+
if not data:
|
|
1882
|
+
return None
|
|
1883
|
+
|
|
1884
|
+
return self._deserialize_incident(data)
|
|
1885
|
+
|
|
1886
|
+
except Exception as e:
|
|
1887
|
+
self._handle_redis_error(e, "get_incident_async")
|
|
1888
|
+
|
|
1889
|
+
if self.enable_fallback:
|
|
1890
|
+
self._using_fallback = True
|
|
1891
|
+
self._metrics.fallbacks += 1
|
|
1892
|
+
return self._get_fallback_store().get_incident(incident_id)
|
|
1893
|
+
|
|
1894
|
+
raise
|
|
1895
|
+
|
|
1896
|
+
def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
|
|
1897
|
+
"""Get incident by external reference.
|
|
1898
|
+
|
|
1899
|
+
Args:
|
|
1900
|
+
incident_ref: External reference.
|
|
1901
|
+
|
|
1902
|
+
Returns:
|
|
1903
|
+
Incident if found, None otherwise.
|
|
1904
|
+
"""
|
|
1905
|
+
if self._using_fallback and self.enable_fallback:
|
|
1906
|
+
return self._get_fallback_store().get_incident_by_ref(incident_ref)
|
|
1907
|
+
|
|
1908
|
+
try:
|
|
1909
|
+
client = self.client
|
|
1910
|
+
ref_key = self._get_key(self.KEY_INCIDENT_BY_REF, incident_ref=incident_ref)
|
|
1911
|
+
incident_id = client.get(ref_key)
|
|
1912
|
+
|
|
1913
|
+
if not incident_id:
|
|
1914
|
+
return None
|
|
1915
|
+
|
|
1916
|
+
return self.get_incident(incident_id)
|
|
1917
|
+
|
|
1918
|
+
except Exception as e:
|
|
1919
|
+
self._handle_redis_error(e, "get_incident_by_ref")
|
|
1920
|
+
|
|
1921
|
+
if self.enable_fallback:
|
|
1922
|
+
self._using_fallback = True
|
|
1923
|
+
self._metrics.fallbacks += 1
|
|
1924
|
+
return self._get_fallback_store().get_incident_by_ref(incident_ref)
|
|
1925
|
+
|
|
1926
|
+
raise
|
|
1927
|
+
|
|
1928
|
+
async def get_incident_by_ref_async(
|
|
1929
|
+
self, incident_ref: str
|
|
1930
|
+
) -> EscalationIncident | None:
|
|
1931
|
+
"""Get incident by external reference asynchronously.
|
|
1932
|
+
|
|
1933
|
+
Args:
|
|
1934
|
+
incident_ref: External reference.
|
|
1935
|
+
|
|
1936
|
+
Returns:
|
|
1937
|
+
Incident if found, None otherwise.
|
|
1938
|
+
"""
|
|
1939
|
+
if self._using_fallback and self.enable_fallback:
|
|
1940
|
+
return self._get_fallback_store().get_incident_by_ref(incident_ref)
|
|
1941
|
+
|
|
1942
|
+
try:
|
|
1943
|
+
client = await self.get_async_client()
|
|
1944
|
+
ref_key = self._get_key(self.KEY_INCIDENT_BY_REF, incident_ref=incident_ref)
|
|
1945
|
+
incident_id = await client.get(ref_key)
|
|
1946
|
+
|
|
1947
|
+
if not incident_id:
|
|
1948
|
+
return None
|
|
1949
|
+
|
|
1950
|
+
return await self.get_incident_async(incident_id)
|
|
1951
|
+
|
|
1952
|
+
except Exception as e:
|
|
1953
|
+
self._handle_redis_error(e, "get_incident_by_ref_async")
|
|
1954
|
+
|
|
1955
|
+
if self.enable_fallback:
|
|
1956
|
+
self._using_fallback = True
|
|
1957
|
+
self._metrics.fallbacks += 1
|
|
1958
|
+
return self._get_fallback_store().get_incident_by_ref(incident_ref)
|
|
1959
|
+
|
|
1960
|
+
raise
|
|
1961
|
+
|
|
1962
|
+
def list_incidents(
|
|
1963
|
+
self,
|
|
1964
|
+
policy_id: str | None = None,
|
|
1965
|
+
states: list[EscalationState] | None = None,
|
|
1966
|
+
) -> list[EscalationIncident]:
|
|
1967
|
+
"""List incidents with optional filters.
|
|
1968
|
+
|
|
1969
|
+
Args:
|
|
1970
|
+
policy_id: Filter by policy ID.
|
|
1971
|
+
states: Filter by states.
|
|
1972
|
+
|
|
1973
|
+
Returns:
|
|
1974
|
+
List of incidents.
|
|
1975
|
+
"""
|
|
1976
|
+
if self._using_fallback and self.enable_fallback:
|
|
1977
|
+
return self._get_fallback_store().list_incidents(policy_id, states)
|
|
1978
|
+
|
|
1979
|
+
try:
|
|
1980
|
+
client = self.client
|
|
1981
|
+
incident_ids: set[str] = set()
|
|
1982
|
+
|
|
1983
|
+
# Get IDs based on filters
|
|
1984
|
+
if policy_id:
|
|
1985
|
+
policy_key = self._get_key(
|
|
1986
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=policy_id
|
|
1987
|
+
)
|
|
1988
|
+
incident_ids = client.smembers(policy_key)
|
|
1989
|
+
elif states:
|
|
1990
|
+
# Get incidents from state indices and intersect
|
|
1991
|
+
for i, state in enumerate(states):
|
|
1992
|
+
state_key = self._get_key(
|
|
1993
|
+
self.KEY_INCIDENT_BY_STATE, state=state.value
|
|
1994
|
+
)
|
|
1995
|
+
state_ids = client.smembers(state_key)
|
|
1996
|
+
if i == 0:
|
|
1997
|
+
incident_ids = state_ids
|
|
1998
|
+
else:
|
|
1999
|
+
incident_ids = incident_ids.union(state_ids)
|
|
2000
|
+
else:
|
|
2001
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2002
|
+
incident_ids = client.smembers(index_key)
|
|
2003
|
+
|
|
2004
|
+
# Fetch incidents
|
|
2005
|
+
incidents = []
|
|
2006
|
+
for incident_id in incident_ids:
|
|
2007
|
+
incident = self.get_incident(incident_id)
|
|
2008
|
+
if incident:
|
|
2009
|
+
# Apply additional filters if needed
|
|
2010
|
+
if states and incident.state not in states:
|
|
2011
|
+
continue
|
|
2012
|
+
if policy_id and incident.policy_id != policy_id:
|
|
2013
|
+
continue
|
|
2014
|
+
incidents.append(incident)
|
|
2015
|
+
|
|
2016
|
+
return incidents
|
|
2017
|
+
|
|
2018
|
+
except Exception as e:
|
|
2019
|
+
self._handle_redis_error(e, "list_incidents")
|
|
2020
|
+
|
|
2021
|
+
if self.enable_fallback:
|
|
2022
|
+
self._using_fallback = True
|
|
2023
|
+
self._metrics.fallbacks += 1
|
|
2024
|
+
return self._get_fallback_store().list_incidents(policy_id, states)
|
|
2025
|
+
|
|
2026
|
+
raise
|
|
2027
|
+
|
|
2028
|
+
async def list_incidents_async(
|
|
2029
|
+
self,
|
|
2030
|
+
policy_id: str | None = None,
|
|
2031
|
+
states: list[EscalationState] | None = None,
|
|
2032
|
+
) -> list[EscalationIncident]:
|
|
2033
|
+
"""List incidents with optional filters asynchronously.
|
|
2034
|
+
|
|
2035
|
+
Args:
|
|
2036
|
+
policy_id: Filter by policy ID.
|
|
2037
|
+
states: Filter by states.
|
|
2038
|
+
|
|
2039
|
+
Returns:
|
|
2040
|
+
List of incidents.
|
|
2041
|
+
"""
|
|
2042
|
+
if self._using_fallback and self.enable_fallback:
|
|
2043
|
+
return self._get_fallback_store().list_incidents(policy_id, states)
|
|
2044
|
+
|
|
2045
|
+
try:
|
|
2046
|
+
client = await self.get_async_client()
|
|
2047
|
+
incident_ids: set[str] = set()
|
|
2048
|
+
|
|
2049
|
+
# Get IDs based on filters
|
|
2050
|
+
if policy_id:
|
|
2051
|
+
policy_key = self._get_key(
|
|
2052
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=policy_id
|
|
2053
|
+
)
|
|
2054
|
+
incident_ids = await client.smembers(policy_key)
|
|
2055
|
+
elif states:
|
|
2056
|
+
# Get incidents from state indices and union
|
|
2057
|
+
for i, state in enumerate(states):
|
|
2058
|
+
state_key = self._get_key(
|
|
2059
|
+
self.KEY_INCIDENT_BY_STATE, state=state.value
|
|
2060
|
+
)
|
|
2061
|
+
state_ids = await client.smembers(state_key)
|
|
2062
|
+
if i == 0:
|
|
2063
|
+
incident_ids = state_ids
|
|
2064
|
+
else:
|
|
2065
|
+
incident_ids = incident_ids.union(state_ids)
|
|
2066
|
+
else:
|
|
2067
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2068
|
+
incident_ids = await client.smembers(index_key)
|
|
2069
|
+
|
|
2070
|
+
# Fetch incidents
|
|
2071
|
+
incidents = []
|
|
2072
|
+
for incident_id in incident_ids:
|
|
2073
|
+
incident = await self.get_incident_async(incident_id)
|
|
2074
|
+
if incident:
|
|
2075
|
+
# Apply additional filters if needed
|
|
2076
|
+
if states and incident.state not in states:
|
|
2077
|
+
continue
|
|
2078
|
+
if policy_id and incident.policy_id != policy_id:
|
|
2079
|
+
continue
|
|
2080
|
+
incidents.append(incident)
|
|
2081
|
+
|
|
2082
|
+
return incidents
|
|
2083
|
+
|
|
2084
|
+
except Exception as e:
|
|
2085
|
+
self._handle_redis_error(e, "list_incidents_async")
|
|
2086
|
+
|
|
2087
|
+
if self.enable_fallback:
|
|
2088
|
+
self._using_fallback = True
|
|
2089
|
+
self._metrics.fallbacks += 1
|
|
2090
|
+
return self._get_fallback_store().list_incidents(policy_id, states)
|
|
2091
|
+
|
|
2092
|
+
raise
|
|
2093
|
+
|
|
2094
|
+
def get_pending_escalations(self) -> list[EscalationIncident]:
|
|
2095
|
+
"""Get incidents due for escalation.
|
|
2096
|
+
|
|
2097
|
+
Returns:
|
|
2098
|
+
List of incidents due for escalation.
|
|
2099
|
+
"""
|
|
2100
|
+
if self._using_fallback and self.enable_fallback:
|
|
2101
|
+
return self._get_fallback_store().get_pending_escalations()
|
|
2102
|
+
|
|
2103
|
+
try:
|
|
2104
|
+
client = self.client
|
|
2105
|
+
now = datetime.utcnow().timestamp()
|
|
2106
|
+
|
|
2107
|
+
# Get incident IDs from pending sorted set where score <= now
|
|
2108
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2109
|
+
incident_ids = client.zrangebyscore(pending_key, "-inf", now)
|
|
2110
|
+
|
|
2111
|
+
incidents = []
|
|
2112
|
+
for incident_id in incident_ids:
|
|
2113
|
+
incident = self.get_incident(incident_id)
|
|
2114
|
+
if incident and incident.state in [
|
|
2115
|
+
EscalationState.TRIGGERED,
|
|
2116
|
+
EscalationState.ESCALATED,
|
|
2117
|
+
]:
|
|
2118
|
+
incidents.append(incident)
|
|
2119
|
+
|
|
2120
|
+
return incidents
|
|
2121
|
+
|
|
2122
|
+
except Exception as e:
|
|
2123
|
+
self._handle_redis_error(e, "get_pending_escalations")
|
|
2124
|
+
|
|
2125
|
+
if self.enable_fallback:
|
|
2126
|
+
self._using_fallback = True
|
|
2127
|
+
self._metrics.fallbacks += 1
|
|
2128
|
+
return self._get_fallback_store().get_pending_escalations()
|
|
2129
|
+
|
|
2130
|
+
raise
|
|
2131
|
+
|
|
2132
|
+
async def get_pending_escalations_async(self) -> list[EscalationIncident]:
|
|
2133
|
+
"""Get incidents due for escalation asynchronously.
|
|
2134
|
+
|
|
2135
|
+
Returns:
|
|
2136
|
+
List of incidents due for escalation.
|
|
2137
|
+
"""
|
|
2138
|
+
if self._using_fallback and self.enable_fallback:
|
|
2139
|
+
return self._get_fallback_store().get_pending_escalations()
|
|
2140
|
+
|
|
2141
|
+
try:
|
|
2142
|
+
client = await self.get_async_client()
|
|
2143
|
+
now = datetime.utcnow().timestamp()
|
|
2144
|
+
|
|
2145
|
+
# Get incident IDs from pending sorted set where score <= now
|
|
2146
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2147
|
+
incident_ids = await client.zrangebyscore(pending_key, "-inf", now)
|
|
2148
|
+
|
|
2149
|
+
incidents = []
|
|
2150
|
+
for incident_id in incident_ids:
|
|
2151
|
+
incident = await self.get_incident_async(incident_id)
|
|
2152
|
+
if incident and incident.state in [
|
|
2153
|
+
EscalationState.TRIGGERED,
|
|
2154
|
+
EscalationState.ESCALATED,
|
|
2155
|
+
]:
|
|
2156
|
+
incidents.append(incident)
|
|
2157
|
+
|
|
2158
|
+
return incidents
|
|
2159
|
+
|
|
2160
|
+
except Exception as e:
|
|
2161
|
+
self._handle_redis_error(e, "get_pending_escalations_async")
|
|
2162
|
+
|
|
2163
|
+
if self.enable_fallback:
|
|
2164
|
+
self._using_fallback = True
|
|
2165
|
+
self._metrics.fallbacks += 1
|
|
2166
|
+
return self._get_fallback_store().get_pending_escalations()
|
|
2167
|
+
|
|
2168
|
+
raise
|
|
2169
|
+
|
|
2170
|
+
# =========================================================================
|
|
2171
|
+
# Atomic State Transition
|
|
2172
|
+
# =========================================================================
|
|
2173
|
+
|
|
2174
|
+
def transition_state(
|
|
2175
|
+
self,
|
|
2176
|
+
incident_id: str,
|
|
2177
|
+
new_state: EscalationState,
|
|
2178
|
+
**updates: Any,
|
|
2179
|
+
) -> EscalationIncident | None:
|
|
2180
|
+
"""Atomically transition incident state using Lua script.
|
|
2181
|
+
|
|
2182
|
+
This ensures that state transitions are atomic and consistent,
|
|
2183
|
+
even under concurrent access.
|
|
2184
|
+
|
|
2185
|
+
Args:
|
|
2186
|
+
incident_id: Incident ID.
|
|
2187
|
+
new_state: New state to transition to.
|
|
2188
|
+
**updates: Additional fields to update on the incident.
|
|
2189
|
+
|
|
2190
|
+
Returns:
|
|
2191
|
+
Updated incident if successful, None if not found.
|
|
2192
|
+
"""
|
|
2193
|
+
start_time = time.time()
|
|
2194
|
+
|
|
2195
|
+
if self._using_fallback and self.enable_fallback:
|
|
2196
|
+
incident = self._get_fallback_store().get_incident(incident_id)
|
|
2197
|
+
if not incident:
|
|
2198
|
+
return None
|
|
2199
|
+
old_state = incident.state
|
|
2200
|
+
incident.state = new_state
|
|
2201
|
+
for key, value in updates.items():
|
|
2202
|
+
if hasattr(incident, key):
|
|
2203
|
+
setattr(incident, key, value)
|
|
2204
|
+
self._get_fallback_store().save_incident(incident)
|
|
2205
|
+
return incident
|
|
2206
|
+
|
|
2207
|
+
try:
|
|
2208
|
+
client = self.client
|
|
2209
|
+
self._register_lua_scripts(client)
|
|
2210
|
+
|
|
2211
|
+
# Get current incident
|
|
2212
|
+
incident = self.get_incident(incident_id)
|
|
2213
|
+
if not incident:
|
|
2214
|
+
return None
|
|
2215
|
+
|
|
2216
|
+
old_state = incident.state
|
|
2217
|
+
|
|
2218
|
+
# Update incident
|
|
2219
|
+
incident.state = new_state
|
|
2220
|
+
incident.updated_at = datetime.utcnow()
|
|
2221
|
+
for key, value in updates.items():
|
|
2222
|
+
if hasattr(incident, key):
|
|
2223
|
+
setattr(incident, key, value)
|
|
2224
|
+
|
|
2225
|
+
# Prepare keys and args for Lua script
|
|
2226
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
|
|
2227
|
+
old_state_key = self._get_key(
|
|
2228
|
+
self.KEY_INCIDENT_BY_STATE, state=old_state.value
|
|
2229
|
+
)
|
|
2230
|
+
new_state_key = self._get_key(
|
|
2231
|
+
self.KEY_INCIDENT_BY_STATE, state=new_state.value
|
|
2232
|
+
)
|
|
2233
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2234
|
+
|
|
2235
|
+
next_escalation_score = ""
|
|
2236
|
+
if incident.next_escalation_at and new_state in [
|
|
2237
|
+
EscalationState.TRIGGERED,
|
|
2238
|
+
EscalationState.ESCALATED,
|
|
2239
|
+
]:
|
|
2240
|
+
next_escalation_score = str(incident.next_escalation_at.timestamp())
|
|
2241
|
+
|
|
2242
|
+
# Execute Lua script
|
|
2243
|
+
result = client.evalsha(
|
|
2244
|
+
self._state_transition_sha,
|
|
2245
|
+
4, # Number of keys
|
|
2246
|
+
incident_key,
|
|
2247
|
+
old_state_key,
|
|
2248
|
+
new_state_key,
|
|
2249
|
+
pending_key,
|
|
2250
|
+
incident_id,
|
|
2251
|
+
new_state.value,
|
|
2252
|
+
self._serialize_incident(incident),
|
|
2253
|
+
next_escalation_score,
|
|
2254
|
+
)
|
|
2255
|
+
|
|
2256
|
+
if result == "OK":
|
|
2257
|
+
# Set TTL for resolved incidents
|
|
2258
|
+
if new_state == EscalationState.RESOLVED and self.resolved_ttl > 0:
|
|
2259
|
+
client.expire(incident_key, self.resolved_ttl)
|
|
2260
|
+
|
|
2261
|
+
# Publish state change
|
|
2262
|
+
self._publish_incident_update(client, incident, "state_changed")
|
|
2263
|
+
|
|
2264
|
+
self._metrics.state_transitions += 1
|
|
2265
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
2266
|
+
self._metrics.record_latency(latency_ms)
|
|
2267
|
+
|
|
2268
|
+
return incident
|
|
2269
|
+
|
|
2270
|
+
return None
|
|
2271
|
+
|
|
2272
|
+
except Exception as e:
|
|
2273
|
+
self._handle_redis_error(e, "transition_state")
|
|
2274
|
+
|
|
2275
|
+
if self.enable_fallback:
|
|
2276
|
+
self._using_fallback = True
|
|
2277
|
+
self._metrics.fallbacks += 1
|
|
2278
|
+
# Fallback to non-atomic operation
|
|
2279
|
+
incident = self._get_fallback_store().get_incident(incident_id)
|
|
2280
|
+
if incident:
|
|
2281
|
+
incident.state = new_state
|
|
2282
|
+
for key, value in updates.items():
|
|
2283
|
+
if hasattr(incident, key):
|
|
2284
|
+
setattr(incident, key, value)
|
|
2285
|
+
self._get_fallback_store().save_incident(incident)
|
|
2286
|
+
return incident
|
|
2287
|
+
|
|
2288
|
+
raise
|
|
2289
|
+
|
|
2290
|
+
async def transition_state_async(
|
|
2291
|
+
self,
|
|
2292
|
+
incident_id: str,
|
|
2293
|
+
new_state: EscalationState,
|
|
2294
|
+
**updates: Any,
|
|
2295
|
+
) -> EscalationIncident | None:
|
|
2296
|
+
"""Atomically transition incident state using Lua script asynchronously.
|
|
2297
|
+
|
|
2298
|
+
Args:
|
|
2299
|
+
incident_id: Incident ID.
|
|
2300
|
+
new_state: New state to transition to.
|
|
2301
|
+
**updates: Additional fields to update on the incident.
|
|
2302
|
+
|
|
2303
|
+
Returns:
|
|
2304
|
+
Updated incident if successful, None if not found.
|
|
2305
|
+
"""
|
|
2306
|
+
start_time = time.time()
|
|
2307
|
+
|
|
2308
|
+
if self._using_fallback and self.enable_fallback:
|
|
2309
|
+
incident = self._get_fallback_store().get_incident(incident_id)
|
|
2310
|
+
if not incident:
|
|
2311
|
+
return None
|
|
2312
|
+
incident.state = new_state
|
|
2313
|
+
for key, value in updates.items():
|
|
2314
|
+
if hasattr(incident, key):
|
|
2315
|
+
setattr(incident, key, value)
|
|
2316
|
+
self._get_fallback_store().save_incident(incident)
|
|
2317
|
+
return incident
|
|
2318
|
+
|
|
2319
|
+
try:
|
|
2320
|
+
client = await self.get_async_client()
|
|
2321
|
+
await self._register_lua_scripts_async(client)
|
|
2322
|
+
|
|
2323
|
+
# Get current incident
|
|
2324
|
+
incident = await self.get_incident_async(incident_id)
|
|
2325
|
+
if not incident:
|
|
2326
|
+
return None
|
|
2327
|
+
|
|
2328
|
+
old_state = incident.state
|
|
2329
|
+
|
|
2330
|
+
# Update incident
|
|
2331
|
+
incident.state = new_state
|
|
2332
|
+
incident.updated_at = datetime.utcnow()
|
|
2333
|
+
for key, value in updates.items():
|
|
2334
|
+
if hasattr(incident, key):
|
|
2335
|
+
setattr(incident, key, value)
|
|
2336
|
+
|
|
2337
|
+
# Prepare keys and args for Lua script
|
|
2338
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
|
|
2339
|
+
old_state_key = self._get_key(
|
|
2340
|
+
self.KEY_INCIDENT_BY_STATE, state=old_state.value
|
|
2341
|
+
)
|
|
2342
|
+
new_state_key = self._get_key(
|
|
2343
|
+
self.KEY_INCIDENT_BY_STATE, state=new_state.value
|
|
2344
|
+
)
|
|
2345
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2346
|
+
|
|
2347
|
+
next_escalation_score = ""
|
|
2348
|
+
if incident.next_escalation_at and new_state in [
|
|
2349
|
+
EscalationState.TRIGGERED,
|
|
2350
|
+
EscalationState.ESCALATED,
|
|
2351
|
+
]:
|
|
2352
|
+
next_escalation_score = str(incident.next_escalation_at.timestamp())
|
|
2353
|
+
|
|
2354
|
+
# Execute Lua script
|
|
2355
|
+
result = await client.evalsha(
|
|
2356
|
+
self._state_transition_sha,
|
|
2357
|
+
4,
|
|
2358
|
+
incident_key,
|
|
2359
|
+
old_state_key,
|
|
2360
|
+
new_state_key,
|
|
2361
|
+
pending_key,
|
|
2362
|
+
incident_id,
|
|
2363
|
+
new_state.value,
|
|
2364
|
+
self._serialize_incident(incident),
|
|
2365
|
+
next_escalation_score,
|
|
2366
|
+
)
|
|
2367
|
+
|
|
2368
|
+
if result == "OK":
|
|
2369
|
+
# Set TTL for resolved incidents
|
|
2370
|
+
if new_state == EscalationState.RESOLVED and self.resolved_ttl > 0:
|
|
2371
|
+
await client.expire(incident_key, self.resolved_ttl)
|
|
2372
|
+
|
|
2373
|
+
# Publish state change
|
|
2374
|
+
await self._publish_incident_update_async(
|
|
2375
|
+
client, incident, "state_changed"
|
|
2376
|
+
)
|
|
2377
|
+
|
|
2378
|
+
self._metrics.state_transitions += 1
|
|
2379
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
2380
|
+
self._metrics.record_latency(latency_ms)
|
|
2381
|
+
|
|
2382
|
+
return incident
|
|
2383
|
+
|
|
2384
|
+
return None
|
|
2385
|
+
|
|
2386
|
+
except Exception as e:
|
|
2387
|
+
self._handle_redis_error(e, "transition_state_async")
|
|
2388
|
+
|
|
2389
|
+
if self.enable_fallback:
|
|
2390
|
+
self._using_fallback = True
|
|
2391
|
+
self._metrics.fallbacks += 1
|
|
2392
|
+
incident = self._get_fallback_store().get_incident(incident_id)
|
|
2393
|
+
if incident:
|
|
2394
|
+
incident.state = new_state
|
|
2395
|
+
for key, value in updates.items():
|
|
2396
|
+
if hasattr(incident, key):
|
|
2397
|
+
setattr(incident, key, value)
|
|
2398
|
+
self._get_fallback_store().save_incident(incident)
|
|
2399
|
+
return incident
|
|
2400
|
+
|
|
2401
|
+
raise
|
|
2402
|
+
|
|
2403
|
+
# =========================================================================
|
|
2404
|
+
# Pub/Sub Subscription
|
|
2405
|
+
# =========================================================================
|
|
2406
|
+
|
|
2407
|
+
async def subscribe_to_updates(
|
|
2408
|
+
self,
|
|
2409
|
+
) -> "redis.asyncio.client.PubSub":
|
|
2410
|
+
"""Subscribe to incident update channel.
|
|
2411
|
+
|
|
2412
|
+
Returns a Pub/Sub instance that can be used to listen for updates.
|
|
2413
|
+
|
|
2414
|
+
Returns:
|
|
2415
|
+
Async Pub/Sub instance subscribed to the incident updates channel.
|
|
2416
|
+
|
|
2417
|
+
Example:
|
|
2418
|
+
pubsub = await store.subscribe_to_updates()
|
|
2419
|
+
async for message in pubsub.listen():
|
|
2420
|
+
if message["type"] == "message":
|
|
2421
|
+
data = json.loads(message["data"])
|
|
2422
|
+
print(f"Incident {data['incident_id']} changed to {data['state']}")
|
|
2423
|
+
"""
|
|
2424
|
+
client = await self.get_async_client()
|
|
2425
|
+
pubsub = client.pubsub()
|
|
2426
|
+
channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
|
|
2427
|
+
await pubsub.subscribe(channel)
|
|
2428
|
+
return pubsub
|
|
2429
|
+
|
|
2430
|
+
# =========================================================================
|
|
2431
|
+
# Cleanup Operations
|
|
2432
|
+
# =========================================================================
|
|
2433
|
+
|
|
2434
|
+
def cleanup_resolved_incidents(self, max_age_seconds: int | None = None) -> int:
|
|
2435
|
+
"""Clean up old resolved incidents.
|
|
2436
|
+
|
|
2437
|
+
Args:
|
|
2438
|
+
max_age_seconds: Maximum age in seconds. Uses resolved_ttl if not provided.
|
|
2439
|
+
|
|
2440
|
+
Returns:
|
|
2441
|
+
Number of incidents cleaned up.
|
|
2442
|
+
"""
|
|
2443
|
+
if self._using_fallback and self.enable_fallback:
|
|
2444
|
+
# InMemory store doesn't have cleanup
|
|
2445
|
+
return 0
|
|
2446
|
+
|
|
2447
|
+
try:
|
|
2448
|
+
client = self.client
|
|
2449
|
+
max_age = max_age_seconds or self.resolved_ttl
|
|
2450
|
+
cutoff = datetime.utcnow() - timedelta(seconds=max_age)
|
|
2451
|
+
cutoff_score = cutoff.timestamp()
|
|
2452
|
+
|
|
2453
|
+
# Get resolved incidents created before cutoff
|
|
2454
|
+
resolved_key = self._get_key(
|
|
2455
|
+
self.KEY_INCIDENT_BY_STATE, state=EscalationState.RESOLVED.value
|
|
2456
|
+
)
|
|
2457
|
+
resolved_ids = client.smembers(resolved_key)
|
|
2458
|
+
|
|
2459
|
+
cleaned = 0
|
|
2460
|
+
for incident_id in resolved_ids:
|
|
2461
|
+
incident = self.get_incident(incident_id)
|
|
2462
|
+
if incident and incident.resolved_at:
|
|
2463
|
+
if incident.resolved_at.timestamp() < cutoff_score:
|
|
2464
|
+
self._delete_incident(client, incident)
|
|
2465
|
+
cleaned += 1
|
|
2466
|
+
|
|
2467
|
+
return cleaned
|
|
2468
|
+
|
|
2469
|
+
except Exception as e:
|
|
2470
|
+
self._handle_redis_error(e, "cleanup_resolved_incidents")
|
|
2471
|
+
return 0
|
|
2472
|
+
|
|
2473
|
+
async def cleanup_resolved_incidents_async(
|
|
2474
|
+
self, max_age_seconds: int | None = None
|
|
2475
|
+
) -> int:
|
|
2476
|
+
"""Clean up old resolved incidents asynchronously.
|
|
2477
|
+
|
|
2478
|
+
Args:
|
|
2479
|
+
max_age_seconds: Maximum age in seconds.
|
|
2480
|
+
|
|
2481
|
+
Returns:
|
|
2482
|
+
Number of incidents cleaned up.
|
|
2483
|
+
"""
|
|
2484
|
+
if self._using_fallback and self.enable_fallback:
|
|
2485
|
+
return 0
|
|
2486
|
+
|
|
2487
|
+
try:
|
|
2488
|
+
client = await self.get_async_client()
|
|
2489
|
+
max_age = max_age_seconds or self.resolved_ttl
|
|
2490
|
+
cutoff = datetime.utcnow() - timedelta(seconds=max_age)
|
|
2491
|
+
cutoff_score = cutoff.timestamp()
|
|
2492
|
+
|
|
2493
|
+
# Get resolved incidents
|
|
2494
|
+
resolved_key = self._get_key(
|
|
2495
|
+
self.KEY_INCIDENT_BY_STATE, state=EscalationState.RESOLVED.value
|
|
2496
|
+
)
|
|
2497
|
+
resolved_ids = await client.smembers(resolved_key)
|
|
2498
|
+
|
|
2499
|
+
cleaned = 0
|
|
2500
|
+
for incident_id in resolved_ids:
|
|
2501
|
+
incident = await self.get_incident_async(incident_id)
|
|
2502
|
+
if incident and incident.resolved_at:
|
|
2503
|
+
if incident.resolved_at.timestamp() < cutoff_score:
|
|
2504
|
+
await self._delete_incident_async(client, incident)
|
|
2505
|
+
cleaned += 1
|
|
2506
|
+
|
|
2507
|
+
return cleaned
|
|
2508
|
+
|
|
2509
|
+
except Exception as e:
|
|
2510
|
+
self._handle_redis_error(e, "cleanup_resolved_incidents_async")
|
|
2511
|
+
return 0
|
|
2512
|
+
|
|
2513
|
+
def _delete_incident(
|
|
2514
|
+
self, client: "redis.Redis", incident: EscalationIncident
|
|
2515
|
+
) -> None:
|
|
2516
|
+
"""Delete incident and all its indices.
|
|
2517
|
+
|
|
2518
|
+
Args:
|
|
2519
|
+
client: Redis client.
|
|
2520
|
+
incident: Incident to delete.
|
|
2521
|
+
"""
|
|
2522
|
+
pipe = client.pipeline()
|
|
2523
|
+
|
|
2524
|
+
# Delete incident
|
|
2525
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
|
|
2526
|
+
pipe.delete(incident_key)
|
|
2527
|
+
|
|
2528
|
+
# Remove from indices
|
|
2529
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2530
|
+
pipe.srem(index_key, incident.id)
|
|
2531
|
+
|
|
2532
|
+
ref_key = self._get_key(
|
|
2533
|
+
self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
|
|
2534
|
+
)
|
|
2535
|
+
pipe.delete(ref_key)
|
|
2536
|
+
|
|
2537
|
+
policy_key = self._get_key(
|
|
2538
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
|
|
2539
|
+
)
|
|
2540
|
+
pipe.srem(policy_key, incident.id)
|
|
2541
|
+
|
|
2542
|
+
for state in EscalationState:
|
|
2543
|
+
state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
|
|
2544
|
+
pipe.srem(state_key, incident.id)
|
|
2545
|
+
|
|
2546
|
+
created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
|
|
2547
|
+
pipe.zrem(created_key, incident.id)
|
|
2548
|
+
|
|
2549
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2550
|
+
pipe.zrem(pending_key, incident.id)
|
|
2551
|
+
|
|
2552
|
+
pipe.execute()
|
|
2553
|
+
|
|
2554
|
+
async def _delete_incident_async(
|
|
2555
|
+
self, client: "redis.asyncio.Redis", incident: EscalationIncident
|
|
2556
|
+
) -> None:
|
|
2557
|
+
"""Delete incident and all its indices asynchronously.
|
|
2558
|
+
|
|
2559
|
+
Args:
|
|
2560
|
+
client: Async Redis client.
|
|
2561
|
+
incident: Incident to delete.
|
|
2562
|
+
"""
|
|
2563
|
+
pipe = client.pipeline()
|
|
2564
|
+
|
|
2565
|
+
# Delete incident
|
|
2566
|
+
incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
|
|
2567
|
+
pipe.delete(incident_key)
|
|
2568
|
+
|
|
2569
|
+
# Remove from indices
|
|
2570
|
+
index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2571
|
+
pipe.srem(index_key, incident.id)
|
|
2572
|
+
|
|
2573
|
+
ref_key = self._get_key(
|
|
2574
|
+
self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
|
|
2575
|
+
)
|
|
2576
|
+
pipe.delete(ref_key)
|
|
2577
|
+
|
|
2578
|
+
policy_key = self._get_key(
|
|
2579
|
+
self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
|
|
2580
|
+
)
|
|
2581
|
+
pipe.srem(policy_key, incident.id)
|
|
2582
|
+
|
|
2583
|
+
for state in EscalationState:
|
|
2584
|
+
state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
|
|
2585
|
+
pipe.srem(state_key, incident.id)
|
|
2586
|
+
|
|
2587
|
+
created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
|
|
2588
|
+
pipe.zrem(created_key, incident.id)
|
|
2589
|
+
|
|
2590
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2591
|
+
pipe.zrem(pending_key, incident.id)
|
|
2592
|
+
|
|
2593
|
+
await pipe.execute()
|
|
2594
|
+
|
|
2595
|
+
# =========================================================================
|
|
2596
|
+
# Health Check & Metrics
|
|
2597
|
+
# =========================================================================
|
|
2598
|
+
|
|
2599
|
+
def health_check(self) -> dict[str, Any]:
|
|
2600
|
+
"""Perform health check and return status.
|
|
2601
|
+
|
|
2602
|
+
Returns:
|
|
2603
|
+
Dictionary with health status information.
|
|
2604
|
+
"""
|
|
2605
|
+
result = {
|
|
2606
|
+
"healthy": False,
|
|
2607
|
+
"connected": self._connected,
|
|
2608
|
+
"using_fallback": self._using_fallback,
|
|
2609
|
+
"redis_url": self._mask_url(self.redis_url),
|
|
2610
|
+
"metrics": self._metrics.to_dict(),
|
|
2611
|
+
}
|
|
2612
|
+
|
|
2613
|
+
if self._using_fallback and self.enable_fallback:
|
|
2614
|
+
result["healthy"] = True
|
|
2615
|
+
result["mode"] = "fallback"
|
|
2616
|
+
result["fallback_policies"] = len(
|
|
2617
|
+
self._get_fallback_store().list_policies(active_only=False)
|
|
2618
|
+
)
|
|
2619
|
+
result["fallback_incidents"] = len(
|
|
2620
|
+
self._get_fallback_store().list_incidents()
|
|
2621
|
+
)
|
|
2622
|
+
return result
|
|
2623
|
+
|
|
2624
|
+
try:
|
|
2625
|
+
client = self.client
|
|
2626
|
+
ping_ok = client.ping()
|
|
2627
|
+
|
|
2628
|
+
if ping_ok:
|
|
2629
|
+
result["healthy"] = True
|
|
2630
|
+
result["mode"] = "redis"
|
|
2631
|
+
|
|
2632
|
+
# Get counts
|
|
2633
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
2634
|
+
result["policies"] = client.scard(index_key)
|
|
2635
|
+
|
|
2636
|
+
incident_index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2637
|
+
result["incidents"] = client.scard(incident_index_key)
|
|
2638
|
+
|
|
2639
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2640
|
+
result["pending_escalations"] = client.zcard(pending_key)
|
|
2641
|
+
|
|
2642
|
+
# Get Redis info
|
|
2643
|
+
info = client.info(section="server")
|
|
2644
|
+
result["redis_info"] = {
|
|
2645
|
+
"version": info.get("redis_version"),
|
|
2646
|
+
"uptime_seconds": info.get("uptime_in_seconds"),
|
|
2647
|
+
}
|
|
2648
|
+
|
|
2649
|
+
except Exception as e:
|
|
2650
|
+
result["error"] = str(e)
|
|
2651
|
+
if self._last_error_time:
|
|
2652
|
+
result["last_error_time"] = datetime.fromtimestamp(
|
|
2653
|
+
self._last_error_time
|
|
2654
|
+
).isoformat()
|
|
2655
|
+
|
|
2656
|
+
return result
|
|
2657
|
+
|
|
2658
|
+
async def health_check_async(self) -> dict[str, Any]:
|
|
2659
|
+
"""Perform health check asynchronously.
|
|
2660
|
+
|
|
2661
|
+
Returns:
|
|
2662
|
+
Dictionary with health status information.
|
|
2663
|
+
"""
|
|
2664
|
+
result = {
|
|
2665
|
+
"healthy": False,
|
|
2666
|
+
"connected": self._connected,
|
|
2667
|
+
"using_fallback": self._using_fallback,
|
|
2668
|
+
"redis_url": self._mask_url(self.redis_url),
|
|
2669
|
+
"metrics": self._metrics.to_dict(),
|
|
2670
|
+
}
|
|
2671
|
+
|
|
2672
|
+
if self._using_fallback and self.enable_fallback:
|
|
2673
|
+
result["healthy"] = True
|
|
2674
|
+
result["mode"] = "fallback"
|
|
2675
|
+
result["fallback_policies"] = len(
|
|
2676
|
+
self._get_fallback_store().list_policies(active_only=False)
|
|
2677
|
+
)
|
|
2678
|
+
result["fallback_incidents"] = len(
|
|
2679
|
+
self._get_fallback_store().list_incidents()
|
|
2680
|
+
)
|
|
2681
|
+
return result
|
|
2682
|
+
|
|
2683
|
+
try:
|
|
2684
|
+
client = await self.get_async_client()
|
|
2685
|
+
ping_ok = await client.ping()
|
|
2686
|
+
|
|
2687
|
+
if ping_ok:
|
|
2688
|
+
result["healthy"] = True
|
|
2689
|
+
result["mode"] = "redis"
|
|
2690
|
+
|
|
2691
|
+
# Get counts
|
|
2692
|
+
index_key = self._get_key(self.KEY_POLICY_INDEX)
|
|
2693
|
+
result["policies"] = await client.scard(index_key)
|
|
2694
|
+
|
|
2695
|
+
incident_index_key = self._get_key(self.KEY_INCIDENT_INDEX)
|
|
2696
|
+
result["incidents"] = await client.scard(incident_index_key)
|
|
2697
|
+
|
|
2698
|
+
pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
|
|
2699
|
+
result["pending_escalations"] = await client.zcard(pending_key)
|
|
2700
|
+
|
|
2701
|
+
# Get Redis info
|
|
2702
|
+
info = await client.info(section="server")
|
|
2703
|
+
result["redis_info"] = {
|
|
2704
|
+
"version": info.get("redis_version"),
|
|
2705
|
+
"uptime_seconds": info.get("uptime_in_seconds"),
|
|
2706
|
+
}
|
|
2707
|
+
|
|
2708
|
+
except Exception as e:
|
|
2709
|
+
result["error"] = str(e)
|
|
2710
|
+
if self._last_error_time:
|
|
2711
|
+
result["last_error_time"] = datetime.fromtimestamp(
|
|
2712
|
+
self._last_error_time
|
|
2713
|
+
).isoformat()
|
|
2714
|
+
|
|
2715
|
+
return result
|
|
2716
|
+
|
|
2717
|
+
def _mask_url(self, url: str) -> str:
|
|
2718
|
+
"""Mask sensitive parts of Redis URL.
|
|
2719
|
+
|
|
2720
|
+
Args:
|
|
2721
|
+
url: Redis URL to mask.
|
|
2722
|
+
|
|
2723
|
+
Returns:
|
|
2724
|
+
Masked URL string.
|
|
2725
|
+
"""
|
|
2726
|
+
import re
|
|
2727
|
+
|
|
2728
|
+
# Mask password if present
|
|
2729
|
+
return re.sub(r"://[^:]+:[^@]+@", "://***:***@", url)
|
|
2730
|
+
|
|
2731
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
2732
|
+
"""Get current metrics.
|
|
2733
|
+
|
|
2734
|
+
Returns:
|
|
2735
|
+
Dictionary with metrics data.
|
|
2736
|
+
"""
|
|
2737
|
+
return self._metrics.to_dict()
|
|
2738
|
+
|
|
2739
|
+
def reset_metrics(self) -> None:
|
|
2740
|
+
"""Reset all metrics to zero."""
|
|
2741
|
+
self._metrics = EscalationMetrics()
|
|
2742
|
+
|
|
2743
|
+
# =========================================================================
|
|
2744
|
+
# Connection Management
|
|
2745
|
+
# =========================================================================
|
|
2746
|
+
|
|
2747
|
+
def close(self) -> None:
|
|
2748
|
+
"""Close all connections and pools."""
|
|
2749
|
+
if self._client is not None:
|
|
2750
|
+
try:
|
|
2751
|
+
self._client.close()
|
|
2752
|
+
except Exception:
|
|
2753
|
+
pass
|
|
2754
|
+
self._client = None
|
|
2755
|
+
|
|
2756
|
+
if self._pool is not None:
|
|
2757
|
+
try:
|
|
2758
|
+
self._pool.disconnect()
|
|
2759
|
+
except Exception:
|
|
2760
|
+
pass
|
|
2761
|
+
self._pool = None
|
|
2762
|
+
|
|
2763
|
+
self._connected = False
|
|
2764
|
+
|
|
2765
|
+
async def close_async(self) -> None:
|
|
2766
|
+
"""Close all connections and pools asynchronously."""
|
|
2767
|
+
if self._async_client is not None:
|
|
2768
|
+
try:
|
|
2769
|
+
await self._async_client.close()
|
|
2770
|
+
except Exception:
|
|
2771
|
+
pass
|
|
2772
|
+
self._async_client = None
|
|
2773
|
+
|
|
2774
|
+
if self._async_pool is not None:
|
|
2775
|
+
try:
|
|
2776
|
+
await self._async_pool.disconnect()
|
|
2777
|
+
except Exception:
|
|
2778
|
+
pass
|
|
2779
|
+
self._async_pool = None
|
|
2780
|
+
|
|
2781
|
+
self._connected = False
|
|
2782
|
+
|
|
2783
|
+
def __enter__(self) -> "RedisEscalationStore":
|
|
2784
|
+
"""Context manager entry."""
|
|
2785
|
+
return self
|
|
2786
|
+
|
|
2787
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
2788
|
+
"""Context manager exit, closes connections."""
|
|
2789
|
+
self.close()
|
|
2790
|
+
|
|
2791
|
+
async def __aenter__(self) -> "RedisEscalationStore":
|
|
2792
|
+
"""Async context manager entry."""
|
|
2793
|
+
return self
|
|
2794
|
+
|
|
2795
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
2796
|
+
"""Async context manager exit, closes connections."""
|
|
2797
|
+
await self.close_async()
|
|
2798
|
+
|
|
2799
|
+
|
|
2800
|
+
# ============================================================================
|
|
2801
|
+
# Factory Function
|
|
2802
|
+
# ============================================================================
|
|
2803
|
+
|
|
2804
|
+
|
|
2805
|
+
class EscalationStoreType:
|
|
2806
|
+
"""Store type constants."""
|
|
2807
|
+
|
|
2808
|
+
MEMORY = "memory"
|
|
2809
|
+
SQLITE = "sqlite"
|
|
2810
|
+
REDIS = "redis"
|
|
2811
|
+
|
|
2812
|
+
|
|
2813
|
+
def create_escalation_store(
|
|
2814
|
+
store_type: str | None = None,
|
|
2815
|
+
**kwargs: Any,
|
|
2816
|
+
) -> BaseEscalationStore:
|
|
2817
|
+
"""Factory function to create appropriate escalation store.
|
|
2818
|
+
|
|
2819
|
+
Selects the store type based on configuration or environment variables.
|
|
2820
|
+
|
|
2821
|
+
Environment variables:
|
|
2822
|
+
TRUTHOUND_ESCALATION_STORE_TYPE: Store type (memory, sqlite, redis)
|
|
2823
|
+
TRUTHOUND_ESCALATION_SQLITE_PATH: SQLite database path
|
|
2824
|
+
TRUTHOUND_ESCALATION_REDIS_URL: Redis connection URL (enables redis)
|
|
2825
|
+
|
|
2826
|
+
Args:
|
|
2827
|
+
store_type: Explicit store type override. If None, auto-detects.
|
|
2828
|
+
**kwargs: Additional arguments passed to the store constructor.
|
|
2829
|
+
|
|
2830
|
+
Returns:
|
|
2831
|
+
Configured BaseEscalationStore instance.
|
|
2832
|
+
|
|
2833
|
+
Example:
|
|
2834
|
+
# Auto-detect based on environment
|
|
2835
|
+
store = create_escalation_store()
|
|
2836
|
+
|
|
2837
|
+
# Explicit type
|
|
2838
|
+
store = create_escalation_store("redis", resolved_ttl=7200)
|
|
2839
|
+
|
|
2840
|
+
# SQLite with custom path
|
|
2841
|
+
store = create_escalation_store("sqlite", db_path="/tmp/escalation.db")
|
|
2842
|
+
"""
|
|
2843
|
+
logger = logging.getLogger(__name__)
|
|
2844
|
+
|
|
2845
|
+
# Determine store type
|
|
2846
|
+
if store_type is None:
|
|
2847
|
+
store_type = os.getenv("TRUTHOUND_ESCALATION_STORE_TYPE")
|
|
2848
|
+
|
|
2849
|
+
# Auto-detect if still None
|
|
2850
|
+
if store_type is None:
|
|
2851
|
+
redis_url = os.getenv("TRUTHOUND_ESCALATION_REDIS_URL")
|
|
2852
|
+
if redis_url and REDIS_AVAILABLE:
|
|
2853
|
+
store_type = EscalationStoreType.REDIS
|
|
2854
|
+
logger.info(
|
|
2855
|
+
"Auto-detected Redis store from TRUTHOUND_ESCALATION_REDIS_URL"
|
|
2856
|
+
)
|
|
2857
|
+
elif os.getenv("TRUTHOUND_ESCALATION_SQLITE_PATH"):
|
|
2858
|
+
store_type = EscalationStoreType.SQLITE
|
|
2859
|
+
logger.info(
|
|
2860
|
+
"Auto-detected SQLite store from TRUTHOUND_ESCALATION_SQLITE_PATH"
|
|
2861
|
+
)
|
|
2862
|
+
else:
|
|
2863
|
+
store_type = EscalationStoreType.MEMORY
|
|
2864
|
+
logger.info("Using default InMemory store")
|
|
2865
|
+
|
|
2866
|
+
# Normalize store type
|
|
2867
|
+
store_type = store_type.lower().strip()
|
|
2868
|
+
|
|
2869
|
+
# Create store based on type
|
|
2870
|
+
if store_type == EscalationStoreType.MEMORY:
|
|
2871
|
+
logger.info("Creating InMemory escalation store")
|
|
2872
|
+
return InMemoryEscalationStore()
|
|
2873
|
+
|
|
2874
|
+
elif store_type == EscalationStoreType.SQLITE:
|
|
2875
|
+
db_path = kwargs.pop("db_path", None) or os.getenv(
|
|
2876
|
+
"TRUTHOUND_ESCALATION_SQLITE_PATH", "escalation.db"
|
|
2877
|
+
)
|
|
2878
|
+
logger.info(f"Creating SQLite escalation store at {db_path}")
|
|
2879
|
+
return SQLiteEscalationStore(db_path=db_path)
|
|
2880
|
+
|
|
2881
|
+
elif store_type == EscalationStoreType.REDIS:
|
|
2882
|
+
if not REDIS_AVAILABLE:
|
|
2883
|
+
logger.warning(
|
|
2884
|
+
"Redis not available, falling back to InMemory store. "
|
|
2885
|
+
"Install with: pip install truthound-dashboard[redis]"
|
|
2886
|
+
)
|
|
2887
|
+
return InMemoryEscalationStore()
|
|
2888
|
+
|
|
2889
|
+
logger.info("Creating Redis escalation store")
|
|
2890
|
+
return RedisEscalationStore(**kwargs)
|
|
2891
|
+
|
|
2892
|
+
else:
|
|
2893
|
+
logger.warning(
|
|
2894
|
+
f"Unknown store type '{store_type}', falling back to InMemory store"
|
|
2895
|
+
)
|
|
2896
|
+
return InMemoryEscalationStore()
|