truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
  164. truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
  166. truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2896 @@
1
+ """Storage backends for escalation state.
2
+
3
+ This module provides storage backends for persisting escalation
4
+ policies and incidents.
5
+
6
+ Storage Backends:
7
+ - InMemoryEscalationStore: Simple in-memory storage
8
+ - SQLiteEscalationStore: Persistent SQLite storage
9
+ - RedisEscalationStore: Redis-based storage for distributed deployments
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ import os
17
+ import random
18
+ import sqlite3
19
+ import threading
20
+ import time
21
+ import uuid
22
+ from abc import ABC, abstractmethod
23
+ from dataclasses import dataclass
24
+ from datetime import datetime, timedelta
25
+ from pathlib import Path
26
+ from typing import TYPE_CHECKING, Any
27
+
28
+ from .models import EscalationIncident, EscalationPolicy, EscalationState
29
+
30
+ # Optional Redis dependency
31
+ try:
32
+ import redis
33
+ import redis.asyncio
34
+
35
+ REDIS_AVAILABLE = True
36
+ except ImportError:
37
+ REDIS_AVAILABLE = False
38
+ redis = None # type: ignore[assignment]
39
+
40
+ if TYPE_CHECKING:
41
+ import redis as redis_sync
42
+ import redis.asyncio as redis_async
43
+
44
+
45
+ class BaseEscalationStore(ABC):
46
+ """Abstract base class for escalation storage."""
47
+
48
+ # Policy operations
49
+ @abstractmethod
50
+ def save_policy(self, policy: EscalationPolicy) -> str:
51
+ """Save or update a policy."""
52
+ ...
53
+
54
+ @abstractmethod
55
+ def get_policy(self, policy_id: str) -> EscalationPolicy | None:
56
+ """Get policy by ID."""
57
+ ...
58
+
59
+ @abstractmethod
60
+ def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
61
+ """Get policy by name."""
62
+ ...
63
+
64
+ @abstractmethod
65
+ def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
66
+ """List all policies."""
67
+ ...
68
+
69
+ @abstractmethod
70
+ def delete_policy(self, policy_id: str) -> bool:
71
+ """Delete a policy."""
72
+ ...
73
+
74
+ # Incident operations
75
+ @abstractmethod
76
+ def save_incident(self, incident: EscalationIncident) -> str:
77
+ """Save or update an incident."""
78
+ ...
79
+
80
+ @abstractmethod
81
+ def get_incident(self, incident_id: str) -> EscalationIncident | None:
82
+ """Get incident by ID."""
83
+ ...
84
+
85
+ @abstractmethod
86
+ def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
87
+ """Get incident by external reference."""
88
+ ...
89
+
90
+ @abstractmethod
91
+ def list_incidents(
92
+ self,
93
+ policy_id: str | None = None,
94
+ states: list[EscalationState] | None = None,
95
+ ) -> list[EscalationIncident]:
96
+ """List incidents with optional filters."""
97
+ ...
98
+
99
+ @abstractmethod
100
+ def get_pending_escalations(self) -> list[EscalationIncident]:
101
+ """Get incidents due for escalation."""
102
+ ...
103
+
104
+
105
+ class InMemoryEscalationStore(BaseEscalationStore):
106
+ """In-memory escalation storage.
107
+
108
+ Simple thread-safe storage suitable for development
109
+ and testing.
110
+ """
111
+
112
+ def __init__(self) -> None:
113
+ """Initialize in-memory store."""
114
+ self._policies: dict[str, EscalationPolicy] = {}
115
+ self._incidents: dict[str, EscalationIncident] = {}
116
+ self._policy_counter = 0
117
+ self._incident_counter = 0
118
+ self._lock = threading.RLock()
119
+
120
+ def _generate_policy_id(self) -> str:
121
+ """Generate unique policy ID."""
122
+ self._policy_counter += 1
123
+ return f"policy-{self._policy_counter}"
124
+
125
+ def _generate_incident_id(self) -> str:
126
+ """Generate unique incident ID."""
127
+ self._incident_counter += 1
128
+ return f"incident-{self._incident_counter}"
129
+
130
+ def save_policy(self, policy: EscalationPolicy) -> str:
131
+ """Save or update a policy."""
132
+ with self._lock:
133
+ if not policy.id:
134
+ policy.id = self._generate_policy_id()
135
+ self._policies[policy.id] = policy
136
+ return policy.id
137
+
138
+ def get_policy(self, policy_id: str) -> EscalationPolicy | None:
139
+ """Get policy by ID."""
140
+ with self._lock:
141
+ return self._policies.get(policy_id)
142
+
143
+ def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
144
+ """Get policy by name."""
145
+ with self._lock:
146
+ for policy in self._policies.values():
147
+ if policy.name == name:
148
+ return policy
149
+ return None
150
+
151
+ def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
152
+ """List all policies."""
153
+ with self._lock:
154
+ policies = list(self._policies.values())
155
+ if active_only:
156
+ policies = [p for p in policies if p.is_active]
157
+ return policies
158
+
159
+ def delete_policy(self, policy_id: str) -> bool:
160
+ """Delete a policy."""
161
+ with self._lock:
162
+ if policy_id in self._policies:
163
+ del self._policies[policy_id]
164
+ return True
165
+ return False
166
+
167
+ def save_incident(self, incident: EscalationIncident) -> str:
168
+ """Save or update an incident."""
169
+ with self._lock:
170
+ if not incident.id:
171
+ incident.id = self._generate_incident_id()
172
+ incident.updated_at = datetime.utcnow()
173
+ self._incidents[incident.id] = incident
174
+ return incident.id
175
+
176
+ def get_incident(self, incident_id: str) -> EscalationIncident | None:
177
+ """Get incident by ID."""
178
+ with self._lock:
179
+ return self._incidents.get(incident_id)
180
+
181
+ def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
182
+ """Get incident by external reference."""
183
+ with self._lock:
184
+ for incident in self._incidents.values():
185
+ if incident.incident_ref == incident_ref:
186
+ return incident
187
+ return None
188
+
189
+ def list_incidents(
190
+ self,
191
+ policy_id: str | None = None,
192
+ states: list[EscalationState] | None = None,
193
+ ) -> list[EscalationIncident]:
194
+ """List incidents with optional filters."""
195
+ with self._lock:
196
+ incidents = list(self._incidents.values())
197
+
198
+ if policy_id:
199
+ incidents = [i for i in incidents if i.policy_id == policy_id]
200
+
201
+ if states:
202
+ incidents = [i for i in incidents if i.state in states]
203
+
204
+ return incidents
205
+
206
+ def get_pending_escalations(self) -> list[EscalationIncident]:
207
+ """Get incidents due for escalation."""
208
+ now = datetime.utcnow()
209
+ active_states = [
210
+ EscalationState.TRIGGERED,
211
+ EscalationState.ESCALATED,
212
+ ]
213
+
214
+ with self._lock:
215
+ return [
216
+ i for i in self._incidents.values()
217
+ if i.state in active_states
218
+ and i.next_escalation_at
219
+ and i.next_escalation_at <= now
220
+ ]
221
+
222
+
223
+ class SQLiteEscalationStore(BaseEscalationStore):
224
+ """SQLite-based persistent escalation storage."""
225
+
226
+ def __init__(self, db_path: str | Path = "escalation.db") -> None:
227
+ """Initialize SQLite store."""
228
+ self.db_path = Path(db_path)
229
+ self._local = threading.local()
230
+ self._init_db()
231
+
232
+ def _get_connection(self) -> sqlite3.Connection:
233
+ """Get thread-local database connection."""
234
+ if not hasattr(self._local, "connection"):
235
+ self._local.connection = sqlite3.connect(
236
+ str(self.db_path),
237
+ check_same_thread=False,
238
+ )
239
+ self._local.connection.row_factory = sqlite3.Row
240
+ return self._local.connection
241
+
242
+ def _init_db(self) -> None:
243
+ """Initialize database schema."""
244
+ conn = self._get_connection()
245
+
246
+ # Policies table
247
+ conn.execute("""
248
+ CREATE TABLE IF NOT EXISTS escalation_policies (
249
+ id TEXT PRIMARY KEY,
250
+ name TEXT NOT NULL UNIQUE,
251
+ description TEXT,
252
+ levels TEXT NOT NULL,
253
+ auto_resolve_on_success INTEGER NOT NULL DEFAULT 1,
254
+ max_escalations INTEGER NOT NULL DEFAULT 3,
255
+ is_active INTEGER NOT NULL DEFAULT 1,
256
+ created_at TEXT NOT NULL,
257
+ updated_at TEXT NOT NULL
258
+ )
259
+ """)
260
+
261
+ # Incidents table
262
+ conn.execute("""
263
+ CREATE TABLE IF NOT EXISTS escalation_incidents (
264
+ id TEXT PRIMARY KEY,
265
+ policy_id TEXT NOT NULL,
266
+ incident_ref TEXT NOT NULL UNIQUE,
267
+ state TEXT NOT NULL,
268
+ current_level INTEGER NOT NULL DEFAULT 0,
269
+ context TEXT,
270
+ acknowledged_by TEXT,
271
+ acknowledged_at TEXT,
272
+ resolved_by TEXT,
273
+ resolved_at TEXT,
274
+ created_at TEXT NOT NULL,
275
+ updated_at TEXT NOT NULL,
276
+ next_escalation_at TEXT,
277
+ escalation_count INTEGER NOT NULL DEFAULT 0,
278
+ events TEXT,
279
+ FOREIGN KEY (policy_id) REFERENCES escalation_policies(id)
280
+ )
281
+ """)
282
+
283
+ conn.execute("""
284
+ CREATE INDEX IF NOT EXISTS idx_incident_state
285
+ ON escalation_incidents(state)
286
+ """)
287
+ conn.execute("""
288
+ CREATE INDEX IF NOT EXISTS idx_incident_next_escalation
289
+ ON escalation_incidents(next_escalation_at)
290
+ """)
291
+
292
+ conn.commit()
293
+
294
+ def save_policy(self, policy: EscalationPolicy) -> str:
295
+ """Save or update a policy."""
296
+ conn = self._get_connection()
297
+ now = datetime.utcnow().isoformat()
298
+
299
+ if not policy.id:
300
+ import uuid
301
+ policy.id = str(uuid.uuid4())
302
+
303
+ levels_json = json.dumps([l.to_dict() for l in policy.levels])
304
+
305
+ conn.execute(
306
+ """
307
+ INSERT OR REPLACE INTO escalation_policies
308
+ (id, name, description, levels, auto_resolve_on_success,
309
+ max_escalations, is_active, created_at, updated_at)
310
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
311
+ """,
312
+ (
313
+ policy.id,
314
+ policy.name,
315
+ policy.description,
316
+ levels_json,
317
+ 1 if policy.auto_resolve_on_success else 0,
318
+ policy.max_escalations,
319
+ 1 if policy.is_active else 0,
320
+ now,
321
+ now,
322
+ ),
323
+ )
324
+ conn.commit()
325
+ return policy.id
326
+
327
+ def get_policy(self, policy_id: str) -> EscalationPolicy | None:
328
+ """Get policy by ID."""
329
+ conn = self._get_connection()
330
+ cursor = conn.execute(
331
+ "SELECT * FROM escalation_policies WHERE id = ?",
332
+ (policy_id,),
333
+ )
334
+ row = cursor.fetchone()
335
+ return self._row_to_policy(row) if row else None
336
+
337
+ def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
338
+ """Get policy by name."""
339
+ conn = self._get_connection()
340
+ cursor = conn.execute(
341
+ "SELECT * FROM escalation_policies WHERE name = ?",
342
+ (name,),
343
+ )
344
+ row = cursor.fetchone()
345
+ return self._row_to_policy(row) if row else None
346
+
347
+ def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
348
+ """List all policies."""
349
+ conn = self._get_connection()
350
+ if active_only:
351
+ cursor = conn.execute(
352
+ "SELECT * FROM escalation_policies WHERE is_active = 1"
353
+ )
354
+ else:
355
+ cursor = conn.execute("SELECT * FROM escalation_policies")
356
+
357
+ return [self._row_to_policy(row) for row in cursor.fetchall()]
358
+
359
+ def delete_policy(self, policy_id: str) -> bool:
360
+ """Delete a policy."""
361
+ conn = self._get_connection()
362
+ cursor = conn.execute(
363
+ "DELETE FROM escalation_policies WHERE id = ?",
364
+ (policy_id,),
365
+ )
366
+ conn.commit()
367
+ return cursor.rowcount > 0
368
+
369
+ def _row_to_policy(self, row: sqlite3.Row) -> EscalationPolicy:
370
+ """Convert database row to policy."""
371
+ from .models import EscalationLevel
372
+
373
+ levels_data = json.loads(row["levels"])
374
+ levels = [EscalationLevel.from_dict(l) for l in levels_data]
375
+
376
+ return EscalationPolicy(
377
+ id=row["id"],
378
+ name=row["name"],
379
+ description=row["description"] or "",
380
+ levels=levels,
381
+ auto_resolve_on_success=bool(row["auto_resolve_on_success"]),
382
+ max_escalations=row["max_escalations"],
383
+ is_active=bool(row["is_active"]),
384
+ )
385
+
386
+ def save_incident(self, incident: EscalationIncident) -> str:
387
+ """Save or update an incident."""
388
+ conn = self._get_connection()
389
+ now = datetime.utcnow().isoformat()
390
+
391
+ if not incident.id:
392
+ import uuid
393
+ incident.id = str(uuid.uuid4())
394
+
395
+ incident.updated_at = datetime.utcnow()
396
+
397
+ conn.execute(
398
+ """
399
+ INSERT OR REPLACE INTO escalation_incidents
400
+ (id, policy_id, incident_ref, state, current_level, context,
401
+ acknowledged_by, acknowledged_at, resolved_by, resolved_at,
402
+ created_at, updated_at, next_escalation_at, escalation_count, events)
403
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
404
+ """,
405
+ (
406
+ incident.id,
407
+ incident.policy_id,
408
+ incident.incident_ref,
409
+ incident.state.value,
410
+ incident.current_level,
411
+ json.dumps(incident.context),
412
+ incident.acknowledged_by,
413
+ incident.acknowledged_at.isoformat() if incident.acknowledged_at else None,
414
+ incident.resolved_by,
415
+ incident.resolved_at.isoformat() if incident.resolved_at else None,
416
+ incident.created_at.isoformat(),
417
+ incident.updated_at.isoformat(),
418
+ incident.next_escalation_at.isoformat() if incident.next_escalation_at else None,
419
+ incident.escalation_count,
420
+ json.dumps([e.to_dict() for e in incident.events]),
421
+ ),
422
+ )
423
+ conn.commit()
424
+ return incident.id
425
+
426
+ def get_incident(self, incident_id: str) -> EscalationIncident | None:
427
+ """Get incident by ID."""
428
+ conn = self._get_connection()
429
+ cursor = conn.execute(
430
+ "SELECT * FROM escalation_incidents WHERE id = ?",
431
+ (incident_id,),
432
+ )
433
+ row = cursor.fetchone()
434
+ return self._row_to_incident(row) if row else None
435
+
436
+ def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
437
+ """Get incident by external reference."""
438
+ conn = self._get_connection()
439
+ cursor = conn.execute(
440
+ "SELECT * FROM escalation_incidents WHERE incident_ref = ?",
441
+ (incident_ref,),
442
+ )
443
+ row = cursor.fetchone()
444
+ return self._row_to_incident(row) if row else None
445
+
446
+ def list_incidents(
447
+ self,
448
+ policy_id: str | None = None,
449
+ states: list[EscalationState] | None = None,
450
+ ) -> list[EscalationIncident]:
451
+ """List incidents with optional filters."""
452
+ conn = self._get_connection()
453
+
454
+ query = "SELECT * FROM escalation_incidents WHERE 1=1"
455
+ params: list[Any] = []
456
+
457
+ if policy_id:
458
+ query += " AND policy_id = ?"
459
+ params.append(policy_id)
460
+
461
+ if states:
462
+ placeholders = ",".join("?" * len(states))
463
+ query += f" AND state IN ({placeholders})"
464
+ params.extend(s.value for s in states)
465
+
466
+ cursor = conn.execute(query, params)
467
+ return [self._row_to_incident(row) for row in cursor.fetchall()]
468
+
469
+ def get_pending_escalations(self) -> list[EscalationIncident]:
470
+ """Get incidents due for escalation."""
471
+ now = datetime.utcnow().isoformat()
472
+ conn = self._get_connection()
473
+ cursor = conn.execute(
474
+ """
475
+ SELECT * FROM escalation_incidents
476
+ WHERE state IN (?, ?)
477
+ AND next_escalation_at IS NOT NULL
478
+ AND next_escalation_at <= ?
479
+ """,
480
+ (EscalationState.TRIGGERED.value, EscalationState.ESCALATED.value, now),
481
+ )
482
+ return [self._row_to_incident(row) for row in cursor.fetchall()]
483
+
484
+ def _row_to_incident(self, row: sqlite3.Row) -> EscalationIncident:
485
+ """Convert database row to incident."""
486
+ from .models import EscalationEvent
487
+
488
+ events_data = json.loads(row["events"]) if row["events"] else []
489
+ events = [EscalationEvent.from_dict(e) for e in events_data]
490
+
491
+ return EscalationIncident(
492
+ id=row["id"],
493
+ policy_id=row["policy_id"],
494
+ incident_ref=row["incident_ref"],
495
+ state=EscalationState(row["state"]),
496
+ current_level=row["current_level"],
497
+ context=json.loads(row["context"]) if row["context"] else {},
498
+ acknowledged_by=row["acknowledged_by"],
499
+ acknowledged_at=datetime.fromisoformat(row["acknowledged_at"]) if row["acknowledged_at"] else None,
500
+ resolved_by=row["resolved_by"],
501
+ resolved_at=datetime.fromisoformat(row["resolved_at"]) if row["resolved_at"] else None,
502
+ created_at=datetime.fromisoformat(row["created_at"]),
503
+ updated_at=datetime.fromisoformat(row["updated_at"]),
504
+ next_escalation_at=datetime.fromisoformat(row["next_escalation_at"]) if row["next_escalation_at"] else None,
505
+ escalation_count=row["escalation_count"],
506
+ events=events,
507
+ )
508
+
509
+ def close(self) -> None:
510
+ """Close database connection."""
511
+ if hasattr(self._local, "connection"):
512
+ self._local.connection.close()
513
+ del self._local.connection
514
+
515
+
516
+ # ============================================================================
517
+ # Redis Escalation Store
518
+ # ============================================================================
519
+
520
+
521
+ @dataclass
522
+ class EscalationMetrics:
523
+ """Metrics for escalation store operations.
524
+
525
+ Attributes:
526
+ policy_saves: Number of policy save operations.
527
+ policy_gets: Number of policy get operations.
528
+ policy_deletes: Number of policy delete operations.
529
+ incident_saves: Number of incident save operations.
530
+ incident_gets: Number of incident get operations.
531
+ state_transitions: Number of state transitions.
532
+ errors: Number of Redis errors encountered.
533
+ fallbacks: Number of times fallback to InMemory was used.
534
+ reconnections: Number of successful reconnections.
535
+ pubsub_publishes: Number of Pub/Sub messages published.
536
+ avg_latency_ms: Average operation latency in milliseconds.
537
+ """
538
+
539
+ policy_saves: int = 0
540
+ policy_gets: int = 0
541
+ policy_deletes: int = 0
542
+ incident_saves: int = 0
543
+ incident_gets: int = 0
544
+ state_transitions: int = 0
545
+ errors: int = 0
546
+ fallbacks: int = 0
547
+ reconnections: int = 0
548
+ pubsub_publishes: int = 0
549
+ total_operations: int = 0
550
+ total_latency_ms: float = 0.0
551
+
552
+ @property
553
+ def avg_latency_ms(self) -> float:
554
+ """Calculate average operation latency."""
555
+ if self.total_operations == 0:
556
+ return 0.0
557
+ return self.total_latency_ms / self.total_operations
558
+
559
+ def record_latency(self, latency_ms: float) -> None:
560
+ """Record an operation's latency."""
561
+ self.total_operations += 1
562
+ self.total_latency_ms += latency_ms
563
+
564
+ def to_dict(self) -> dict[str, Any]:
565
+ """Convert metrics to dictionary."""
566
+ return {
567
+ "policy_saves": self.policy_saves,
568
+ "policy_gets": self.policy_gets,
569
+ "policy_deletes": self.policy_deletes,
570
+ "incident_saves": self.incident_saves,
571
+ "incident_gets": self.incident_gets,
572
+ "state_transitions": self.state_transitions,
573
+ "errors": self.errors,
574
+ "fallbacks": self.fallbacks,
575
+ "reconnections": self.reconnections,
576
+ "pubsub_publishes": self.pubsub_publishes,
577
+ "total_operations": self.total_operations,
578
+ "avg_latency_ms": round(self.avg_latency_ms, 3),
579
+ }
580
+
581
+
582
+ class RedisEscalationStore(BaseEscalationStore):
583
+ """Production-ready Redis-based escalation store for distributed deployments.
584
+
585
+ Uses Redis for robust distributed escalation state management with:
586
+ - Connection pool management with configurable pool size
587
+ - Automatic reconnection with exponential backoff
588
+ - Proper JSON serialization/deserialization of incident objects
589
+ - Transaction support for atomic state updates (MULTI/EXEC and Lua scripts)
590
+ - Pub/Sub for real-time incident updates
591
+ - TTL management for completed/resolved incidents (auto-cleanup)
592
+ - Index structures for efficient queries (by state, policy_id, created_at)
593
+ - Graceful degradation (fallback to InMemory on Redis failure)
594
+ - Health check endpoint support
595
+ - Comprehensive metrics (operations, latency, errors)
596
+
597
+ Configuration via environment variables:
598
+ TRUTHOUND_ESCALATION_REDIS_URL: Redis connection URL (default: redis://localhost:6379/0)
599
+ TRUTHOUND_ESCALATION_REDIS_PREFIX: Key prefix (default: truthound:escalation:)
600
+ TRUTHOUND_ESCALATION_REDIS_POOL_SIZE: Connection pool size (default: 10)
601
+ TRUTHOUND_ESCALATION_REDIS_SOCKET_TIMEOUT: Socket timeout (default: 5.0)
602
+ TRUTHOUND_ESCALATION_REDIS_CONNECT_TIMEOUT: Connection timeout (default: 5.0)
603
+ TRUTHOUND_ESCALATION_REDIS_MAX_RETRIES: Max retry attempts (default: 3)
604
+ TRUTHOUND_ESCALATION_REDIS_RETRY_BASE_DELAY: Base delay for exponential backoff (default: 1.0)
605
+ TRUTHOUND_ESCALATION_REDIS_RESOLVED_TTL: TTL in seconds for resolved incidents (default: 86400 = 24h)
606
+ TRUTHOUND_ESCALATION_FALLBACK_ENABLED: Enable fallback to InMemory (default: true)
607
+ TRUTHOUND_ESCALATION_PUBSUB_ENABLED: Enable Pub/Sub notifications (default: true)
608
+
609
+ Example:
610
+ # Basic usage
611
+ store = RedisEscalationStore()
612
+
613
+ # Custom configuration
614
+ store = RedisEscalationStore(
615
+ redis_url="redis://myredis:6379/1",
616
+ max_connections=20,
617
+ resolved_ttl=3600, # 1 hour
618
+ enable_fallback=True,
619
+ )
620
+
621
+ # With context manager
622
+ async with RedisEscalationStore() as store:
623
+ policy_id = await store.save_policy_async(policy)
624
+ incident_id = await store.save_incident_async(incident)
625
+
626
+ Note: Requires the 'redis' optional dependency.
627
+ Install with: pip install truthound-dashboard[redis]
628
+ """
629
+
630
+ # Redis key patterns
631
+ KEY_POLICY = "policy:{policy_id}"
632
+ KEY_POLICY_INDEX = "policies:all"
633
+ KEY_POLICY_BY_NAME = "policies:name:{name}"
634
+ KEY_POLICY_ACTIVE = "policies:active"
635
+
636
+ KEY_INCIDENT = "incident:{incident_id}"
637
+ KEY_INCIDENT_INDEX = "incidents:all"
638
+ KEY_INCIDENT_BY_REF = "incidents:ref:{incident_ref}"
639
+ KEY_INCIDENT_BY_POLICY = "incidents:policy:{policy_id}"
640
+ KEY_INCIDENT_BY_STATE = "incidents:state:{state}"
641
+ KEY_INCIDENT_BY_CREATED = "incidents:created" # Sorted set
642
+ KEY_INCIDENT_PENDING = "incidents:pending_escalation" # Sorted set by next_escalation_at
643
+
644
+ # Pub/Sub channels
645
+ CHANNEL_INCIDENT_UPDATE = "escalation:incidents:updates"
646
+ CHANNEL_POLICY_UPDATE = "escalation:policies:updates"
647
+
648
+ # Lua script for atomic incident state transition
649
+ LUA_STATE_TRANSITION = """
650
+ local incident_key = KEYS[1]
651
+ local old_state_key = KEYS[2]
652
+ local new_state_key = KEYS[3]
653
+ local pending_key = KEYS[4]
654
+ local incident_id = ARGV[1]
655
+ local new_state = ARGV[2]
656
+ local updated_data = ARGV[3]
657
+ local next_escalation_score = ARGV[4]
658
+
659
+ -- Get current incident
660
+ local current = redis.call('GET', incident_key)
661
+ if not current then
662
+ return {err = 'incident_not_found'}
663
+ end
664
+
665
+ -- Update incident data
666
+ redis.call('SET', incident_key, updated_data)
667
+
668
+ -- Update state indices
669
+ redis.call('SREM', old_state_key, incident_id)
670
+ redis.call('SADD', new_state_key, incident_id)
671
+
672
+ -- Update pending escalation sorted set
673
+ if new_state == 'resolved' or new_state == 'acknowledged' then
674
+ redis.call('ZREM', pending_key, incident_id)
675
+ elseif next_escalation_score ~= '' then
676
+ redis.call('ZADD', pending_key, next_escalation_score, incident_id)
677
+ end
678
+
679
+ return 'OK'
680
+ """
681
+
682
+ def __init__(
683
+ self,
684
+ redis_url: str | None = None,
685
+ key_prefix: str | None = None,
686
+ max_connections: int | None = None,
687
+ socket_timeout: float | None = None,
688
+ socket_connect_timeout: float | None = None,
689
+ max_retries: int | None = None,
690
+ retry_base_delay: float | None = None,
691
+ resolved_ttl: int | None = None,
692
+ enable_fallback: bool | None = None,
693
+ enable_pubsub: bool | None = None,
694
+ logger: Any | None = None,
695
+ ) -> None:
696
+ """Initialize Redis escalation store.
697
+
698
+ All parameters can be configured via environment variables if not
699
+ explicitly provided.
700
+
701
+ Args:
702
+ redis_url: Redis connection URL.
703
+ key_prefix: Prefix for all Redis keys.
704
+ max_connections: Maximum connections in the pool.
705
+ socket_timeout: Socket timeout in seconds.
706
+ socket_connect_timeout: Connection timeout in seconds.
707
+ max_retries: Maximum retry attempts for reconnection.
708
+ retry_base_delay: Base delay for exponential backoff.
709
+ resolved_ttl: TTL in seconds for resolved/completed incidents.
710
+ enable_fallback: Enable fallback to InMemory on Redis failure.
711
+ enable_pubsub: Enable Pub/Sub notifications for state changes.
712
+ logger: Custom logger instance.
713
+
714
+ Raises:
715
+ ImportError: If redis package is not installed.
716
+ """
717
+ if not REDIS_AVAILABLE:
718
+ raise ImportError(
719
+ "Redis support requires the 'redis' package. "
720
+ "Install with: pip install truthound-dashboard[redis] "
721
+ "or pip install redis"
722
+ )
723
+
724
+ # Configuration from environment or parameters
725
+ self.redis_url = redis_url or os.getenv(
726
+ "TRUTHOUND_ESCALATION_REDIS_URL", "redis://localhost:6379/0"
727
+ )
728
+ self.key_prefix = key_prefix or os.getenv(
729
+ "TRUTHOUND_ESCALATION_REDIS_PREFIX", "truthound:escalation:"
730
+ )
731
+ self.max_connections = max_connections or int(
732
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_POOL_SIZE", "10")
733
+ )
734
+ self.socket_timeout = socket_timeout or float(
735
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_SOCKET_TIMEOUT", "5.0")
736
+ )
737
+ self.socket_connect_timeout = socket_connect_timeout or float(
738
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_CONNECT_TIMEOUT", "5.0")
739
+ )
740
+ self.max_retries = max_retries or int(
741
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_MAX_RETRIES", "3")
742
+ )
743
+ self.retry_base_delay = retry_base_delay or float(
744
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_RETRY_BASE_DELAY", "1.0")
745
+ )
746
+ self.resolved_ttl = resolved_ttl or int(
747
+ os.getenv("TRUTHOUND_ESCALATION_REDIS_RESOLVED_TTL", "86400")
748
+ )
749
+
750
+ fallback_env = os.getenv("TRUTHOUND_ESCALATION_FALLBACK_ENABLED", "true")
751
+ self.enable_fallback = (
752
+ enable_fallback
753
+ if enable_fallback is not None
754
+ else fallback_env.lower() == "true"
755
+ )
756
+
757
+ pubsub_env = os.getenv("TRUTHOUND_ESCALATION_PUBSUB_ENABLED", "true")
758
+ self.enable_pubsub = (
759
+ enable_pubsub
760
+ if enable_pubsub is not None
761
+ else pubsub_env.lower() == "true"
762
+ )
763
+
764
+ # Logger setup
765
+ self._logger = logger or logging.getLogger(__name__)
766
+
767
+ # Connection pool for sync client
768
+ self._pool: "redis.ConnectionPool | None" = None
769
+ self._client: "redis.Redis | None" = None
770
+
771
+ # Connection pool for async client
772
+ self._async_pool: "redis.asyncio.ConnectionPool | None" = None
773
+ self._async_client: "redis.asyncio.Redis | None" = None
774
+
775
+ # Locks for thread-safe initialization
776
+ self._lock = threading.Lock()
777
+ self._async_lock: Any = None # Created lazily for asyncio
778
+
779
+ # Fallback store for graceful degradation
780
+ self._fallback_store: InMemoryEscalationStore | None = None
781
+ self._using_fallback = False
782
+
783
+ # Connection state tracking
784
+ self._connected = False
785
+ self._retry_count = 0
786
+ self._last_error: Exception | None = None
787
+ self._last_error_time: float | None = None
788
+
789
+ # Metrics
790
+ self._metrics = EscalationMetrics()
791
+
792
+ # Lua script SHA (registered on first use)
793
+ self._state_transition_sha: str | None = None
794
+
795
+ def _get_key(self, pattern: str, **kwargs: str) -> str:
796
+ """Get full Redis key from pattern.
797
+
798
+ Args:
799
+ pattern: Key pattern with placeholders.
800
+ **kwargs: Values to substitute in pattern.
801
+
802
+ Returns:
803
+ Full Redis key with prefix.
804
+ """
805
+ key = pattern.format(**kwargs) if kwargs else pattern
806
+ return f"{self.key_prefix}{key}"
807
+
808
+ def _create_pool(self) -> "redis.ConnectionPool":
809
+ """Create a connection pool for sync client.
810
+
811
+ Returns:
812
+ Configured connection pool.
813
+ """
814
+ return redis.ConnectionPool.from_url(
815
+ self.redis_url,
816
+ max_connections=self.max_connections,
817
+ socket_timeout=self.socket_timeout,
818
+ socket_connect_timeout=self.socket_connect_timeout,
819
+ retry_on_timeout=True,
820
+ decode_responses=True,
821
+ )
822
+
823
+ async def _create_async_pool(self) -> "redis.asyncio.ConnectionPool":
824
+ """Create a connection pool for async client.
825
+
826
+ Returns:
827
+ Configured async connection pool.
828
+ """
829
+ return redis.asyncio.ConnectionPool.from_url(
830
+ self.redis_url,
831
+ max_connections=self.max_connections,
832
+ socket_timeout=self.socket_timeout,
833
+ socket_connect_timeout=self.socket_connect_timeout,
834
+ retry_on_timeout=True,
835
+ decode_responses=True,
836
+ )
837
+
838
+ def _get_fallback_store(self) -> InMemoryEscalationStore:
839
+ """Get or create fallback in-memory store.
840
+
841
+ Returns:
842
+ InMemoryEscalationStore instance.
843
+ """
844
+ if self._fallback_store is None:
845
+ self._fallback_store = InMemoryEscalationStore()
846
+ return self._fallback_store
847
+
848
+ def _calculate_backoff_delay(self) -> float:
849
+ """Calculate exponential backoff delay.
850
+
851
+ Returns:
852
+ Delay in seconds.
853
+ """
854
+ # Exponential backoff with jitter
855
+ delay = self.retry_base_delay * (2**self._retry_count)
856
+ # Add jitter (up to 25% of delay)
857
+ jitter = delay * random.uniform(0, 0.25)
858
+ return min(delay + jitter, 60.0) # Cap at 60 seconds
859
+
860
+ def _handle_redis_error(self, error: Exception, operation: str) -> None:
861
+ """Handle Redis errors with logging and metrics.
862
+
863
+ Args:
864
+ error: The exception that occurred.
865
+ operation: Name of the operation that failed.
866
+ """
867
+ self._metrics.errors += 1
868
+ self._last_error = error
869
+ self._last_error_time = time.time()
870
+ self._connected = False
871
+
872
+ self._logger.error(
873
+ f"Redis error during {operation}: {error}",
874
+ extra={
875
+ "operation": operation,
876
+ "error_type": type(error).__name__,
877
+ "retry_count": self._retry_count,
878
+ },
879
+ )
880
+
881
+ def _try_reconnect_sync(self) -> bool:
882
+ """Attempt to reconnect to Redis synchronously.
883
+
884
+ Returns:
885
+ True if reconnection successful, False otherwise.
886
+ """
887
+ if self._retry_count >= self.max_retries:
888
+ self._logger.warning(
889
+ f"Max retries ({self.max_retries}) reached, using fallback"
890
+ )
891
+ return False
892
+
893
+ delay = self._calculate_backoff_delay()
894
+ self._logger.info(
895
+ f"Attempting Redis reconnection in {delay:.2f}s "
896
+ f"(attempt {self._retry_count + 1}/{self.max_retries})"
897
+ )
898
+
899
+ time.sleep(delay)
900
+ self._retry_count += 1
901
+
902
+ try:
903
+ # Close existing connections
904
+ if self._client:
905
+ try:
906
+ self._client.close()
907
+ except Exception:
908
+ pass
909
+ self._client = None
910
+
911
+ if self._pool:
912
+ try:
913
+ self._pool.disconnect()
914
+ except Exception:
915
+ pass
916
+ self._pool = None
917
+
918
+ # Create new connection
919
+ self._pool = self._create_pool()
920
+ self._client = redis.Redis(connection_pool=self._pool)
921
+
922
+ # Test connection
923
+ if self._client.ping():
924
+ self._connected = True
925
+ self._retry_count = 0
926
+ self._using_fallback = False
927
+ self._metrics.reconnections += 1
928
+ self._logger.info("Redis reconnection successful")
929
+ return True
930
+ except Exception as e:
931
+ self._logger.warning(f"Reconnection attempt failed: {e}")
932
+
933
+ return False
934
+
935
+ async def _try_reconnect_async(self) -> bool:
936
+ """Attempt to reconnect to Redis asynchronously.
937
+
938
+ Returns:
939
+ True if reconnection successful, False otherwise.
940
+ """
941
+ import asyncio
942
+
943
+ if self._retry_count >= self.max_retries:
944
+ self._logger.warning(
945
+ f"Max retries ({self.max_retries}) reached, using fallback"
946
+ )
947
+ return False
948
+
949
+ delay = self._calculate_backoff_delay()
950
+ self._logger.info(
951
+ f"Attempting async Redis reconnection in {delay:.2f}s "
952
+ f"(attempt {self._retry_count + 1}/{self.max_retries})"
953
+ )
954
+
955
+ await asyncio.sleep(delay)
956
+ self._retry_count += 1
957
+
958
+ try:
959
+ # Close existing connections
960
+ if self._async_client:
961
+ try:
962
+ await self._async_client.close()
963
+ except Exception:
964
+ pass
965
+ self._async_client = None
966
+
967
+ if self._async_pool:
968
+ try:
969
+ await self._async_pool.disconnect()
970
+ except Exception:
971
+ pass
972
+ self._async_pool = None
973
+
974
+ # Create new connection
975
+ self._async_pool = await self._create_async_pool()
976
+ self._async_client = redis.asyncio.Redis(connection_pool=self._async_pool)
977
+
978
+ # Test connection
979
+ if await self._async_client.ping():
980
+ self._connected = True
981
+ self._retry_count = 0
982
+ self._using_fallback = False
983
+ self._metrics.reconnections += 1
984
+ self._logger.info("Async Redis reconnection successful")
985
+ return True
986
+ except Exception as e:
987
+ self._logger.warning(f"Async reconnection attempt failed: {e}")
988
+
989
+ return False
990
+
991
+ @property
992
+ def client(self) -> "redis.Redis":
993
+ """Get sync Redis client with connection pooling.
994
+
995
+ Creates the connection pool and client on first access.
996
+ Handles reconnection on failure.
997
+
998
+ Returns:
999
+ Redis client instance.
1000
+ """
1001
+ if self._client is None or not self._connected:
1002
+ with self._lock:
1003
+ if self._client is None or not self._connected:
1004
+ try:
1005
+ self._pool = self._create_pool()
1006
+ self._client = redis.Redis(connection_pool=self._pool)
1007
+ # Test connection
1008
+ self._client.ping()
1009
+ self._connected = True
1010
+ self._retry_count = 0
1011
+ self._logger.debug("Redis sync client connected")
1012
+ except Exception as e:
1013
+ self._handle_redis_error(e, "client_init")
1014
+ raise
1015
+ return self._client
1016
+
1017
+ async def get_async_client(self) -> "redis.asyncio.Redis":
1018
+ """Get async Redis client with connection pooling.
1019
+
1020
+ Creates the async connection pool and client on first access.
1021
+
1022
+ Returns:
1023
+ Async Redis client instance.
1024
+ """
1025
+ import asyncio
1026
+
1027
+ if self._async_lock is None:
1028
+ self._async_lock = asyncio.Lock()
1029
+
1030
+ if self._async_client is None or not self._connected:
1031
+ async with self._async_lock:
1032
+ if self._async_client is None or not self._connected:
1033
+ try:
1034
+ self._async_pool = await self._create_async_pool()
1035
+ self._async_client = redis.asyncio.Redis(
1036
+ connection_pool=self._async_pool
1037
+ )
1038
+ # Test connection
1039
+ await self._async_client.ping()
1040
+ self._connected = True
1041
+ self._retry_count = 0
1042
+ self._logger.debug("Redis async client connected")
1043
+ except Exception as e:
1044
+ self._handle_redis_error(e, "async_client_init")
1045
+ raise
1046
+ return self._async_client
1047
+
1048
+ def _register_lua_scripts(self, client: "redis.Redis") -> None:
1049
+ """Register Lua scripts with Redis.
1050
+
1051
+ Args:
1052
+ client: Redis client instance.
1053
+ """
1054
+ if self._state_transition_sha is None:
1055
+ self._state_transition_sha = client.script_load(self.LUA_STATE_TRANSITION)
1056
+
1057
+ async def _register_lua_scripts_async(
1058
+ self, client: "redis.asyncio.Redis"
1059
+ ) -> None:
1060
+ """Register Lua scripts with Redis asynchronously.
1061
+
1062
+ Args:
1063
+ client: Async Redis client instance.
1064
+ """
1065
+ if self._state_transition_sha is None:
1066
+ self._state_transition_sha = await client.script_load(
1067
+ self.LUA_STATE_TRANSITION
1068
+ )
1069
+
1070
+ def _serialize_policy(self, policy: EscalationPolicy) -> str:
1071
+ """Serialize policy to JSON string.
1072
+
1073
+ Args:
1074
+ policy: Policy to serialize.
1075
+
1076
+ Returns:
1077
+ JSON string.
1078
+ """
1079
+ return json.dumps(policy.to_dict())
1080
+
1081
+ def _deserialize_policy(self, data: str) -> EscalationPolicy:
1082
+ """Deserialize policy from JSON string.
1083
+
1084
+ Args:
1085
+ data: JSON string.
1086
+
1087
+ Returns:
1088
+ EscalationPolicy instance.
1089
+ """
1090
+ return EscalationPolicy.from_dict(json.loads(data))
1091
+
1092
+ def _serialize_incident(self, incident: EscalationIncident) -> str:
1093
+ """Serialize incident to JSON string.
1094
+
1095
+ Args:
1096
+ incident: Incident to serialize.
1097
+
1098
+ Returns:
1099
+ JSON string.
1100
+ """
1101
+ return json.dumps(incident.to_dict())
1102
+
1103
+ def _deserialize_incident(self, data: str) -> EscalationIncident:
1104
+ """Deserialize incident from JSON string.
1105
+
1106
+ Args:
1107
+ data: JSON string.
1108
+
1109
+ Returns:
1110
+ EscalationIncident instance.
1111
+ """
1112
+ return EscalationIncident.from_dict(json.loads(data))
1113
+
1114
+ def _publish_incident_update(
1115
+ self,
1116
+ client: "redis.Redis",
1117
+ incident: EscalationIncident,
1118
+ event_type: str,
1119
+ ) -> None:
1120
+ """Publish incident update via Pub/Sub.
1121
+
1122
+ Args:
1123
+ client: Redis client.
1124
+ incident: Updated incident.
1125
+ event_type: Type of event (created, updated, state_changed, etc.).
1126
+ """
1127
+ if not self.enable_pubsub:
1128
+ return
1129
+
1130
+ try:
1131
+ message = json.dumps({
1132
+ "event_type": event_type,
1133
+ "incident_id": incident.id,
1134
+ "incident_ref": incident.incident_ref,
1135
+ "policy_id": incident.policy_id,
1136
+ "state": incident.state.value,
1137
+ "current_level": incident.current_level,
1138
+ "timestamp": datetime.utcnow().isoformat(),
1139
+ })
1140
+ channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
1141
+ client.publish(channel, message)
1142
+ self._metrics.pubsub_publishes += 1
1143
+ except Exception as e:
1144
+ self._logger.warning(f"Failed to publish incident update: {e}")
1145
+
1146
+ async def _publish_incident_update_async(
1147
+ self,
1148
+ client: "redis.asyncio.Redis",
1149
+ incident: EscalationIncident,
1150
+ event_type: str,
1151
+ ) -> None:
1152
+ """Publish incident update via Pub/Sub asynchronously.
1153
+
1154
+ Args:
1155
+ client: Async Redis client.
1156
+ incident: Updated incident.
1157
+ event_type: Type of event.
1158
+ """
1159
+ if not self.enable_pubsub:
1160
+ return
1161
+
1162
+ try:
1163
+ message = json.dumps({
1164
+ "event_type": event_type,
1165
+ "incident_id": incident.id,
1166
+ "incident_ref": incident.incident_ref,
1167
+ "policy_id": incident.policy_id,
1168
+ "state": incident.state.value,
1169
+ "current_level": incident.current_level,
1170
+ "timestamp": datetime.utcnow().isoformat(),
1171
+ })
1172
+ channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
1173
+ await client.publish(channel, message)
1174
+ self._metrics.pubsub_publishes += 1
1175
+ except Exception as e:
1176
+ self._logger.warning(f"Failed to publish incident update: {e}")
1177
+
1178
+ # =========================================================================
1179
+ # Policy Operations
1180
+ # =========================================================================
1181
+
1182
+ def save_policy(self, policy: EscalationPolicy) -> str:
1183
+ """Save or update a policy.
1184
+
1185
+ Args:
1186
+ policy: Policy to save.
1187
+
1188
+ Returns:
1189
+ Policy ID.
1190
+ """
1191
+ start_time = time.time()
1192
+
1193
+ if self._using_fallback and self.enable_fallback:
1194
+ return self._get_fallback_store().save_policy(policy)
1195
+
1196
+ try:
1197
+ client = self.client
1198
+
1199
+ # Generate ID if not present
1200
+ if not policy.id:
1201
+ policy.id = str(uuid.uuid4())
1202
+
1203
+ # Use pipeline for atomicity
1204
+ pipe = client.pipeline()
1205
+
1206
+ # Store policy
1207
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy.id)
1208
+ pipe.set(policy_key, self._serialize_policy(policy))
1209
+
1210
+ # Update indices
1211
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1212
+ pipe.sadd(index_key, policy.id)
1213
+
1214
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
1215
+ pipe.set(name_key, policy.id)
1216
+
1217
+ active_key = self._get_key(self.KEY_POLICY_ACTIVE)
1218
+ if policy.is_active:
1219
+ pipe.sadd(active_key, policy.id)
1220
+ else:
1221
+ pipe.srem(active_key, policy.id)
1222
+
1223
+ pipe.execute()
1224
+
1225
+ self._metrics.policy_saves += 1
1226
+ latency_ms = (time.time() - start_time) * 1000
1227
+ self._metrics.record_latency(latency_ms)
1228
+
1229
+ return policy.id
1230
+
1231
+ except Exception as e:
1232
+ self._handle_redis_error(e, "save_policy")
1233
+
1234
+ if self.enable_fallback:
1235
+ self._using_fallback = True
1236
+ self._metrics.fallbacks += 1
1237
+ self._logger.warning("Falling back to InMemory store")
1238
+ return self._get_fallback_store().save_policy(policy)
1239
+
1240
+ raise
1241
+
1242
+ async def save_policy_async(self, policy: EscalationPolicy) -> str:
1243
+ """Save or update a policy asynchronously.
1244
+
1245
+ Args:
1246
+ policy: Policy to save.
1247
+
1248
+ Returns:
1249
+ Policy ID.
1250
+ """
1251
+ start_time = time.time()
1252
+
1253
+ if self._using_fallback and self.enable_fallback:
1254
+ return self._get_fallback_store().save_policy(policy)
1255
+
1256
+ try:
1257
+ client = await self.get_async_client()
1258
+
1259
+ # Generate ID if not present
1260
+ if not policy.id:
1261
+ policy.id = str(uuid.uuid4())
1262
+
1263
+ # Use pipeline for atomicity
1264
+ pipe = client.pipeline()
1265
+
1266
+ # Store policy
1267
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy.id)
1268
+ pipe.set(policy_key, self._serialize_policy(policy))
1269
+
1270
+ # Update indices
1271
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1272
+ pipe.sadd(index_key, policy.id)
1273
+
1274
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
1275
+ pipe.set(name_key, policy.id)
1276
+
1277
+ active_key = self._get_key(self.KEY_POLICY_ACTIVE)
1278
+ if policy.is_active:
1279
+ pipe.sadd(active_key, policy.id)
1280
+ else:
1281
+ pipe.srem(active_key, policy.id)
1282
+
1283
+ await pipe.execute()
1284
+
1285
+ self._metrics.policy_saves += 1
1286
+ latency_ms = (time.time() - start_time) * 1000
1287
+ self._metrics.record_latency(latency_ms)
1288
+
1289
+ return policy.id
1290
+
1291
+ except Exception as e:
1292
+ self._handle_redis_error(e, "save_policy_async")
1293
+
1294
+ if self.enable_fallback:
1295
+ self._using_fallback = True
1296
+ self._metrics.fallbacks += 1
1297
+ return self._get_fallback_store().save_policy(policy)
1298
+
1299
+ raise
1300
+
1301
+ def get_policy(self, policy_id: str) -> EscalationPolicy | None:
1302
+ """Get policy by ID.
1303
+
1304
+ Args:
1305
+ policy_id: Policy ID.
1306
+
1307
+ Returns:
1308
+ Policy if found, None otherwise.
1309
+ """
1310
+ start_time = time.time()
1311
+
1312
+ if self._using_fallback and self.enable_fallback:
1313
+ return self._get_fallback_store().get_policy(policy_id)
1314
+
1315
+ try:
1316
+ client = self.client
1317
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
1318
+ data = client.get(policy_key)
1319
+
1320
+ self._metrics.policy_gets += 1
1321
+ latency_ms = (time.time() - start_time) * 1000
1322
+ self._metrics.record_latency(latency_ms)
1323
+
1324
+ if not data:
1325
+ return None
1326
+
1327
+ return self._deserialize_policy(data)
1328
+
1329
+ except Exception as e:
1330
+ self._handle_redis_error(e, "get_policy")
1331
+
1332
+ if self.enable_fallback:
1333
+ self._using_fallback = True
1334
+ self._metrics.fallbacks += 1
1335
+ return self._get_fallback_store().get_policy(policy_id)
1336
+
1337
+ raise
1338
+
1339
+ async def get_policy_async(self, policy_id: str) -> EscalationPolicy | None:
1340
+ """Get policy by ID asynchronously.
1341
+
1342
+ Args:
1343
+ policy_id: Policy ID.
1344
+
1345
+ Returns:
1346
+ Policy if found, None otherwise.
1347
+ """
1348
+ start_time = time.time()
1349
+
1350
+ if self._using_fallback and self.enable_fallback:
1351
+ return self._get_fallback_store().get_policy(policy_id)
1352
+
1353
+ try:
1354
+ client = await self.get_async_client()
1355
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
1356
+ data = await client.get(policy_key)
1357
+
1358
+ self._metrics.policy_gets += 1
1359
+ latency_ms = (time.time() - start_time) * 1000
1360
+ self._metrics.record_latency(latency_ms)
1361
+
1362
+ if not data:
1363
+ return None
1364
+
1365
+ return self._deserialize_policy(data)
1366
+
1367
+ except Exception as e:
1368
+ self._handle_redis_error(e, "get_policy_async")
1369
+
1370
+ if self.enable_fallback:
1371
+ self._using_fallback = True
1372
+ self._metrics.fallbacks += 1
1373
+ return self._get_fallback_store().get_policy(policy_id)
1374
+
1375
+ raise
1376
+
1377
+ def get_policy_by_name(self, name: str) -> EscalationPolicy | None:
1378
+ """Get policy by name.
1379
+
1380
+ Args:
1381
+ name: Policy name.
1382
+
1383
+ Returns:
1384
+ Policy if found, None otherwise.
1385
+ """
1386
+ if self._using_fallback and self.enable_fallback:
1387
+ return self._get_fallback_store().get_policy_by_name(name)
1388
+
1389
+ try:
1390
+ client = self.client
1391
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=name)
1392
+ policy_id = client.get(name_key)
1393
+
1394
+ if not policy_id:
1395
+ return None
1396
+
1397
+ return self.get_policy(policy_id)
1398
+
1399
+ except Exception as e:
1400
+ self._handle_redis_error(e, "get_policy_by_name")
1401
+
1402
+ if self.enable_fallback:
1403
+ self._using_fallback = True
1404
+ self._metrics.fallbacks += 1
1405
+ return self._get_fallback_store().get_policy_by_name(name)
1406
+
1407
+ raise
1408
+
1409
+ async def get_policy_by_name_async(self, name: str) -> EscalationPolicy | None:
1410
+ """Get policy by name asynchronously.
1411
+
1412
+ Args:
1413
+ name: Policy name.
1414
+
1415
+ Returns:
1416
+ Policy if found, None otherwise.
1417
+ """
1418
+ if self._using_fallback and self.enable_fallback:
1419
+ return self._get_fallback_store().get_policy_by_name(name)
1420
+
1421
+ try:
1422
+ client = await self.get_async_client()
1423
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=name)
1424
+ policy_id = await client.get(name_key)
1425
+
1426
+ if not policy_id:
1427
+ return None
1428
+
1429
+ return await self.get_policy_async(policy_id)
1430
+
1431
+ except Exception as e:
1432
+ self._handle_redis_error(e, "get_policy_by_name_async")
1433
+
1434
+ if self.enable_fallback:
1435
+ self._using_fallback = True
1436
+ self._metrics.fallbacks += 1
1437
+ return self._get_fallback_store().get_policy_by_name(name)
1438
+
1439
+ raise
1440
+
1441
+ def list_policies(self, active_only: bool = True) -> list[EscalationPolicy]:
1442
+ """List all policies.
1443
+
1444
+ Args:
1445
+ active_only: If True, only return active policies.
1446
+
1447
+ Returns:
1448
+ List of policies.
1449
+ """
1450
+ if self._using_fallback and self.enable_fallback:
1451
+ return self._get_fallback_store().list_policies(active_only)
1452
+
1453
+ try:
1454
+ client = self.client
1455
+
1456
+ if active_only:
1457
+ index_key = self._get_key(self.KEY_POLICY_ACTIVE)
1458
+ else:
1459
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1460
+
1461
+ policy_ids = client.smembers(index_key)
1462
+ policies = []
1463
+
1464
+ for policy_id in policy_ids:
1465
+ policy = self.get_policy(policy_id)
1466
+ if policy:
1467
+ policies.append(policy)
1468
+
1469
+ return policies
1470
+
1471
+ except Exception as e:
1472
+ self._handle_redis_error(e, "list_policies")
1473
+
1474
+ if self.enable_fallback:
1475
+ self._using_fallback = True
1476
+ self._metrics.fallbacks += 1
1477
+ return self._get_fallback_store().list_policies(active_only)
1478
+
1479
+ raise
1480
+
1481
+ async def list_policies_async(
1482
+ self, active_only: bool = True
1483
+ ) -> list[EscalationPolicy]:
1484
+ """List all policies asynchronously.
1485
+
1486
+ Args:
1487
+ active_only: If True, only return active policies.
1488
+
1489
+ Returns:
1490
+ List of policies.
1491
+ """
1492
+ if self._using_fallback and self.enable_fallback:
1493
+ return self._get_fallback_store().list_policies(active_only)
1494
+
1495
+ try:
1496
+ client = await self.get_async_client()
1497
+
1498
+ if active_only:
1499
+ index_key = self._get_key(self.KEY_POLICY_ACTIVE)
1500
+ else:
1501
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1502
+
1503
+ policy_ids = await client.smembers(index_key)
1504
+ policies = []
1505
+
1506
+ for policy_id in policy_ids:
1507
+ policy = await self.get_policy_async(policy_id)
1508
+ if policy:
1509
+ policies.append(policy)
1510
+
1511
+ return policies
1512
+
1513
+ except Exception as e:
1514
+ self._handle_redis_error(e, "list_policies_async")
1515
+
1516
+ if self.enable_fallback:
1517
+ self._using_fallback = True
1518
+ self._metrics.fallbacks += 1
1519
+ return self._get_fallback_store().list_policies(active_only)
1520
+
1521
+ raise
1522
+
1523
+ def delete_policy(self, policy_id: str) -> bool:
1524
+ """Delete a policy.
1525
+
1526
+ Args:
1527
+ policy_id: Policy ID to delete.
1528
+
1529
+ Returns:
1530
+ True if deleted, False if not found.
1531
+ """
1532
+ if self._using_fallback and self.enable_fallback:
1533
+ return self._get_fallback_store().delete_policy(policy_id)
1534
+
1535
+ try:
1536
+ client = self.client
1537
+
1538
+ # Get policy first to get name for index cleanup
1539
+ policy = self.get_policy(policy_id)
1540
+ if not policy:
1541
+ return False
1542
+
1543
+ pipe = client.pipeline()
1544
+
1545
+ # Delete policy
1546
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
1547
+ pipe.delete(policy_key)
1548
+
1549
+ # Remove from indices
1550
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1551
+ pipe.srem(index_key, policy_id)
1552
+
1553
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
1554
+ pipe.delete(name_key)
1555
+
1556
+ active_key = self._get_key(self.KEY_POLICY_ACTIVE)
1557
+ pipe.srem(active_key, policy_id)
1558
+
1559
+ pipe.execute()
1560
+
1561
+ self._metrics.policy_deletes += 1
1562
+ return True
1563
+
1564
+ except Exception as e:
1565
+ self._handle_redis_error(e, "delete_policy")
1566
+
1567
+ if self.enable_fallback:
1568
+ self._using_fallback = True
1569
+ self._metrics.fallbacks += 1
1570
+ return self._get_fallback_store().delete_policy(policy_id)
1571
+
1572
+ raise
1573
+
1574
+ async def delete_policy_async(self, policy_id: str) -> bool:
1575
+ """Delete a policy asynchronously.
1576
+
1577
+ Args:
1578
+ policy_id: Policy ID to delete.
1579
+
1580
+ Returns:
1581
+ True if deleted, False if not found.
1582
+ """
1583
+ if self._using_fallback and self.enable_fallback:
1584
+ return self._get_fallback_store().delete_policy(policy_id)
1585
+
1586
+ try:
1587
+ client = await self.get_async_client()
1588
+
1589
+ # Get policy first to get name for index cleanup
1590
+ policy = await self.get_policy_async(policy_id)
1591
+ if not policy:
1592
+ return False
1593
+
1594
+ pipe = client.pipeline()
1595
+
1596
+ # Delete policy
1597
+ policy_key = self._get_key(self.KEY_POLICY, policy_id=policy_id)
1598
+ pipe.delete(policy_key)
1599
+
1600
+ # Remove from indices
1601
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
1602
+ pipe.srem(index_key, policy_id)
1603
+
1604
+ name_key = self._get_key(self.KEY_POLICY_BY_NAME, name=policy.name)
1605
+ pipe.delete(name_key)
1606
+
1607
+ active_key = self._get_key(self.KEY_POLICY_ACTIVE)
1608
+ pipe.srem(active_key, policy_id)
1609
+
1610
+ await pipe.execute()
1611
+
1612
+ self._metrics.policy_deletes += 1
1613
+ return True
1614
+
1615
+ except Exception as e:
1616
+ self._handle_redis_error(e, "delete_policy_async")
1617
+
1618
+ if self.enable_fallback:
1619
+ self._using_fallback = True
1620
+ self._metrics.fallbacks += 1
1621
+ return self._get_fallback_store().delete_policy(policy_id)
1622
+
1623
+ raise
1624
+
1625
+ # =========================================================================
1626
+ # Incident Operations
1627
+ # =========================================================================
1628
+
1629
+ def save_incident(self, incident: EscalationIncident) -> str:
1630
+ """Save or update an incident.
1631
+
1632
+ Args:
1633
+ incident: Incident to save.
1634
+
1635
+ Returns:
1636
+ Incident ID.
1637
+ """
1638
+ start_time = time.time()
1639
+
1640
+ if self._using_fallback and self.enable_fallback:
1641
+ return self._get_fallback_store().save_incident(incident)
1642
+
1643
+ try:
1644
+ client = self.client
1645
+ is_new = not incident.id
1646
+
1647
+ # Generate ID if not present
1648
+ if not incident.id:
1649
+ incident.id = str(uuid.uuid4())
1650
+
1651
+ incident.updated_at = datetime.utcnow()
1652
+
1653
+ # Use pipeline for atomicity
1654
+ pipe = client.pipeline()
1655
+
1656
+ # Store incident
1657
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
1658
+ pipe.set(incident_key, self._serialize_incident(incident))
1659
+
1660
+ # Set TTL for resolved incidents
1661
+ if incident.state == EscalationState.RESOLVED and self.resolved_ttl > 0:
1662
+ pipe.expire(incident_key, self.resolved_ttl)
1663
+
1664
+ # Update indices
1665
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
1666
+ pipe.sadd(index_key, incident.id)
1667
+
1668
+ ref_key = self._get_key(
1669
+ self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
1670
+ )
1671
+ pipe.set(ref_key, incident.id)
1672
+
1673
+ policy_key = self._get_key(
1674
+ self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
1675
+ )
1676
+ pipe.sadd(policy_key, incident.id)
1677
+
1678
+ # Update state index (remove from other states first)
1679
+ for state in EscalationState:
1680
+ state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
1681
+ if state == incident.state:
1682
+ pipe.sadd(state_key, incident.id)
1683
+ else:
1684
+ pipe.srem(state_key, incident.id)
1685
+
1686
+ # Update created_at sorted set
1687
+ created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
1688
+ created_score = incident.created_at.timestamp()
1689
+ pipe.zadd(created_key, {incident.id: created_score})
1690
+
1691
+ # Update pending escalation sorted set
1692
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
1693
+ if incident.state in [EscalationState.TRIGGERED, EscalationState.ESCALATED]:
1694
+ if incident.next_escalation_at:
1695
+ score = incident.next_escalation_at.timestamp()
1696
+ pipe.zadd(pending_key, {incident.id: score})
1697
+ else:
1698
+ pipe.zrem(pending_key, incident.id)
1699
+ else:
1700
+ pipe.zrem(pending_key, incident.id)
1701
+
1702
+ pipe.execute()
1703
+
1704
+ # Publish update
1705
+ event_type = "created" if is_new else "updated"
1706
+ self._publish_incident_update(client, incident, event_type)
1707
+
1708
+ self._metrics.incident_saves += 1
1709
+ latency_ms = (time.time() - start_time) * 1000
1710
+ self._metrics.record_latency(latency_ms)
1711
+
1712
+ return incident.id
1713
+
1714
+ except Exception as e:
1715
+ self._handle_redis_error(e, "save_incident")
1716
+
1717
+ if self.enable_fallback:
1718
+ self._using_fallback = True
1719
+ self._metrics.fallbacks += 1
1720
+ self._logger.warning("Falling back to InMemory store")
1721
+ return self._get_fallback_store().save_incident(incident)
1722
+
1723
+ raise
1724
+
1725
+ async def save_incident_async(self, incident: EscalationIncident) -> str:
1726
+ """Save or update an incident asynchronously.
1727
+
1728
+ Args:
1729
+ incident: Incident to save.
1730
+
1731
+ Returns:
1732
+ Incident ID.
1733
+ """
1734
+ start_time = time.time()
1735
+
1736
+ if self._using_fallback and self.enable_fallback:
1737
+ return self._get_fallback_store().save_incident(incident)
1738
+
1739
+ try:
1740
+ client = await self.get_async_client()
1741
+ is_new = not incident.id
1742
+
1743
+ # Generate ID if not present
1744
+ if not incident.id:
1745
+ incident.id = str(uuid.uuid4())
1746
+
1747
+ incident.updated_at = datetime.utcnow()
1748
+
1749
+ # Use pipeline for atomicity
1750
+ pipe = client.pipeline()
1751
+
1752
+ # Store incident
1753
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
1754
+ pipe.set(incident_key, self._serialize_incident(incident))
1755
+
1756
+ # Set TTL for resolved incidents
1757
+ if incident.state == EscalationState.RESOLVED and self.resolved_ttl > 0:
1758
+ pipe.expire(incident_key, self.resolved_ttl)
1759
+
1760
+ # Update indices
1761
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
1762
+ pipe.sadd(index_key, incident.id)
1763
+
1764
+ ref_key = self._get_key(
1765
+ self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
1766
+ )
1767
+ pipe.set(ref_key, incident.id)
1768
+
1769
+ policy_key = self._get_key(
1770
+ self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
1771
+ )
1772
+ pipe.sadd(policy_key, incident.id)
1773
+
1774
+ # Update state index
1775
+ for state in EscalationState:
1776
+ state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
1777
+ if state == incident.state:
1778
+ pipe.sadd(state_key, incident.id)
1779
+ else:
1780
+ pipe.srem(state_key, incident.id)
1781
+
1782
+ # Update created_at sorted set
1783
+ created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
1784
+ created_score = incident.created_at.timestamp()
1785
+ pipe.zadd(created_key, {incident.id: created_score})
1786
+
1787
+ # Update pending escalation sorted set
1788
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
1789
+ if incident.state in [EscalationState.TRIGGERED, EscalationState.ESCALATED]:
1790
+ if incident.next_escalation_at:
1791
+ score = incident.next_escalation_at.timestamp()
1792
+ pipe.zadd(pending_key, {incident.id: score})
1793
+ else:
1794
+ pipe.zrem(pending_key, incident.id)
1795
+ else:
1796
+ pipe.zrem(pending_key, incident.id)
1797
+
1798
+ await pipe.execute()
1799
+
1800
+ # Publish update
1801
+ event_type = "created" if is_new else "updated"
1802
+ await self._publish_incident_update_async(client, incident, event_type)
1803
+
1804
+ self._metrics.incident_saves += 1
1805
+ latency_ms = (time.time() - start_time) * 1000
1806
+ self._metrics.record_latency(latency_ms)
1807
+
1808
+ return incident.id
1809
+
1810
+ except Exception as e:
1811
+ self._handle_redis_error(e, "save_incident_async")
1812
+
1813
+ if self.enable_fallback:
1814
+ self._using_fallback = True
1815
+ self._metrics.fallbacks += 1
1816
+ return self._get_fallback_store().save_incident(incident)
1817
+
1818
+ raise
1819
+
1820
+ def get_incident(self, incident_id: str) -> EscalationIncident | None:
1821
+ """Get incident by ID.
1822
+
1823
+ Args:
1824
+ incident_id: Incident ID.
1825
+
1826
+ Returns:
1827
+ Incident if found, None otherwise.
1828
+ """
1829
+ start_time = time.time()
1830
+
1831
+ if self._using_fallback and self.enable_fallback:
1832
+ return self._get_fallback_store().get_incident(incident_id)
1833
+
1834
+ try:
1835
+ client = self.client
1836
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
1837
+ data = client.get(incident_key)
1838
+
1839
+ self._metrics.incident_gets += 1
1840
+ latency_ms = (time.time() - start_time) * 1000
1841
+ self._metrics.record_latency(latency_ms)
1842
+
1843
+ if not data:
1844
+ return None
1845
+
1846
+ return self._deserialize_incident(data)
1847
+
1848
+ except Exception as e:
1849
+ self._handle_redis_error(e, "get_incident")
1850
+
1851
+ if self.enable_fallback:
1852
+ self._using_fallback = True
1853
+ self._metrics.fallbacks += 1
1854
+ return self._get_fallback_store().get_incident(incident_id)
1855
+
1856
+ raise
1857
+
1858
+ async def get_incident_async(self, incident_id: str) -> EscalationIncident | None:
1859
+ """Get incident by ID asynchronously.
1860
+
1861
+ Args:
1862
+ incident_id: Incident ID.
1863
+
1864
+ Returns:
1865
+ Incident if found, None otherwise.
1866
+ """
1867
+ start_time = time.time()
1868
+
1869
+ if self._using_fallback and self.enable_fallback:
1870
+ return self._get_fallback_store().get_incident(incident_id)
1871
+
1872
+ try:
1873
+ client = await self.get_async_client()
1874
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
1875
+ data = await client.get(incident_key)
1876
+
1877
+ self._metrics.incident_gets += 1
1878
+ latency_ms = (time.time() - start_time) * 1000
1879
+ self._metrics.record_latency(latency_ms)
1880
+
1881
+ if not data:
1882
+ return None
1883
+
1884
+ return self._deserialize_incident(data)
1885
+
1886
+ except Exception as e:
1887
+ self._handle_redis_error(e, "get_incident_async")
1888
+
1889
+ if self.enable_fallback:
1890
+ self._using_fallback = True
1891
+ self._metrics.fallbacks += 1
1892
+ return self._get_fallback_store().get_incident(incident_id)
1893
+
1894
+ raise
1895
+
1896
+ def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
1897
+ """Get incident by external reference.
1898
+
1899
+ Args:
1900
+ incident_ref: External reference.
1901
+
1902
+ Returns:
1903
+ Incident if found, None otherwise.
1904
+ """
1905
+ if self._using_fallback and self.enable_fallback:
1906
+ return self._get_fallback_store().get_incident_by_ref(incident_ref)
1907
+
1908
+ try:
1909
+ client = self.client
1910
+ ref_key = self._get_key(self.KEY_INCIDENT_BY_REF, incident_ref=incident_ref)
1911
+ incident_id = client.get(ref_key)
1912
+
1913
+ if not incident_id:
1914
+ return None
1915
+
1916
+ return self.get_incident(incident_id)
1917
+
1918
+ except Exception as e:
1919
+ self._handle_redis_error(e, "get_incident_by_ref")
1920
+
1921
+ if self.enable_fallback:
1922
+ self._using_fallback = True
1923
+ self._metrics.fallbacks += 1
1924
+ return self._get_fallback_store().get_incident_by_ref(incident_ref)
1925
+
1926
+ raise
1927
+
1928
+ async def get_incident_by_ref_async(
1929
+ self, incident_ref: str
1930
+ ) -> EscalationIncident | None:
1931
+ """Get incident by external reference asynchronously.
1932
+
1933
+ Args:
1934
+ incident_ref: External reference.
1935
+
1936
+ Returns:
1937
+ Incident if found, None otherwise.
1938
+ """
1939
+ if self._using_fallback and self.enable_fallback:
1940
+ return self._get_fallback_store().get_incident_by_ref(incident_ref)
1941
+
1942
+ try:
1943
+ client = await self.get_async_client()
1944
+ ref_key = self._get_key(self.KEY_INCIDENT_BY_REF, incident_ref=incident_ref)
1945
+ incident_id = await client.get(ref_key)
1946
+
1947
+ if not incident_id:
1948
+ return None
1949
+
1950
+ return await self.get_incident_async(incident_id)
1951
+
1952
+ except Exception as e:
1953
+ self._handle_redis_error(e, "get_incident_by_ref_async")
1954
+
1955
+ if self.enable_fallback:
1956
+ self._using_fallback = True
1957
+ self._metrics.fallbacks += 1
1958
+ return self._get_fallback_store().get_incident_by_ref(incident_ref)
1959
+
1960
+ raise
1961
+
1962
+ def list_incidents(
1963
+ self,
1964
+ policy_id: str | None = None,
1965
+ states: list[EscalationState] | None = None,
1966
+ ) -> list[EscalationIncident]:
1967
+ """List incidents with optional filters.
1968
+
1969
+ Args:
1970
+ policy_id: Filter by policy ID.
1971
+ states: Filter by states.
1972
+
1973
+ Returns:
1974
+ List of incidents.
1975
+ """
1976
+ if self._using_fallback and self.enable_fallback:
1977
+ return self._get_fallback_store().list_incidents(policy_id, states)
1978
+
1979
+ try:
1980
+ client = self.client
1981
+ incident_ids: set[str] = set()
1982
+
1983
+ # Get IDs based on filters
1984
+ if policy_id:
1985
+ policy_key = self._get_key(
1986
+ self.KEY_INCIDENT_BY_POLICY, policy_id=policy_id
1987
+ )
1988
+ incident_ids = client.smembers(policy_key)
1989
+ elif states:
1990
+ # Get incidents from state indices and intersect
1991
+ for i, state in enumerate(states):
1992
+ state_key = self._get_key(
1993
+ self.KEY_INCIDENT_BY_STATE, state=state.value
1994
+ )
1995
+ state_ids = client.smembers(state_key)
1996
+ if i == 0:
1997
+ incident_ids = state_ids
1998
+ else:
1999
+ incident_ids = incident_ids.union(state_ids)
2000
+ else:
2001
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2002
+ incident_ids = client.smembers(index_key)
2003
+
2004
+ # Fetch incidents
2005
+ incidents = []
2006
+ for incident_id in incident_ids:
2007
+ incident = self.get_incident(incident_id)
2008
+ if incident:
2009
+ # Apply additional filters if needed
2010
+ if states and incident.state not in states:
2011
+ continue
2012
+ if policy_id and incident.policy_id != policy_id:
2013
+ continue
2014
+ incidents.append(incident)
2015
+
2016
+ return incidents
2017
+
2018
+ except Exception as e:
2019
+ self._handle_redis_error(e, "list_incidents")
2020
+
2021
+ if self.enable_fallback:
2022
+ self._using_fallback = True
2023
+ self._metrics.fallbacks += 1
2024
+ return self._get_fallback_store().list_incidents(policy_id, states)
2025
+
2026
+ raise
2027
+
2028
+ async def list_incidents_async(
2029
+ self,
2030
+ policy_id: str | None = None,
2031
+ states: list[EscalationState] | None = None,
2032
+ ) -> list[EscalationIncident]:
2033
+ """List incidents with optional filters asynchronously.
2034
+
2035
+ Args:
2036
+ policy_id: Filter by policy ID.
2037
+ states: Filter by states.
2038
+
2039
+ Returns:
2040
+ List of incidents.
2041
+ """
2042
+ if self._using_fallback and self.enable_fallback:
2043
+ return self._get_fallback_store().list_incidents(policy_id, states)
2044
+
2045
+ try:
2046
+ client = await self.get_async_client()
2047
+ incident_ids: set[str] = set()
2048
+
2049
+ # Get IDs based on filters
2050
+ if policy_id:
2051
+ policy_key = self._get_key(
2052
+ self.KEY_INCIDENT_BY_POLICY, policy_id=policy_id
2053
+ )
2054
+ incident_ids = await client.smembers(policy_key)
2055
+ elif states:
2056
+ # Get incidents from state indices and union
2057
+ for i, state in enumerate(states):
2058
+ state_key = self._get_key(
2059
+ self.KEY_INCIDENT_BY_STATE, state=state.value
2060
+ )
2061
+ state_ids = await client.smembers(state_key)
2062
+ if i == 0:
2063
+ incident_ids = state_ids
2064
+ else:
2065
+ incident_ids = incident_ids.union(state_ids)
2066
+ else:
2067
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2068
+ incident_ids = await client.smembers(index_key)
2069
+
2070
+ # Fetch incidents
2071
+ incidents = []
2072
+ for incident_id in incident_ids:
2073
+ incident = await self.get_incident_async(incident_id)
2074
+ if incident:
2075
+ # Apply additional filters if needed
2076
+ if states and incident.state not in states:
2077
+ continue
2078
+ if policy_id and incident.policy_id != policy_id:
2079
+ continue
2080
+ incidents.append(incident)
2081
+
2082
+ return incidents
2083
+
2084
+ except Exception as e:
2085
+ self._handle_redis_error(e, "list_incidents_async")
2086
+
2087
+ if self.enable_fallback:
2088
+ self._using_fallback = True
2089
+ self._metrics.fallbacks += 1
2090
+ return self._get_fallback_store().list_incidents(policy_id, states)
2091
+
2092
+ raise
2093
+
2094
+ def get_pending_escalations(self) -> list[EscalationIncident]:
2095
+ """Get incidents due for escalation.
2096
+
2097
+ Returns:
2098
+ List of incidents due for escalation.
2099
+ """
2100
+ if self._using_fallback and self.enable_fallback:
2101
+ return self._get_fallback_store().get_pending_escalations()
2102
+
2103
+ try:
2104
+ client = self.client
2105
+ now = datetime.utcnow().timestamp()
2106
+
2107
+ # Get incident IDs from pending sorted set where score <= now
2108
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2109
+ incident_ids = client.zrangebyscore(pending_key, "-inf", now)
2110
+
2111
+ incidents = []
2112
+ for incident_id in incident_ids:
2113
+ incident = self.get_incident(incident_id)
2114
+ if incident and incident.state in [
2115
+ EscalationState.TRIGGERED,
2116
+ EscalationState.ESCALATED,
2117
+ ]:
2118
+ incidents.append(incident)
2119
+
2120
+ return incidents
2121
+
2122
+ except Exception as e:
2123
+ self._handle_redis_error(e, "get_pending_escalations")
2124
+
2125
+ if self.enable_fallback:
2126
+ self._using_fallback = True
2127
+ self._metrics.fallbacks += 1
2128
+ return self._get_fallback_store().get_pending_escalations()
2129
+
2130
+ raise
2131
+
2132
+ async def get_pending_escalations_async(self) -> list[EscalationIncident]:
2133
+ """Get incidents due for escalation asynchronously.
2134
+
2135
+ Returns:
2136
+ List of incidents due for escalation.
2137
+ """
2138
+ if self._using_fallback and self.enable_fallback:
2139
+ return self._get_fallback_store().get_pending_escalations()
2140
+
2141
+ try:
2142
+ client = await self.get_async_client()
2143
+ now = datetime.utcnow().timestamp()
2144
+
2145
+ # Get incident IDs from pending sorted set where score <= now
2146
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2147
+ incident_ids = await client.zrangebyscore(pending_key, "-inf", now)
2148
+
2149
+ incidents = []
2150
+ for incident_id in incident_ids:
2151
+ incident = await self.get_incident_async(incident_id)
2152
+ if incident and incident.state in [
2153
+ EscalationState.TRIGGERED,
2154
+ EscalationState.ESCALATED,
2155
+ ]:
2156
+ incidents.append(incident)
2157
+
2158
+ return incidents
2159
+
2160
+ except Exception as e:
2161
+ self._handle_redis_error(e, "get_pending_escalations_async")
2162
+
2163
+ if self.enable_fallback:
2164
+ self._using_fallback = True
2165
+ self._metrics.fallbacks += 1
2166
+ return self._get_fallback_store().get_pending_escalations()
2167
+
2168
+ raise
2169
+
2170
+ # =========================================================================
2171
+ # Atomic State Transition
2172
+ # =========================================================================
2173
+
2174
+ def transition_state(
2175
+ self,
2176
+ incident_id: str,
2177
+ new_state: EscalationState,
2178
+ **updates: Any,
2179
+ ) -> EscalationIncident | None:
2180
+ """Atomically transition incident state using Lua script.
2181
+
2182
+ This ensures that state transitions are atomic and consistent,
2183
+ even under concurrent access.
2184
+
2185
+ Args:
2186
+ incident_id: Incident ID.
2187
+ new_state: New state to transition to.
2188
+ **updates: Additional fields to update on the incident.
2189
+
2190
+ Returns:
2191
+ Updated incident if successful, None if not found.
2192
+ """
2193
+ start_time = time.time()
2194
+
2195
+ if self._using_fallback and self.enable_fallback:
2196
+ incident = self._get_fallback_store().get_incident(incident_id)
2197
+ if not incident:
2198
+ return None
2199
+ old_state = incident.state
2200
+ incident.state = new_state
2201
+ for key, value in updates.items():
2202
+ if hasattr(incident, key):
2203
+ setattr(incident, key, value)
2204
+ self._get_fallback_store().save_incident(incident)
2205
+ return incident
2206
+
2207
+ try:
2208
+ client = self.client
2209
+ self._register_lua_scripts(client)
2210
+
2211
+ # Get current incident
2212
+ incident = self.get_incident(incident_id)
2213
+ if not incident:
2214
+ return None
2215
+
2216
+ old_state = incident.state
2217
+
2218
+ # Update incident
2219
+ incident.state = new_state
2220
+ incident.updated_at = datetime.utcnow()
2221
+ for key, value in updates.items():
2222
+ if hasattr(incident, key):
2223
+ setattr(incident, key, value)
2224
+
2225
+ # Prepare keys and args for Lua script
2226
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
2227
+ old_state_key = self._get_key(
2228
+ self.KEY_INCIDENT_BY_STATE, state=old_state.value
2229
+ )
2230
+ new_state_key = self._get_key(
2231
+ self.KEY_INCIDENT_BY_STATE, state=new_state.value
2232
+ )
2233
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2234
+
2235
+ next_escalation_score = ""
2236
+ if incident.next_escalation_at and new_state in [
2237
+ EscalationState.TRIGGERED,
2238
+ EscalationState.ESCALATED,
2239
+ ]:
2240
+ next_escalation_score = str(incident.next_escalation_at.timestamp())
2241
+
2242
+ # Execute Lua script
2243
+ result = client.evalsha(
2244
+ self._state_transition_sha,
2245
+ 4, # Number of keys
2246
+ incident_key,
2247
+ old_state_key,
2248
+ new_state_key,
2249
+ pending_key,
2250
+ incident_id,
2251
+ new_state.value,
2252
+ self._serialize_incident(incident),
2253
+ next_escalation_score,
2254
+ )
2255
+
2256
+ if result == "OK":
2257
+ # Set TTL for resolved incidents
2258
+ if new_state == EscalationState.RESOLVED and self.resolved_ttl > 0:
2259
+ client.expire(incident_key, self.resolved_ttl)
2260
+
2261
+ # Publish state change
2262
+ self._publish_incident_update(client, incident, "state_changed")
2263
+
2264
+ self._metrics.state_transitions += 1
2265
+ latency_ms = (time.time() - start_time) * 1000
2266
+ self._metrics.record_latency(latency_ms)
2267
+
2268
+ return incident
2269
+
2270
+ return None
2271
+
2272
+ except Exception as e:
2273
+ self._handle_redis_error(e, "transition_state")
2274
+
2275
+ if self.enable_fallback:
2276
+ self._using_fallback = True
2277
+ self._metrics.fallbacks += 1
2278
+ # Fallback to non-atomic operation
2279
+ incident = self._get_fallback_store().get_incident(incident_id)
2280
+ if incident:
2281
+ incident.state = new_state
2282
+ for key, value in updates.items():
2283
+ if hasattr(incident, key):
2284
+ setattr(incident, key, value)
2285
+ self._get_fallback_store().save_incident(incident)
2286
+ return incident
2287
+
2288
+ raise
2289
+
2290
+ async def transition_state_async(
2291
+ self,
2292
+ incident_id: str,
2293
+ new_state: EscalationState,
2294
+ **updates: Any,
2295
+ ) -> EscalationIncident | None:
2296
+ """Atomically transition incident state using Lua script asynchronously.
2297
+
2298
+ Args:
2299
+ incident_id: Incident ID.
2300
+ new_state: New state to transition to.
2301
+ **updates: Additional fields to update on the incident.
2302
+
2303
+ Returns:
2304
+ Updated incident if successful, None if not found.
2305
+ """
2306
+ start_time = time.time()
2307
+
2308
+ if self._using_fallback and self.enable_fallback:
2309
+ incident = self._get_fallback_store().get_incident(incident_id)
2310
+ if not incident:
2311
+ return None
2312
+ incident.state = new_state
2313
+ for key, value in updates.items():
2314
+ if hasattr(incident, key):
2315
+ setattr(incident, key, value)
2316
+ self._get_fallback_store().save_incident(incident)
2317
+ return incident
2318
+
2319
+ try:
2320
+ client = await self.get_async_client()
2321
+ await self._register_lua_scripts_async(client)
2322
+
2323
+ # Get current incident
2324
+ incident = await self.get_incident_async(incident_id)
2325
+ if not incident:
2326
+ return None
2327
+
2328
+ old_state = incident.state
2329
+
2330
+ # Update incident
2331
+ incident.state = new_state
2332
+ incident.updated_at = datetime.utcnow()
2333
+ for key, value in updates.items():
2334
+ if hasattr(incident, key):
2335
+ setattr(incident, key, value)
2336
+
2337
+ # Prepare keys and args for Lua script
2338
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident_id)
2339
+ old_state_key = self._get_key(
2340
+ self.KEY_INCIDENT_BY_STATE, state=old_state.value
2341
+ )
2342
+ new_state_key = self._get_key(
2343
+ self.KEY_INCIDENT_BY_STATE, state=new_state.value
2344
+ )
2345
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2346
+
2347
+ next_escalation_score = ""
2348
+ if incident.next_escalation_at and new_state in [
2349
+ EscalationState.TRIGGERED,
2350
+ EscalationState.ESCALATED,
2351
+ ]:
2352
+ next_escalation_score = str(incident.next_escalation_at.timestamp())
2353
+
2354
+ # Execute Lua script
2355
+ result = await client.evalsha(
2356
+ self._state_transition_sha,
2357
+ 4,
2358
+ incident_key,
2359
+ old_state_key,
2360
+ new_state_key,
2361
+ pending_key,
2362
+ incident_id,
2363
+ new_state.value,
2364
+ self._serialize_incident(incident),
2365
+ next_escalation_score,
2366
+ )
2367
+
2368
+ if result == "OK":
2369
+ # Set TTL for resolved incidents
2370
+ if new_state == EscalationState.RESOLVED and self.resolved_ttl > 0:
2371
+ await client.expire(incident_key, self.resolved_ttl)
2372
+
2373
+ # Publish state change
2374
+ await self._publish_incident_update_async(
2375
+ client, incident, "state_changed"
2376
+ )
2377
+
2378
+ self._metrics.state_transitions += 1
2379
+ latency_ms = (time.time() - start_time) * 1000
2380
+ self._metrics.record_latency(latency_ms)
2381
+
2382
+ return incident
2383
+
2384
+ return None
2385
+
2386
+ except Exception as e:
2387
+ self._handle_redis_error(e, "transition_state_async")
2388
+
2389
+ if self.enable_fallback:
2390
+ self._using_fallback = True
2391
+ self._metrics.fallbacks += 1
2392
+ incident = self._get_fallback_store().get_incident(incident_id)
2393
+ if incident:
2394
+ incident.state = new_state
2395
+ for key, value in updates.items():
2396
+ if hasattr(incident, key):
2397
+ setattr(incident, key, value)
2398
+ self._get_fallback_store().save_incident(incident)
2399
+ return incident
2400
+
2401
+ raise
2402
+
2403
+ # =========================================================================
2404
+ # Pub/Sub Subscription
2405
+ # =========================================================================
2406
+
2407
+ async def subscribe_to_updates(
2408
+ self,
2409
+ ) -> "redis.asyncio.client.PubSub":
2410
+ """Subscribe to incident update channel.
2411
+
2412
+ Returns a Pub/Sub instance that can be used to listen for updates.
2413
+
2414
+ Returns:
2415
+ Async Pub/Sub instance subscribed to the incident updates channel.
2416
+
2417
+ Example:
2418
+ pubsub = await store.subscribe_to_updates()
2419
+ async for message in pubsub.listen():
2420
+ if message["type"] == "message":
2421
+ data = json.loads(message["data"])
2422
+ print(f"Incident {data['incident_id']} changed to {data['state']}")
2423
+ """
2424
+ client = await self.get_async_client()
2425
+ pubsub = client.pubsub()
2426
+ channel = self._get_key(self.CHANNEL_INCIDENT_UPDATE)
2427
+ await pubsub.subscribe(channel)
2428
+ return pubsub
2429
+
2430
+ # =========================================================================
2431
+ # Cleanup Operations
2432
+ # =========================================================================
2433
+
2434
+ def cleanup_resolved_incidents(self, max_age_seconds: int | None = None) -> int:
2435
+ """Clean up old resolved incidents.
2436
+
2437
+ Args:
2438
+ max_age_seconds: Maximum age in seconds. Uses resolved_ttl if not provided.
2439
+
2440
+ Returns:
2441
+ Number of incidents cleaned up.
2442
+ """
2443
+ if self._using_fallback and self.enable_fallback:
2444
+ # InMemory store doesn't have cleanup
2445
+ return 0
2446
+
2447
+ try:
2448
+ client = self.client
2449
+ max_age = max_age_seconds or self.resolved_ttl
2450
+ cutoff = datetime.utcnow() - timedelta(seconds=max_age)
2451
+ cutoff_score = cutoff.timestamp()
2452
+
2453
+ # Get resolved incidents created before cutoff
2454
+ resolved_key = self._get_key(
2455
+ self.KEY_INCIDENT_BY_STATE, state=EscalationState.RESOLVED.value
2456
+ )
2457
+ resolved_ids = client.smembers(resolved_key)
2458
+
2459
+ cleaned = 0
2460
+ for incident_id in resolved_ids:
2461
+ incident = self.get_incident(incident_id)
2462
+ if incident and incident.resolved_at:
2463
+ if incident.resolved_at.timestamp() < cutoff_score:
2464
+ self._delete_incident(client, incident)
2465
+ cleaned += 1
2466
+
2467
+ return cleaned
2468
+
2469
+ except Exception as e:
2470
+ self._handle_redis_error(e, "cleanup_resolved_incidents")
2471
+ return 0
2472
+
2473
+ async def cleanup_resolved_incidents_async(
2474
+ self, max_age_seconds: int | None = None
2475
+ ) -> int:
2476
+ """Clean up old resolved incidents asynchronously.
2477
+
2478
+ Args:
2479
+ max_age_seconds: Maximum age in seconds.
2480
+
2481
+ Returns:
2482
+ Number of incidents cleaned up.
2483
+ """
2484
+ if self._using_fallback and self.enable_fallback:
2485
+ return 0
2486
+
2487
+ try:
2488
+ client = await self.get_async_client()
2489
+ max_age = max_age_seconds or self.resolved_ttl
2490
+ cutoff = datetime.utcnow() - timedelta(seconds=max_age)
2491
+ cutoff_score = cutoff.timestamp()
2492
+
2493
+ # Get resolved incidents
2494
+ resolved_key = self._get_key(
2495
+ self.KEY_INCIDENT_BY_STATE, state=EscalationState.RESOLVED.value
2496
+ )
2497
+ resolved_ids = await client.smembers(resolved_key)
2498
+
2499
+ cleaned = 0
2500
+ for incident_id in resolved_ids:
2501
+ incident = await self.get_incident_async(incident_id)
2502
+ if incident and incident.resolved_at:
2503
+ if incident.resolved_at.timestamp() < cutoff_score:
2504
+ await self._delete_incident_async(client, incident)
2505
+ cleaned += 1
2506
+
2507
+ return cleaned
2508
+
2509
+ except Exception as e:
2510
+ self._handle_redis_error(e, "cleanup_resolved_incidents_async")
2511
+ return 0
2512
+
2513
+ def _delete_incident(
2514
+ self, client: "redis.Redis", incident: EscalationIncident
2515
+ ) -> None:
2516
+ """Delete incident and all its indices.
2517
+
2518
+ Args:
2519
+ client: Redis client.
2520
+ incident: Incident to delete.
2521
+ """
2522
+ pipe = client.pipeline()
2523
+
2524
+ # Delete incident
2525
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
2526
+ pipe.delete(incident_key)
2527
+
2528
+ # Remove from indices
2529
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2530
+ pipe.srem(index_key, incident.id)
2531
+
2532
+ ref_key = self._get_key(
2533
+ self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
2534
+ )
2535
+ pipe.delete(ref_key)
2536
+
2537
+ policy_key = self._get_key(
2538
+ self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
2539
+ )
2540
+ pipe.srem(policy_key, incident.id)
2541
+
2542
+ for state in EscalationState:
2543
+ state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
2544
+ pipe.srem(state_key, incident.id)
2545
+
2546
+ created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
2547
+ pipe.zrem(created_key, incident.id)
2548
+
2549
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2550
+ pipe.zrem(pending_key, incident.id)
2551
+
2552
+ pipe.execute()
2553
+
2554
+ async def _delete_incident_async(
2555
+ self, client: "redis.asyncio.Redis", incident: EscalationIncident
2556
+ ) -> None:
2557
+ """Delete incident and all its indices asynchronously.
2558
+
2559
+ Args:
2560
+ client: Async Redis client.
2561
+ incident: Incident to delete.
2562
+ """
2563
+ pipe = client.pipeline()
2564
+
2565
+ # Delete incident
2566
+ incident_key = self._get_key(self.KEY_INCIDENT, incident_id=incident.id)
2567
+ pipe.delete(incident_key)
2568
+
2569
+ # Remove from indices
2570
+ index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2571
+ pipe.srem(index_key, incident.id)
2572
+
2573
+ ref_key = self._get_key(
2574
+ self.KEY_INCIDENT_BY_REF, incident_ref=incident.incident_ref
2575
+ )
2576
+ pipe.delete(ref_key)
2577
+
2578
+ policy_key = self._get_key(
2579
+ self.KEY_INCIDENT_BY_POLICY, policy_id=incident.policy_id
2580
+ )
2581
+ pipe.srem(policy_key, incident.id)
2582
+
2583
+ for state in EscalationState:
2584
+ state_key = self._get_key(self.KEY_INCIDENT_BY_STATE, state=state.value)
2585
+ pipe.srem(state_key, incident.id)
2586
+
2587
+ created_key = self._get_key(self.KEY_INCIDENT_BY_CREATED)
2588
+ pipe.zrem(created_key, incident.id)
2589
+
2590
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2591
+ pipe.zrem(pending_key, incident.id)
2592
+
2593
+ await pipe.execute()
2594
+
2595
+ # =========================================================================
2596
+ # Health Check & Metrics
2597
+ # =========================================================================
2598
+
2599
+ def health_check(self) -> dict[str, Any]:
2600
+ """Perform health check and return status.
2601
+
2602
+ Returns:
2603
+ Dictionary with health status information.
2604
+ """
2605
+ result = {
2606
+ "healthy": False,
2607
+ "connected": self._connected,
2608
+ "using_fallback": self._using_fallback,
2609
+ "redis_url": self._mask_url(self.redis_url),
2610
+ "metrics": self._metrics.to_dict(),
2611
+ }
2612
+
2613
+ if self._using_fallback and self.enable_fallback:
2614
+ result["healthy"] = True
2615
+ result["mode"] = "fallback"
2616
+ result["fallback_policies"] = len(
2617
+ self._get_fallback_store().list_policies(active_only=False)
2618
+ )
2619
+ result["fallback_incidents"] = len(
2620
+ self._get_fallback_store().list_incidents()
2621
+ )
2622
+ return result
2623
+
2624
+ try:
2625
+ client = self.client
2626
+ ping_ok = client.ping()
2627
+
2628
+ if ping_ok:
2629
+ result["healthy"] = True
2630
+ result["mode"] = "redis"
2631
+
2632
+ # Get counts
2633
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
2634
+ result["policies"] = client.scard(index_key)
2635
+
2636
+ incident_index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2637
+ result["incidents"] = client.scard(incident_index_key)
2638
+
2639
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2640
+ result["pending_escalations"] = client.zcard(pending_key)
2641
+
2642
+ # Get Redis info
2643
+ info = client.info(section="server")
2644
+ result["redis_info"] = {
2645
+ "version": info.get("redis_version"),
2646
+ "uptime_seconds": info.get("uptime_in_seconds"),
2647
+ }
2648
+
2649
+ except Exception as e:
2650
+ result["error"] = str(e)
2651
+ if self._last_error_time:
2652
+ result["last_error_time"] = datetime.fromtimestamp(
2653
+ self._last_error_time
2654
+ ).isoformat()
2655
+
2656
+ return result
2657
+
2658
+ async def health_check_async(self) -> dict[str, Any]:
2659
+ """Perform health check asynchronously.
2660
+
2661
+ Returns:
2662
+ Dictionary with health status information.
2663
+ """
2664
+ result = {
2665
+ "healthy": False,
2666
+ "connected": self._connected,
2667
+ "using_fallback": self._using_fallback,
2668
+ "redis_url": self._mask_url(self.redis_url),
2669
+ "metrics": self._metrics.to_dict(),
2670
+ }
2671
+
2672
+ if self._using_fallback and self.enable_fallback:
2673
+ result["healthy"] = True
2674
+ result["mode"] = "fallback"
2675
+ result["fallback_policies"] = len(
2676
+ self._get_fallback_store().list_policies(active_only=False)
2677
+ )
2678
+ result["fallback_incidents"] = len(
2679
+ self._get_fallback_store().list_incidents()
2680
+ )
2681
+ return result
2682
+
2683
+ try:
2684
+ client = await self.get_async_client()
2685
+ ping_ok = await client.ping()
2686
+
2687
+ if ping_ok:
2688
+ result["healthy"] = True
2689
+ result["mode"] = "redis"
2690
+
2691
+ # Get counts
2692
+ index_key = self._get_key(self.KEY_POLICY_INDEX)
2693
+ result["policies"] = await client.scard(index_key)
2694
+
2695
+ incident_index_key = self._get_key(self.KEY_INCIDENT_INDEX)
2696
+ result["incidents"] = await client.scard(incident_index_key)
2697
+
2698
+ pending_key = self._get_key(self.KEY_INCIDENT_PENDING)
2699
+ result["pending_escalations"] = await client.zcard(pending_key)
2700
+
2701
+ # Get Redis info
2702
+ info = await client.info(section="server")
2703
+ result["redis_info"] = {
2704
+ "version": info.get("redis_version"),
2705
+ "uptime_seconds": info.get("uptime_in_seconds"),
2706
+ }
2707
+
2708
+ except Exception as e:
2709
+ result["error"] = str(e)
2710
+ if self._last_error_time:
2711
+ result["last_error_time"] = datetime.fromtimestamp(
2712
+ self._last_error_time
2713
+ ).isoformat()
2714
+
2715
+ return result
2716
+
2717
+ def _mask_url(self, url: str) -> str:
2718
+ """Mask sensitive parts of Redis URL.
2719
+
2720
+ Args:
2721
+ url: Redis URL to mask.
2722
+
2723
+ Returns:
2724
+ Masked URL string.
2725
+ """
2726
+ import re
2727
+
2728
+ # Mask password if present
2729
+ return re.sub(r"://[^:]+:[^@]+@", "://***:***@", url)
2730
+
2731
+ def get_metrics(self) -> dict[str, Any]:
2732
+ """Get current metrics.
2733
+
2734
+ Returns:
2735
+ Dictionary with metrics data.
2736
+ """
2737
+ return self._metrics.to_dict()
2738
+
2739
+ def reset_metrics(self) -> None:
2740
+ """Reset all metrics to zero."""
2741
+ self._metrics = EscalationMetrics()
2742
+
2743
+ # =========================================================================
2744
+ # Connection Management
2745
+ # =========================================================================
2746
+
2747
+ def close(self) -> None:
2748
+ """Close all connections and pools."""
2749
+ if self._client is not None:
2750
+ try:
2751
+ self._client.close()
2752
+ except Exception:
2753
+ pass
2754
+ self._client = None
2755
+
2756
+ if self._pool is not None:
2757
+ try:
2758
+ self._pool.disconnect()
2759
+ except Exception:
2760
+ pass
2761
+ self._pool = None
2762
+
2763
+ self._connected = False
2764
+
2765
+ async def close_async(self) -> None:
2766
+ """Close all connections and pools asynchronously."""
2767
+ if self._async_client is not None:
2768
+ try:
2769
+ await self._async_client.close()
2770
+ except Exception:
2771
+ pass
2772
+ self._async_client = None
2773
+
2774
+ if self._async_pool is not None:
2775
+ try:
2776
+ await self._async_pool.disconnect()
2777
+ except Exception:
2778
+ pass
2779
+ self._async_pool = None
2780
+
2781
+ self._connected = False
2782
+
2783
+ def __enter__(self) -> "RedisEscalationStore":
2784
+ """Context manager entry."""
2785
+ return self
2786
+
2787
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
2788
+ """Context manager exit, closes connections."""
2789
+ self.close()
2790
+
2791
+ async def __aenter__(self) -> "RedisEscalationStore":
2792
+ """Async context manager entry."""
2793
+ return self
2794
+
2795
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
2796
+ """Async context manager exit, closes connections."""
2797
+ await self.close_async()
2798
+
2799
+
2800
+ # ============================================================================
2801
+ # Factory Function
2802
+ # ============================================================================
2803
+
2804
+
2805
+ class EscalationStoreType:
2806
+ """Store type constants."""
2807
+
2808
+ MEMORY = "memory"
2809
+ SQLITE = "sqlite"
2810
+ REDIS = "redis"
2811
+
2812
+
2813
+ def create_escalation_store(
2814
+ store_type: str | None = None,
2815
+ **kwargs: Any,
2816
+ ) -> BaseEscalationStore:
2817
+ """Factory function to create appropriate escalation store.
2818
+
2819
+ Selects the store type based on configuration or environment variables.
2820
+
2821
+ Environment variables:
2822
+ TRUTHOUND_ESCALATION_STORE_TYPE: Store type (memory, sqlite, redis)
2823
+ TRUTHOUND_ESCALATION_SQLITE_PATH: SQLite database path
2824
+ TRUTHOUND_ESCALATION_REDIS_URL: Redis connection URL (enables redis)
2825
+
2826
+ Args:
2827
+ store_type: Explicit store type override. If None, auto-detects.
2828
+ **kwargs: Additional arguments passed to the store constructor.
2829
+
2830
+ Returns:
2831
+ Configured BaseEscalationStore instance.
2832
+
2833
+ Example:
2834
+ # Auto-detect based on environment
2835
+ store = create_escalation_store()
2836
+
2837
+ # Explicit type
2838
+ store = create_escalation_store("redis", resolved_ttl=7200)
2839
+
2840
+ # SQLite with custom path
2841
+ store = create_escalation_store("sqlite", db_path="/tmp/escalation.db")
2842
+ """
2843
+ logger = logging.getLogger(__name__)
2844
+
2845
+ # Determine store type
2846
+ if store_type is None:
2847
+ store_type = os.getenv("TRUTHOUND_ESCALATION_STORE_TYPE")
2848
+
2849
+ # Auto-detect if still None
2850
+ if store_type is None:
2851
+ redis_url = os.getenv("TRUTHOUND_ESCALATION_REDIS_URL")
2852
+ if redis_url and REDIS_AVAILABLE:
2853
+ store_type = EscalationStoreType.REDIS
2854
+ logger.info(
2855
+ "Auto-detected Redis store from TRUTHOUND_ESCALATION_REDIS_URL"
2856
+ )
2857
+ elif os.getenv("TRUTHOUND_ESCALATION_SQLITE_PATH"):
2858
+ store_type = EscalationStoreType.SQLITE
2859
+ logger.info(
2860
+ "Auto-detected SQLite store from TRUTHOUND_ESCALATION_SQLITE_PATH"
2861
+ )
2862
+ else:
2863
+ store_type = EscalationStoreType.MEMORY
2864
+ logger.info("Using default InMemory store")
2865
+
2866
+ # Normalize store type
2867
+ store_type = store_type.lower().strip()
2868
+
2869
+ # Create store based on type
2870
+ if store_type == EscalationStoreType.MEMORY:
2871
+ logger.info("Creating InMemory escalation store")
2872
+ return InMemoryEscalationStore()
2873
+
2874
+ elif store_type == EscalationStoreType.SQLITE:
2875
+ db_path = kwargs.pop("db_path", None) or os.getenv(
2876
+ "TRUTHOUND_ESCALATION_SQLITE_PATH", "escalation.db"
2877
+ )
2878
+ logger.info(f"Creating SQLite escalation store at {db_path}")
2879
+ return SQLiteEscalationStore(db_path=db_path)
2880
+
2881
+ elif store_type == EscalationStoreType.REDIS:
2882
+ if not REDIS_AVAILABLE:
2883
+ logger.warning(
2884
+ "Redis not available, falling back to InMemory store. "
2885
+ "Install with: pip install truthound-dashboard[redis]"
2886
+ )
2887
+ return InMemoryEscalationStore()
2888
+
2889
+ logger.info("Creating Redis escalation store")
2890
+ return RedisEscalationStore(**kwargs)
2891
+
2892
+ else:
2893
+ logger.warning(
2894
+ f"Unknown store type '{store_type}', falling back to InMemory store"
2895
+ )
2896
+ return InMemoryEscalationStore()