truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/METADATA +147 -23
  162. truthound_dashboard-1.4.1.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
  164. truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
  166. truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1187 @@
1
+ """Escalation scheduler service with APScheduler integration.
2
+
3
+ This module provides automatic escalation checking via APScheduler,
4
+ triggering escalations when incidents reach their scheduled escalation time.
5
+
6
+ Features:
7
+ - Periodic checking of pending escalations
8
+ - Configurable check interval
9
+ - Abstract handler interface for extensibility
10
+ - Multiple escalation strategy support
11
+ - Integration with notification dispatcher
12
+ - **Persistent job storage (SQLAlchemy backend)**
13
+ - **Configurable misfire handling with grace time**
14
+ - **Error recovery with exponential backoff**
15
+ - **Job coalescing to avoid duplicate executions**
16
+ - **Graceful shutdown handling**
17
+
18
+ Usage:
19
+ from truthound_dashboard.core.notifications.escalation.scheduler import (
20
+ EscalationSchedulerService,
21
+ get_escalation_scheduler,
22
+ start_escalation_scheduler,
23
+ stop_escalation_scheduler,
24
+ )
25
+
26
+ # Start the scheduler with persistent backend
27
+ scheduler = get_escalation_scheduler()
28
+ await scheduler.start()
29
+
30
+ # Or use convenience functions
31
+ await start_escalation_scheduler()
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import asyncio
37
+ import logging
38
+ import os
39
+ from abc import ABC, abstractmethod
40
+ from dataclasses import dataclass, field
41
+ from datetime import datetime, timedelta
42
+ from typing import Any
43
+
44
+ from apscheduler.schedulers.asyncio import AsyncIOScheduler
45
+ from apscheduler.triggers.interval import IntervalTrigger
46
+
47
+ from ...validation_limits import get_escalation_limits, ValidationLimitError
48
+ from ....db import get_session
49
+ from ....db.models import (
50
+ EscalationIncidentModel,
51
+ EscalationPolicyModel,
52
+ EscalationStateEnum,
53
+ NotificationChannel,
54
+ )
55
+ from ..dispatcher import create_dispatcher
56
+ from .backends import (
57
+ BackendType,
58
+ JobData,
59
+ JobState,
60
+ SchedulerBackend,
61
+ SchedulerBackendConfig,
62
+ create_scheduler_backend,
63
+ )
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+
68
+ # =============================================================================
69
+ # Configuration
70
+ # =============================================================================
71
+
72
+
73
+ @dataclass
74
+ class EscalationSchedulerConfig:
75
+ """Configuration for the escalation scheduler with validation.
76
+
77
+ Validation:
78
+ - check_interval_seconds: Must be between 10 and 3600 (configurable).
79
+ - max_escalations_per_check: Must be between 1 and 1000.
80
+ - retry_delay_seconds: Must be between 1 and 3600.
81
+ - max_retries: Must be between 0 and 10.
82
+
83
+ DoS Prevention:
84
+ - Minimum check interval prevents excessive CPU usage.
85
+ - Maximum escalations per check prevents memory exhaustion.
86
+ - Maximum retry attempts prevents infinite retry loops.
87
+
88
+ Environment Variables:
89
+ - TRUTHOUND_ESCALATION_CHECK_INTERVAL_MIN
90
+ - TRUTHOUND_ESCALATION_CHECK_INTERVAL_MAX
91
+
92
+ Attributes:
93
+ check_interval_seconds: How often to check for pending escalations.
94
+ max_escalations_per_check: Maximum escalations to process per check.
95
+ retry_on_failure: Whether to retry failed escalations.
96
+ retry_delay_seconds: Delay before retrying failed escalation.
97
+ enabled: Whether the scheduler is enabled.
98
+ backend_type: Type of scheduler backend (memory, sqlalchemy).
99
+ misfire_grace_time: Seconds to allow for late job execution.
100
+ coalesce: Combine multiple pending executions into one.
101
+ max_retries: Maximum retry attempts on failure.
102
+ shutdown_timeout: Seconds to wait for jobs during shutdown.
103
+ """
104
+
105
+ check_interval_seconds: int = 60
106
+ max_escalations_per_check: int = 100
107
+ retry_on_failure: bool = True
108
+ retry_delay_seconds: int = 300
109
+ enabled: bool = True
110
+ backend_type: BackendType = BackendType.SQLALCHEMY
111
+ misfire_grace_time: int = 60
112
+ coalesce: bool = True
113
+ max_retries: int = 3
114
+ shutdown_timeout: float = 30.0
115
+
116
+ def __post_init__(self) -> None:
117
+ """Validate configuration after initialization."""
118
+ limits = get_escalation_limits()
119
+
120
+ # Validate check_interval_seconds
121
+ valid, error = limits.validate_check_interval(self.check_interval_seconds)
122
+ if not valid:
123
+ raise ValidationLimitError(
124
+ error or f"Invalid check_interval_seconds: {self.check_interval_seconds}",
125
+ parameter="check_interval_seconds",
126
+ value=self.check_interval_seconds,
127
+ )
128
+
129
+ # Validate max_escalations_per_check (1-1000)
130
+ if self.max_escalations_per_check < 1:
131
+ raise ValidationLimitError(
132
+ f"max_escalations_per_check must be at least 1, "
133
+ f"got {self.max_escalations_per_check}",
134
+ parameter="max_escalations_per_check",
135
+ value=self.max_escalations_per_check,
136
+ )
137
+ if self.max_escalations_per_check > 1000:
138
+ raise ValidationLimitError(
139
+ f"max_escalations_per_check must not exceed 1000, "
140
+ f"got {self.max_escalations_per_check}",
141
+ parameter="max_escalations_per_check",
142
+ value=self.max_escalations_per_check,
143
+ )
144
+
145
+ # Validate retry_delay_seconds (1-3600)
146
+ if self.retry_delay_seconds < 1:
147
+ raise ValidationLimitError(
148
+ f"retry_delay_seconds must be at least 1, "
149
+ f"got {self.retry_delay_seconds}",
150
+ parameter="retry_delay_seconds",
151
+ value=self.retry_delay_seconds,
152
+ )
153
+ if self.retry_delay_seconds > 3600:
154
+ raise ValidationLimitError(
155
+ f"retry_delay_seconds must not exceed 3600, "
156
+ f"got {self.retry_delay_seconds}",
157
+ parameter="retry_delay_seconds",
158
+ value=self.retry_delay_seconds,
159
+ )
160
+
161
+ # Validate max_retries (0-10)
162
+ if self.max_retries < 0:
163
+ raise ValidationLimitError(
164
+ f"max_retries must be non-negative, "
165
+ f"got {self.max_retries}",
166
+ parameter="max_retries",
167
+ value=self.max_retries,
168
+ )
169
+ if self.max_retries > 10:
170
+ raise ValidationLimitError(
171
+ f"max_retries must not exceed 10, "
172
+ f"got {self.max_retries}",
173
+ parameter="max_retries",
174
+ value=self.max_retries,
175
+ )
176
+
177
+ # Validate misfire_grace_time (1-3600)
178
+ if self.misfire_grace_time < 1:
179
+ raise ValidationLimitError(
180
+ f"misfire_grace_time must be at least 1, "
181
+ f"got {self.misfire_grace_time}",
182
+ parameter="misfire_grace_time",
183
+ value=self.misfire_grace_time,
184
+ )
185
+ if self.misfire_grace_time > 3600:
186
+ raise ValidationLimitError(
187
+ f"misfire_grace_time must not exceed 3600, "
188
+ f"got {self.misfire_grace_time}",
189
+ parameter="misfire_grace_time",
190
+ value=self.misfire_grace_time,
191
+ )
192
+
193
+ # Validate shutdown_timeout (1-300)
194
+ if self.shutdown_timeout < 1:
195
+ raise ValidationLimitError(
196
+ f"shutdown_timeout must be at least 1, "
197
+ f"got {self.shutdown_timeout}",
198
+ parameter="shutdown_timeout",
199
+ value=self.shutdown_timeout,
200
+ )
201
+ if self.shutdown_timeout > 300:
202
+ raise ValidationLimitError(
203
+ f"shutdown_timeout must not exceed 300, "
204
+ f"got {self.shutdown_timeout}",
205
+ parameter="shutdown_timeout",
206
+ value=self.shutdown_timeout,
207
+ )
208
+
209
+ @classmethod
210
+ def from_env(cls) -> EscalationSchedulerConfig:
211
+ """Create configuration from environment variables with validation.
212
+
213
+ Environment variables:
214
+ TRUTHOUND_ESCALATION_CHECK_INTERVAL: Check interval in seconds
215
+ TRUTHOUND_ESCALATION_MAX_PER_CHECK: Max escalations per check
216
+ TRUTHOUND_ESCALATION_ENABLED: Enable/disable scheduler (true/false)
217
+ TRUTHOUND_ESCALATION_BACKEND: Backend type (memory, sqlalchemy)
218
+ TRUTHOUND_ESCALATION_MISFIRE_GRACE: Misfire grace time in seconds
219
+ TRUTHOUND_ESCALATION_COALESCE: Enable job coalescing (true/false)
220
+ TRUTHOUND_ESCALATION_MAX_RETRIES: Maximum retry attempts
221
+
222
+ Raises:
223
+ ValidationLimitError: If any configuration value is invalid.
224
+ """
225
+ return cls(
226
+ check_interval_seconds=int(
227
+ os.getenv("TRUTHOUND_ESCALATION_CHECK_INTERVAL", "60")
228
+ ),
229
+ max_escalations_per_check=int(
230
+ os.getenv("TRUTHOUND_ESCALATION_MAX_PER_CHECK", "100")
231
+ ),
232
+ enabled=os.getenv("TRUTHOUND_ESCALATION_ENABLED", "true").lower() == "true",
233
+ backend_type=BackendType(
234
+ os.getenv("TRUTHOUND_ESCALATION_BACKEND", "sqlalchemy")
235
+ ),
236
+ misfire_grace_time=int(
237
+ os.getenv("TRUTHOUND_ESCALATION_MISFIRE_GRACE", "60")
238
+ ),
239
+ coalesce=os.getenv("TRUTHOUND_ESCALATION_COALESCE", "true").lower() == "true",
240
+ max_retries=int(os.getenv("TRUTHOUND_ESCALATION_MAX_RETRIES", "3")),
241
+ shutdown_timeout=float(
242
+ os.getenv("TRUTHOUND_ESCALATION_SHUTDOWN_TIMEOUT", "30")
243
+ ),
244
+ )
245
+
246
+
247
+ # =============================================================================
248
+ # Abstract Escalation Handler
249
+ # =============================================================================
250
+
251
+
252
+ class EscalationHandler(ABC):
253
+ """Abstract base class for escalation handlers.
254
+
255
+ Implement this class to define custom escalation behavior.
256
+ Handlers are called when an incident needs to be escalated.
257
+
258
+ Example:
259
+ class SlackEscalationHandler(EscalationHandler):
260
+ def __init__(self, webhook_url: str):
261
+ self.webhook_url = webhook_url
262
+
263
+ @property
264
+ def handler_type(self) -> str:
265
+ return "slack"
266
+
267
+ async def handle_escalation(
268
+ self,
269
+ incident: EscalationIncidentModel,
270
+ policy: EscalationPolicyModel,
271
+ level: int,
272
+ targets: list[dict],
273
+ ) -> EscalationResult:
274
+ # Send Slack notification
275
+ ...
276
+ return EscalationResult(success=True, message="Sent to Slack")
277
+
278
+ async def can_handle(self, channel_type: str) -> bool:
279
+ return channel_type == "slack"
280
+ """
281
+
282
+ @property
283
+ @abstractmethod
284
+ def handler_type(self) -> str:
285
+ """Return the handler type identifier."""
286
+ ...
287
+
288
+ @abstractmethod
289
+ async def handle_escalation(
290
+ self,
291
+ incident: EscalationIncidentModel,
292
+ policy: EscalationPolicyModel,
293
+ level: int,
294
+ targets: list[dict[str, Any]],
295
+ ) -> "EscalationResult":
296
+ """Handle an escalation event.
297
+
298
+ Args:
299
+ incident: The escalation incident.
300
+ policy: The escalation policy.
301
+ level: The new escalation level.
302
+ targets: List of target configurations for this level.
303
+
304
+ Returns:
305
+ EscalationResult indicating success or failure.
306
+ """
307
+ ...
308
+
309
+ @abstractmethod
310
+ async def can_handle(self, channel_type: str) -> bool:
311
+ """Check if this handler can handle the given channel type.
312
+
313
+ Args:
314
+ channel_type: The notification channel type.
315
+
316
+ Returns:
317
+ True if this handler can handle the channel type.
318
+ """
319
+ ...
320
+
321
+
322
+ @dataclass
323
+ class EscalationResult:
324
+ """Result of an escalation attempt.
325
+
326
+ Attributes:
327
+ success: Whether the escalation succeeded.
328
+ message: Status message.
329
+ notifications_sent: Number of notifications sent.
330
+ metadata: Additional result data.
331
+ """
332
+
333
+ success: bool
334
+ message: str = ""
335
+ notifications_sent: int = 0
336
+ metadata: dict[str, Any] = field(default_factory=dict)
337
+
338
+
339
+ # =============================================================================
340
+ # Built-in Handlers
341
+ # =============================================================================
342
+
343
+
344
+ class DefaultEscalationHandler(EscalationHandler):
345
+ """Default escalation handler using the notification dispatcher.
346
+
347
+ This handler uses the existing notification system to send
348
+ escalation notifications through configured channels.
349
+ """
350
+
351
+ @property
352
+ def handler_type(self) -> str:
353
+ return "default"
354
+
355
+ async def handle_escalation(
356
+ self,
357
+ incident: EscalationIncidentModel,
358
+ policy: EscalationPolicyModel,
359
+ level: int,
360
+ targets: list[dict[str, Any]],
361
+ ) -> EscalationResult:
362
+ """Send escalation notifications via dispatcher."""
363
+ notifications_sent = 0
364
+ errors: list[str] = []
365
+
366
+ async with get_session() as session:
367
+ dispatcher = create_dispatcher(session)
368
+
369
+ for target in targets:
370
+ try:
371
+ channel_type = target.get("channel", "email")
372
+ channel_id = target.get("channel_id")
373
+ identifier = target.get("identifier", "")
374
+ target_type = target.get("type", "user")
375
+
376
+ # Build notification message
377
+ message = self._build_escalation_message(
378
+ incident=incident,
379
+ policy=policy,
380
+ level=level,
381
+ target=target,
382
+ )
383
+
384
+ # Use dispatcher to send notification
385
+ # Note: This uses the existing notification infrastructure
386
+ results = await dispatcher.dispatch(
387
+ channel_ids=[channel_id] if channel_id else None,
388
+ subject=f"[ESCALATION L{level}] {incident.incident_ref}",
389
+ message=message,
390
+ metadata={
391
+ "escalation": True,
392
+ "incident_id": incident.id,
393
+ "policy_id": policy.id,
394
+ "level": level,
395
+ "target_type": target_type,
396
+ "target_identifier": identifier,
397
+ },
398
+ )
399
+
400
+ for result in results:
401
+ if result.success:
402
+ notifications_sent += 1
403
+ else:
404
+ errors.append(f"Failed to notify {identifier}: {result.error_message}")
405
+
406
+ except Exception as e:
407
+ errors.append(f"Error notifying target: {e}")
408
+ logger.error(f"Escalation notification error: {e}")
409
+
410
+ await session.commit()
411
+
412
+ success = notifications_sent > 0 or len(targets) == 0
413
+ message = f"Sent {notifications_sent} notifications"
414
+ if errors:
415
+ message += f"; Errors: {'; '.join(errors[:3])}"
416
+
417
+ return EscalationResult(
418
+ success=success,
419
+ message=message,
420
+ notifications_sent=notifications_sent,
421
+ metadata={"errors": errors},
422
+ )
423
+
424
+ async def can_handle(self, channel_type: str) -> bool:
425
+ """Default handler can handle any channel type."""
426
+ return True
427
+
428
+ def _build_escalation_message(
429
+ self,
430
+ incident: EscalationIncidentModel,
431
+ policy: EscalationPolicyModel,
432
+ level: int,
433
+ target: dict[str, Any],
434
+ ) -> str:
435
+ """Build escalation notification message."""
436
+ context = incident.context or {}
437
+
438
+ message_parts = [
439
+ f"ESCALATION ALERT - Level {level}",
440
+ "",
441
+ f"Incident: {incident.incident_ref}",
442
+ f"Policy: {policy.name}",
443
+ f"State: {incident.state}",
444
+ f"Escalation Count: {incident.escalation_count}",
445
+ "",
446
+ ]
447
+
448
+ if context:
449
+ message_parts.append("Context:")
450
+ for key, value in context.items():
451
+ message_parts.append(f" {key}: {value}")
452
+ message_parts.append("")
453
+
454
+ message_parts.extend([
455
+ f"Created: {incident.created_at.isoformat()}",
456
+ f"Target: {target.get('identifier', 'N/A')} ({target.get('type', 'N/A')})",
457
+ ])
458
+
459
+ # Add custom message template if defined in policy level
460
+ levels = policy.levels or []
461
+ for level_config in levels:
462
+ if level_config.get("level") == level:
463
+ template = level_config.get("message_template")
464
+ if template:
465
+ message_parts.extend(["", "---", template])
466
+ break
467
+
468
+ return "\n".join(message_parts)
469
+
470
+
471
+ class LoggingEscalationHandler(EscalationHandler):
472
+ """Escalation handler that only logs escalations.
473
+
474
+ Useful for testing and debugging.
475
+ """
476
+
477
+ @property
478
+ def handler_type(self) -> str:
479
+ return "logging"
480
+
481
+ async def handle_escalation(
482
+ self,
483
+ incident: EscalationIncidentModel,
484
+ policy: EscalationPolicyModel,
485
+ level: int,
486
+ targets: list[dict[str, Any]],
487
+ ) -> EscalationResult:
488
+ """Log the escalation."""
489
+ logger.info(
490
+ f"Escalation triggered: incident={incident.id}, "
491
+ f"policy={policy.name}, level={level}, targets={len(targets)}"
492
+ )
493
+ for target in targets:
494
+ logger.info(
495
+ f" Target: type={target.get('type')}, "
496
+ f"identifier={target.get('identifier')}, "
497
+ f"channel={target.get('channel')}"
498
+ )
499
+
500
+ return EscalationResult(
501
+ success=True,
502
+ message=f"Logged escalation to level {level}",
503
+ notifications_sent=len(targets),
504
+ )
505
+
506
+ async def can_handle(self, channel_type: str) -> bool:
507
+ """Logging handler can handle any channel type."""
508
+ return True
509
+
510
+
511
+ # =============================================================================
512
+ # Escalation Strategy
513
+ # =============================================================================
514
+
515
+
516
+ class EscalationStrategy(ABC):
517
+ """Abstract base class for escalation strategies.
518
+
519
+ Strategies determine how and when escalations should proceed.
520
+ """
521
+
522
+ @property
523
+ @abstractmethod
524
+ def strategy_name(self) -> str:
525
+ """Return the strategy name."""
526
+ ...
527
+
528
+ @abstractmethod
529
+ async def should_escalate(
530
+ self,
531
+ incident: EscalationIncidentModel,
532
+ policy: EscalationPolicyModel,
533
+ ) -> bool:
534
+ """Determine if an incident should be escalated.
535
+
536
+ Args:
537
+ incident: The escalation incident.
538
+ policy: The escalation policy.
539
+
540
+ Returns:
541
+ True if escalation should proceed.
542
+ """
543
+ ...
544
+
545
+ @abstractmethod
546
+ async def get_next_level(
547
+ self,
548
+ incident: EscalationIncidentModel,
549
+ policy: EscalationPolicyModel,
550
+ ) -> int | None:
551
+ """Get the next escalation level.
552
+
553
+ Args:
554
+ incident: The escalation incident.
555
+ policy: The escalation policy.
556
+
557
+ Returns:
558
+ Next level number or None if no more levels.
559
+ """
560
+ ...
561
+
562
+
563
+ class TimeBasedEscalationStrategy(EscalationStrategy):
564
+ """Time-based escalation strategy.
565
+
566
+ Escalates when the scheduled escalation time has passed.
567
+ This is the default strategy that respects `next_escalation_at`.
568
+ """
569
+
570
+ @property
571
+ def strategy_name(self) -> str:
572
+ return "time_based"
573
+
574
+ async def should_escalate(
575
+ self,
576
+ incident: EscalationIncidentModel,
577
+ policy: EscalationPolicyModel,
578
+ ) -> bool:
579
+ """Check if escalation time has passed."""
580
+ if not incident.next_escalation_at:
581
+ return False
582
+
583
+ # Don't escalate resolved or acknowledged incidents
584
+ if incident.state in (
585
+ EscalationStateEnum.RESOLVED.value,
586
+ EscalationStateEnum.ACKNOWLEDGED.value,
587
+ ):
588
+ return False
589
+
590
+ return datetime.utcnow() >= incident.next_escalation_at
591
+
592
+ async def get_next_level(
593
+ self,
594
+ incident: EscalationIncidentModel,
595
+ policy: EscalationPolicyModel,
596
+ ) -> int | None:
597
+ """Get the next level based on current level."""
598
+ current_level = incident.current_level
599
+ levels = policy.levels or []
600
+
601
+ # Find next level
602
+ for level_config in levels:
603
+ if level_config.get("level", 0) == current_level + 1:
604
+ return current_level + 1
605
+
606
+ return None
607
+
608
+
609
+ class ImmediateEscalationStrategy(EscalationStrategy):
610
+ """Immediate escalation strategy.
611
+
612
+ Always escalates immediately without waiting.
613
+ Useful for critical incidents.
614
+ """
615
+
616
+ @property
617
+ def strategy_name(self) -> str:
618
+ return "immediate"
619
+
620
+ async def should_escalate(
621
+ self,
622
+ incident: EscalationIncidentModel,
623
+ policy: EscalationPolicyModel,
624
+ ) -> bool:
625
+ """Always return True for active incidents."""
626
+ return incident.state not in (
627
+ EscalationStateEnum.RESOLVED.value,
628
+ )
629
+
630
+ async def get_next_level(
631
+ self,
632
+ incident: EscalationIncidentModel,
633
+ policy: EscalationPolicyModel,
634
+ ) -> int | None:
635
+ """Get the next level, skipping to max if needed."""
636
+ current_level = incident.current_level
637
+ max_level = max(
638
+ (l.get("level", 0) for l in policy.levels or []),
639
+ default=0,
640
+ )
641
+
642
+ if current_level < max_level:
643
+ return current_level + 1
644
+ return None
645
+
646
+
647
+ # =============================================================================
648
+ # Main Scheduler Service
649
+ # =============================================================================
650
+
651
+
652
+ class EscalationSchedulerService:
653
+ """Service for scheduling automatic escalation checks.
654
+
655
+ This service uses APScheduler with a configurable backend to
656
+ periodically check for incidents that need escalation and
657
+ processes them accordingly.
658
+
659
+ Features:
660
+ - Configurable check interval
661
+ - Multiple handler support
662
+ - Multiple strategy support
663
+ - Metrics and status tracking
664
+ - Thread-safe operations
665
+ - **Persistent job storage (SQLAlchemy backend)**
666
+ - **Automatic job recovery on restart**
667
+ - **Configurable misfire handling**
668
+ - **Exponential backoff for failures**
669
+ - **Graceful shutdown with job persistence**
670
+
671
+ Usage:
672
+ service = EscalationSchedulerService()
673
+ await service.start()
674
+
675
+ # Later...
676
+ await service.stop()
677
+ """
678
+
679
+ DEFAULT_JOB_ID = "escalation_checker"
680
+
681
+ def __init__(
682
+ self,
683
+ config: EscalationSchedulerConfig | None = None,
684
+ scheduler: AsyncIOScheduler | None = None,
685
+ backend: SchedulerBackend | None = None,
686
+ ) -> None:
687
+ """Initialize the escalation scheduler service.
688
+
689
+ Args:
690
+ config: Service configuration.
691
+ scheduler: Optional existing APScheduler instance.
692
+ backend: Optional custom scheduler backend.
693
+ """
694
+ self.config = config or EscalationSchedulerConfig.from_env()
695
+ self._scheduler = scheduler or AsyncIOScheduler()
696
+ self._owns_scheduler = scheduler is None
697
+ self._handlers: list[EscalationHandler] = []
698
+ self._strategy: EscalationStrategy = TimeBasedEscalationStrategy()
699
+ self._running = False
700
+ self._last_check_at: datetime | None = None
701
+ self._check_count = 0
702
+ self._escalation_count = 0
703
+ self._error_count = 0
704
+ self._misfire_count = 0
705
+ self._lock = asyncio.Lock()
706
+
707
+ # Initialize backend
708
+ if backend:
709
+ self._backend = backend
710
+ else:
711
+ backend_config = SchedulerBackendConfig(
712
+ backend_type=self.config.backend_type,
713
+ misfire_grace_time=self.config.misfire_grace_time,
714
+ coalesce=self.config.coalesce,
715
+ max_retries=self.config.max_retries,
716
+ shutdown_timeout=self.config.shutdown_timeout,
717
+ )
718
+ self._backend = create_scheduler_backend(backend_config)
719
+
720
+ # Register default handler
721
+ self.register_handler(DefaultEscalationHandler())
722
+
723
+ @property
724
+ def is_running(self) -> bool:
725
+ """Check if the scheduler is running."""
726
+ return self._running
727
+
728
+ @property
729
+ def backend(self) -> SchedulerBackend:
730
+ """Get the scheduler backend."""
731
+ return self._backend
732
+
733
+ def register_handler(self, handler: EscalationHandler) -> None:
734
+ """Register an escalation handler.
735
+
736
+ Args:
737
+ handler: The handler to register.
738
+ """
739
+ self._handlers.append(handler)
740
+ logger.debug(f"Registered escalation handler: {handler.handler_type}")
741
+
742
+ def unregister_handler(self, handler_type: str) -> bool:
743
+ """Unregister an escalation handler by type.
744
+
745
+ Args:
746
+ handler_type: The handler type to unregister.
747
+
748
+ Returns:
749
+ True if handler was found and removed.
750
+ """
751
+ for handler in self._handlers[:]:
752
+ if handler.handler_type == handler_type:
753
+ self._handlers.remove(handler)
754
+ logger.debug(f"Unregistered escalation handler: {handler_type}")
755
+ return True
756
+ return False
757
+
758
+ def set_strategy(self, strategy: EscalationStrategy) -> None:
759
+ """Set the escalation strategy.
760
+
761
+ Args:
762
+ strategy: The strategy to use.
763
+ """
764
+ self._strategy = strategy
765
+ logger.debug(f"Set escalation strategy: {strategy.strategy_name}")
766
+
767
+ async def start(self) -> None:
768
+ """Start the escalation scheduler."""
769
+ if self._running:
770
+ logger.warning("Escalation scheduler already running")
771
+ return
772
+
773
+ if not self.config.enabled:
774
+ logger.info("Escalation scheduler is disabled")
775
+ return
776
+
777
+ logger.info("Starting escalation scheduler")
778
+
779
+ # Initialize backend
780
+ await self._backend.initialize()
781
+ logger.info(f"Using scheduler backend: {self._backend.backend_type.value}")
782
+
783
+ # Register the checker job with backend for persistence
784
+ job_data = JobData(
785
+ id=self.DEFAULT_JOB_ID,
786
+ name="Escalation Checker",
787
+ func_ref="truthound_dashboard.core.notifications.escalation.scheduler:_check_and_escalate",
788
+ trigger_type="interval",
789
+ trigger_args={"seconds": self.config.check_interval_seconds},
790
+ next_run_time=datetime.utcnow() + timedelta(
791
+ seconds=self.config.check_interval_seconds
792
+ ),
793
+ state=JobState.PENDING,
794
+ )
795
+
796
+ try:
797
+ # Check if job exists (recovery scenario)
798
+ existing = await self._backend.get_job(self.DEFAULT_JOB_ID)
799
+ if existing:
800
+ logger.info("Recovered existing escalation checker job")
801
+ # Update next_run_time if it was in the past
802
+ if existing.next_run_time and existing.next_run_time < datetime.utcnow():
803
+ if self._backend.is_misfired(existing):
804
+ self._misfire_count += 1
805
+ logger.warning("Escalation checker job misfired, rescheduling")
806
+ existing.next_run_time = datetime.utcnow()
807
+ existing.state = JobState.PENDING
808
+ await self._backend.update_job(existing)
809
+ else:
810
+ await self._backend.add_job(job_data)
811
+ logger.debug("Created escalation checker job")
812
+ except ValueError:
813
+ # Job already exists
814
+ logger.debug("Escalation checker job already registered")
815
+
816
+ # Schedule the checker job with APScheduler
817
+ self._scheduler.add_job(
818
+ self._check_and_escalate,
819
+ trigger=IntervalTrigger(seconds=self.config.check_interval_seconds),
820
+ id=self.DEFAULT_JOB_ID,
821
+ name="Escalation Checker",
822
+ replace_existing=True,
823
+ misfire_grace_time=self.config.misfire_grace_time,
824
+ coalesce=self.config.coalesce,
825
+ )
826
+
827
+ # Start scheduler if we own it
828
+ if self._owns_scheduler and not self._scheduler.running:
829
+ self._scheduler.start()
830
+
831
+ self._running = True
832
+ logger.info(
833
+ f"Escalation scheduler started "
834
+ f"(interval: {self.config.check_interval_seconds}s, "
835
+ f"backend: {self._backend.backend_type.value})"
836
+ )
837
+
838
+ async def stop(self) -> None:
839
+ """Stop the escalation scheduler gracefully."""
840
+ if not self._running:
841
+ return
842
+
843
+ logger.info("Stopping escalation scheduler")
844
+
845
+ try:
846
+ self._scheduler.remove_job(self.DEFAULT_JOB_ID)
847
+ except Exception:
848
+ pass # Job may not exist
849
+
850
+ # Shutdown scheduler if we own it
851
+ if self._owns_scheduler and self._scheduler.running:
852
+ self._scheduler.shutdown(wait=False)
853
+
854
+ # Shutdown backend (handles pending job persistence)
855
+ await self._backend.shutdown()
856
+
857
+ self._running = False
858
+ logger.info("Escalation scheduler stopped")
859
+
860
+ async def _check_and_escalate(self) -> None:
861
+ """Check for and process pending escalations.
862
+
863
+ This is the main job that runs periodically.
864
+ """
865
+ async with self._lock:
866
+ self._last_check_at = datetime.utcnow()
867
+ self._check_count += 1
868
+
869
+ logger.debug(f"Checking for pending escalations (check #{self._check_count})")
870
+
871
+ # Mark job as running in backend
872
+ await self._backend.mark_job_running(self.DEFAULT_JOB_ID)
873
+
874
+ try:
875
+ async with get_session() as session:
876
+ from sqlalchemy import select
877
+
878
+ # Get pending escalations
879
+ now = datetime.utcnow()
880
+ query = (
881
+ select(EscalationIncidentModel)
882
+ .where(
883
+ EscalationIncidentModel.state.in_([
884
+ EscalationStateEnum.TRIGGERED.value,
885
+ EscalationStateEnum.ESCALATED.value,
886
+ ])
887
+ )
888
+ .where(EscalationIncidentModel.next_escalation_at <= now)
889
+ .limit(self.config.max_escalations_per_check)
890
+ )
891
+
892
+ result = await session.execute(query)
893
+ incidents = result.scalars().all()
894
+
895
+ if not incidents:
896
+ logger.debug("No pending escalations found")
897
+ else:
898
+ logger.info(f"Found {len(incidents)} incidents due for escalation")
899
+
900
+ for incident in incidents:
901
+ await self._process_incident(session, incident)
902
+
903
+ await session.commit()
904
+
905
+ # Mark job as completed with next run time
906
+ next_run = datetime.utcnow() + timedelta(
907
+ seconds=self.config.check_interval_seconds
908
+ )
909
+ await self._backend.mark_job_completed(self.DEFAULT_JOB_ID, next_run)
910
+
911
+ except Exception as e:
912
+ self._error_count += 1
913
+ logger.error(f"Error checking escalations: {e}")
914
+ # Mark job as failed (will retry with exponential backoff)
915
+ await self._backend.mark_job_failed(
916
+ self.DEFAULT_JOB_ID,
917
+ str(e),
918
+ schedule_retry=self.config.retry_on_failure,
919
+ )
920
+
921
+ async def _process_incident(
922
+ self,
923
+ session: Any,
924
+ incident: EscalationIncidentModel,
925
+ ) -> None:
926
+ """Process a single incident for escalation.
927
+
928
+ Args:
929
+ session: Database session.
930
+ incident: The incident to process.
931
+ """
932
+ try:
933
+ # Get the policy
934
+ from sqlalchemy import select
935
+
936
+ result = await session.execute(
937
+ select(EscalationPolicyModel)
938
+ .where(EscalationPolicyModel.id == incident.policy_id)
939
+ )
940
+ policy = result.scalar_one_or_none()
941
+
942
+ if not policy:
943
+ logger.error(f"Policy not found for incident {incident.id}")
944
+ return
945
+
946
+ if not policy.is_active:
947
+ logger.debug(f"Policy {policy.id} is inactive, skipping")
948
+ return
949
+
950
+ # Check escalation strategy
951
+ if not await self._strategy.should_escalate(incident, policy):
952
+ logger.debug(f"Strategy says don't escalate incident {incident.id}")
953
+ return
954
+
955
+ # Get next level
956
+ next_level = await self._strategy.get_next_level(incident, policy)
957
+ if next_level is None:
958
+ logger.debug(f"No more levels for incident {incident.id}")
959
+ # Clear next_escalation_at since we're at max level
960
+ incident.next_escalation_at = None
961
+ return
962
+
963
+ # Check if escalation is allowed using model method
964
+ if not incident.can_escalate(policy.max_escalations):
965
+ logger.warning(
966
+ f"Incident {incident.id} cannot escalate: "
967
+ f"count={incident.escalation_count}, max={policy.max_escalations}, "
968
+ f"state={incident.state}"
969
+ )
970
+ incident.next_escalation_at = None
971
+ return
972
+
973
+ # Get targets for the next level
974
+ targets = self._get_level_targets(policy, next_level)
975
+ if not targets:
976
+ logger.warning(f"No targets for level {next_level} in policy {policy.id}")
977
+
978
+ # Execute escalation through handlers
979
+ await self._execute_escalation(incident, policy, next_level, targets)
980
+
981
+ # Calculate next escalation time
982
+ further_level = self._get_level_config(policy, next_level + 1)
983
+ next_escalation_at: datetime | None = None
984
+ if further_level:
985
+ delay_minutes = further_level.get("delay_minutes", 15)
986
+ next_escalation_at = datetime.utcnow() + timedelta(minutes=delay_minutes)
987
+
988
+ # Use model's escalate method for atomic state update
989
+ if not incident.escalate(
990
+ next_level=next_level,
991
+ next_escalation_at=next_escalation_at,
992
+ max_escalations=policy.max_escalations,
993
+ ):
994
+ logger.warning(f"Escalation blocked for incident {incident.id}")
995
+ return
996
+
997
+ self._escalation_count += 1
998
+ logger.info(
999
+ f"Escalated incident {incident.id} to level {next_level}"
1000
+ )
1001
+
1002
+ except Exception as e:
1003
+ self._error_count += 1
1004
+ logger.error(f"Error processing incident {incident.id}: {e}")
1005
+
1006
+ async def _execute_escalation(
1007
+ self,
1008
+ incident: EscalationIncidentModel,
1009
+ policy: EscalationPolicyModel,
1010
+ level: int,
1011
+ targets: list[dict[str, Any]],
1012
+ ) -> None:
1013
+ """Execute escalation through registered handlers.
1014
+
1015
+ Args:
1016
+ incident: The incident being escalated.
1017
+ policy: The escalation policy.
1018
+ level: The new level.
1019
+ targets: Targets for this level.
1020
+ """
1021
+ for handler in self._handlers:
1022
+ try:
1023
+ # Group targets by channel type
1024
+ for target in targets:
1025
+ channel_type = target.get("channel", "email")
1026
+ if await handler.can_handle(channel_type):
1027
+ result = await handler.handle_escalation(
1028
+ incident=incident,
1029
+ policy=policy,
1030
+ level=level,
1031
+ targets=[target],
1032
+ )
1033
+ if not result.success:
1034
+ logger.warning(
1035
+ f"Handler {handler.handler_type} failed: {result.message}"
1036
+ )
1037
+ break # Only use first matching handler
1038
+
1039
+ except Exception as e:
1040
+ logger.error(
1041
+ f"Handler {handler.handler_type} error: {e}"
1042
+ )
1043
+
1044
+ def _get_level_targets(
1045
+ self,
1046
+ policy: EscalationPolicyModel,
1047
+ level: int,
1048
+ ) -> list[dict[str, Any]]:
1049
+ """Get targets for a specific escalation level.
1050
+
1051
+ Args:
1052
+ policy: The escalation policy.
1053
+ level: The level number.
1054
+
1055
+ Returns:
1056
+ List of target configurations.
1057
+ """
1058
+ level_config = self._get_level_config(policy, level)
1059
+ if not level_config:
1060
+ return []
1061
+ return level_config.get("targets", [])
1062
+
1063
+ def _get_level_config(
1064
+ self,
1065
+ policy: EscalationPolicyModel,
1066
+ level: int,
1067
+ ) -> dict[str, Any] | None:
1068
+ """Get configuration for a specific level.
1069
+
1070
+ Args:
1071
+ policy: The escalation policy.
1072
+ level: The level number.
1073
+
1074
+ Returns:
1075
+ Level configuration or None.
1076
+ """
1077
+ for level_config in policy.levels or []:
1078
+ if level_config.get("level") == level:
1079
+ return level_config
1080
+ return None
1081
+
1082
+ async def trigger_immediate_check(self) -> dict[str, Any]:
1083
+ """Trigger an immediate escalation check.
1084
+
1085
+ Returns:
1086
+ Check result including number of escalations processed.
1087
+ """
1088
+ if not self._running:
1089
+ return {
1090
+ "success": False,
1091
+ "message": "Scheduler is not running",
1092
+ }
1093
+
1094
+ escalations_before = self._escalation_count
1095
+ await self._check_and_escalate()
1096
+ escalations_processed = self._escalation_count - escalations_before
1097
+
1098
+ return {
1099
+ "success": True,
1100
+ "message": f"Processed {escalations_processed} escalations",
1101
+ "escalations_processed": escalations_processed,
1102
+ "timestamp": datetime.utcnow().isoformat(),
1103
+ }
1104
+
1105
+ def get_status(self) -> dict[str, Any]:
1106
+ """Get current scheduler status.
1107
+
1108
+ Returns:
1109
+ Status dictionary with metrics.
1110
+ """
1111
+ next_run: datetime | None = None
1112
+ if self._running:
1113
+ try:
1114
+ job = self._scheduler.get_job(self.DEFAULT_JOB_ID)
1115
+ if job:
1116
+ next_run = job.next_run_time
1117
+ except Exception:
1118
+ pass
1119
+
1120
+ backend_status = self._backend.get_status()
1121
+
1122
+ return {
1123
+ "running": self._running,
1124
+ "enabled": self.config.enabled,
1125
+ "check_interval_seconds": self.config.check_interval_seconds,
1126
+ "last_check_at": self._last_check_at.isoformat() if self._last_check_at else None,
1127
+ "next_check_at": next_run.isoformat() if next_run else None,
1128
+ "check_count": self._check_count,
1129
+ "escalation_count": self._escalation_count,
1130
+ "error_count": self._error_count,
1131
+ "misfire_count": self._misfire_count,
1132
+ "handlers": [h.handler_type for h in self._handlers],
1133
+ "strategy": self._strategy.strategy_name,
1134
+ "backend": backend_status,
1135
+ }
1136
+
1137
+ def reset_metrics(self) -> None:
1138
+ """Reset scheduler metrics."""
1139
+ self._check_count = 0
1140
+ self._escalation_count = 0
1141
+ self._error_count = 0
1142
+ self._misfire_count = 0
1143
+
1144
+
1145
+ # =============================================================================
1146
+ # Singleton Instance Management
1147
+ # =============================================================================
1148
+
1149
+ _scheduler_service: EscalationSchedulerService | None = None
1150
+
1151
+
1152
+ def get_escalation_scheduler(
1153
+ config: EscalationSchedulerConfig | None = None,
1154
+ ) -> EscalationSchedulerService:
1155
+ """Get the singleton escalation scheduler instance.
1156
+
1157
+ Args:
1158
+ config: Optional configuration (only used on first call).
1159
+
1160
+ Returns:
1161
+ The EscalationSchedulerService instance.
1162
+ """
1163
+ global _scheduler_service
1164
+ if _scheduler_service is None:
1165
+ _scheduler_service = EscalationSchedulerService(config=config)
1166
+ return _scheduler_service
1167
+
1168
+
1169
+ def reset_escalation_scheduler() -> None:
1170
+ """Reset the singleton scheduler instance.
1171
+
1172
+ Useful for testing or reconfiguration.
1173
+ """
1174
+ global _scheduler_service
1175
+ _scheduler_service = None
1176
+
1177
+
1178
+ async def start_escalation_scheduler() -> None:
1179
+ """Start the escalation scheduler."""
1180
+ scheduler = get_escalation_scheduler()
1181
+ await scheduler.start()
1182
+
1183
+
1184
+ async def stop_escalation_scheduler() -> None:
1185
+ """Stop the escalation scheduler."""
1186
+ scheduler = get_escalation_scheduler()
1187
+ await scheduler.stop()