truthound-dashboard 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -22
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BZG20KuF.js +0 -586
- truthound_dashboard/static/assets/index-D_HyZ3pb.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +0 -1
- truthound_dashboard-1.3.1.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.1.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1187 @@
|
|
|
1
|
+
"""Escalation scheduler service with APScheduler integration.
|
|
2
|
+
|
|
3
|
+
This module provides automatic escalation checking via APScheduler,
|
|
4
|
+
triggering escalations when incidents reach their scheduled escalation time.
|
|
5
|
+
|
|
6
|
+
Features:
|
|
7
|
+
- Periodic checking of pending escalations
|
|
8
|
+
- Configurable check interval
|
|
9
|
+
- Abstract handler interface for extensibility
|
|
10
|
+
- Multiple escalation strategy support
|
|
11
|
+
- Integration with notification dispatcher
|
|
12
|
+
- **Persistent job storage (SQLAlchemy backend)**
|
|
13
|
+
- **Configurable misfire handling with grace time**
|
|
14
|
+
- **Error recovery with exponential backoff**
|
|
15
|
+
- **Job coalescing to avoid duplicate executions**
|
|
16
|
+
- **Graceful shutdown handling**
|
|
17
|
+
|
|
18
|
+
Usage:
|
|
19
|
+
from truthound_dashboard.core.notifications.escalation.scheduler import (
|
|
20
|
+
EscalationSchedulerService,
|
|
21
|
+
get_escalation_scheduler,
|
|
22
|
+
start_escalation_scheduler,
|
|
23
|
+
stop_escalation_scheduler,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Start the scheduler with persistent backend
|
|
27
|
+
scheduler = get_escalation_scheduler()
|
|
28
|
+
await scheduler.start()
|
|
29
|
+
|
|
30
|
+
# Or use convenience functions
|
|
31
|
+
await start_escalation_scheduler()
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import asyncio
|
|
37
|
+
import logging
|
|
38
|
+
import os
|
|
39
|
+
from abc import ABC, abstractmethod
|
|
40
|
+
from dataclasses import dataclass, field
|
|
41
|
+
from datetime import datetime, timedelta
|
|
42
|
+
from typing import Any
|
|
43
|
+
|
|
44
|
+
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
45
|
+
from apscheduler.triggers.interval import IntervalTrigger
|
|
46
|
+
|
|
47
|
+
from ...validation_limits import get_escalation_limits, ValidationLimitError
|
|
48
|
+
from ....db import get_session
|
|
49
|
+
from ....db.models import (
|
|
50
|
+
EscalationIncidentModel,
|
|
51
|
+
EscalationPolicyModel,
|
|
52
|
+
EscalationStateEnum,
|
|
53
|
+
NotificationChannel,
|
|
54
|
+
)
|
|
55
|
+
from ..dispatcher import create_dispatcher
|
|
56
|
+
from .backends import (
|
|
57
|
+
BackendType,
|
|
58
|
+
JobData,
|
|
59
|
+
JobState,
|
|
60
|
+
SchedulerBackend,
|
|
61
|
+
SchedulerBackendConfig,
|
|
62
|
+
create_scheduler_backend,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
logger = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# =============================================================================
|
|
69
|
+
# Configuration
|
|
70
|
+
# =============================================================================
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class EscalationSchedulerConfig:
|
|
75
|
+
"""Configuration for the escalation scheduler with validation.
|
|
76
|
+
|
|
77
|
+
Validation:
|
|
78
|
+
- check_interval_seconds: Must be between 10 and 3600 (configurable).
|
|
79
|
+
- max_escalations_per_check: Must be between 1 and 1000.
|
|
80
|
+
- retry_delay_seconds: Must be between 1 and 3600.
|
|
81
|
+
- max_retries: Must be between 0 and 10.
|
|
82
|
+
|
|
83
|
+
DoS Prevention:
|
|
84
|
+
- Minimum check interval prevents excessive CPU usage.
|
|
85
|
+
- Maximum escalations per check prevents memory exhaustion.
|
|
86
|
+
- Maximum retry attempts prevents infinite retry loops.
|
|
87
|
+
|
|
88
|
+
Environment Variables:
|
|
89
|
+
- TRUTHOUND_ESCALATION_CHECK_INTERVAL_MIN
|
|
90
|
+
- TRUTHOUND_ESCALATION_CHECK_INTERVAL_MAX
|
|
91
|
+
|
|
92
|
+
Attributes:
|
|
93
|
+
check_interval_seconds: How often to check for pending escalations.
|
|
94
|
+
max_escalations_per_check: Maximum escalations to process per check.
|
|
95
|
+
retry_on_failure: Whether to retry failed escalations.
|
|
96
|
+
retry_delay_seconds: Delay before retrying failed escalation.
|
|
97
|
+
enabled: Whether the scheduler is enabled.
|
|
98
|
+
backend_type: Type of scheduler backend (memory, sqlalchemy).
|
|
99
|
+
misfire_grace_time: Seconds to allow for late job execution.
|
|
100
|
+
coalesce: Combine multiple pending executions into one.
|
|
101
|
+
max_retries: Maximum retry attempts on failure.
|
|
102
|
+
shutdown_timeout: Seconds to wait for jobs during shutdown.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
check_interval_seconds: int = 60
|
|
106
|
+
max_escalations_per_check: int = 100
|
|
107
|
+
retry_on_failure: bool = True
|
|
108
|
+
retry_delay_seconds: int = 300
|
|
109
|
+
enabled: bool = True
|
|
110
|
+
backend_type: BackendType = BackendType.SQLALCHEMY
|
|
111
|
+
misfire_grace_time: int = 60
|
|
112
|
+
coalesce: bool = True
|
|
113
|
+
max_retries: int = 3
|
|
114
|
+
shutdown_timeout: float = 30.0
|
|
115
|
+
|
|
116
|
+
def __post_init__(self) -> None:
|
|
117
|
+
"""Validate configuration after initialization."""
|
|
118
|
+
limits = get_escalation_limits()
|
|
119
|
+
|
|
120
|
+
# Validate check_interval_seconds
|
|
121
|
+
valid, error = limits.validate_check_interval(self.check_interval_seconds)
|
|
122
|
+
if not valid:
|
|
123
|
+
raise ValidationLimitError(
|
|
124
|
+
error or f"Invalid check_interval_seconds: {self.check_interval_seconds}",
|
|
125
|
+
parameter="check_interval_seconds",
|
|
126
|
+
value=self.check_interval_seconds,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Validate max_escalations_per_check (1-1000)
|
|
130
|
+
if self.max_escalations_per_check < 1:
|
|
131
|
+
raise ValidationLimitError(
|
|
132
|
+
f"max_escalations_per_check must be at least 1, "
|
|
133
|
+
f"got {self.max_escalations_per_check}",
|
|
134
|
+
parameter="max_escalations_per_check",
|
|
135
|
+
value=self.max_escalations_per_check,
|
|
136
|
+
)
|
|
137
|
+
if self.max_escalations_per_check > 1000:
|
|
138
|
+
raise ValidationLimitError(
|
|
139
|
+
f"max_escalations_per_check must not exceed 1000, "
|
|
140
|
+
f"got {self.max_escalations_per_check}",
|
|
141
|
+
parameter="max_escalations_per_check",
|
|
142
|
+
value=self.max_escalations_per_check,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Validate retry_delay_seconds (1-3600)
|
|
146
|
+
if self.retry_delay_seconds < 1:
|
|
147
|
+
raise ValidationLimitError(
|
|
148
|
+
f"retry_delay_seconds must be at least 1, "
|
|
149
|
+
f"got {self.retry_delay_seconds}",
|
|
150
|
+
parameter="retry_delay_seconds",
|
|
151
|
+
value=self.retry_delay_seconds,
|
|
152
|
+
)
|
|
153
|
+
if self.retry_delay_seconds > 3600:
|
|
154
|
+
raise ValidationLimitError(
|
|
155
|
+
f"retry_delay_seconds must not exceed 3600, "
|
|
156
|
+
f"got {self.retry_delay_seconds}",
|
|
157
|
+
parameter="retry_delay_seconds",
|
|
158
|
+
value=self.retry_delay_seconds,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Validate max_retries (0-10)
|
|
162
|
+
if self.max_retries < 0:
|
|
163
|
+
raise ValidationLimitError(
|
|
164
|
+
f"max_retries must be non-negative, "
|
|
165
|
+
f"got {self.max_retries}",
|
|
166
|
+
parameter="max_retries",
|
|
167
|
+
value=self.max_retries,
|
|
168
|
+
)
|
|
169
|
+
if self.max_retries > 10:
|
|
170
|
+
raise ValidationLimitError(
|
|
171
|
+
f"max_retries must not exceed 10, "
|
|
172
|
+
f"got {self.max_retries}",
|
|
173
|
+
parameter="max_retries",
|
|
174
|
+
value=self.max_retries,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Validate misfire_grace_time (1-3600)
|
|
178
|
+
if self.misfire_grace_time < 1:
|
|
179
|
+
raise ValidationLimitError(
|
|
180
|
+
f"misfire_grace_time must be at least 1, "
|
|
181
|
+
f"got {self.misfire_grace_time}",
|
|
182
|
+
parameter="misfire_grace_time",
|
|
183
|
+
value=self.misfire_grace_time,
|
|
184
|
+
)
|
|
185
|
+
if self.misfire_grace_time > 3600:
|
|
186
|
+
raise ValidationLimitError(
|
|
187
|
+
f"misfire_grace_time must not exceed 3600, "
|
|
188
|
+
f"got {self.misfire_grace_time}",
|
|
189
|
+
parameter="misfire_grace_time",
|
|
190
|
+
value=self.misfire_grace_time,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Validate shutdown_timeout (1-300)
|
|
194
|
+
if self.shutdown_timeout < 1:
|
|
195
|
+
raise ValidationLimitError(
|
|
196
|
+
f"shutdown_timeout must be at least 1, "
|
|
197
|
+
f"got {self.shutdown_timeout}",
|
|
198
|
+
parameter="shutdown_timeout",
|
|
199
|
+
value=self.shutdown_timeout,
|
|
200
|
+
)
|
|
201
|
+
if self.shutdown_timeout > 300:
|
|
202
|
+
raise ValidationLimitError(
|
|
203
|
+
f"shutdown_timeout must not exceed 300, "
|
|
204
|
+
f"got {self.shutdown_timeout}",
|
|
205
|
+
parameter="shutdown_timeout",
|
|
206
|
+
value=self.shutdown_timeout,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def from_env(cls) -> EscalationSchedulerConfig:
|
|
211
|
+
"""Create configuration from environment variables with validation.
|
|
212
|
+
|
|
213
|
+
Environment variables:
|
|
214
|
+
TRUTHOUND_ESCALATION_CHECK_INTERVAL: Check interval in seconds
|
|
215
|
+
TRUTHOUND_ESCALATION_MAX_PER_CHECK: Max escalations per check
|
|
216
|
+
TRUTHOUND_ESCALATION_ENABLED: Enable/disable scheduler (true/false)
|
|
217
|
+
TRUTHOUND_ESCALATION_BACKEND: Backend type (memory, sqlalchemy)
|
|
218
|
+
TRUTHOUND_ESCALATION_MISFIRE_GRACE: Misfire grace time in seconds
|
|
219
|
+
TRUTHOUND_ESCALATION_COALESCE: Enable job coalescing (true/false)
|
|
220
|
+
TRUTHOUND_ESCALATION_MAX_RETRIES: Maximum retry attempts
|
|
221
|
+
|
|
222
|
+
Raises:
|
|
223
|
+
ValidationLimitError: If any configuration value is invalid.
|
|
224
|
+
"""
|
|
225
|
+
return cls(
|
|
226
|
+
check_interval_seconds=int(
|
|
227
|
+
os.getenv("TRUTHOUND_ESCALATION_CHECK_INTERVAL", "60")
|
|
228
|
+
),
|
|
229
|
+
max_escalations_per_check=int(
|
|
230
|
+
os.getenv("TRUTHOUND_ESCALATION_MAX_PER_CHECK", "100")
|
|
231
|
+
),
|
|
232
|
+
enabled=os.getenv("TRUTHOUND_ESCALATION_ENABLED", "true").lower() == "true",
|
|
233
|
+
backend_type=BackendType(
|
|
234
|
+
os.getenv("TRUTHOUND_ESCALATION_BACKEND", "sqlalchemy")
|
|
235
|
+
),
|
|
236
|
+
misfire_grace_time=int(
|
|
237
|
+
os.getenv("TRUTHOUND_ESCALATION_MISFIRE_GRACE", "60")
|
|
238
|
+
),
|
|
239
|
+
coalesce=os.getenv("TRUTHOUND_ESCALATION_COALESCE", "true").lower() == "true",
|
|
240
|
+
max_retries=int(os.getenv("TRUTHOUND_ESCALATION_MAX_RETRIES", "3")),
|
|
241
|
+
shutdown_timeout=float(
|
|
242
|
+
os.getenv("TRUTHOUND_ESCALATION_SHUTDOWN_TIMEOUT", "30")
|
|
243
|
+
),
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# =============================================================================
|
|
248
|
+
# Abstract Escalation Handler
|
|
249
|
+
# =============================================================================
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class EscalationHandler(ABC):
|
|
253
|
+
"""Abstract base class for escalation handlers.
|
|
254
|
+
|
|
255
|
+
Implement this class to define custom escalation behavior.
|
|
256
|
+
Handlers are called when an incident needs to be escalated.
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
class SlackEscalationHandler(EscalationHandler):
|
|
260
|
+
def __init__(self, webhook_url: str):
|
|
261
|
+
self.webhook_url = webhook_url
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def handler_type(self) -> str:
|
|
265
|
+
return "slack"
|
|
266
|
+
|
|
267
|
+
async def handle_escalation(
|
|
268
|
+
self,
|
|
269
|
+
incident: EscalationIncidentModel,
|
|
270
|
+
policy: EscalationPolicyModel,
|
|
271
|
+
level: int,
|
|
272
|
+
targets: list[dict],
|
|
273
|
+
) -> EscalationResult:
|
|
274
|
+
# Send Slack notification
|
|
275
|
+
...
|
|
276
|
+
return EscalationResult(success=True, message="Sent to Slack")
|
|
277
|
+
|
|
278
|
+
async def can_handle(self, channel_type: str) -> bool:
|
|
279
|
+
return channel_type == "slack"
|
|
280
|
+
"""
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
@abstractmethod
|
|
284
|
+
def handler_type(self) -> str:
|
|
285
|
+
"""Return the handler type identifier."""
|
|
286
|
+
...
|
|
287
|
+
|
|
288
|
+
@abstractmethod
|
|
289
|
+
async def handle_escalation(
|
|
290
|
+
self,
|
|
291
|
+
incident: EscalationIncidentModel,
|
|
292
|
+
policy: EscalationPolicyModel,
|
|
293
|
+
level: int,
|
|
294
|
+
targets: list[dict[str, Any]],
|
|
295
|
+
) -> "EscalationResult":
|
|
296
|
+
"""Handle an escalation event.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
incident: The escalation incident.
|
|
300
|
+
policy: The escalation policy.
|
|
301
|
+
level: The new escalation level.
|
|
302
|
+
targets: List of target configurations for this level.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
EscalationResult indicating success or failure.
|
|
306
|
+
"""
|
|
307
|
+
...
|
|
308
|
+
|
|
309
|
+
@abstractmethod
|
|
310
|
+
async def can_handle(self, channel_type: str) -> bool:
|
|
311
|
+
"""Check if this handler can handle the given channel type.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
channel_type: The notification channel type.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
True if this handler can handle the channel type.
|
|
318
|
+
"""
|
|
319
|
+
...
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
@dataclass
|
|
323
|
+
class EscalationResult:
|
|
324
|
+
"""Result of an escalation attempt.
|
|
325
|
+
|
|
326
|
+
Attributes:
|
|
327
|
+
success: Whether the escalation succeeded.
|
|
328
|
+
message: Status message.
|
|
329
|
+
notifications_sent: Number of notifications sent.
|
|
330
|
+
metadata: Additional result data.
|
|
331
|
+
"""
|
|
332
|
+
|
|
333
|
+
success: bool
|
|
334
|
+
message: str = ""
|
|
335
|
+
notifications_sent: int = 0
|
|
336
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# =============================================================================
|
|
340
|
+
# Built-in Handlers
|
|
341
|
+
# =============================================================================
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
class DefaultEscalationHandler(EscalationHandler):
|
|
345
|
+
"""Default escalation handler using the notification dispatcher.
|
|
346
|
+
|
|
347
|
+
This handler uses the existing notification system to send
|
|
348
|
+
escalation notifications through configured channels.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
@property
|
|
352
|
+
def handler_type(self) -> str:
|
|
353
|
+
return "default"
|
|
354
|
+
|
|
355
|
+
async def handle_escalation(
|
|
356
|
+
self,
|
|
357
|
+
incident: EscalationIncidentModel,
|
|
358
|
+
policy: EscalationPolicyModel,
|
|
359
|
+
level: int,
|
|
360
|
+
targets: list[dict[str, Any]],
|
|
361
|
+
) -> EscalationResult:
|
|
362
|
+
"""Send escalation notifications via dispatcher."""
|
|
363
|
+
notifications_sent = 0
|
|
364
|
+
errors: list[str] = []
|
|
365
|
+
|
|
366
|
+
async with get_session() as session:
|
|
367
|
+
dispatcher = create_dispatcher(session)
|
|
368
|
+
|
|
369
|
+
for target in targets:
|
|
370
|
+
try:
|
|
371
|
+
channel_type = target.get("channel", "email")
|
|
372
|
+
channel_id = target.get("channel_id")
|
|
373
|
+
identifier = target.get("identifier", "")
|
|
374
|
+
target_type = target.get("type", "user")
|
|
375
|
+
|
|
376
|
+
# Build notification message
|
|
377
|
+
message = self._build_escalation_message(
|
|
378
|
+
incident=incident,
|
|
379
|
+
policy=policy,
|
|
380
|
+
level=level,
|
|
381
|
+
target=target,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Use dispatcher to send notification
|
|
385
|
+
# Note: This uses the existing notification infrastructure
|
|
386
|
+
results = await dispatcher.dispatch(
|
|
387
|
+
channel_ids=[channel_id] if channel_id else None,
|
|
388
|
+
subject=f"[ESCALATION L{level}] {incident.incident_ref}",
|
|
389
|
+
message=message,
|
|
390
|
+
metadata={
|
|
391
|
+
"escalation": True,
|
|
392
|
+
"incident_id": incident.id,
|
|
393
|
+
"policy_id": policy.id,
|
|
394
|
+
"level": level,
|
|
395
|
+
"target_type": target_type,
|
|
396
|
+
"target_identifier": identifier,
|
|
397
|
+
},
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
for result in results:
|
|
401
|
+
if result.success:
|
|
402
|
+
notifications_sent += 1
|
|
403
|
+
else:
|
|
404
|
+
errors.append(f"Failed to notify {identifier}: {result.error_message}")
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
errors.append(f"Error notifying target: {e}")
|
|
408
|
+
logger.error(f"Escalation notification error: {e}")
|
|
409
|
+
|
|
410
|
+
await session.commit()
|
|
411
|
+
|
|
412
|
+
success = notifications_sent > 0 or len(targets) == 0
|
|
413
|
+
message = f"Sent {notifications_sent} notifications"
|
|
414
|
+
if errors:
|
|
415
|
+
message += f"; Errors: {'; '.join(errors[:3])}"
|
|
416
|
+
|
|
417
|
+
return EscalationResult(
|
|
418
|
+
success=success,
|
|
419
|
+
message=message,
|
|
420
|
+
notifications_sent=notifications_sent,
|
|
421
|
+
metadata={"errors": errors},
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
async def can_handle(self, channel_type: str) -> bool:
|
|
425
|
+
"""Default handler can handle any channel type."""
|
|
426
|
+
return True
|
|
427
|
+
|
|
428
|
+
def _build_escalation_message(
|
|
429
|
+
self,
|
|
430
|
+
incident: EscalationIncidentModel,
|
|
431
|
+
policy: EscalationPolicyModel,
|
|
432
|
+
level: int,
|
|
433
|
+
target: dict[str, Any],
|
|
434
|
+
) -> str:
|
|
435
|
+
"""Build escalation notification message."""
|
|
436
|
+
context = incident.context or {}
|
|
437
|
+
|
|
438
|
+
message_parts = [
|
|
439
|
+
f"ESCALATION ALERT - Level {level}",
|
|
440
|
+
"",
|
|
441
|
+
f"Incident: {incident.incident_ref}",
|
|
442
|
+
f"Policy: {policy.name}",
|
|
443
|
+
f"State: {incident.state}",
|
|
444
|
+
f"Escalation Count: {incident.escalation_count}",
|
|
445
|
+
"",
|
|
446
|
+
]
|
|
447
|
+
|
|
448
|
+
if context:
|
|
449
|
+
message_parts.append("Context:")
|
|
450
|
+
for key, value in context.items():
|
|
451
|
+
message_parts.append(f" {key}: {value}")
|
|
452
|
+
message_parts.append("")
|
|
453
|
+
|
|
454
|
+
message_parts.extend([
|
|
455
|
+
f"Created: {incident.created_at.isoformat()}",
|
|
456
|
+
f"Target: {target.get('identifier', 'N/A')} ({target.get('type', 'N/A')})",
|
|
457
|
+
])
|
|
458
|
+
|
|
459
|
+
# Add custom message template if defined in policy level
|
|
460
|
+
levels = policy.levels or []
|
|
461
|
+
for level_config in levels:
|
|
462
|
+
if level_config.get("level") == level:
|
|
463
|
+
template = level_config.get("message_template")
|
|
464
|
+
if template:
|
|
465
|
+
message_parts.extend(["", "---", template])
|
|
466
|
+
break
|
|
467
|
+
|
|
468
|
+
return "\n".join(message_parts)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
class LoggingEscalationHandler(EscalationHandler):
|
|
472
|
+
"""Escalation handler that only logs escalations.
|
|
473
|
+
|
|
474
|
+
Useful for testing and debugging.
|
|
475
|
+
"""
|
|
476
|
+
|
|
477
|
+
@property
|
|
478
|
+
def handler_type(self) -> str:
|
|
479
|
+
return "logging"
|
|
480
|
+
|
|
481
|
+
async def handle_escalation(
|
|
482
|
+
self,
|
|
483
|
+
incident: EscalationIncidentModel,
|
|
484
|
+
policy: EscalationPolicyModel,
|
|
485
|
+
level: int,
|
|
486
|
+
targets: list[dict[str, Any]],
|
|
487
|
+
) -> EscalationResult:
|
|
488
|
+
"""Log the escalation."""
|
|
489
|
+
logger.info(
|
|
490
|
+
f"Escalation triggered: incident={incident.id}, "
|
|
491
|
+
f"policy={policy.name}, level={level}, targets={len(targets)}"
|
|
492
|
+
)
|
|
493
|
+
for target in targets:
|
|
494
|
+
logger.info(
|
|
495
|
+
f" Target: type={target.get('type')}, "
|
|
496
|
+
f"identifier={target.get('identifier')}, "
|
|
497
|
+
f"channel={target.get('channel')}"
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
return EscalationResult(
|
|
501
|
+
success=True,
|
|
502
|
+
message=f"Logged escalation to level {level}",
|
|
503
|
+
notifications_sent=len(targets),
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
async def can_handle(self, channel_type: str) -> bool:
|
|
507
|
+
"""Logging handler can handle any channel type."""
|
|
508
|
+
return True
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
# =============================================================================
|
|
512
|
+
# Escalation Strategy
|
|
513
|
+
# =============================================================================
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
class EscalationStrategy(ABC):
|
|
517
|
+
"""Abstract base class for escalation strategies.
|
|
518
|
+
|
|
519
|
+
Strategies determine how and when escalations should proceed.
|
|
520
|
+
"""
|
|
521
|
+
|
|
522
|
+
@property
|
|
523
|
+
@abstractmethod
|
|
524
|
+
def strategy_name(self) -> str:
|
|
525
|
+
"""Return the strategy name."""
|
|
526
|
+
...
|
|
527
|
+
|
|
528
|
+
@abstractmethod
|
|
529
|
+
async def should_escalate(
|
|
530
|
+
self,
|
|
531
|
+
incident: EscalationIncidentModel,
|
|
532
|
+
policy: EscalationPolicyModel,
|
|
533
|
+
) -> bool:
|
|
534
|
+
"""Determine if an incident should be escalated.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
incident: The escalation incident.
|
|
538
|
+
policy: The escalation policy.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
True if escalation should proceed.
|
|
542
|
+
"""
|
|
543
|
+
...
|
|
544
|
+
|
|
545
|
+
@abstractmethod
|
|
546
|
+
async def get_next_level(
|
|
547
|
+
self,
|
|
548
|
+
incident: EscalationIncidentModel,
|
|
549
|
+
policy: EscalationPolicyModel,
|
|
550
|
+
) -> int | None:
|
|
551
|
+
"""Get the next escalation level.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
incident: The escalation incident.
|
|
555
|
+
policy: The escalation policy.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Next level number or None if no more levels.
|
|
559
|
+
"""
|
|
560
|
+
...
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
class TimeBasedEscalationStrategy(EscalationStrategy):
|
|
564
|
+
"""Time-based escalation strategy.
|
|
565
|
+
|
|
566
|
+
Escalates when the scheduled escalation time has passed.
|
|
567
|
+
This is the default strategy that respects `next_escalation_at`.
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
@property
|
|
571
|
+
def strategy_name(self) -> str:
|
|
572
|
+
return "time_based"
|
|
573
|
+
|
|
574
|
+
async def should_escalate(
|
|
575
|
+
self,
|
|
576
|
+
incident: EscalationIncidentModel,
|
|
577
|
+
policy: EscalationPolicyModel,
|
|
578
|
+
) -> bool:
|
|
579
|
+
"""Check if escalation time has passed."""
|
|
580
|
+
if not incident.next_escalation_at:
|
|
581
|
+
return False
|
|
582
|
+
|
|
583
|
+
# Don't escalate resolved or acknowledged incidents
|
|
584
|
+
if incident.state in (
|
|
585
|
+
EscalationStateEnum.RESOLVED.value,
|
|
586
|
+
EscalationStateEnum.ACKNOWLEDGED.value,
|
|
587
|
+
):
|
|
588
|
+
return False
|
|
589
|
+
|
|
590
|
+
return datetime.utcnow() >= incident.next_escalation_at
|
|
591
|
+
|
|
592
|
+
async def get_next_level(
|
|
593
|
+
self,
|
|
594
|
+
incident: EscalationIncidentModel,
|
|
595
|
+
policy: EscalationPolicyModel,
|
|
596
|
+
) -> int | None:
|
|
597
|
+
"""Get the next level based on current level."""
|
|
598
|
+
current_level = incident.current_level
|
|
599
|
+
levels = policy.levels or []
|
|
600
|
+
|
|
601
|
+
# Find next level
|
|
602
|
+
for level_config in levels:
|
|
603
|
+
if level_config.get("level", 0) == current_level + 1:
|
|
604
|
+
return current_level + 1
|
|
605
|
+
|
|
606
|
+
return None
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
class ImmediateEscalationStrategy(EscalationStrategy):
|
|
610
|
+
"""Immediate escalation strategy.
|
|
611
|
+
|
|
612
|
+
Always escalates immediately without waiting.
|
|
613
|
+
Useful for critical incidents.
|
|
614
|
+
"""
|
|
615
|
+
|
|
616
|
+
@property
|
|
617
|
+
def strategy_name(self) -> str:
|
|
618
|
+
return "immediate"
|
|
619
|
+
|
|
620
|
+
async def should_escalate(
|
|
621
|
+
self,
|
|
622
|
+
incident: EscalationIncidentModel,
|
|
623
|
+
policy: EscalationPolicyModel,
|
|
624
|
+
) -> bool:
|
|
625
|
+
"""Always return True for active incidents."""
|
|
626
|
+
return incident.state not in (
|
|
627
|
+
EscalationStateEnum.RESOLVED.value,
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
async def get_next_level(
|
|
631
|
+
self,
|
|
632
|
+
incident: EscalationIncidentModel,
|
|
633
|
+
policy: EscalationPolicyModel,
|
|
634
|
+
) -> int | None:
|
|
635
|
+
"""Get the next level, skipping to max if needed."""
|
|
636
|
+
current_level = incident.current_level
|
|
637
|
+
max_level = max(
|
|
638
|
+
(l.get("level", 0) for l in policy.levels or []),
|
|
639
|
+
default=0,
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
if current_level < max_level:
|
|
643
|
+
return current_level + 1
|
|
644
|
+
return None
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
# =============================================================================
|
|
648
|
+
# Main Scheduler Service
|
|
649
|
+
# =============================================================================
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
class EscalationSchedulerService:
|
|
653
|
+
"""Service for scheduling automatic escalation checks.
|
|
654
|
+
|
|
655
|
+
This service uses APScheduler with a configurable backend to
|
|
656
|
+
periodically check for incidents that need escalation and
|
|
657
|
+
processes them accordingly.
|
|
658
|
+
|
|
659
|
+
Features:
|
|
660
|
+
- Configurable check interval
|
|
661
|
+
- Multiple handler support
|
|
662
|
+
- Multiple strategy support
|
|
663
|
+
- Metrics and status tracking
|
|
664
|
+
- Thread-safe operations
|
|
665
|
+
- **Persistent job storage (SQLAlchemy backend)**
|
|
666
|
+
- **Automatic job recovery on restart**
|
|
667
|
+
- **Configurable misfire handling**
|
|
668
|
+
- **Exponential backoff for failures**
|
|
669
|
+
- **Graceful shutdown with job persistence**
|
|
670
|
+
|
|
671
|
+
Usage:
|
|
672
|
+
service = EscalationSchedulerService()
|
|
673
|
+
await service.start()
|
|
674
|
+
|
|
675
|
+
# Later...
|
|
676
|
+
await service.stop()
|
|
677
|
+
"""
|
|
678
|
+
|
|
679
|
+
DEFAULT_JOB_ID = "escalation_checker"
|
|
680
|
+
|
|
681
|
+
def __init__(
|
|
682
|
+
self,
|
|
683
|
+
config: EscalationSchedulerConfig | None = None,
|
|
684
|
+
scheduler: AsyncIOScheduler | None = None,
|
|
685
|
+
backend: SchedulerBackend | None = None,
|
|
686
|
+
) -> None:
|
|
687
|
+
"""Initialize the escalation scheduler service.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
config: Service configuration.
|
|
691
|
+
scheduler: Optional existing APScheduler instance.
|
|
692
|
+
backend: Optional custom scheduler backend.
|
|
693
|
+
"""
|
|
694
|
+
self.config = config or EscalationSchedulerConfig.from_env()
|
|
695
|
+
self._scheduler = scheduler or AsyncIOScheduler()
|
|
696
|
+
self._owns_scheduler = scheduler is None
|
|
697
|
+
self._handlers: list[EscalationHandler] = []
|
|
698
|
+
self._strategy: EscalationStrategy = TimeBasedEscalationStrategy()
|
|
699
|
+
self._running = False
|
|
700
|
+
self._last_check_at: datetime | None = None
|
|
701
|
+
self._check_count = 0
|
|
702
|
+
self._escalation_count = 0
|
|
703
|
+
self._error_count = 0
|
|
704
|
+
self._misfire_count = 0
|
|
705
|
+
self._lock = asyncio.Lock()
|
|
706
|
+
|
|
707
|
+
# Initialize backend
|
|
708
|
+
if backend:
|
|
709
|
+
self._backend = backend
|
|
710
|
+
else:
|
|
711
|
+
backend_config = SchedulerBackendConfig(
|
|
712
|
+
backend_type=self.config.backend_type,
|
|
713
|
+
misfire_grace_time=self.config.misfire_grace_time,
|
|
714
|
+
coalesce=self.config.coalesce,
|
|
715
|
+
max_retries=self.config.max_retries,
|
|
716
|
+
shutdown_timeout=self.config.shutdown_timeout,
|
|
717
|
+
)
|
|
718
|
+
self._backend = create_scheduler_backend(backend_config)
|
|
719
|
+
|
|
720
|
+
# Register default handler
|
|
721
|
+
self.register_handler(DefaultEscalationHandler())
|
|
722
|
+
|
|
723
|
+
@property
|
|
724
|
+
def is_running(self) -> bool:
|
|
725
|
+
"""Check if the scheduler is running."""
|
|
726
|
+
return self._running
|
|
727
|
+
|
|
728
|
+
@property
|
|
729
|
+
def backend(self) -> SchedulerBackend:
|
|
730
|
+
"""Get the scheduler backend."""
|
|
731
|
+
return self._backend
|
|
732
|
+
|
|
733
|
+
def register_handler(self, handler: EscalationHandler) -> None:
|
|
734
|
+
"""Register an escalation handler.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
handler: The handler to register.
|
|
738
|
+
"""
|
|
739
|
+
self._handlers.append(handler)
|
|
740
|
+
logger.debug(f"Registered escalation handler: {handler.handler_type}")
|
|
741
|
+
|
|
742
|
+
def unregister_handler(self, handler_type: str) -> bool:
|
|
743
|
+
"""Unregister an escalation handler by type.
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
handler_type: The handler type to unregister.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
True if handler was found and removed.
|
|
750
|
+
"""
|
|
751
|
+
for handler in self._handlers[:]:
|
|
752
|
+
if handler.handler_type == handler_type:
|
|
753
|
+
self._handlers.remove(handler)
|
|
754
|
+
logger.debug(f"Unregistered escalation handler: {handler_type}")
|
|
755
|
+
return True
|
|
756
|
+
return False
|
|
757
|
+
|
|
758
|
+
def set_strategy(self, strategy: EscalationStrategy) -> None:
|
|
759
|
+
"""Set the escalation strategy.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
strategy: The strategy to use.
|
|
763
|
+
"""
|
|
764
|
+
self._strategy = strategy
|
|
765
|
+
logger.debug(f"Set escalation strategy: {strategy.strategy_name}")
|
|
766
|
+
|
|
767
|
+
async def start(self) -> None:
|
|
768
|
+
"""Start the escalation scheduler."""
|
|
769
|
+
if self._running:
|
|
770
|
+
logger.warning("Escalation scheduler already running")
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if not self.config.enabled:
|
|
774
|
+
logger.info("Escalation scheduler is disabled")
|
|
775
|
+
return
|
|
776
|
+
|
|
777
|
+
logger.info("Starting escalation scheduler")
|
|
778
|
+
|
|
779
|
+
# Initialize backend
|
|
780
|
+
await self._backend.initialize()
|
|
781
|
+
logger.info(f"Using scheduler backend: {self._backend.backend_type.value}")
|
|
782
|
+
|
|
783
|
+
# Register the checker job with backend for persistence
|
|
784
|
+
job_data = JobData(
|
|
785
|
+
id=self.DEFAULT_JOB_ID,
|
|
786
|
+
name="Escalation Checker",
|
|
787
|
+
func_ref="truthound_dashboard.core.notifications.escalation.scheduler:_check_and_escalate",
|
|
788
|
+
trigger_type="interval",
|
|
789
|
+
trigger_args={"seconds": self.config.check_interval_seconds},
|
|
790
|
+
next_run_time=datetime.utcnow() + timedelta(
|
|
791
|
+
seconds=self.config.check_interval_seconds
|
|
792
|
+
),
|
|
793
|
+
state=JobState.PENDING,
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
try:
|
|
797
|
+
# Check if job exists (recovery scenario)
|
|
798
|
+
existing = await self._backend.get_job(self.DEFAULT_JOB_ID)
|
|
799
|
+
if existing:
|
|
800
|
+
logger.info("Recovered existing escalation checker job")
|
|
801
|
+
# Update next_run_time if it was in the past
|
|
802
|
+
if existing.next_run_time and existing.next_run_time < datetime.utcnow():
|
|
803
|
+
if self._backend.is_misfired(existing):
|
|
804
|
+
self._misfire_count += 1
|
|
805
|
+
logger.warning("Escalation checker job misfired, rescheduling")
|
|
806
|
+
existing.next_run_time = datetime.utcnow()
|
|
807
|
+
existing.state = JobState.PENDING
|
|
808
|
+
await self._backend.update_job(existing)
|
|
809
|
+
else:
|
|
810
|
+
await self._backend.add_job(job_data)
|
|
811
|
+
logger.debug("Created escalation checker job")
|
|
812
|
+
except ValueError:
|
|
813
|
+
# Job already exists
|
|
814
|
+
logger.debug("Escalation checker job already registered")
|
|
815
|
+
|
|
816
|
+
# Schedule the checker job with APScheduler
|
|
817
|
+
self._scheduler.add_job(
|
|
818
|
+
self._check_and_escalate,
|
|
819
|
+
trigger=IntervalTrigger(seconds=self.config.check_interval_seconds),
|
|
820
|
+
id=self.DEFAULT_JOB_ID,
|
|
821
|
+
name="Escalation Checker",
|
|
822
|
+
replace_existing=True,
|
|
823
|
+
misfire_grace_time=self.config.misfire_grace_time,
|
|
824
|
+
coalesce=self.config.coalesce,
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
# Start scheduler if we own it
|
|
828
|
+
if self._owns_scheduler and not self._scheduler.running:
|
|
829
|
+
self._scheduler.start()
|
|
830
|
+
|
|
831
|
+
self._running = True
|
|
832
|
+
logger.info(
|
|
833
|
+
f"Escalation scheduler started "
|
|
834
|
+
f"(interval: {self.config.check_interval_seconds}s, "
|
|
835
|
+
f"backend: {self._backend.backend_type.value})"
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
async def stop(self) -> None:
|
|
839
|
+
"""Stop the escalation scheduler gracefully."""
|
|
840
|
+
if not self._running:
|
|
841
|
+
return
|
|
842
|
+
|
|
843
|
+
logger.info("Stopping escalation scheduler")
|
|
844
|
+
|
|
845
|
+
try:
|
|
846
|
+
self._scheduler.remove_job(self.DEFAULT_JOB_ID)
|
|
847
|
+
except Exception:
|
|
848
|
+
pass # Job may not exist
|
|
849
|
+
|
|
850
|
+
# Shutdown scheduler if we own it
|
|
851
|
+
if self._owns_scheduler and self._scheduler.running:
|
|
852
|
+
self._scheduler.shutdown(wait=False)
|
|
853
|
+
|
|
854
|
+
# Shutdown backend (handles pending job persistence)
|
|
855
|
+
await self._backend.shutdown()
|
|
856
|
+
|
|
857
|
+
self._running = False
|
|
858
|
+
logger.info("Escalation scheduler stopped")
|
|
859
|
+
|
|
860
|
+
async def _check_and_escalate(self) -> None:
|
|
861
|
+
"""Check for and process pending escalations.
|
|
862
|
+
|
|
863
|
+
This is the main job that runs periodically.
|
|
864
|
+
"""
|
|
865
|
+
async with self._lock:
|
|
866
|
+
self._last_check_at = datetime.utcnow()
|
|
867
|
+
self._check_count += 1
|
|
868
|
+
|
|
869
|
+
logger.debug(f"Checking for pending escalations (check #{self._check_count})")
|
|
870
|
+
|
|
871
|
+
# Mark job as running in backend
|
|
872
|
+
await self._backend.mark_job_running(self.DEFAULT_JOB_ID)
|
|
873
|
+
|
|
874
|
+
try:
|
|
875
|
+
async with get_session() as session:
|
|
876
|
+
from sqlalchemy import select
|
|
877
|
+
|
|
878
|
+
# Get pending escalations
|
|
879
|
+
now = datetime.utcnow()
|
|
880
|
+
query = (
|
|
881
|
+
select(EscalationIncidentModel)
|
|
882
|
+
.where(
|
|
883
|
+
EscalationIncidentModel.state.in_([
|
|
884
|
+
EscalationStateEnum.TRIGGERED.value,
|
|
885
|
+
EscalationStateEnum.ESCALATED.value,
|
|
886
|
+
])
|
|
887
|
+
)
|
|
888
|
+
.where(EscalationIncidentModel.next_escalation_at <= now)
|
|
889
|
+
.limit(self.config.max_escalations_per_check)
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
result = await session.execute(query)
|
|
893
|
+
incidents = result.scalars().all()
|
|
894
|
+
|
|
895
|
+
if not incidents:
|
|
896
|
+
logger.debug("No pending escalations found")
|
|
897
|
+
else:
|
|
898
|
+
logger.info(f"Found {len(incidents)} incidents due for escalation")
|
|
899
|
+
|
|
900
|
+
for incident in incidents:
|
|
901
|
+
await self._process_incident(session, incident)
|
|
902
|
+
|
|
903
|
+
await session.commit()
|
|
904
|
+
|
|
905
|
+
# Mark job as completed with next run time
|
|
906
|
+
next_run = datetime.utcnow() + timedelta(
|
|
907
|
+
seconds=self.config.check_interval_seconds
|
|
908
|
+
)
|
|
909
|
+
await self._backend.mark_job_completed(self.DEFAULT_JOB_ID, next_run)
|
|
910
|
+
|
|
911
|
+
except Exception as e:
|
|
912
|
+
self._error_count += 1
|
|
913
|
+
logger.error(f"Error checking escalations: {e}")
|
|
914
|
+
# Mark job as failed (will retry with exponential backoff)
|
|
915
|
+
await self._backend.mark_job_failed(
|
|
916
|
+
self.DEFAULT_JOB_ID,
|
|
917
|
+
str(e),
|
|
918
|
+
schedule_retry=self.config.retry_on_failure,
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
async def _process_incident(
|
|
922
|
+
self,
|
|
923
|
+
session: Any,
|
|
924
|
+
incident: EscalationIncidentModel,
|
|
925
|
+
) -> None:
|
|
926
|
+
"""Process a single incident for escalation.
|
|
927
|
+
|
|
928
|
+
Args:
|
|
929
|
+
session: Database session.
|
|
930
|
+
incident: The incident to process.
|
|
931
|
+
"""
|
|
932
|
+
try:
|
|
933
|
+
# Get the policy
|
|
934
|
+
from sqlalchemy import select
|
|
935
|
+
|
|
936
|
+
result = await session.execute(
|
|
937
|
+
select(EscalationPolicyModel)
|
|
938
|
+
.where(EscalationPolicyModel.id == incident.policy_id)
|
|
939
|
+
)
|
|
940
|
+
policy = result.scalar_one_or_none()
|
|
941
|
+
|
|
942
|
+
if not policy:
|
|
943
|
+
logger.error(f"Policy not found for incident {incident.id}")
|
|
944
|
+
return
|
|
945
|
+
|
|
946
|
+
if not policy.is_active:
|
|
947
|
+
logger.debug(f"Policy {policy.id} is inactive, skipping")
|
|
948
|
+
return
|
|
949
|
+
|
|
950
|
+
# Check escalation strategy
|
|
951
|
+
if not await self._strategy.should_escalate(incident, policy):
|
|
952
|
+
logger.debug(f"Strategy says don't escalate incident {incident.id}")
|
|
953
|
+
return
|
|
954
|
+
|
|
955
|
+
# Get next level
|
|
956
|
+
next_level = await self._strategy.get_next_level(incident, policy)
|
|
957
|
+
if next_level is None:
|
|
958
|
+
logger.debug(f"No more levels for incident {incident.id}")
|
|
959
|
+
# Clear next_escalation_at since we're at max level
|
|
960
|
+
incident.next_escalation_at = None
|
|
961
|
+
return
|
|
962
|
+
|
|
963
|
+
# Check if escalation is allowed using model method
|
|
964
|
+
if not incident.can_escalate(policy.max_escalations):
|
|
965
|
+
logger.warning(
|
|
966
|
+
f"Incident {incident.id} cannot escalate: "
|
|
967
|
+
f"count={incident.escalation_count}, max={policy.max_escalations}, "
|
|
968
|
+
f"state={incident.state}"
|
|
969
|
+
)
|
|
970
|
+
incident.next_escalation_at = None
|
|
971
|
+
return
|
|
972
|
+
|
|
973
|
+
# Get targets for the next level
|
|
974
|
+
targets = self._get_level_targets(policy, next_level)
|
|
975
|
+
if not targets:
|
|
976
|
+
logger.warning(f"No targets for level {next_level} in policy {policy.id}")
|
|
977
|
+
|
|
978
|
+
# Execute escalation through handlers
|
|
979
|
+
await self._execute_escalation(incident, policy, next_level, targets)
|
|
980
|
+
|
|
981
|
+
# Calculate next escalation time
|
|
982
|
+
further_level = self._get_level_config(policy, next_level + 1)
|
|
983
|
+
next_escalation_at: datetime | None = None
|
|
984
|
+
if further_level:
|
|
985
|
+
delay_minutes = further_level.get("delay_minutes", 15)
|
|
986
|
+
next_escalation_at = datetime.utcnow() + timedelta(minutes=delay_minutes)
|
|
987
|
+
|
|
988
|
+
# Use model's escalate method for atomic state update
|
|
989
|
+
if not incident.escalate(
|
|
990
|
+
next_level=next_level,
|
|
991
|
+
next_escalation_at=next_escalation_at,
|
|
992
|
+
max_escalations=policy.max_escalations,
|
|
993
|
+
):
|
|
994
|
+
logger.warning(f"Escalation blocked for incident {incident.id}")
|
|
995
|
+
return
|
|
996
|
+
|
|
997
|
+
self._escalation_count += 1
|
|
998
|
+
logger.info(
|
|
999
|
+
f"Escalated incident {incident.id} to level {next_level}"
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
except Exception as e:
|
|
1003
|
+
self._error_count += 1
|
|
1004
|
+
logger.error(f"Error processing incident {incident.id}: {e}")
|
|
1005
|
+
|
|
1006
|
+
async def _execute_escalation(
|
|
1007
|
+
self,
|
|
1008
|
+
incident: EscalationIncidentModel,
|
|
1009
|
+
policy: EscalationPolicyModel,
|
|
1010
|
+
level: int,
|
|
1011
|
+
targets: list[dict[str, Any]],
|
|
1012
|
+
) -> None:
|
|
1013
|
+
"""Execute escalation through registered handlers.
|
|
1014
|
+
|
|
1015
|
+
Args:
|
|
1016
|
+
incident: The incident being escalated.
|
|
1017
|
+
policy: The escalation policy.
|
|
1018
|
+
level: The new level.
|
|
1019
|
+
targets: Targets for this level.
|
|
1020
|
+
"""
|
|
1021
|
+
for handler in self._handlers:
|
|
1022
|
+
try:
|
|
1023
|
+
# Group targets by channel type
|
|
1024
|
+
for target in targets:
|
|
1025
|
+
channel_type = target.get("channel", "email")
|
|
1026
|
+
if await handler.can_handle(channel_type):
|
|
1027
|
+
result = await handler.handle_escalation(
|
|
1028
|
+
incident=incident,
|
|
1029
|
+
policy=policy,
|
|
1030
|
+
level=level,
|
|
1031
|
+
targets=[target],
|
|
1032
|
+
)
|
|
1033
|
+
if not result.success:
|
|
1034
|
+
logger.warning(
|
|
1035
|
+
f"Handler {handler.handler_type} failed: {result.message}"
|
|
1036
|
+
)
|
|
1037
|
+
break # Only use first matching handler
|
|
1038
|
+
|
|
1039
|
+
except Exception as e:
|
|
1040
|
+
logger.error(
|
|
1041
|
+
f"Handler {handler.handler_type} error: {e}"
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
def _get_level_targets(
|
|
1045
|
+
self,
|
|
1046
|
+
policy: EscalationPolicyModel,
|
|
1047
|
+
level: int,
|
|
1048
|
+
) -> list[dict[str, Any]]:
|
|
1049
|
+
"""Get targets for a specific escalation level.
|
|
1050
|
+
|
|
1051
|
+
Args:
|
|
1052
|
+
policy: The escalation policy.
|
|
1053
|
+
level: The level number.
|
|
1054
|
+
|
|
1055
|
+
Returns:
|
|
1056
|
+
List of target configurations.
|
|
1057
|
+
"""
|
|
1058
|
+
level_config = self._get_level_config(policy, level)
|
|
1059
|
+
if not level_config:
|
|
1060
|
+
return []
|
|
1061
|
+
return level_config.get("targets", [])
|
|
1062
|
+
|
|
1063
|
+
def _get_level_config(
|
|
1064
|
+
self,
|
|
1065
|
+
policy: EscalationPolicyModel,
|
|
1066
|
+
level: int,
|
|
1067
|
+
) -> dict[str, Any] | None:
|
|
1068
|
+
"""Get configuration for a specific level.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
policy: The escalation policy.
|
|
1072
|
+
level: The level number.
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
Level configuration or None.
|
|
1076
|
+
"""
|
|
1077
|
+
for level_config in policy.levels or []:
|
|
1078
|
+
if level_config.get("level") == level:
|
|
1079
|
+
return level_config
|
|
1080
|
+
return None
|
|
1081
|
+
|
|
1082
|
+
async def trigger_immediate_check(self) -> dict[str, Any]:
|
|
1083
|
+
"""Trigger an immediate escalation check.
|
|
1084
|
+
|
|
1085
|
+
Returns:
|
|
1086
|
+
Check result including number of escalations processed.
|
|
1087
|
+
"""
|
|
1088
|
+
if not self._running:
|
|
1089
|
+
return {
|
|
1090
|
+
"success": False,
|
|
1091
|
+
"message": "Scheduler is not running",
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
escalations_before = self._escalation_count
|
|
1095
|
+
await self._check_and_escalate()
|
|
1096
|
+
escalations_processed = self._escalation_count - escalations_before
|
|
1097
|
+
|
|
1098
|
+
return {
|
|
1099
|
+
"success": True,
|
|
1100
|
+
"message": f"Processed {escalations_processed} escalations",
|
|
1101
|
+
"escalations_processed": escalations_processed,
|
|
1102
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
def get_status(self) -> dict[str, Any]:
|
|
1106
|
+
"""Get current scheduler status.
|
|
1107
|
+
|
|
1108
|
+
Returns:
|
|
1109
|
+
Status dictionary with metrics.
|
|
1110
|
+
"""
|
|
1111
|
+
next_run: datetime | None = None
|
|
1112
|
+
if self._running:
|
|
1113
|
+
try:
|
|
1114
|
+
job = self._scheduler.get_job(self.DEFAULT_JOB_ID)
|
|
1115
|
+
if job:
|
|
1116
|
+
next_run = job.next_run_time
|
|
1117
|
+
except Exception:
|
|
1118
|
+
pass
|
|
1119
|
+
|
|
1120
|
+
backend_status = self._backend.get_status()
|
|
1121
|
+
|
|
1122
|
+
return {
|
|
1123
|
+
"running": self._running,
|
|
1124
|
+
"enabled": self.config.enabled,
|
|
1125
|
+
"check_interval_seconds": self.config.check_interval_seconds,
|
|
1126
|
+
"last_check_at": self._last_check_at.isoformat() if self._last_check_at else None,
|
|
1127
|
+
"next_check_at": next_run.isoformat() if next_run else None,
|
|
1128
|
+
"check_count": self._check_count,
|
|
1129
|
+
"escalation_count": self._escalation_count,
|
|
1130
|
+
"error_count": self._error_count,
|
|
1131
|
+
"misfire_count": self._misfire_count,
|
|
1132
|
+
"handlers": [h.handler_type for h in self._handlers],
|
|
1133
|
+
"strategy": self._strategy.strategy_name,
|
|
1134
|
+
"backend": backend_status,
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
def reset_metrics(self) -> None:
|
|
1138
|
+
"""Reset scheduler metrics."""
|
|
1139
|
+
self._check_count = 0
|
|
1140
|
+
self._escalation_count = 0
|
|
1141
|
+
self._error_count = 0
|
|
1142
|
+
self._misfire_count = 0
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
# =============================================================================
|
|
1146
|
+
# Singleton Instance Management
|
|
1147
|
+
# =============================================================================
|
|
1148
|
+
|
|
1149
|
+
_scheduler_service: EscalationSchedulerService | None = None
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def get_escalation_scheduler(
|
|
1153
|
+
config: EscalationSchedulerConfig | None = None,
|
|
1154
|
+
) -> EscalationSchedulerService:
|
|
1155
|
+
"""Get the singleton escalation scheduler instance.
|
|
1156
|
+
|
|
1157
|
+
Args:
|
|
1158
|
+
config: Optional configuration (only used on first call).
|
|
1159
|
+
|
|
1160
|
+
Returns:
|
|
1161
|
+
The EscalationSchedulerService instance.
|
|
1162
|
+
"""
|
|
1163
|
+
global _scheduler_service
|
|
1164
|
+
if _scheduler_service is None:
|
|
1165
|
+
_scheduler_service = EscalationSchedulerService(config=config)
|
|
1166
|
+
return _scheduler_service
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def reset_escalation_scheduler() -> None:
|
|
1170
|
+
"""Reset the singleton scheduler instance.
|
|
1171
|
+
|
|
1172
|
+
Useful for testing or reconfiguration.
|
|
1173
|
+
"""
|
|
1174
|
+
global _scheduler_service
|
|
1175
|
+
_scheduler_service = None
|
|
1176
|
+
|
|
1177
|
+
|
|
1178
|
+
async def start_escalation_scheduler() -> None:
|
|
1179
|
+
"""Start the escalation scheduler."""
|
|
1180
|
+
scheduler = get_escalation_scheduler()
|
|
1181
|
+
await scheduler.start()
|
|
1182
|
+
|
|
1183
|
+
|
|
1184
|
+
async def stop_escalation_scheduler() -> None:
|
|
1185
|
+
"""Stop the escalation scheduler."""
|
|
1186
|
+
scheduler = get_escalation_scheduler()
|
|
1187
|
+
await scheduler.stop()
|