truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- truthound_dashboard/api/alerts.py +258 -0
- truthound_dashboard/api/anomaly.py +1302 -0
- truthound_dashboard/api/cross_alerts.py +352 -0
- truthound_dashboard/api/deps.py +143 -0
- truthound_dashboard/api/drift_monitor.py +540 -0
- truthound_dashboard/api/lineage.py +1151 -0
- truthound_dashboard/api/maintenance.py +363 -0
- truthound_dashboard/api/middleware.py +373 -1
- truthound_dashboard/api/model_monitoring.py +805 -0
- truthound_dashboard/api/notifications_advanced.py +2452 -0
- truthound_dashboard/api/plugins.py +2096 -0
- truthound_dashboard/api/profile.py +211 -14
- truthound_dashboard/api/reports.py +853 -0
- truthound_dashboard/api/router.py +147 -0
- truthound_dashboard/api/rule_suggestions.py +310 -0
- truthound_dashboard/api/schema_evolution.py +231 -0
- truthound_dashboard/api/sources.py +47 -3
- truthound_dashboard/api/triggers.py +190 -0
- truthound_dashboard/api/validations.py +13 -0
- truthound_dashboard/api/validators.py +333 -4
- truthound_dashboard/api/versioning.py +309 -0
- truthound_dashboard/api/websocket.py +301 -0
- truthound_dashboard/core/__init__.py +27 -0
- truthound_dashboard/core/anomaly.py +1395 -0
- truthound_dashboard/core/anomaly_explainer.py +633 -0
- truthound_dashboard/core/cache.py +206 -0
- truthound_dashboard/core/cached_services.py +422 -0
- truthound_dashboard/core/charts.py +352 -0
- truthound_dashboard/core/connections.py +1069 -42
- truthound_dashboard/core/cross_alerts.py +837 -0
- truthound_dashboard/core/drift_monitor.py +1477 -0
- truthound_dashboard/core/drift_sampling.py +669 -0
- truthound_dashboard/core/i18n/__init__.py +42 -0
- truthound_dashboard/core/i18n/detector.py +173 -0
- truthound_dashboard/core/i18n/messages.py +564 -0
- truthound_dashboard/core/lineage.py +971 -0
- truthound_dashboard/core/maintenance.py +443 -5
- truthound_dashboard/core/model_monitoring.py +1043 -0
- truthound_dashboard/core/notifications/channels.py +1020 -1
- truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
- truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
- truthound_dashboard/core/notifications/deduplication/service.py +400 -0
- truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
- truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
- truthound_dashboard/core/notifications/dispatcher.py +43 -0
- truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
- truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
- truthound_dashboard/core/notifications/escalation/engine.py +429 -0
- truthound_dashboard/core/notifications/escalation/models.py +336 -0
- truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
- truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
- truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
- truthound_dashboard/core/notifications/events.py +49 -0
- truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
- truthound_dashboard/core/notifications/metrics/base.py +528 -0
- truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
- truthound_dashboard/core/notifications/routing/__init__.py +169 -0
- truthound_dashboard/core/notifications/routing/combinators.py +184 -0
- truthound_dashboard/core/notifications/routing/config.py +375 -0
- truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
- truthound_dashboard/core/notifications/routing/engine.py +382 -0
- truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
- truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
- truthound_dashboard/core/notifications/routing/rules.py +625 -0
- truthound_dashboard/core/notifications/routing/validator.py +678 -0
- truthound_dashboard/core/notifications/service.py +2 -0
- truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
- truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
- truthound_dashboard/core/notifications/throttling/builder.py +311 -0
- truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
- truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
- truthound_dashboard/core/openlineage.py +1028 -0
- truthound_dashboard/core/plugins/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/__init__.py +39 -0
- truthound_dashboard/core/plugins/docs/extractor.py +703 -0
- truthound_dashboard/core/plugins/docs/renderers.py +804 -0
- truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
- truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
- truthound_dashboard/core/plugins/hooks/manager.py +403 -0
- truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
- truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
- truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
- truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
- truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
- truthound_dashboard/core/plugins/loader.py +504 -0
- truthound_dashboard/core/plugins/registry.py +810 -0
- truthound_dashboard/core/plugins/reporter_executor.py +588 -0
- truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
- truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
- truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
- truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
- truthound_dashboard/core/plugins/sandbox.py +617 -0
- truthound_dashboard/core/plugins/security/__init__.py +68 -0
- truthound_dashboard/core/plugins/security/analyzer.py +535 -0
- truthound_dashboard/core/plugins/security/policies.py +311 -0
- truthound_dashboard/core/plugins/security/protocols.py +296 -0
- truthound_dashboard/core/plugins/security/signing.py +842 -0
- truthound_dashboard/core/plugins/security.py +446 -0
- truthound_dashboard/core/plugins/validator_executor.py +401 -0
- truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
- truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
- truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
- truthound_dashboard/core/plugins/versioning/semver.py +266 -0
- truthound_dashboard/core/profile_comparison.py +601 -0
- truthound_dashboard/core/report_history.py +570 -0
- truthound_dashboard/core/reporters/__init__.py +57 -0
- truthound_dashboard/core/reporters/base.py +296 -0
- truthound_dashboard/core/reporters/csv_reporter.py +155 -0
- truthound_dashboard/core/reporters/html_reporter.py +598 -0
- truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
- truthound_dashboard/core/reporters/i18n/base.py +494 -0
- truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
- truthound_dashboard/core/reporters/json_reporter.py +160 -0
- truthound_dashboard/core/reporters/junit_reporter.py +233 -0
- truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
- truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
- truthound_dashboard/core/reporters/registry.py +272 -0
- truthound_dashboard/core/rule_generator.py +2088 -0
- truthound_dashboard/core/scheduler.py +822 -12
- truthound_dashboard/core/schema_evolution.py +858 -0
- truthound_dashboard/core/services.py +152 -9
- truthound_dashboard/core/statistics.py +718 -0
- truthound_dashboard/core/streaming_anomaly.py +883 -0
- truthound_dashboard/core/triggers/__init__.py +45 -0
- truthound_dashboard/core/triggers/base.py +226 -0
- truthound_dashboard/core/triggers/evaluators.py +609 -0
- truthound_dashboard/core/triggers/factory.py +363 -0
- truthound_dashboard/core/unified_alerts.py +870 -0
- truthound_dashboard/core/validation_limits.py +509 -0
- truthound_dashboard/core/versioning.py +709 -0
- truthound_dashboard/core/websocket/__init__.py +59 -0
- truthound_dashboard/core/websocket/manager.py +512 -0
- truthound_dashboard/core/websocket/messages.py +130 -0
- truthound_dashboard/db/__init__.py +30 -0
- truthound_dashboard/db/models.py +3375 -3
- truthound_dashboard/main.py +22 -0
- truthound_dashboard/schemas/__init__.py +396 -1
- truthound_dashboard/schemas/anomaly.py +1258 -0
- truthound_dashboard/schemas/base.py +4 -0
- truthound_dashboard/schemas/cross_alerts.py +334 -0
- truthound_dashboard/schemas/drift_monitor.py +890 -0
- truthound_dashboard/schemas/lineage.py +428 -0
- truthound_dashboard/schemas/maintenance.py +154 -0
- truthound_dashboard/schemas/model_monitoring.py +374 -0
- truthound_dashboard/schemas/notifications_advanced.py +1363 -0
- truthound_dashboard/schemas/openlineage.py +704 -0
- truthound_dashboard/schemas/plugins.py +1293 -0
- truthound_dashboard/schemas/profile.py +420 -34
- truthound_dashboard/schemas/profile_comparison.py +242 -0
- truthound_dashboard/schemas/reports.py +285 -0
- truthound_dashboard/schemas/rule_suggestion.py +434 -0
- truthound_dashboard/schemas/schema_evolution.py +164 -0
- truthound_dashboard/schemas/source.py +117 -2
- truthound_dashboard/schemas/triggers.py +511 -0
- truthound_dashboard/schemas/unified_alerts.py +223 -0
- truthound_dashboard/schemas/validation.py +25 -1
- truthound_dashboard/schemas/validators/__init__.py +11 -0
- truthound_dashboard/schemas/validators/base.py +151 -0
- truthound_dashboard/schemas/versioning.py +152 -0
- truthound_dashboard/static/index.html +2 -2
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
- truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
- truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
- truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
- truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
- truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
- {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
"""Escalation engine for managing alert escalations.
|
|
2
|
+
|
|
3
|
+
This module provides the main EscalationEngine that orchestrates
|
|
4
|
+
the escalation lifecycle including triggering, escalating,
|
|
5
|
+
acknowledging, and resolving incidents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timedelta
|
|
13
|
+
from typing import Any, Callable
|
|
14
|
+
|
|
15
|
+
from .models import (
|
|
16
|
+
EscalationIncident,
|
|
17
|
+
EscalationLevel,
|
|
18
|
+
EscalationPolicy,
|
|
19
|
+
EscalationState,
|
|
20
|
+
EscalationTarget,
|
|
21
|
+
)
|
|
22
|
+
from .state_machine import EscalationStateMachine
|
|
23
|
+
from .stores import BaseEscalationStore, InMemoryEscalationStore
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class EscalationEngineConfig:
|
|
30
|
+
"""Configuration for the escalation engine.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
check_interval_seconds: How often to check for pending escalations.
|
|
34
|
+
max_retries: Maximum notification retries per level.
|
|
35
|
+
default_delay_minutes: Default delay between levels if not specified.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
check_interval_seconds: int = 60
|
|
39
|
+
max_retries: int = 3
|
|
40
|
+
default_delay_minutes: int = 15
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class EscalationEngine:
|
|
44
|
+
"""Main escalation engine.
|
|
45
|
+
|
|
46
|
+
Orchestrates the complete escalation lifecycle:
|
|
47
|
+
1. Triggering new incidents
|
|
48
|
+
2. Escalating to next levels based on time
|
|
49
|
+
3. Acknowledging incidents
|
|
50
|
+
4. Resolving incidents
|
|
51
|
+
5. Auto-resolving on success
|
|
52
|
+
|
|
53
|
+
The engine can be used standalone or integrated with
|
|
54
|
+
APScheduler for periodic escalation checks.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
engine = EscalationEngine(
|
|
58
|
+
store=SQLiteEscalationStore("escalation.db"),
|
|
59
|
+
on_notify=send_notification,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Trigger escalation
|
|
63
|
+
await engine.trigger(
|
|
64
|
+
policy_id="critical-policy",
|
|
65
|
+
incident_ref="validation-123",
|
|
66
|
+
context={"severity": "critical"},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Acknowledge
|
|
70
|
+
await engine.acknowledge("incident-id", actor="user@example.com")
|
|
71
|
+
|
|
72
|
+
# Resolve
|
|
73
|
+
await engine.resolve("incident-id", actor="user@example.com")
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
store: BaseEscalationStore | None = None,
|
|
79
|
+
config: EscalationEngineConfig | None = None,
|
|
80
|
+
on_notify: Callable[[EscalationIncident, EscalationLevel, EscalationTarget], Any] | None = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Initialize escalation engine.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
store: Storage backend.
|
|
86
|
+
config: Engine configuration.
|
|
87
|
+
on_notify: Callback for sending notifications.
|
|
88
|
+
"""
|
|
89
|
+
self.store = store or InMemoryEscalationStore()
|
|
90
|
+
self.config = config or EscalationEngineConfig()
|
|
91
|
+
self.on_notify = on_notify
|
|
92
|
+
self.state_machine = EscalationStateMachine()
|
|
93
|
+
|
|
94
|
+
async def trigger(
|
|
95
|
+
self,
|
|
96
|
+
policy_id: str,
|
|
97
|
+
incident_ref: str,
|
|
98
|
+
context: dict[str, Any] | None = None,
|
|
99
|
+
) -> EscalationIncident:
|
|
100
|
+
"""Trigger a new escalation incident.
|
|
101
|
+
|
|
102
|
+
Creates a new incident and starts the escalation process.
|
|
103
|
+
If an incident with the same ref already exists and is not
|
|
104
|
+
resolved, returns the existing incident.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
policy_id: ID of the escalation policy.
|
|
108
|
+
incident_ref: External reference (e.g., validation ID).
|
|
109
|
+
context: Context data for the incident.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The created or existing incident.
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If policy not found.
|
|
116
|
+
"""
|
|
117
|
+
# Check for existing unresolved incident
|
|
118
|
+
existing = self.store.get_incident_by_ref(incident_ref)
|
|
119
|
+
if existing and existing.state != EscalationState.RESOLVED:
|
|
120
|
+
logger.debug(f"Incident {incident_ref} already exists in state {existing.state}")
|
|
121
|
+
return existing
|
|
122
|
+
|
|
123
|
+
# Get policy
|
|
124
|
+
policy = self.store.get_policy(policy_id)
|
|
125
|
+
if not policy:
|
|
126
|
+
raise ValueError(f"Escalation policy not found: {policy_id}")
|
|
127
|
+
|
|
128
|
+
if not policy.is_active:
|
|
129
|
+
raise ValueError(f"Escalation policy is not active: {policy_id}")
|
|
130
|
+
|
|
131
|
+
# Create incident
|
|
132
|
+
incident = EscalationIncident(
|
|
133
|
+
policy_id=policy_id,
|
|
134
|
+
incident_ref=incident_ref,
|
|
135
|
+
context=context or {},
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Trigger state transition
|
|
139
|
+
incident = self.state_machine.trigger(incident)
|
|
140
|
+
|
|
141
|
+
# Set next escalation time
|
|
142
|
+
first_level = policy.get_level(1)
|
|
143
|
+
if first_level:
|
|
144
|
+
delay = first_level.delay_minutes
|
|
145
|
+
if delay > 0:
|
|
146
|
+
incident.next_escalation_at = datetime.utcnow() + timedelta(minutes=delay)
|
|
147
|
+
else:
|
|
148
|
+
incident.next_escalation_at = datetime.utcnow()
|
|
149
|
+
|
|
150
|
+
# Save incident
|
|
151
|
+
self.store.save_incident(incident)
|
|
152
|
+
|
|
153
|
+
# Notify first level
|
|
154
|
+
await self._notify_level(incident, policy, first_level)
|
|
155
|
+
|
|
156
|
+
logger.info(f"Triggered escalation for {incident_ref}")
|
|
157
|
+
return incident
|
|
158
|
+
|
|
159
|
+
async def escalate(self, incident_id: str) -> EscalationIncident:
|
|
160
|
+
"""Escalate incident to the next level.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
incident_id: ID of the incident.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Updated incident.
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If incident not found or can't escalate.
|
|
170
|
+
"""
|
|
171
|
+
incident = self.store.get_incident(incident_id)
|
|
172
|
+
if not incident:
|
|
173
|
+
raise ValueError(f"Incident not found: {incident_id}")
|
|
174
|
+
|
|
175
|
+
if incident.state == EscalationState.RESOLVED:
|
|
176
|
+
raise ValueError("Cannot escalate resolved incident")
|
|
177
|
+
|
|
178
|
+
policy = self.store.get_policy(incident.policy_id)
|
|
179
|
+
if not policy:
|
|
180
|
+
raise ValueError(f"Policy not found: {incident.policy_id}")
|
|
181
|
+
|
|
182
|
+
# Check max escalations
|
|
183
|
+
if incident.escalation_count >= policy.max_escalations:
|
|
184
|
+
logger.warning(f"Incident {incident_id} reached max escalations")
|
|
185
|
+
return incident
|
|
186
|
+
|
|
187
|
+
# Get next level
|
|
188
|
+
next_level = policy.get_next_level(incident.current_level)
|
|
189
|
+
if not next_level:
|
|
190
|
+
logger.info(f"Incident {incident_id} at max level {incident.current_level}")
|
|
191
|
+
return incident
|
|
192
|
+
|
|
193
|
+
# Escalate
|
|
194
|
+
incident = self.state_machine.escalate(
|
|
195
|
+
incident,
|
|
196
|
+
to_level=next_level.level,
|
|
197
|
+
message=f"Escalating to level {next_level.level}",
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Set next escalation time
|
|
201
|
+
further_level = policy.get_next_level(next_level.level)
|
|
202
|
+
if further_level:
|
|
203
|
+
delay = further_level.delay_minutes
|
|
204
|
+
incident.next_escalation_at = datetime.utcnow() + timedelta(minutes=delay)
|
|
205
|
+
else:
|
|
206
|
+
incident.next_escalation_at = None
|
|
207
|
+
|
|
208
|
+
# Save
|
|
209
|
+
self.store.save_incident(incident)
|
|
210
|
+
|
|
211
|
+
# Notify
|
|
212
|
+
await self._notify_level(incident, policy, next_level)
|
|
213
|
+
|
|
214
|
+
logger.info(f"Escalated {incident_id} to level {next_level.level}")
|
|
215
|
+
return incident
|
|
216
|
+
|
|
217
|
+
async def acknowledge(
|
|
218
|
+
self,
|
|
219
|
+
incident_id: str,
|
|
220
|
+
actor: str,
|
|
221
|
+
message: str = "",
|
|
222
|
+
) -> EscalationIncident:
|
|
223
|
+
"""Acknowledge an incident.
|
|
224
|
+
|
|
225
|
+
Pauses further escalation until either resolved or
|
|
226
|
+
escalation time is reached.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
incident_id: ID of the incident.
|
|
230
|
+
actor: Who is acknowledging.
|
|
231
|
+
message: Optional acknowledgement message.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Updated incident.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
ValueError: If incident not found or can't acknowledge.
|
|
238
|
+
"""
|
|
239
|
+
incident = self.store.get_incident(incident_id)
|
|
240
|
+
if not incident:
|
|
241
|
+
raise ValueError(f"Incident not found: {incident_id}")
|
|
242
|
+
|
|
243
|
+
# Check if can acknowledge
|
|
244
|
+
if not self.state_machine.can_transition(incident, EscalationState.ACKNOWLEDGED):
|
|
245
|
+
raise ValueError(f"Cannot acknowledge incident in state {incident.state}")
|
|
246
|
+
|
|
247
|
+
# Acknowledge
|
|
248
|
+
incident = self.state_machine.acknowledge(
|
|
249
|
+
incident,
|
|
250
|
+
actor=actor,
|
|
251
|
+
message=message or f"Acknowledged by {actor}",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Save
|
|
255
|
+
self.store.save_incident(incident)
|
|
256
|
+
|
|
257
|
+
logger.info(f"Incident {incident_id} acknowledged by {actor}")
|
|
258
|
+
return incident
|
|
259
|
+
|
|
260
|
+
async def resolve(
|
|
261
|
+
self,
|
|
262
|
+
incident_id: str,
|
|
263
|
+
actor: str | None = None,
|
|
264
|
+
message: str = "",
|
|
265
|
+
auto: bool = False,
|
|
266
|
+
) -> EscalationIncident:
|
|
267
|
+
"""Resolve an incident.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
incident_id: ID of the incident.
|
|
271
|
+
actor: Who is resolving (None for auto-resolve).
|
|
272
|
+
message: Optional resolution message.
|
|
273
|
+
auto: Whether this is auto-resolution.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Updated incident.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
ValueError: If incident not found or can't resolve.
|
|
280
|
+
"""
|
|
281
|
+
incident = self.store.get_incident(incident_id)
|
|
282
|
+
if not incident:
|
|
283
|
+
raise ValueError(f"Incident not found: {incident_id}")
|
|
284
|
+
|
|
285
|
+
# Check if can resolve
|
|
286
|
+
if not self.state_machine.can_transition(incident, EscalationState.RESOLVED):
|
|
287
|
+
raise ValueError(f"Cannot resolve incident in state {incident.state}")
|
|
288
|
+
|
|
289
|
+
# Resolve
|
|
290
|
+
incident = self.state_machine.resolve(
|
|
291
|
+
incident,
|
|
292
|
+
actor=actor,
|
|
293
|
+
message=message,
|
|
294
|
+
auto=auto,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Save
|
|
298
|
+
self.store.save_incident(incident)
|
|
299
|
+
|
|
300
|
+
log_msg = f"Incident {incident_id} resolved"
|
|
301
|
+
if auto:
|
|
302
|
+
log_msg += " (auto)"
|
|
303
|
+
elif actor:
|
|
304
|
+
log_msg += f" by {actor}"
|
|
305
|
+
logger.info(log_msg)
|
|
306
|
+
|
|
307
|
+
return incident
|
|
308
|
+
|
|
309
|
+
async def auto_resolve_by_ref(
|
|
310
|
+
self,
|
|
311
|
+
incident_ref: str,
|
|
312
|
+
message: str = "Auto-resolved - validation passed",
|
|
313
|
+
) -> EscalationIncident | None:
|
|
314
|
+
"""Auto-resolve an incident by reference.
|
|
315
|
+
|
|
316
|
+
Called when validation passes to auto-resolve associated
|
|
317
|
+
incidents (if policy allows).
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
incident_ref: External reference.
|
|
321
|
+
message: Resolution message.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Resolved incident or None if not found/not eligible.
|
|
325
|
+
"""
|
|
326
|
+
incident = self.store.get_incident_by_ref(incident_ref)
|
|
327
|
+
if not incident:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
if incident.state == EscalationState.RESOLVED:
|
|
331
|
+
return incident
|
|
332
|
+
|
|
333
|
+
# Check policy allows auto-resolve
|
|
334
|
+
policy = self.store.get_policy(incident.policy_id)
|
|
335
|
+
if not policy or not policy.auto_resolve_on_success:
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
return await self.resolve(
|
|
339
|
+
incident.id,
|
|
340
|
+
message=message,
|
|
341
|
+
auto=True,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
async def check_and_escalate(self) -> int:
|
|
345
|
+
"""Check for and process pending escalations.
|
|
346
|
+
|
|
347
|
+
This method should be called periodically (e.g., by APScheduler)
|
|
348
|
+
to process escalations that are due.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
Number of incidents escalated.
|
|
352
|
+
"""
|
|
353
|
+
pending = self.store.get_pending_escalations()
|
|
354
|
+
escalated = 0
|
|
355
|
+
|
|
356
|
+
for incident in pending:
|
|
357
|
+
try:
|
|
358
|
+
await self.escalate(incident.id)
|
|
359
|
+
escalated += 1
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Failed to escalate {incident.id}: {e}")
|
|
362
|
+
|
|
363
|
+
return escalated
|
|
364
|
+
|
|
365
|
+
async def _notify_level(
|
|
366
|
+
self,
|
|
367
|
+
incident: EscalationIncident,
|
|
368
|
+
policy: EscalationPolicy,
|
|
369
|
+
level: EscalationLevel | None,
|
|
370
|
+
) -> None:
|
|
371
|
+
"""Send notifications for an escalation level.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
incident: The incident.
|
|
375
|
+
policy: The policy.
|
|
376
|
+
level: The level to notify.
|
|
377
|
+
"""
|
|
378
|
+
if not level or not self.on_notify:
|
|
379
|
+
return
|
|
380
|
+
|
|
381
|
+
for target in level.targets:
|
|
382
|
+
try:
|
|
383
|
+
await self.on_notify(incident, level, target)
|
|
384
|
+
except Exception as e:
|
|
385
|
+
logger.error(
|
|
386
|
+
f"Failed to notify {target.identifier} for incident {incident.id}: {e}"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
def get_incident(self, incident_id: str) -> EscalationIncident | None:
|
|
390
|
+
"""Get incident by ID."""
|
|
391
|
+
return self.store.get_incident(incident_id)
|
|
392
|
+
|
|
393
|
+
def get_incident_by_ref(self, incident_ref: str) -> EscalationIncident | None:
|
|
394
|
+
"""Get incident by reference."""
|
|
395
|
+
return self.store.get_incident_by_ref(incident_ref)
|
|
396
|
+
|
|
397
|
+
def list_active_incidents(self) -> list[EscalationIncident]:
|
|
398
|
+
"""List all active (non-resolved) incidents."""
|
|
399
|
+
return self.store.list_incidents(
|
|
400
|
+
states=[
|
|
401
|
+
EscalationState.PENDING,
|
|
402
|
+
EscalationState.TRIGGERED,
|
|
403
|
+
EscalationState.ACKNOWLEDGED,
|
|
404
|
+
EscalationState.ESCALATED,
|
|
405
|
+
]
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
def get_stats(self) -> dict[str, Any]:
|
|
409
|
+
"""Get escalation statistics.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Dictionary with stats.
|
|
413
|
+
"""
|
|
414
|
+
all_incidents = self.store.list_incidents()
|
|
415
|
+
|
|
416
|
+
by_state: dict[str, int] = {}
|
|
417
|
+
for incident in all_incidents:
|
|
418
|
+
state = incident.state.value
|
|
419
|
+
by_state[state] = by_state.get(state, 0) + 1
|
|
420
|
+
|
|
421
|
+
return {
|
|
422
|
+
"total_incidents": len(all_incidents),
|
|
423
|
+
"by_state": by_state,
|
|
424
|
+
"active_count": sum(
|
|
425
|
+
1 for i in all_incidents
|
|
426
|
+
if i.state != EscalationState.RESOLVED
|
|
427
|
+
),
|
|
428
|
+
"total_policies": len(self.store.list_policies(active_only=False)),
|
|
429
|
+
}
|