detectkit 0.3.17__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.3.17/detectkit.egg-info → detectkit-0.4.1}/PKG-INFO +5 -1
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/__init__.py +1 -1
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/factory.py +5 -35
- detectkit-0.4.1/detectkit/alerting/orchestrator/__init__.py +19 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_base.py +46 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_cooldown.py +40 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_decision.py +148 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_dispatch.py +69 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_recovery.py +203 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/_types.py +82 -0
- detectkit-0.4.1/detectkit/alerting/orchestrator/orchestrator.py +36 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/config/profile.py +5 -0
- detectkit-0.4.1/detectkit/database/internal_tables/__init__.py +10 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_alert_states.py +169 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_base.py +43 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_datapoints.py +128 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_detections.py +198 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_metrics.py +93 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_schema.py +25 -0
- detectkit-0.4.1/detectkit/database/internal_tables/_tasks.py +92 -0
- detectkit-0.4.1/detectkit/database/internal_tables/manager.py +26 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/base.py +34 -37
- detectkit-0.4.1/detectkit/detectors/seasonality.py +95 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/statistical/iqr.py +6 -75
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/statistical/mad.py +6 -92
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/statistical/zscore.py +6 -75
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/loaders/metric_loader.py +2 -16
- detectkit-0.4.1/detectkit/orchestration/task_manager/__init__.py +28 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/_alert_step.py +193 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/_base.py +128 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/_detect_step.py +215 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/_load_step.py +138 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/_types.py +46 -0
- detectkit-0.4.1/detectkit/orchestration/task_manager/manager.py +135 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/utils/__init__.py +5 -0
- detectkit-0.4.1/detectkit/utils/env_interpolation.py +50 -0
- detectkit-0.4.1/detectkit/utils/json_utils.py +34 -0
- {detectkit-0.3.17 → detectkit-0.4.1/detectkit.egg-info}/PKG-INFO +5 -1
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit.egg-info/SOURCES.txt +27 -3
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit.egg-info/requires.txt +5 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/pyproject.toml +18 -4
- detectkit-0.3.17/detectkit/alerting/orchestrator.py +0 -777
- detectkit-0.3.17/detectkit/database/internal_tables.py +0 -1066
- detectkit-0.3.17/detectkit/orchestration/task_manager.py +0 -875
- {detectkit-0.3.17 → detectkit-0.4.1}/LICENSE +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/MANIFEST.in +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/README.md +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/email.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/telegram.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/commands/init.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/commands/run.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/commands/test_alert.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/cli/main.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/config/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/config/project_config.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/config/validator.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/core/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/core/interval.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/core/models.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/database/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/database/manager.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/database/tables.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/utils/datetime_utils.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit/utils/stats.py +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/requirements.txt +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/setup.cfg +0 -0
- {detectkit-0.3.17 → detectkit-0.4.1}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -55,9 +55,13 @@ Requires-Dist: timesfm>=0.1.0; extra == "all"
|
|
|
55
55
|
Provides-Extra: dev
|
|
56
56
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
57
57
|
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
58
|
+
Requires-Dist: pytest-requests-mock>=0.1; extra == "dev"
|
|
59
|
+
Requires-Dist: requests-mock>=1.12; extra == "dev"
|
|
58
60
|
Requires-Dist: black>=23.0; extra == "dev"
|
|
59
61
|
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
60
62
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
63
|
+
Provides-Extra: integration
|
|
64
|
+
Requires-Dist: testcontainers[clickhouse]>=4.0; extra == "integration"
|
|
61
65
|
Dynamic: license-file
|
|
62
66
|
|
|
63
67
|
# detectkit
|
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.4.1"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Alert channel factory for creating channel instances from configuration.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import os
|
|
6
5
|
from typing import Dict, List
|
|
7
6
|
|
|
8
7
|
from detectkit.alerting.channels.base import BaseAlertChannel
|
|
@@ -11,6 +10,7 @@ from detectkit.alerting.channels.slack import SlackChannel
|
|
|
11
10
|
from detectkit.alerting.channels.webhook import WebhookChannel
|
|
12
11
|
from detectkit.alerting.channels.telegram import TelegramChannel
|
|
13
12
|
from detectkit.alerting.channels.email import EmailChannel
|
|
13
|
+
from detectkit.utils.env_interpolation import interpolate_env_vars
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class AlertChannelFactory:
|
|
@@ -82,42 +82,12 @@ class AlertChannelFactory:
|
|
|
82
82
|
|
|
83
83
|
@classmethod
|
|
84
84
|
def _interpolate_env_vars(cls, params: Dict) -> Dict:
|
|
85
|
-
"""
|
|
86
|
-
Interpolate environment variables in parameter values.
|
|
87
|
-
|
|
88
|
-
Supports formats:
|
|
89
|
-
- ${VAR_NAME}
|
|
90
|
-
- {{ env_var('VAR_NAME') }}
|
|
85
|
+
"""Interpolate ``${VAR}`` and ``{{ env_var('VAR') }}`` placeholders.
|
|
91
86
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
Parameters with interpolated values
|
|
87
|
+
Delegates to :func:`detectkit.utils.env_interpolation.interpolate_env_vars`,
|
|
88
|
+
which walks nested dicts/lists recursively.
|
|
97
89
|
"""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
interpolated = {}
|
|
101
|
-
|
|
102
|
-
for key, value in params.items():
|
|
103
|
-
if isinstance(value, str):
|
|
104
|
-
# Handle ${VAR} format
|
|
105
|
-
value = re.sub(
|
|
106
|
-
r'\$\{([^}]+)\}',
|
|
107
|
-
lambda m: os.environ.get(m.group(1), m.group(0)),
|
|
108
|
-
value,
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
# Handle {{ env_var('VAR') }} format
|
|
112
|
-
value = re.sub(
|
|
113
|
-
r"\{\{\s*env_var\(['\"]([^'\"]+)['\"]\)\s*\}\}",
|
|
114
|
-
lambda m: os.environ.get(m.group(1), m.group(0)),
|
|
115
|
-
value,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
interpolated[key] = value
|
|
119
|
-
|
|
120
|
-
return interpolated
|
|
90
|
+
return interpolate_env_vars(params)
|
|
121
91
|
|
|
122
92
|
@classmethod
|
|
123
93
|
def create_from_config(cls, channel_config: Dict) -> BaseAlertChannel:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Public surface of the alert-orchestrator package."""
|
|
2
|
+
|
|
3
|
+
from detectkit.alerting.orchestrator._types import (
|
|
4
|
+
AlertConditions,
|
|
5
|
+
DetectionRecord,
|
|
6
|
+
_direction_from_metadata,
|
|
7
|
+
_parse_detection_metadata,
|
|
8
|
+
)
|
|
9
|
+
from detectkit.alerting.orchestrator.orchestrator import AlertOrchestrator
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AlertOrchestrator",
|
|
13
|
+
"AlertConditions",
|
|
14
|
+
"DetectionRecord",
|
|
15
|
+
# Re-exported for callers (notably TaskManager) that build
|
|
16
|
+
# DetectionRecord rows manually before handing them to the orchestrator.
|
|
17
|
+
"_direction_from_metadata",
|
|
18
|
+
"_parse_detection_metadata",
|
|
19
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Shared state for orchestrator mixins."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
from detectkit.alerting.orchestrator._types import (
|
|
10
|
+
AlertConditions,
|
|
11
|
+
DetectionRecord,
|
|
12
|
+
)
|
|
13
|
+
from detectkit.core.interval import Interval
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _OrchestratorBase:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
metric_name: str,
|
|
20
|
+
interval: Interval,
|
|
21
|
+
alert_config_id: str,
|
|
22
|
+
conditions: Optional[AlertConditions] = None,
|
|
23
|
+
timezone_display: str = "UTC",
|
|
24
|
+
internal=None, # InternalTablesManager
|
|
25
|
+
alert_config=None, # AlertConfig
|
|
26
|
+
description: Optional[str] = None,
|
|
27
|
+
mentions: Optional[List[str]] = None,
|
|
28
|
+
):
|
|
29
|
+
self.metric_name = metric_name
|
|
30
|
+
self.interval = interval
|
|
31
|
+
self.alert_config_id = alert_config_id
|
|
32
|
+
self.conditions = conditions or AlertConditions()
|
|
33
|
+
self.timezone_display = timezone_display
|
|
34
|
+
self.internal = internal
|
|
35
|
+
self.alert_config = alert_config
|
|
36
|
+
self.description = description
|
|
37
|
+
self.mentions = mentions or []
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _group_by_timestamp(
|
|
41
|
+
detections: List[DetectionRecord],
|
|
42
|
+
) -> Dict[np.datetime64, List[DetectionRecord]]:
|
|
43
|
+
grouped: Dict[np.datetime64, List[DetectionRecord]] = {}
|
|
44
|
+
for d in detections:
|
|
45
|
+
grouped.setdefault(d.timestamp, []).append(d)
|
|
46
|
+
return grouped
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Cooldown logic — suppresses repeat alerts within a configured window."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
6
|
+
from detectkit.core.interval import Interval
|
|
7
|
+
from detectkit.utils.datetime_utils import now_utc_naive
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _CooldownMixin(_OrchestratorBase):
|
|
11
|
+
def _is_in_cooldown(self) -> bool:
|
|
12
|
+
"""Return ``True`` while a previously sent alert is still cooling down.
|
|
13
|
+
|
|
14
|
+
Logic:
|
|
15
|
+
1. No ``alert_cooldown`` configured → never in cooldown.
|
|
16
|
+
2. No internal manager wired in → can't read state, allow alert.
|
|
17
|
+
3. Never alerted before → no cooldown.
|
|
18
|
+
4. ``cooldown_reset_on_recovery`` and a recovery has happened
|
|
19
|
+
since the last alert → cooldown is reset, allow alert.
|
|
20
|
+
5. Otherwise: ``elapsed < cooldown_seconds`` → suppress.
|
|
21
|
+
"""
|
|
22
|
+
if not self.alert_config or not self.alert_config.alert_cooldown:
|
|
23
|
+
return False
|
|
24
|
+
if not self.internal:
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
last_sent = self.internal.get_last_alert_timestamp(
|
|
28
|
+
self.metric_name, self.alert_config_id
|
|
29
|
+
)
|
|
30
|
+
if not last_sent:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
cooldown_seconds = Interval(self.alert_config.alert_cooldown).seconds
|
|
34
|
+
elapsed = (now_utc_naive() - last_sent).total_seconds()
|
|
35
|
+
|
|
36
|
+
if self.alert_config.cooldown_reset_on_recovery:
|
|
37
|
+
if self._check_recovery_since_last_alert(last_sent):
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
return elapsed < cooldown_seconds
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Decision logic: ``should_alert`` and the consecutive-anomaly helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from detectkit.alerting.channels.base import AlertData
|
|
11
|
+
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
12
|
+
from detectkit.alerting.orchestrator._types import DetectionRecord
|
|
13
|
+
from detectkit.utils.datetime_utils import now_utc, to_aware_utc
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _DecisionMixin(_OrchestratorBase):
|
|
17
|
+
def should_alert(
|
|
18
|
+
self,
|
|
19
|
+
recent_detections: List[DetectionRecord],
|
|
20
|
+
) -> Tuple[bool, Optional[AlertData]]:
|
|
21
|
+
"""Decide whether to fire an alert from recent detections.
|
|
22
|
+
|
|
23
|
+
Steps (cheap → expensive):
|
|
24
|
+
1. Bail out on empty input.
|
|
25
|
+
2. Honour the alert cooldown so we don't spam channels.
|
|
26
|
+
3. Require ``min_detectors`` triggering on the latest point.
|
|
27
|
+
4. Require ``consecutive_anomalies`` matching the direction.
|
|
28
|
+
"""
|
|
29
|
+
if not recent_detections:
|
|
30
|
+
return False, None
|
|
31
|
+
|
|
32
|
+
# Cooldown is checked first so a noisy run doesn't waste effort.
|
|
33
|
+
if self._is_in_cooldown():
|
|
34
|
+
return False, None
|
|
35
|
+
|
|
36
|
+
detections_by_time = self._group_by_timestamp(recent_detections)
|
|
37
|
+
timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
|
|
38
|
+
|
|
39
|
+
latest_anomalies = [
|
|
40
|
+
d for d in detections_by_time[timestamps_sorted[0]] if d.is_anomaly
|
|
41
|
+
]
|
|
42
|
+
if len(latest_anomalies) < self.conditions.min_detectors:
|
|
43
|
+
return False, None
|
|
44
|
+
|
|
45
|
+
consecutive = self._count_consecutive_anomalies(
|
|
46
|
+
detections_by_time, timestamps_sorted
|
|
47
|
+
)
|
|
48
|
+
if consecutive < self.conditions.consecutive_anomalies:
|
|
49
|
+
return False, None
|
|
50
|
+
|
|
51
|
+
return True, self._build_alert_data(latest_anomalies, consecutive)
|
|
52
|
+
|
|
53
|
+
def _count_consecutive_anomalies(
|
|
54
|
+
self,
|
|
55
|
+
detections_by_time: Dict[np.datetime64, List[DetectionRecord]],
|
|
56
|
+
timestamps_sorted: List[np.datetime64],
|
|
57
|
+
) -> int:
|
|
58
|
+
"""Walk timestamps newest→oldest counting matching anomalies."""
|
|
59
|
+
direction_condition = self.conditions.direction
|
|
60
|
+
consecutive = 0
|
|
61
|
+
prev_direction: Optional[str] = None
|
|
62
|
+
|
|
63
|
+
for ts in timestamps_sorted:
|
|
64
|
+
anomalies = [d for d in detections_by_time[ts] if d.is_anomaly]
|
|
65
|
+
if len(anomalies) < self.conditions.min_detectors:
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
current_direction = anomalies[0].direction
|
|
69
|
+
|
|
70
|
+
if direction_condition == "any":
|
|
71
|
+
consecutive += 1
|
|
72
|
+
elif direction_condition == "same":
|
|
73
|
+
if prev_direction is None:
|
|
74
|
+
consecutive = 1
|
|
75
|
+
prev_direction = current_direction
|
|
76
|
+
elif current_direction == prev_direction:
|
|
77
|
+
consecutive += 1
|
|
78
|
+
else:
|
|
79
|
+
break # direction flipped → stop counting
|
|
80
|
+
elif direction_condition == "up":
|
|
81
|
+
if current_direction == "up":
|
|
82
|
+
consecutive += 1
|
|
83
|
+
else:
|
|
84
|
+
break
|
|
85
|
+
elif direction_condition == "down":
|
|
86
|
+
if current_direction == "down":
|
|
87
|
+
consecutive += 1
|
|
88
|
+
else:
|
|
89
|
+
break
|
|
90
|
+
else:
|
|
91
|
+
# Unknown direction policy — treat as "any" to stay safe.
|
|
92
|
+
consecutive += 1
|
|
93
|
+
|
|
94
|
+
return consecutive
|
|
95
|
+
|
|
96
|
+
def _build_alert_data(
|
|
97
|
+
self,
|
|
98
|
+
anomalies: List[DetectionRecord],
|
|
99
|
+
consecutive_count: int,
|
|
100
|
+
) -> AlertData:
|
|
101
|
+
primary = anomalies[0]
|
|
102
|
+
|
|
103
|
+
if len(anomalies) > 1:
|
|
104
|
+
max_severity = max(d.severity for d in anomalies)
|
|
105
|
+
detector_names = [d.detector_name for d in anomalies]
|
|
106
|
+
detector_name = f"{len(anomalies)} detectors"
|
|
107
|
+
detector_params = "; ".join(
|
|
108
|
+
f"{d.detector_name}: {d.detector_params}" for d in anomalies
|
|
109
|
+
)
|
|
110
|
+
combined_metadata = {
|
|
111
|
+
"detectors": detector_names,
|
|
112
|
+
"count": len(anomalies),
|
|
113
|
+
}
|
|
114
|
+
for i, d in enumerate(anomalies):
|
|
115
|
+
combined_metadata[f"detector_{i}_metadata"] = d.detection_metadata
|
|
116
|
+
else:
|
|
117
|
+
max_severity = primary.severity
|
|
118
|
+
detector_name = primary.detector_name
|
|
119
|
+
detector_params = primary.detector_params
|
|
120
|
+
combined_metadata = primary.detection_metadata
|
|
121
|
+
|
|
122
|
+
return AlertData(
|
|
123
|
+
metric_name=self.metric_name,
|
|
124
|
+
timestamp=primary.timestamp,
|
|
125
|
+
timezone=self.timezone_display,
|
|
126
|
+
value=primary.value,
|
|
127
|
+
confidence_lower=primary.confidence_lower,
|
|
128
|
+
confidence_upper=primary.confidence_upper,
|
|
129
|
+
detector_name=detector_name,
|
|
130
|
+
detector_params=detector_params,
|
|
131
|
+
direction=primary.direction,
|
|
132
|
+
severity=max_severity,
|
|
133
|
+
detection_metadata=combined_metadata,
|
|
134
|
+
consecutive_count=consecutive_count,
|
|
135
|
+
description=self.description,
|
|
136
|
+
mentions=self.mentions,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def get_last_complete_point(self, now: Optional[datetime] = None) -> datetime:
|
|
140
|
+
"""Floor ``now`` to the previous fully completed interval boundary."""
|
|
141
|
+
if now is None:
|
|
142
|
+
now = now_utc()
|
|
143
|
+
now = to_aware_utc(now)
|
|
144
|
+
|
|
145
|
+
interval_seconds = self.interval.seconds
|
|
146
|
+
floored = (int(now.timestamp()) // interval_seconds) * interval_seconds
|
|
147
|
+
last_complete = floored - interval_seconds
|
|
148
|
+
return datetime.fromtimestamp(last_complete, tz=timezone.utc)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Dispatch mixin — actually sends alerts/recoveries via channels."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from detectkit.alerting.channels.base import AlertData, BaseAlertChannel
|
|
8
|
+
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
9
|
+
from detectkit.utils.datetime_utils import now_utc_naive
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class _DispatchMixin(_OrchestratorBase):
|
|
13
|
+
def send_alerts(
|
|
14
|
+
self,
|
|
15
|
+
alert_data: AlertData,
|
|
16
|
+
channels: List[BaseAlertChannel],
|
|
17
|
+
template: Optional[str] = None,
|
|
18
|
+
) -> Dict[str, bool]:
|
|
19
|
+
"""Send *alert_data* to every channel; record success per-channel.
|
|
20
|
+
|
|
21
|
+
Updates ``last_alert_sent`` (and increments the counter) when at
|
|
22
|
+
least one channel succeeded — this is what powers cooldown and
|
|
23
|
+
recovery detection.
|
|
24
|
+
"""
|
|
25
|
+
results = self._dispatch(channels, alert_data, template, "alert")
|
|
26
|
+
|
|
27
|
+
if any(results.values()) and self.internal:
|
|
28
|
+
self.internal.update_alert_timestamp(
|
|
29
|
+
metric_name=self.metric_name,
|
|
30
|
+
alert_config_id=self.alert_config_id,
|
|
31
|
+
timestamp=now_utc_naive(),
|
|
32
|
+
increment_count=True,
|
|
33
|
+
)
|
|
34
|
+
return results
|
|
35
|
+
|
|
36
|
+
def send_recovery(
|
|
37
|
+
self,
|
|
38
|
+
alert_data: AlertData,
|
|
39
|
+
channels: List[BaseAlertChannel],
|
|
40
|
+
template: Optional[str] = None,
|
|
41
|
+
) -> Dict[str, bool]:
|
|
42
|
+
"""Send a recovery notification and stamp ``last_recovery_sent``."""
|
|
43
|
+
results = self._dispatch(channels, alert_data, template, "recovery")
|
|
44
|
+
|
|
45
|
+
if any(results.values()) and self.internal:
|
|
46
|
+
self.internal.update_recovery_timestamp(
|
|
47
|
+
metric_name=self.metric_name,
|
|
48
|
+
alert_config_id=self.alert_config_id,
|
|
49
|
+
timestamp=now_utc_naive(),
|
|
50
|
+
)
|
|
51
|
+
return results
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _dispatch(
|
|
55
|
+
channels: List[BaseAlertChannel],
|
|
56
|
+
alert_data: AlertData,
|
|
57
|
+
template: Optional[str],
|
|
58
|
+
kind: str,
|
|
59
|
+
) -> Dict[str, bool]:
|
|
60
|
+
results: Dict[str, bool] = {}
|
|
61
|
+
for channel in channels:
|
|
62
|
+
channel_name = channel.__class__.__name__
|
|
63
|
+
try:
|
|
64
|
+
results[channel_name] = bool(channel.send(alert_data, template))
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
# One bad channel must not abort the others.
|
|
67
|
+
print(f"Error sending {kind} via {channel_name}: {exc}")
|
|
68
|
+
results[channel_name] = False
|
|
69
|
+
return results
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Recovery decision and reconstruction logic."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from detectkit.alerting.channels.base import AlertData
|
|
11
|
+
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
12
|
+
from detectkit.alerting.orchestrator._types import (
|
|
13
|
+
DetectionRecord,
|
|
14
|
+
_direction_from_metadata,
|
|
15
|
+
_parse_detection_metadata,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _RecoveryMixin(_OrchestratorBase):
|
|
20
|
+
def should_send_recovery(
|
|
21
|
+
self,
|
|
22
|
+
recent_detections: List[DetectionRecord],
|
|
23
|
+
) -> Tuple[bool, Optional[AlertData]]:
|
|
24
|
+
"""Decide whether to send a recovery notification.
|
|
25
|
+
|
|
26
|
+
Conditions (all must hold):
|
|
27
|
+
1. A previous alert has been sent (``last_alert_sent`` exists).
|
|
28
|
+
2. The metric has actually recovered (no blocking anomalies).
|
|
29
|
+
3. We haven't already notified recovery for this incident.
|
|
30
|
+
"""
|
|
31
|
+
if not self.internal:
|
|
32
|
+
return False, None
|
|
33
|
+
|
|
34
|
+
last_alert = self.internal.get_last_alert_timestamp(
|
|
35
|
+
self.metric_name, self.alert_config_id
|
|
36
|
+
)
|
|
37
|
+
if not last_alert:
|
|
38
|
+
return False, None
|
|
39
|
+
|
|
40
|
+
last_recovery = self.internal.get_last_recovery_timestamp(
|
|
41
|
+
self.metric_name, self.alert_config_id
|
|
42
|
+
)
|
|
43
|
+
if last_recovery and last_recovery >= last_alert:
|
|
44
|
+
return False, None # already notified for this incident
|
|
45
|
+
|
|
46
|
+
if not self._check_recovery_since_last_alert(last_alert):
|
|
47
|
+
return False, None
|
|
48
|
+
|
|
49
|
+
recovery_data = self._build_recovery_data(recent_detections)
|
|
50
|
+
if not recovery_data:
|
|
51
|
+
return False, None
|
|
52
|
+
return True, recovery_data
|
|
53
|
+
|
|
54
|
+
def _check_recovery_since_last_alert(
|
|
55
|
+
self, last_alert_timestamp: datetime
|
|
56
|
+
) -> bool:
|
|
57
|
+
"""Return ``True`` when the metric has recovered since *last_alert_timestamp*.
|
|
58
|
+
|
|
59
|
+
Direction-aware: a "down"-only alert is not blocked by a fresh
|
|
60
|
+
"up" anomaly, since the alert condition no longer holds.
|
|
61
|
+
"""
|
|
62
|
+
if not self.internal:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
last_point = self.get_last_complete_point()
|
|
66
|
+
# +5 for safety margin so we don't truncate the consecutive window.
|
|
67
|
+
num_points = self.conditions.consecutive_anomalies + 5
|
|
68
|
+
|
|
69
|
+
recent_detections = self.internal.get_recent_detections(
|
|
70
|
+
metric_name=self.metric_name,
|
|
71
|
+
last_point=last_point,
|
|
72
|
+
num_points=num_points,
|
|
73
|
+
created_after=last_alert_timestamp,
|
|
74
|
+
)
|
|
75
|
+
if not recent_detections:
|
|
76
|
+
# No fresh detections at all → assume recovery.
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
records: List[DetectionRecord] = []
|
|
80
|
+
for det in recent_detections:
|
|
81
|
+
metadata_list = (
|
|
82
|
+
det.get("detection_metadata_list")
|
|
83
|
+
or [None] * len(det["detector_ids"])
|
|
84
|
+
)
|
|
85
|
+
for i in range(len(det["detector_ids"])):
|
|
86
|
+
is_anomaly = det["is_anomaly_flags"][i]
|
|
87
|
+
metadata = _parse_detection_metadata(metadata_list[i])
|
|
88
|
+
records.append(
|
|
89
|
+
DetectionRecord(
|
|
90
|
+
timestamp=np.datetime64(det["timestamp"]),
|
|
91
|
+
detector_name=det["detector_names"][i],
|
|
92
|
+
detector_id=det["detector_ids"][i],
|
|
93
|
+
detector_params=det["detector_params_list"][i],
|
|
94
|
+
value=det["value"],
|
|
95
|
+
is_anomaly=is_anomaly,
|
|
96
|
+
confidence_lower=det["confidence_lowers"][i],
|
|
97
|
+
confidence_upper=det["confidence_uppers"][i],
|
|
98
|
+
direction=_direction_from_metadata(metadata, is_anomaly),
|
|
99
|
+
severity=0.0, # not used for the recovery check
|
|
100
|
+
detection_metadata=metadata,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
detections_by_time = self._group_by_timestamp(records)
|
|
105
|
+
timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
|
|
106
|
+
latest_anomalies = [
|
|
107
|
+
d for d in detections_by_time[timestamps_sorted[0]] if d.is_anomaly
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
direction_condition = self.conditions.direction
|
|
111
|
+
if direction_condition == "down":
|
|
112
|
+
blocking = [d for d in latest_anomalies if d.direction == "down"]
|
|
113
|
+
elif direction_condition == "up":
|
|
114
|
+
blocking = [d for d in latest_anomalies if d.direction == "up"]
|
|
115
|
+
elif direction_condition == "same":
|
|
116
|
+
trigger_direction = self._get_alert_trigger_direction(
|
|
117
|
+
last_alert_timestamp
|
|
118
|
+
)
|
|
119
|
+
if trigger_direction is None:
|
|
120
|
+
blocking = latest_anomalies # conservative fallback
|
|
121
|
+
else:
|
|
122
|
+
blocking = [
|
|
123
|
+
d for d in latest_anomalies if d.direction == trigger_direction
|
|
124
|
+
]
|
|
125
|
+
else: # "any" / unknown — preserve historical behaviour
|
|
126
|
+
blocking = latest_anomalies
|
|
127
|
+
|
|
128
|
+
return len(blocking) == 0
|
|
129
|
+
|
|
130
|
+
def _get_alert_trigger_direction(
|
|
131
|
+
self, last_alert_timestamp: datetime
|
|
132
|
+
) -> Optional[str]:
|
|
133
|
+
"""Return the direction of the anomaly that triggered the last alert."""
|
|
134
|
+
if not self.internal:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
trigger_detections = self.internal.get_recent_detections(
|
|
138
|
+
metric_name=self.metric_name,
|
|
139
|
+
last_point=last_alert_timestamp,
|
|
140
|
+
num_points=1,
|
|
141
|
+
)
|
|
142
|
+
if not trigger_detections:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
det = trigger_detections[0]
|
|
146
|
+
metadata_list = (
|
|
147
|
+
det.get("detection_metadata_list")
|
|
148
|
+
or [None] * len(det["detector_ids"])
|
|
149
|
+
)
|
|
150
|
+
for i in range(len(det["detector_ids"])):
|
|
151
|
+
if not det["is_anomaly_flags"][i]:
|
|
152
|
+
continue
|
|
153
|
+
direction = _direction_from_metadata(metadata_list[i], True)
|
|
154
|
+
if direction in ("up", "down"):
|
|
155
|
+
return direction
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
def _build_recovery_data(
|
|
159
|
+
self,
|
|
160
|
+
detections: List[DetectionRecord],
|
|
161
|
+
) -> Optional[AlertData]:
|
|
162
|
+
"""Construct the AlertData payload sent as a recovery notification."""
|
|
163
|
+
if not detections:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
# ``detections`` is oldest→newest, so the latest point lives at [-1].
|
|
167
|
+
latest = detections[-1]
|
|
168
|
+
|
|
169
|
+
# Prefer the latest CI so the message reflects the *current* interval.
|
|
170
|
+
# Fall back to the last anomalous point only if the latest row has no
|
|
171
|
+
# CI (e.g. missing-data / insufficient-data placeholders).
|
|
172
|
+
recovery_ci_lower = latest.confidence_lower
|
|
173
|
+
recovery_ci_upper = latest.confidence_upper
|
|
174
|
+
recovery_detector_name = latest.detector_name
|
|
175
|
+
recovery_detector_params = latest.detector_params
|
|
176
|
+
|
|
177
|
+
if recovery_ci_lower is None or recovery_ci_upper is None:
|
|
178
|
+
last_anomalous = next(
|
|
179
|
+
(d for d in reversed(detections) if d.is_anomaly), None
|
|
180
|
+
)
|
|
181
|
+
if last_anomalous:
|
|
182
|
+
recovery_detector_name = last_anomalous.detector_name
|
|
183
|
+
recovery_detector_params = last_anomalous.detector_params
|
|
184
|
+
recovery_ci_lower = last_anomalous.confidence_lower
|
|
185
|
+
recovery_ci_upper = last_anomalous.confidence_upper
|
|
186
|
+
|
|
187
|
+
return AlertData(
|
|
188
|
+
metric_name=self.metric_name,
|
|
189
|
+
timestamp=latest.timestamp,
|
|
190
|
+
timezone=self.timezone_display,
|
|
191
|
+
value=latest.value,
|
|
192
|
+
confidence_lower=recovery_ci_lower,
|
|
193
|
+
confidence_upper=recovery_ci_upper,
|
|
194
|
+
detector_name=recovery_detector_name,
|
|
195
|
+
detector_params=recovery_detector_params,
|
|
196
|
+
direction="none",
|
|
197
|
+
severity=0.0,
|
|
198
|
+
detection_metadata={},
|
|
199
|
+
consecutive_count=0,
|
|
200
|
+
is_recovery=True,
|
|
201
|
+
description=self.description,
|
|
202
|
+
mentions=self.mentions,
|
|
203
|
+
)
|