detectkit 0.6.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.6.0/detectkit.egg-info → detectkit-0.8.0}/PKG-INFO +11 -5
- {detectkit-0.6.0 → detectkit-0.8.0}/README.md +10 -4
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/__init__.py +1 -1
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/email.py +9 -2
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/telegram.py +9 -2
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/__init__.py +4 -2
- detectkit-0.8.0/detectkit/alerting/orchestrator/_decision.py +287 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_dispatch.py +7 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_recovery.py +31 -34
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_types.py +51 -3
- detectkit-0.8.0/detectkit/cli/commands/clean.py +333 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/init.py +72 -68
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/main.py +67 -1
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_alert_states.py +31 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_detections.py +27 -1
- detectkit-0.8.0/detectkit/database/internal_tables/_maintenance.py +70 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/manager.py +2 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/base.py +23 -46
- detectkit-0.8.0/detectkit/detectors/statistical/_windowed.py +570 -0
- detectkit-0.8.0/detectkit/detectors/statistical/iqr.py +69 -0
- detectkit-0.8.0/detectkit/detectors/statistical/mad.py +76 -0
- detectkit-0.8.0/detectkit/detectors/statistical/zscore.py +74 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/metric_loader.py +16 -15
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_alert_step.py +10 -2
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_base.py +2 -36
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/manager.py +12 -1
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/json_utils.py +11 -3
- detectkit-0.8.0/detectkit/utils/stats.py +148 -0
- {detectkit-0.6.0 → detectkit-0.8.0/detectkit.egg-info}/PKG-INFO +11 -5
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/SOURCES.txt +3 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/pyproject.toml +4 -1
- detectkit-0.6.0/detectkit/alerting/orchestrator/_decision.py +0 -195
- detectkit-0.6.0/detectkit/detectors/statistical/iqr.py +0 -436
- detectkit-0.6.0/detectkit/detectors/statistical/mad.py +0 -399
- detectkit-0.6.0/detectkit/detectors/statistical/zscore.py +0 -421
- detectkit-0.6.0/detectkit/utils/stats.py +0 -183
- {detectkit-0.6.0 → detectkit-0.8.0}/LICENSE +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/MANIFEST.in +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_base.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/run.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/test_alert.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/unlock.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/profile.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/project_config.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/validator.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/interval.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/models.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_base.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_metrics.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_schema.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_tasks.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/manager.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/tables.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/seasonality.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/error_dispatch.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_types.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/__init__.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/datetime_utils.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/env_interpolation.py +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/requirements.txt +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/setup.cfg +0 -0
- {detectkit-0.6.0 → detectkit-0.8.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -77,13 +77,14 @@ Dynamic: license-file
|
|
|
77
77
|
|
|
78
78
|
- **Pure numpy arrays** — no pandas dependency in core logic
|
|
79
79
|
- **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
|
|
80
|
+
- **Trend & seasonality handling** — seasonality grouping, recency weighting (`half_life`), robust linear detrending for slowly drifting metrics
|
|
80
81
|
- **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
|
|
81
82
|
- **@mentions** — tag users/groups in alerts, each channel formats natively
|
|
82
83
|
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
|
|
83
84
|
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
84
85
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
85
86
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
86
|
-
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
|
|
87
|
+
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, `dtk clean`, tag-based selectors
|
|
87
88
|
|
|
88
89
|
## Installation
|
|
89
90
|
|
|
@@ -115,6 +116,9 @@ dtk run --select cpu_usage --from 2024-01-01
|
|
|
115
116
|
|
|
116
117
|
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
117
118
|
dtk unlock --select cpu_usage
|
|
119
|
+
|
|
120
|
+
# Prune data orphaned by config edits (dry-run; add --execute to apply)
|
|
121
|
+
dtk clean --select cpu_usage
|
|
118
122
|
```
|
|
119
123
|
|
|
120
124
|
### Metric Configuration
|
|
@@ -129,14 +133,16 @@ query: |
|
|
|
129
133
|
toStartOfInterval(timestamp, INTERVAL 5 MINUTE) AS timestamp,
|
|
130
134
|
countIf(status_code >= 500) / count() * 100 AS value
|
|
131
135
|
FROM http_requests
|
|
132
|
-
WHERE timestamp >=
|
|
136
|
+
WHERE timestamp >= '{{ dtk_start_time }}' AND timestamp < '{{ dtk_end_time }}'
|
|
133
137
|
GROUP BY timestamp ORDER BY timestamp
|
|
134
138
|
|
|
135
139
|
detectors:
|
|
136
140
|
- type: mad
|
|
137
141
|
params:
|
|
138
|
-
threshold: 3.0
|
|
139
|
-
window_size: 2016
|
|
142
|
+
threshold: 3.0 # in sigma-equivalents
|
|
143
|
+
window_size: 2016 # 7 days of 5-min points
|
|
144
|
+
window_weights: exponential # optional: favor recent data
|
|
145
|
+
half_life: "1d" # weight halves every day of age
|
|
140
146
|
|
|
141
147
|
alerting:
|
|
142
148
|
enabled: true
|
|
@@ -11,13 +11,14 @@
|
|
|
11
11
|
|
|
12
12
|
- **Pure numpy arrays** — no pandas dependency in core logic
|
|
13
13
|
- **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
|
|
14
|
+
- **Trend & seasonality handling** — seasonality grouping, recency weighting (`half_life`), robust linear detrending for slowly drifting metrics
|
|
14
15
|
- **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
|
|
15
16
|
- **@mentions** — tag users/groups in alerts, each channel formats natively
|
|
16
17
|
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
|
|
17
18
|
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
18
19
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
19
20
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
20
|
-
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
|
|
21
|
+
- **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, `dtk clean`, tag-based selectors
|
|
21
22
|
|
|
22
23
|
## Installation
|
|
23
24
|
|
|
@@ -49,6 +50,9 @@ dtk run --select cpu_usage --from 2024-01-01
|
|
|
49
50
|
|
|
50
51
|
# Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
|
|
51
52
|
dtk unlock --select cpu_usage
|
|
53
|
+
|
|
54
|
+
# Prune data orphaned by config edits (dry-run; add --execute to apply)
|
|
55
|
+
dtk clean --select cpu_usage
|
|
52
56
|
```
|
|
53
57
|
|
|
54
58
|
### Metric Configuration
|
|
@@ -63,14 +67,16 @@ query: |
|
|
|
63
67
|
toStartOfInterval(timestamp, INTERVAL 5 MINUTE) AS timestamp,
|
|
64
68
|
countIf(status_code >= 500) / count() * 100 AS value
|
|
65
69
|
FROM http_requests
|
|
66
|
-
WHERE timestamp >=
|
|
70
|
+
WHERE timestamp >= '{{ dtk_start_time }}' AND timestamp < '{{ dtk_end_time }}'
|
|
67
71
|
GROUP BY timestamp ORDER BY timestamp
|
|
68
72
|
|
|
69
73
|
detectors:
|
|
70
74
|
- type: mad
|
|
71
75
|
params:
|
|
72
|
-
threshold: 3.0
|
|
73
|
-
window_size: 2016
|
|
76
|
+
threshold: 3.0 # in sigma-equivalents
|
|
77
|
+
window_size: 2016 # 7 days of 5-min points
|
|
78
|
+
window_weights: exponential # optional: favor recent data
|
|
79
|
+
half_life: "1d" # weight halves every day of age
|
|
74
80
|
|
|
75
81
|
alerting:
|
|
76
82
|
enabled: true
|
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.8.0"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -95,12 +95,17 @@ class EmailChannel(BaseAlertChannel):
|
|
|
95
95
|
self.subject_template = subject_template
|
|
96
96
|
self.template = template
|
|
97
97
|
|
|
98
|
-
def send(self, alert_data: AlertData) ->
|
|
98
|
+
def send(self, alert_data: AlertData, template: str | None = None) -> bool:
|
|
99
99
|
"""
|
|
100
100
|
Send alert via email.
|
|
101
101
|
|
|
102
102
|
Args:
|
|
103
103
|
alert_data: Alert information to send
|
|
104
|
+
template: Per-call template override (falls back to the
|
|
105
|
+
channel-level template, then the built-in default)
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True when the email was handed to the SMTP server
|
|
104
109
|
|
|
105
110
|
Raises:
|
|
106
111
|
smtplib.SMTPException: If email sending fails
|
|
@@ -108,7 +113,7 @@ class EmailChannel(BaseAlertChannel):
|
|
|
108
113
|
Example:
|
|
109
114
|
>>> channel.send(alert_data)
|
|
110
115
|
"""
|
|
111
|
-
message_body = self.format_message(alert_data, self.template)
|
|
116
|
+
message_body = self.format_message(alert_data, template or self.template)
|
|
112
117
|
|
|
113
118
|
# Create email message
|
|
114
119
|
msg = MIMEMultipart("alternative")
|
|
@@ -138,6 +143,8 @@ class EmailChannel(BaseAlertChannel):
|
|
|
138
143
|
except smtplib.SMTPException as e:
|
|
139
144
|
raise smtplib.SMTPException(f"Failed to send email alert: {e}") from e
|
|
140
145
|
|
|
146
|
+
return True
|
|
147
|
+
|
|
141
148
|
def format_mentions(self, mentions: list[str]) -> str:
|
|
142
149
|
"""
|
|
143
150
|
Format mentions for email.
|
|
@@ -69,12 +69,17 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
69
69
|
self.disable_notification = disable_notification
|
|
70
70
|
self.template = template
|
|
71
71
|
|
|
72
|
-
def send(self, alert_data: AlertData) ->
|
|
72
|
+
def send(self, alert_data: AlertData, template: str | None = None) -> bool:
|
|
73
73
|
"""
|
|
74
74
|
Send alert to Telegram.
|
|
75
75
|
|
|
76
76
|
Args:
|
|
77
77
|
alert_data: Alert information to send
|
|
78
|
+
template: Per-call template override (falls back to the
|
|
79
|
+
channel-level template, then the built-in default)
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
True when the message was accepted by the Telegram API
|
|
78
83
|
|
|
79
84
|
Raises:
|
|
80
85
|
requests.RequestException: If request fails
|
|
@@ -83,7 +88,7 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
83
88
|
Example:
|
|
84
89
|
>>> channel.send(alert_data)
|
|
85
90
|
"""
|
|
86
|
-
message = self.format_message(alert_data, self.template)
|
|
91
|
+
message = self.format_message(alert_data, template or self.template)
|
|
87
92
|
|
|
88
93
|
# Telegram Bot API URL
|
|
89
94
|
url = f"https://api.telegram.org/bot{self.bot_token}/sendMessage"
|
|
@@ -103,6 +108,8 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
103
108
|
except requests.RequestException as e:
|
|
104
109
|
raise requests.RequestException(f"Failed to send Telegram alert: {e}") from e
|
|
105
110
|
|
|
111
|
+
return True
|
|
112
|
+
|
|
106
113
|
def format_mentions(self, mentions: list[str]) -> str:
|
|
107
114
|
"""
|
|
108
115
|
Format mentions for Telegram.
|
|
@@ -5,6 +5,7 @@ from detectkit.alerting.orchestrator._types import (
|
|
|
5
5
|
DetectionRecord,
|
|
6
6
|
_direction_from_metadata,
|
|
7
7
|
_parse_detection_metadata,
|
|
8
|
+
hydrate_detection_records,
|
|
8
9
|
)
|
|
9
10
|
from detectkit.alerting.orchestrator.orchestrator import AlertOrchestrator
|
|
10
11
|
|
|
@@ -12,8 +13,9 @@ __all__ = [
|
|
|
12
13
|
"AlertOrchestrator",
|
|
13
14
|
"AlertConditions",
|
|
14
15
|
"DetectionRecord",
|
|
15
|
-
#
|
|
16
|
-
#
|
|
16
|
+
# Shared hydration of DetectionRecord rows from get_recent_detections
|
|
17
|
+
# output (used by TaskManager and the recovery mixin).
|
|
18
|
+
"hydrate_detection_records",
|
|
17
19
|
"_direction_from_metadata",
|
|
18
20
|
"_parse_detection_metadata",
|
|
19
21
|
]
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Decision logic: ``should_alert`` and the consecutive-anomaly helpers.
|
|
2
|
+
|
|
3
|
+
The multi-detector alert contract (documented in docs/guides/alerting.md):
|
|
4
|
+
|
|
5
|
+
For every timestamp, an alert *quorum* is the set of anomalous
|
|
6
|
+
detections that match the configured direction policy:
|
|
7
|
+
|
|
8
|
+
- ``"up"`` / ``"down"``: only anomalies in that direction count.
|
|
9
|
+
Detectors firing the other way are ignored (they neither help nor
|
|
10
|
+
block the quorum).
|
|
11
|
+
- ``"any"``: every anomaly counts, regardless of direction (an
|
|
12
|
+
up-anomaly and a down-anomaly together can satisfy
|
|
13
|
+
``min_detectors=2``).
|
|
14
|
+
- ``"same"``: at the latest point at least ``min_detectors`` detectors
|
|
15
|
+
must agree on ONE direction (up- and down-anomalies are counted
|
|
16
|
+
separately; disagreement does not form a quorum). The winning
|
|
17
|
+
direction is then locked for the consecutive walk.
|
|
18
|
+
|
|
19
|
+
An alert fires when the latest ``consecutive_anomalies`` timestamps
|
|
20
|
+
each satisfy the quorum AND are exactly one metric interval apart —
|
|
21
|
+
a gap in the detection grid breaks the chain.
|
|
22
|
+
|
|
23
|
+
The alert payload (value/CI shown in the message) is built from the
|
|
24
|
+
highest-severity record of the latest quorum; ties are broken by
|
|
25
|
+
detector name, then detector id, so the outcome never depends on SQL
|
|
26
|
+
row ordering.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import math
|
|
32
|
+
from datetime import datetime, timezone
|
|
33
|
+
|
|
34
|
+
import numpy as np
|
|
35
|
+
|
|
36
|
+
from detectkit.alerting.channels.base import AlertData
|
|
37
|
+
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
38
|
+
from detectkit.alerting.orchestrator._types import DetectionRecord
|
|
39
|
+
from detectkit.utils.datetime_utils import now_utc, to_aware_utc
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _DecisionMixin(_OrchestratorBase):
|
|
43
|
+
def should_alert(
|
|
44
|
+
self,
|
|
45
|
+
recent_detections: list[DetectionRecord],
|
|
46
|
+
) -> tuple[bool, AlertData | None]:
|
|
47
|
+
"""Decide whether to fire an alert from recent detections.
|
|
48
|
+
|
|
49
|
+
Steps (cheap → expensive):
|
|
50
|
+
1. Bail out on empty input.
|
|
51
|
+
2. Honour the alert cooldown so we don't spam channels.
|
|
52
|
+
3. Walk timestamps newest→oldest counting points where the
|
|
53
|
+
direction-aware quorum holds (see module docstring).
|
|
54
|
+
4. Require ``consecutive_anomalies`` such points on a
|
|
55
|
+
contiguous interval grid.
|
|
56
|
+
"""
|
|
57
|
+
if not recent_detections:
|
|
58
|
+
return False, None
|
|
59
|
+
|
|
60
|
+
# Cooldown is checked first so a noisy run doesn't waste effort.
|
|
61
|
+
if self._is_in_cooldown():
|
|
62
|
+
return False, None
|
|
63
|
+
|
|
64
|
+
detections_by_time = self._group_by_timestamp(recent_detections)
|
|
65
|
+
timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
|
|
66
|
+
|
|
67
|
+
consecutive, latest_quorum, direction = self._count_consecutive_anomalies(
|
|
68
|
+
detections_by_time, timestamps_sorted
|
|
69
|
+
)
|
|
70
|
+
if not latest_quorum or consecutive < self.conditions.consecutive_anomalies:
|
|
71
|
+
return False, None
|
|
72
|
+
|
|
73
|
+
return True, self._build_alert_data(latest_quorum, consecutive, direction)
|
|
74
|
+
|
|
75
|
+
def _quorum_at(
|
|
76
|
+
self,
|
|
77
|
+
anomalies: list[DetectionRecord],
|
|
78
|
+
locked_direction: str | None,
|
|
79
|
+
) -> tuple[list[DetectionRecord] | None, str | None]:
|
|
80
|
+
"""Anomalies satisfying the direction policy at one timestamp.
|
|
81
|
+
|
|
82
|
+
Returns ``(quorum, direction)`` or ``(None, None)`` when the quorum
|
|
83
|
+
is not met. ``locked_direction`` carries the winning direction of
|
|
84
|
+
the latest point through the consecutive walk for ``"same"``.
|
|
85
|
+
"""
|
|
86
|
+
policy = self.conditions.direction
|
|
87
|
+
required = self.conditions.min_detectors
|
|
88
|
+
|
|
89
|
+
if policy in ("up", "down"):
|
|
90
|
+
qualifying = [d for d in anomalies if d.direction == policy]
|
|
91
|
+
if len(qualifying) >= required:
|
|
92
|
+
return qualifying, policy
|
|
93
|
+
return None, None
|
|
94
|
+
|
|
95
|
+
if policy == "same":
|
|
96
|
+
if locked_direction is not None:
|
|
97
|
+
qualifying = [d for d in anomalies if d.direction == locked_direction]
|
|
98
|
+
if len(qualifying) >= required:
|
|
99
|
+
return qualifying, locked_direction
|
|
100
|
+
return None, None
|
|
101
|
+
|
|
102
|
+
ups = [d for d in anomalies if d.direction == "up"]
|
|
103
|
+
downs = [d for d in anomalies if d.direction == "down"]
|
|
104
|
+
candidates = [c for c in (ups, downs) if len(c) >= required]
|
|
105
|
+
if not candidates:
|
|
106
|
+
return None, None
|
|
107
|
+
if len(candidates) == 2:
|
|
108
|
+
if len(ups) != len(downs):
|
|
109
|
+
winner = ups if len(ups) > len(downs) else downs
|
|
110
|
+
else:
|
|
111
|
+
# Same detector count in both directions: prefer the
|
|
112
|
+
# more severe side (deterministic tie-break).
|
|
113
|
+
winner = max(
|
|
114
|
+
candidates,
|
|
115
|
+
key=lambda c: max((d.severity, d.detector_name) for d in c),
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
winner = candidates[0]
|
|
119
|
+
return winner, winner[0].direction
|
|
120
|
+
|
|
121
|
+
# "any" (unknown policies are rejected at config validation; if one
|
|
122
|
+
# sneaks in through the direct API, fail open like "any")
|
|
123
|
+
if len(anomalies) >= required:
|
|
124
|
+
return anomalies, None
|
|
125
|
+
return None, None
|
|
126
|
+
|
|
127
|
+
def _count_consecutive_anomalies(
|
|
128
|
+
self,
|
|
129
|
+
detections_by_time: dict[np.datetime64, list[DetectionRecord]],
|
|
130
|
+
timestamps_sorted: list[np.datetime64],
|
|
131
|
+
) -> tuple[int, list[DetectionRecord] | None, str | None]:
|
|
132
|
+
"""Walk timestamps newest→oldest counting quorum-satisfying points.
|
|
133
|
+
|
|
134
|
+
The chain requires grid adjacency: each older timestamp must be
|
|
135
|
+
exactly one metric interval before the previous one, so detection
|
|
136
|
+
gaps (days without runs, detector start_time boundaries) are not
|
|
137
|
+
miscounted as "consecutive".
|
|
138
|
+
|
|
139
|
+
Returns ``(count, latest_quorum, direction)`` where direction is
|
|
140
|
+
the locked/policy direction (None for "any").
|
|
141
|
+
"""
|
|
142
|
+
expected_step = np.timedelta64(self.interval.seconds, "s")
|
|
143
|
+
consecutive = 0
|
|
144
|
+
locked_direction: str | None = None
|
|
145
|
+
latest_quorum: list[DetectionRecord] | None = None
|
|
146
|
+
latest_direction: str | None = None
|
|
147
|
+
prev_ts: np.datetime64 | None = None
|
|
148
|
+
|
|
149
|
+
for ts in timestamps_sorted:
|
|
150
|
+
if prev_ts is not None and (prev_ts - ts) != expected_step:
|
|
151
|
+
break
|
|
152
|
+
|
|
153
|
+
anomalies = [d for d in detections_by_time[ts] if d.is_anomaly]
|
|
154
|
+
quorum, direction = self._quorum_at(anomalies, locked_direction)
|
|
155
|
+
if quorum is None:
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
if self.conditions.direction == "same":
|
|
159
|
+
locked_direction = direction
|
|
160
|
+
if latest_quorum is None:
|
|
161
|
+
latest_quorum = quorum
|
|
162
|
+
latest_direction = direction
|
|
163
|
+
|
|
164
|
+
consecutive += 1
|
|
165
|
+
prev_ts = ts
|
|
166
|
+
|
|
167
|
+
return consecutive, latest_quorum, latest_direction
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def _primary_record(anomalies: list[DetectionRecord]) -> DetectionRecord:
|
|
171
|
+
"""Highest-severity record; ties broken by name/id for determinism."""
|
|
172
|
+
|
|
173
|
+
def sort_key(d: DetectionRecord):
|
|
174
|
+
severity = d.severity
|
|
175
|
+
if math.isnan(severity):
|
|
176
|
+
severity = 0.0
|
|
177
|
+
elif math.isinf(severity):
|
|
178
|
+
severity = 1e308 if severity > 0 else -1e308
|
|
179
|
+
return (-severity, d.detector_name, d.detector_id)
|
|
180
|
+
|
|
181
|
+
return min(anomalies, key=sort_key)
|
|
182
|
+
|
|
183
|
+
def _build_alert_data(
|
|
184
|
+
self,
|
|
185
|
+
anomalies: list[DetectionRecord],
|
|
186
|
+
consecutive_count: int,
|
|
187
|
+
direction: str | None,
|
|
188
|
+
) -> AlertData:
|
|
189
|
+
primary = self._primary_record(anomalies)
|
|
190
|
+
|
|
191
|
+
if len(anomalies) > 1:
|
|
192
|
+
max_severity = max(d.severity for d in anomalies)
|
|
193
|
+
detector_names = [d.detector_name for d in anomalies]
|
|
194
|
+
detector_name = f"{len(anomalies)} detectors"
|
|
195
|
+
detector_params = "; ".join(
|
|
196
|
+
f"{d.detector_name}: {d.detector_params}" for d in anomalies
|
|
197
|
+
)
|
|
198
|
+
combined_metadata = {
|
|
199
|
+
"detectors": detector_names,
|
|
200
|
+
"count": len(anomalies),
|
|
201
|
+
}
|
|
202
|
+
for i, d in enumerate(anomalies):
|
|
203
|
+
combined_metadata[f"detector_{i}_metadata"] = d.detection_metadata
|
|
204
|
+
else:
|
|
205
|
+
max_severity = primary.severity
|
|
206
|
+
detector_name = primary.detector_name
|
|
207
|
+
detector_params = primary.detector_params
|
|
208
|
+
combined_metadata = primary.detection_metadata
|
|
209
|
+
|
|
210
|
+
return AlertData(
|
|
211
|
+
metric_name=self.metric_name,
|
|
212
|
+
timestamp=primary.timestamp,
|
|
213
|
+
timezone=self.timezone_display,
|
|
214
|
+
value=primary.value,
|
|
215
|
+
confidence_lower=primary.confidence_lower,
|
|
216
|
+
confidence_upper=primary.confidence_upper,
|
|
217
|
+
detector_name=detector_name,
|
|
218
|
+
detector_params=detector_params,
|
|
219
|
+
direction=direction or primary.direction,
|
|
220
|
+
severity=max_severity,
|
|
221
|
+
detection_metadata=combined_metadata,
|
|
222
|
+
consecutive_count=consecutive_count,
|
|
223
|
+
description=self.description,
|
|
224
|
+
mentions=self.mentions,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def should_alert_no_data(
|
|
228
|
+
self,
|
|
229
|
+
last_point: datetime,
|
|
230
|
+
) -> tuple[bool, AlertData | None]:
|
|
231
|
+
"""Decide whether to fire a no-data alert for *last_point*.
|
|
232
|
+
|
|
233
|
+
Conditions (all must hold):
|
|
234
|
+
1. ``alert_config.no_data_alert`` is true.
|
|
235
|
+
2. Not currently in alert cooldown for this alert config.
|
|
236
|
+
3. The latest expected datapoint is missing — there is no row
|
|
237
|
+
in ``_dtk_datapoints`` for *last_point* OR the row's value
|
|
238
|
+
is NULL/NaN. ``get_value_at`` returns ``None`` for both.
|
|
239
|
+
|
|
240
|
+
``min_detectors`` and ``consecutive_anomalies`` deliberately do
|
|
241
|
+
not apply here: missing data is a single binary metric-level
|
|
242
|
+
signal, not a per-detector vote.
|
|
243
|
+
"""
|
|
244
|
+
if not self.alert_config or not getattr(self.alert_config, "no_data_alert", False):
|
|
245
|
+
return False, None
|
|
246
|
+
if not self.internal:
|
|
247
|
+
return False, None
|
|
248
|
+
|
|
249
|
+
if self._is_in_cooldown():
|
|
250
|
+
return False, None
|
|
251
|
+
|
|
252
|
+
value = self.internal.get_value_at(self.metric_name, last_point)
|
|
253
|
+
if value is not None and not (isinstance(value, float) and math.isnan(value)):
|
|
254
|
+
return False, None
|
|
255
|
+
|
|
256
|
+
return True, self._build_no_data_alert_data(last_point)
|
|
257
|
+
|
|
258
|
+
def _build_no_data_alert_data(self, last_point: datetime) -> AlertData:
|
|
259
|
+
"""Construct the AlertData payload for a no-data alert."""
|
|
260
|
+
return AlertData(
|
|
261
|
+
metric_name=self.metric_name,
|
|
262
|
+
timestamp=np.datetime64(last_point, "ms"),
|
|
263
|
+
timezone=self.timezone_display,
|
|
264
|
+
value=None,
|
|
265
|
+
confidence_lower=None,
|
|
266
|
+
confidence_upper=None,
|
|
267
|
+
detector_name="no_data",
|
|
268
|
+
detector_params="",
|
|
269
|
+
direction="none",
|
|
270
|
+
severity=0.0,
|
|
271
|
+
detection_metadata={"reason": "no_data"},
|
|
272
|
+
consecutive_count=0,
|
|
273
|
+
is_no_data=True,
|
|
274
|
+
description=self.description,
|
|
275
|
+
mentions=self.mentions,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def get_last_complete_point(self, now: datetime | None = None) -> datetime:
|
|
279
|
+
"""Floor ``now`` to the previous fully completed interval boundary."""
|
|
280
|
+
if now is None:
|
|
281
|
+
now = now_utc()
|
|
282
|
+
now = to_aware_utc(now)
|
|
283
|
+
|
|
284
|
+
interval_seconds = self.interval.seconds
|
|
285
|
+
floored = (int(now.timestamp()) // interval_seconds) * interval_seconds
|
|
286
|
+
last_complete = floored - interval_seconds
|
|
287
|
+
return datetime.fromtimestamp(last_complete, tz=timezone.utc)
|
|
@@ -58,6 +58,13 @@ class _DispatchMixin(_OrchestratorBase):
|
|
|
58
58
|
results: dict[str, bool] = {}
|
|
59
59
|
for channel in channels:
|
|
60
60
|
channel_name = channel.__class__.__name__
|
|
61
|
+
# Two channels of the same type must not collapse into one
|
|
62
|
+
# result entry (that would undercount sends).
|
|
63
|
+
if channel_name in results:
|
|
64
|
+
suffix = 2
|
|
65
|
+
while f"{channel_name}#{suffix}" in results:
|
|
66
|
+
suffix += 1
|
|
67
|
+
channel_name = f"{channel_name}#{suffix}"
|
|
61
68
|
try:
|
|
62
69
|
results[channel_name] = bool(channel.send(alert_data, template))
|
|
63
70
|
except Exception as exc:
|
|
@@ -4,14 +4,11 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
|
|
7
|
-
import numpy as np
|
|
8
|
-
|
|
9
7
|
from detectkit.alerting.channels.base import AlertData
|
|
10
8
|
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
11
9
|
from detectkit.alerting.orchestrator._types import (
|
|
12
10
|
DetectionRecord,
|
|
13
|
-
|
|
14
|
-
_parse_detection_metadata,
|
|
11
|
+
hydrate_detection_records,
|
|
15
12
|
)
|
|
16
13
|
|
|
17
14
|
|
|
@@ -71,27 +68,7 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
71
68
|
# No fresh detections at all → assume recovery.
|
|
72
69
|
return True
|
|
73
70
|
|
|
74
|
-
records
|
|
75
|
-
for det in recent_detections:
|
|
76
|
-
metadata_list = det.get("detection_metadata_list") or [None] * len(det["detector_ids"])
|
|
77
|
-
for i in range(len(det["detector_ids"])):
|
|
78
|
-
is_anomaly = det["is_anomaly_flags"][i]
|
|
79
|
-
metadata = _parse_detection_metadata(metadata_list[i])
|
|
80
|
-
records.append(
|
|
81
|
-
DetectionRecord(
|
|
82
|
-
timestamp=np.datetime64(det["timestamp"]),
|
|
83
|
-
detector_name=det["detector_names"][i],
|
|
84
|
-
detector_id=det["detector_ids"][i],
|
|
85
|
-
detector_params=det["detector_params_list"][i],
|
|
86
|
-
value=det["value"],
|
|
87
|
-
is_anomaly=is_anomaly,
|
|
88
|
-
confidence_lower=det["confidence_lowers"][i],
|
|
89
|
-
confidence_upper=det["confidence_uppers"][i],
|
|
90
|
-
direction=_direction_from_metadata(metadata, is_anomaly),
|
|
91
|
-
severity=0.0, # not used for the recovery check
|
|
92
|
-
detection_metadata=metadata,
|
|
93
|
-
)
|
|
94
|
-
)
|
|
71
|
+
records = hydrate_detection_records(recent_detections)
|
|
95
72
|
|
|
96
73
|
detections_by_time = self._group_by_timestamp(records)
|
|
97
74
|
timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
|
|
@@ -114,7 +91,14 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
114
91
|
return len(blocking) == 0
|
|
115
92
|
|
|
116
93
|
def _get_alert_trigger_direction(self, last_alert_timestamp: datetime) -> str | None:
|
|
117
|
-
"""Return the direction of the anomaly that triggered the last alert.
|
|
94
|
+
"""Return the direction of the anomaly that triggered the last alert.
|
|
95
|
+
|
|
96
|
+
Mirrors the quorum logic that fired the alert (``_quorum_at`` with
|
|
97
|
+
no locked direction) so recovery checks the SAME direction the
|
|
98
|
+
alert was raised for — not whichever anomalous detector happens to
|
|
99
|
+
sort first. Falls back to a simple majority when the quorum can no
|
|
100
|
+
longer be reconstructed.
|
|
101
|
+
"""
|
|
118
102
|
if not self.internal:
|
|
119
103
|
return None
|
|
120
104
|
|
|
@@ -126,14 +110,27 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
126
110
|
if not trigger_detections:
|
|
127
111
|
return None
|
|
128
112
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
113
|
+
records = hydrate_detection_records(trigger_detections)
|
|
114
|
+
by_time = self._group_by_timestamp(records)
|
|
115
|
+
if not by_time:
|
|
116
|
+
return None
|
|
117
|
+
latest_ts = max(by_time.keys())
|
|
118
|
+
anomalies = [d for d in by_time[latest_ts] if d.is_anomaly]
|
|
119
|
+
if not anomalies:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
# _quorum_at lives in _DecisionMixin; both mixins are combined in
|
|
123
|
+
# AlertOrchestrator, so the call resolves at runtime.
|
|
124
|
+
_, direction = self._quorum_at(anomalies, None)
|
|
125
|
+
if direction in ("up", "down"):
|
|
126
|
+
return direction
|
|
127
|
+
|
|
128
|
+
ups = sum(1 for d in anomalies if d.direction == "up")
|
|
129
|
+
downs = sum(1 for d in anomalies if d.direction == "down")
|
|
130
|
+
if ups > downs:
|
|
131
|
+
return "up"
|
|
132
|
+
if downs > ups:
|
|
133
|
+
return "down"
|
|
137
134
|
return None
|
|
138
135
|
|
|
139
136
|
def _build_recovery_data(
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
|
|
11
|
+
from detectkit.utils.datetime_utils import to_naive_utc
|
|
10
12
|
from detectkit.utils.json_utils import json_loads
|
|
11
13
|
|
|
12
14
|
|
|
@@ -58,11 +60,15 @@ def _direction_from_metadata(metadata: Any, is_anomaly: bool) -> str:
|
|
|
58
60
|
|
|
59
61
|
@dataclass
|
|
60
62
|
class AlertConditions:
|
|
61
|
-
"""Conditions that turn a sequence of detections into an alert.
|
|
63
|
+
"""Conditions that turn a sequence of detections into an alert.
|
|
64
|
+
|
|
65
|
+
Defaults mirror :class:`detectkit.config.metric_config.AlertConfig`
|
|
66
|
+
so direct API users get the same behavior as YAML users.
|
|
67
|
+
"""
|
|
62
68
|
|
|
63
69
|
min_detectors: int = 1
|
|
64
|
-
direction: str = "
|
|
65
|
-
consecutive_anomalies: int =
|
|
70
|
+
direction: str = "same" # "any", "same", "up", "down"
|
|
71
|
+
consecutive_anomalies: int = 3
|
|
66
72
|
|
|
67
73
|
|
|
68
74
|
@dataclass
|
|
@@ -80,3 +86,45 @@ class DetectionRecord:
|
|
|
80
86
|
direction: str # "up", "down", "none"
|
|
81
87
|
severity: float
|
|
82
88
|
detection_metadata: dict
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def hydrate_detection_records(rows: list[dict]) -> list[DetectionRecord]:
|
|
92
|
+
"""Build :class:`DetectionRecord` rows from ``get_recent_detections`` output.
|
|
93
|
+
|
|
94
|
+
Emits one record *per detector per timestamp* (the orchestrator counts
|
|
95
|
+
records to evaluate ``min_detectors``). Input rows are timestamp-DESC as
|
|
96
|
+
returned by SQL; output is oldest→newest. Timestamps are normalized to
|
|
97
|
+
``datetime64[ms]`` so grid-adjacency arithmetic is well-defined.
|
|
98
|
+
"""
|
|
99
|
+
records: list[DetectionRecord] = []
|
|
100
|
+
for row in reversed(rows):
|
|
101
|
+
raw_ts = row["timestamp"]
|
|
102
|
+
if isinstance(raw_ts, datetime):
|
|
103
|
+
raw_ts = to_naive_utc(raw_ts)
|
|
104
|
+
timestamp = np.datetime64(raw_ts, "ms")
|
|
105
|
+
metadata_list = row.get("detection_metadata_list") or [None] * len(row["detector_ids"])
|
|
106
|
+
for i in range(len(row["detector_ids"])):
|
|
107
|
+
is_anomaly = bool(row["is_anomaly_flags"][i])
|
|
108
|
+
metadata = _parse_detection_metadata(metadata_list[i])
|
|
109
|
+
try:
|
|
110
|
+
severity = float(metadata.get("severity", 0.0) or 0.0)
|
|
111
|
+
except (TypeError, ValueError):
|
|
112
|
+
severity = 0.0
|
|
113
|
+
|
|
114
|
+
records.append(
|
|
115
|
+
DetectionRecord(
|
|
116
|
+
timestamp=timestamp,
|
|
117
|
+
detector_name=row["detector_names"][i],
|
|
118
|
+
detector_id=row["detector_ids"][i],
|
|
119
|
+
detector_params=row["detector_params_list"][i],
|
|
120
|
+
value=row["value"],
|
|
121
|
+
is_anomaly=is_anomaly,
|
|
122
|
+
confidence_lower=row["confidence_lowers"][i],
|
|
123
|
+
confidence_upper=row["confidence_uppers"][i],
|
|
124
|
+
direction=_direction_from_metadata(metadata, is_anomaly),
|
|
125
|
+
severity=severity,
|
|
126
|
+
detection_metadata=metadata,
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return records
|