detectkit 0.6.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {detectkit-0.6.0/detectkit.egg-info → detectkit-0.8.0}/PKG-INFO +11 -5
  2. {detectkit-0.6.0 → detectkit-0.8.0}/README.md +10 -4
  3. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/__init__.py +1 -1
  4. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/email.py +9 -2
  5. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/telegram.py +9 -2
  6. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/__init__.py +4 -2
  7. detectkit-0.8.0/detectkit/alerting/orchestrator/_decision.py +287 -0
  8. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_dispatch.py +7 -0
  9. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_recovery.py +31 -34
  10. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_types.py +51 -3
  11. detectkit-0.8.0/detectkit/cli/commands/clean.py +333 -0
  12. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/init.py +72 -68
  13. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/main.py +67 -1
  14. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_alert_states.py +31 -0
  15. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_detections.py +27 -1
  16. detectkit-0.8.0/detectkit/database/internal_tables/_maintenance.py +70 -0
  17. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/manager.py +2 -0
  18. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/base.py +23 -46
  19. detectkit-0.8.0/detectkit/detectors/statistical/_windowed.py +570 -0
  20. detectkit-0.8.0/detectkit/detectors/statistical/iqr.py +69 -0
  21. detectkit-0.8.0/detectkit/detectors/statistical/mad.py +76 -0
  22. detectkit-0.8.0/detectkit/detectors/statistical/zscore.py +74 -0
  23. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/metric_loader.py +16 -15
  24. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_alert_step.py +10 -2
  25. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_base.py +2 -36
  26. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/manager.py +12 -1
  27. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/json_utils.py +11 -3
  28. detectkit-0.8.0/detectkit/utils/stats.py +148 -0
  29. {detectkit-0.6.0 → detectkit-0.8.0/detectkit.egg-info}/PKG-INFO +11 -5
  30. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/SOURCES.txt +3 -0
  31. {detectkit-0.6.0 → detectkit-0.8.0}/pyproject.toml +4 -1
  32. detectkit-0.6.0/detectkit/alerting/orchestrator/_decision.py +0 -195
  33. detectkit-0.6.0/detectkit/detectors/statistical/iqr.py +0 -436
  34. detectkit-0.6.0/detectkit/detectors/statistical/mad.py +0 -399
  35. detectkit-0.6.0/detectkit/detectors/statistical/zscore.py +0 -421
  36. detectkit-0.6.0/detectkit/utils/stats.py +0 -183
  37. {detectkit-0.6.0 → detectkit-0.8.0}/LICENSE +0 -0
  38. {detectkit-0.6.0 → detectkit-0.8.0}/MANIFEST.in +0 -0
  39. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/__init__.py +0 -0
  40. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/__init__.py +0 -0
  41. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/base.py +0 -0
  42. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/factory.py +0 -0
  43. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/mattermost.py +0 -0
  44. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/slack.py +0 -0
  45. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/channels/webhook.py +0 -0
  46. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_base.py +0 -0
  47. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
  48. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
  49. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/__init__.py +0 -0
  50. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/__init__.py +0 -0
  51. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/run.py +0 -0
  52. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/test_alert.py +0 -0
  53. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/cli/commands/unlock.py +0 -0
  54. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/__init__.py +0 -0
  55. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/metric_config.py +0 -0
  56. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/profile.py +0 -0
  57. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/project_config.py +0 -0
  58. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/config/validator.py +0 -0
  59. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/__init__.py +0 -0
  60. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/interval.py +0 -0
  61. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/core/models.py +0 -0
  62. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/__init__.py +0 -0
  63. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/clickhouse_manager.py +0 -0
  64. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/__init__.py +0 -0
  65. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_base.py +0 -0
  66. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
  67. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_metrics.py +0 -0
  68. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_schema.py +0 -0
  69. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/internal_tables/_tasks.py +0 -0
  70. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/manager.py +0 -0
  71. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/database/tables.py +0 -0
  72. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/__init__.py +0 -0
  73. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/factory.py +0 -0
  74. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/seasonality.py +0 -0
  75. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/statistical/__init__.py +0 -0
  76. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
  77. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/__init__.py +0 -0
  78. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/loaders/query_template.py +0 -0
  79. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/__init__.py +0 -0
  80. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/error_dispatch.py +0 -0
  81. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
  82. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
  83. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
  84. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/orchestration/task_manager/_types.py +0 -0
  85. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/__init__.py +0 -0
  86. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/datetime_utils.py +0 -0
  87. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit/utils/env_interpolation.py +0 -0
  88. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/dependency_links.txt +0 -0
  89. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/entry_points.txt +0 -0
  90. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/requires.txt +0 -0
  91. {detectkit-0.6.0 → detectkit-0.8.0}/detectkit.egg-info/top_level.txt +0 -0
  92. {detectkit-0.6.0 → detectkit-0.8.0}/requirements.txt +0 -0
  93. {detectkit-0.6.0 → detectkit-0.8.0}/setup.cfg +0 -0
  94. {detectkit-0.6.0 → detectkit-0.8.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: detectkit
3
- Version: 0.6.0
3
+ Version: 0.8.0
4
4
  Summary: Metric monitoring with automatic anomaly detection
5
5
  Author: detectkit team
6
6
  License: MIT
@@ -77,13 +77,14 @@ Dynamic: license-file
77
77
 
78
78
  - **Pure numpy arrays** — no pandas dependency in core logic
79
79
  - **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
80
+ - **Trend & seasonality handling** — seasonality grouping, recency weighting (`half_life`), robust linear detrending for slowly drifting metrics
80
81
  - **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
81
82
  - **@mentions** — tag users/groups in alerts, each channel formats natively
82
83
  - **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
83
84
  - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
84
85
  - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
85
86
  - **Idempotent** — resume from interruptions, no duplicate processing
86
- - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
87
+ - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, `dtk clean`, tag-based selectors
87
88
 
88
89
  ## Installation
89
90
 
@@ -115,6 +116,9 @@ dtk run --select cpu_usage --from 2024-01-01
115
116
 
116
117
  # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
117
118
  dtk unlock --select cpu_usage
119
+
120
+ # Prune data orphaned by config edits (dry-run; add --execute to apply)
121
+ dtk clean --select cpu_usage
118
122
  ```
119
123
 
120
124
  ### Metric Configuration
@@ -129,14 +133,16 @@ query: |
129
133
  toStartOfInterval(timestamp, INTERVAL 5 MINUTE) AS timestamp,
130
134
  countIf(status_code >= 500) / count() * 100 AS value
131
135
  FROM http_requests
132
- WHERE timestamp >= %(from_date)s AND timestamp < %(to_date)s
136
+ WHERE timestamp >= '{{ dtk_start_time }}' AND timestamp < '{{ dtk_end_time }}'
133
137
  GROUP BY timestamp ORDER BY timestamp
134
138
 
135
139
  detectors:
136
140
  - type: mad
137
141
  params:
138
- threshold: 3.0
139
- window_size: 2016 # 7 days
142
+ threshold: 3.0 # in sigma-equivalents
143
+ window_size: 2016 # 7 days of 5-min points
144
+ window_weights: exponential # optional: favor recent data
145
+ half_life: "1d" # weight halves every day of age
140
146
 
141
147
  alerting:
142
148
  enabled: true
@@ -11,13 +11,14 @@
11
11
 
12
12
  - **Pure numpy arrays** — no pandas dependency in core logic
13
13
  - **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
14
+ - **Trend & seasonality handling** — seasonality grouping, recency weighting (`half_life`), robust linear detrending for slowly drifting metrics
14
15
  - **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
15
16
  - **@mentions** — tag users/groups in alerts, each channel formats natively
16
17
  - **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
17
18
  - **Project-level error alerts** — catch DB outages and pipeline crashes once per run
18
19
  - **Database agnostic** — ClickHouse, PostgreSQL, MySQL
19
20
  - **Idempotent** — resume from interruptions, no duplicate processing
20
- - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, tag-based selectors
21
+ - **CLI** — `dtk init`, `dtk run --select`, `dtk unlock`, `dtk clean`, tag-based selectors
21
22
 
22
23
  ## Installation
23
24
 
@@ -49,6 +50,9 @@ dtk run --select cpu_usage --from 2024-01-01
49
50
 
50
51
  # Clear a stuck lock left by a crashed run (e.g. DB restarted mid-run)
51
52
  dtk unlock --select cpu_usage
53
+
54
+ # Prune data orphaned by config edits (dry-run; add --execute to apply)
55
+ dtk clean --select cpu_usage
52
56
  ```
53
57
 
54
58
  ### Metric Configuration
@@ -63,14 +67,16 @@ query: |
63
67
  toStartOfInterval(timestamp, INTERVAL 5 MINUTE) AS timestamp,
64
68
  countIf(status_code >= 500) / count() * 100 AS value
65
69
  FROM http_requests
66
- WHERE timestamp >= %(from_date)s AND timestamp < %(to_date)s
70
+ WHERE timestamp >= '{{ dtk_start_time }}' AND timestamp < '{{ dtk_end_time }}'
67
71
  GROUP BY timestamp ORDER BY timestamp
68
72
 
69
73
  detectors:
70
74
  - type: mad
71
75
  params:
72
- threshold: 3.0
73
- window_size: 2016 # 7 days
76
+ threshold: 3.0 # in sigma-equivalents
77
+ window_size: 2016 # 7 days of 5-min points
78
+ window_weights: exponential # optional: favor recent data
79
+ half_life: "1d" # weight halves every day of age
74
80
 
75
81
  alerting:
76
82
  enabled: true
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
4
4
  A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
5
5
  """
6
6
 
7
- __version__ = "0.5.3"
7
+ __version__ = "0.8.0"
8
8
 
9
9
  from detectkit.core.interval import Interval
10
10
  from detectkit.core.models import ColumnDefinition, TableModel
@@ -95,12 +95,17 @@ class EmailChannel(BaseAlertChannel):
95
95
  self.subject_template = subject_template
96
96
  self.template = template
97
97
 
98
- def send(self, alert_data: AlertData) -> None:
98
+ def send(self, alert_data: AlertData, template: str | None = None) -> bool:
99
99
  """
100
100
  Send alert via email.
101
101
 
102
102
  Args:
103
103
  alert_data: Alert information to send
104
+ template: Per-call template override (falls back to the
105
+ channel-level template, then the built-in default)
106
+
107
+ Returns:
108
+ True when the email was handed to the SMTP server
104
109
 
105
110
  Raises:
106
111
  smtplib.SMTPException: If email sending fails
@@ -108,7 +113,7 @@ class EmailChannel(BaseAlertChannel):
108
113
  Example:
109
114
  >>> channel.send(alert_data)
110
115
  """
111
- message_body = self.format_message(alert_data, self.template)
116
+ message_body = self.format_message(alert_data, template or self.template)
112
117
 
113
118
  # Create email message
114
119
  msg = MIMEMultipart("alternative")
@@ -138,6 +143,8 @@ class EmailChannel(BaseAlertChannel):
138
143
  except smtplib.SMTPException as e:
139
144
  raise smtplib.SMTPException(f"Failed to send email alert: {e}") from e
140
145
 
146
+ return True
147
+
141
148
  def format_mentions(self, mentions: list[str]) -> str:
142
149
  """
143
150
  Format mentions for email.
@@ -69,12 +69,17 @@ class TelegramChannel(BaseAlertChannel):
69
69
  self.disable_notification = disable_notification
70
70
  self.template = template
71
71
 
72
- def send(self, alert_data: AlertData) -> None:
72
+ def send(self, alert_data: AlertData, template: str | None = None) -> bool:
73
73
  """
74
74
  Send alert to Telegram.
75
75
 
76
76
  Args:
77
77
  alert_data: Alert information to send
78
+ template: Per-call template override (falls back to the
79
+ channel-level template, then the built-in default)
80
+
81
+ Returns:
82
+ True when the message was accepted by the Telegram API
78
83
 
79
84
  Raises:
80
85
  requests.RequestException: If request fails
@@ -83,7 +88,7 @@ class TelegramChannel(BaseAlertChannel):
83
88
  Example:
84
89
  >>> channel.send(alert_data)
85
90
  """
86
- message = self.format_message(alert_data, self.template)
91
+ message = self.format_message(alert_data, template or self.template)
87
92
 
88
93
  # Telegram Bot API URL
89
94
  url = f"https://api.telegram.org/bot{self.bot_token}/sendMessage"
@@ -103,6 +108,8 @@ class TelegramChannel(BaseAlertChannel):
103
108
  except requests.RequestException as e:
104
109
  raise requests.RequestException(f"Failed to send Telegram alert: {e}") from e
105
110
 
111
+ return True
112
+
106
113
  def format_mentions(self, mentions: list[str]) -> str:
107
114
  """
108
115
  Format mentions for Telegram.
@@ -5,6 +5,7 @@ from detectkit.alerting.orchestrator._types import (
5
5
  DetectionRecord,
6
6
  _direction_from_metadata,
7
7
  _parse_detection_metadata,
8
+ hydrate_detection_records,
8
9
  )
9
10
  from detectkit.alerting.orchestrator.orchestrator import AlertOrchestrator
10
11
 
@@ -12,8 +13,9 @@ __all__ = [
12
13
  "AlertOrchestrator",
13
14
  "AlertConditions",
14
15
  "DetectionRecord",
15
- # Re-exported for callers (notably TaskManager) that build
16
- # DetectionRecord rows manually before handing them to the orchestrator.
16
+ # Shared hydration of DetectionRecord rows from get_recent_detections
17
+ # output (used by TaskManager and the recovery mixin).
18
+ "hydrate_detection_records",
17
19
  "_direction_from_metadata",
18
20
  "_parse_detection_metadata",
19
21
  ]
@@ -0,0 +1,287 @@
1
+ """Decision logic: ``should_alert`` and the consecutive-anomaly helpers.
2
+
3
+ The multi-detector alert contract (documented in docs/guides/alerting.md):
4
+
5
+ For every timestamp, an alert *quorum* is the set of anomalous
6
+ detections that match the configured direction policy:
7
+
8
+ - ``"up"`` / ``"down"``: only anomalies in that direction count.
9
+ Detectors firing the other way are ignored (they neither help nor
10
+ block the quorum).
11
+ - ``"any"``: every anomaly counts, regardless of direction (an
12
+ up-anomaly and a down-anomaly together can satisfy
13
+ ``min_detectors=2``).
14
+ - ``"same"``: at the latest point at least ``min_detectors`` detectors
15
+ must agree on ONE direction (up- and down-anomalies are counted
16
+ separately; disagreement does not form a quorum). The winning
17
+ direction is then locked for the consecutive walk.
18
+
19
+ An alert fires when the latest ``consecutive_anomalies`` timestamps
20
+ each satisfy the quorum AND are exactly one metric interval apart —
21
+ a gap in the detection grid breaks the chain.
22
+
23
+ The alert payload (value/CI shown in the message) is built from the
24
+ highest-severity record of the latest quorum; ties are broken by
25
+ detector name, then detector id, so the outcome never depends on SQL
26
+ row ordering.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import math
32
+ from datetime import datetime, timezone
33
+
34
+ import numpy as np
35
+
36
+ from detectkit.alerting.channels.base import AlertData
37
+ from detectkit.alerting.orchestrator._base import _OrchestratorBase
38
+ from detectkit.alerting.orchestrator._types import DetectionRecord
39
+ from detectkit.utils.datetime_utils import now_utc, to_aware_utc
40
+
41
+
42
+ class _DecisionMixin(_OrchestratorBase):
43
+ def should_alert(
44
+ self,
45
+ recent_detections: list[DetectionRecord],
46
+ ) -> tuple[bool, AlertData | None]:
47
+ """Decide whether to fire an alert from recent detections.
48
+
49
+ Steps (cheap → expensive):
50
+ 1. Bail out on empty input.
51
+ 2. Honour the alert cooldown so we don't spam channels.
52
+ 3. Walk timestamps newest→oldest counting points where the
53
+ direction-aware quorum holds (see module docstring).
54
+ 4. Require ``consecutive_anomalies`` such points on a
55
+ contiguous interval grid.
56
+ """
57
+ if not recent_detections:
58
+ return False, None
59
+
60
+ # Cooldown is checked first so a noisy run doesn't waste effort.
61
+ if self._is_in_cooldown():
62
+ return False, None
63
+
64
+ detections_by_time = self._group_by_timestamp(recent_detections)
65
+ timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
66
+
67
+ consecutive, latest_quorum, direction = self._count_consecutive_anomalies(
68
+ detections_by_time, timestamps_sorted
69
+ )
70
+ if not latest_quorum or consecutive < self.conditions.consecutive_anomalies:
71
+ return False, None
72
+
73
+ return True, self._build_alert_data(latest_quorum, consecutive, direction)
74
+
75
+ def _quorum_at(
76
+ self,
77
+ anomalies: list[DetectionRecord],
78
+ locked_direction: str | None,
79
+ ) -> tuple[list[DetectionRecord] | None, str | None]:
80
+ """Anomalies satisfying the direction policy at one timestamp.
81
+
82
+ Returns ``(quorum, direction)`` or ``(None, None)`` when the quorum
83
+ is not met. ``locked_direction`` carries the winning direction of
84
+ the latest point through the consecutive walk for ``"same"``.
85
+ """
86
+ policy = self.conditions.direction
87
+ required = self.conditions.min_detectors
88
+
89
+ if policy in ("up", "down"):
90
+ qualifying = [d for d in anomalies if d.direction == policy]
91
+ if len(qualifying) >= required:
92
+ return qualifying, policy
93
+ return None, None
94
+
95
+ if policy == "same":
96
+ if locked_direction is not None:
97
+ qualifying = [d for d in anomalies if d.direction == locked_direction]
98
+ if len(qualifying) >= required:
99
+ return qualifying, locked_direction
100
+ return None, None
101
+
102
+ ups = [d for d in anomalies if d.direction == "up"]
103
+ downs = [d for d in anomalies if d.direction == "down"]
104
+ candidates = [c for c in (ups, downs) if len(c) >= required]
105
+ if not candidates:
106
+ return None, None
107
+ if len(candidates) == 2:
108
+ if len(ups) != len(downs):
109
+ winner = ups if len(ups) > len(downs) else downs
110
+ else:
111
+ # Same detector count in both directions: prefer the
112
+ # more severe side (deterministic tie-break).
113
+ winner = max(
114
+ candidates,
115
+ key=lambda c: max((d.severity, d.detector_name) for d in c),
116
+ )
117
+ else:
118
+ winner = candidates[0]
119
+ return winner, winner[0].direction
120
+
121
+ # "any" (unknown policies are rejected at config validation; if one
122
+ # sneaks in through the direct API, fail open like "any")
123
+ if len(anomalies) >= required:
124
+ return anomalies, None
125
+ return None, None
126
+
127
+ def _count_consecutive_anomalies(
128
+ self,
129
+ detections_by_time: dict[np.datetime64, list[DetectionRecord]],
130
+ timestamps_sorted: list[np.datetime64],
131
+ ) -> tuple[int, list[DetectionRecord] | None, str | None]:
132
+ """Walk timestamps newest→oldest counting quorum-satisfying points.
133
+
134
+ The chain requires grid adjacency: each older timestamp must be
135
+ exactly one metric interval before the previous one, so detection
136
+ gaps (days without runs, detector start_time boundaries) are not
137
+ miscounted as "consecutive".
138
+
139
+ Returns ``(count, latest_quorum, direction)`` where direction is
140
+ the locked/policy direction (None for "any").
141
+ """
142
+ expected_step = np.timedelta64(self.interval.seconds, "s")
143
+ consecutive = 0
144
+ locked_direction: str | None = None
145
+ latest_quorum: list[DetectionRecord] | None = None
146
+ latest_direction: str | None = None
147
+ prev_ts: np.datetime64 | None = None
148
+
149
+ for ts in timestamps_sorted:
150
+ if prev_ts is not None and (prev_ts - ts) != expected_step:
151
+ break
152
+
153
+ anomalies = [d for d in detections_by_time[ts] if d.is_anomaly]
154
+ quorum, direction = self._quorum_at(anomalies, locked_direction)
155
+ if quorum is None:
156
+ break
157
+
158
+ if self.conditions.direction == "same":
159
+ locked_direction = direction
160
+ if latest_quorum is None:
161
+ latest_quorum = quorum
162
+ latest_direction = direction
163
+
164
+ consecutive += 1
165
+ prev_ts = ts
166
+
167
+ return consecutive, latest_quorum, latest_direction
168
+
169
+ @staticmethod
170
+ def _primary_record(anomalies: list[DetectionRecord]) -> DetectionRecord:
171
+ """Highest-severity record; ties broken by name/id for determinism."""
172
+
173
+ def sort_key(d: DetectionRecord):
174
+ severity = d.severity
175
+ if math.isnan(severity):
176
+ severity = 0.0
177
+ elif math.isinf(severity):
178
+ severity = 1e308 if severity > 0 else -1e308
179
+ return (-severity, d.detector_name, d.detector_id)
180
+
181
+ return min(anomalies, key=sort_key)
182
+
183
+ def _build_alert_data(
184
+ self,
185
+ anomalies: list[DetectionRecord],
186
+ consecutive_count: int,
187
+ direction: str | None,
188
+ ) -> AlertData:
189
+ primary = self._primary_record(anomalies)
190
+
191
+ if len(anomalies) > 1:
192
+ max_severity = max(d.severity for d in anomalies)
193
+ detector_names = [d.detector_name for d in anomalies]
194
+ detector_name = f"{len(anomalies)} detectors"
195
+ detector_params = "; ".join(
196
+ f"{d.detector_name}: {d.detector_params}" for d in anomalies
197
+ )
198
+ combined_metadata = {
199
+ "detectors": detector_names,
200
+ "count": len(anomalies),
201
+ }
202
+ for i, d in enumerate(anomalies):
203
+ combined_metadata[f"detector_{i}_metadata"] = d.detection_metadata
204
+ else:
205
+ max_severity = primary.severity
206
+ detector_name = primary.detector_name
207
+ detector_params = primary.detector_params
208
+ combined_metadata = primary.detection_metadata
209
+
210
+ return AlertData(
211
+ metric_name=self.metric_name,
212
+ timestamp=primary.timestamp,
213
+ timezone=self.timezone_display,
214
+ value=primary.value,
215
+ confidence_lower=primary.confidence_lower,
216
+ confidence_upper=primary.confidence_upper,
217
+ detector_name=detector_name,
218
+ detector_params=detector_params,
219
+ direction=direction or primary.direction,
220
+ severity=max_severity,
221
+ detection_metadata=combined_metadata,
222
+ consecutive_count=consecutive_count,
223
+ description=self.description,
224
+ mentions=self.mentions,
225
+ )
226
+
227
+ def should_alert_no_data(
228
+ self,
229
+ last_point: datetime,
230
+ ) -> tuple[bool, AlertData | None]:
231
+ """Decide whether to fire a no-data alert for *last_point*.
232
+
233
+ Conditions (all must hold):
234
+ 1. ``alert_config.no_data_alert`` is true.
235
+ 2. Not currently in alert cooldown for this alert config.
236
+ 3. The latest expected datapoint is missing — there is no row
237
+ in ``_dtk_datapoints`` for *last_point* OR the row's value
238
+ is NULL/NaN. ``get_value_at`` returns ``None`` for both.
239
+
240
+ ``min_detectors`` and ``consecutive_anomalies`` deliberately do
241
+ not apply here: missing data is a single binary metric-level
242
+ signal, not a per-detector vote.
243
+ """
244
+ if not self.alert_config or not getattr(self.alert_config, "no_data_alert", False):
245
+ return False, None
246
+ if not self.internal:
247
+ return False, None
248
+
249
+ if self._is_in_cooldown():
250
+ return False, None
251
+
252
+ value = self.internal.get_value_at(self.metric_name, last_point)
253
+ if value is not None and not (isinstance(value, float) and math.isnan(value)):
254
+ return False, None
255
+
256
+ return True, self._build_no_data_alert_data(last_point)
257
+
258
+ def _build_no_data_alert_data(self, last_point: datetime) -> AlertData:
259
+ """Construct the AlertData payload for a no-data alert."""
260
+ return AlertData(
261
+ metric_name=self.metric_name,
262
+ timestamp=np.datetime64(last_point, "ms"),
263
+ timezone=self.timezone_display,
264
+ value=None,
265
+ confidence_lower=None,
266
+ confidence_upper=None,
267
+ detector_name="no_data",
268
+ detector_params="",
269
+ direction="none",
270
+ severity=0.0,
271
+ detection_metadata={"reason": "no_data"},
272
+ consecutive_count=0,
273
+ is_no_data=True,
274
+ description=self.description,
275
+ mentions=self.mentions,
276
+ )
277
+
278
+ def get_last_complete_point(self, now: datetime | None = None) -> datetime:
279
+ """Floor ``now`` to the previous fully completed interval boundary."""
280
+ if now is None:
281
+ now = now_utc()
282
+ now = to_aware_utc(now)
283
+
284
+ interval_seconds = self.interval.seconds
285
+ floored = (int(now.timestamp()) // interval_seconds) * interval_seconds
286
+ last_complete = floored - interval_seconds
287
+ return datetime.fromtimestamp(last_complete, tz=timezone.utc)
@@ -58,6 +58,13 @@ class _DispatchMixin(_OrchestratorBase):
58
58
  results: dict[str, bool] = {}
59
59
  for channel in channels:
60
60
  channel_name = channel.__class__.__name__
61
+ # Two channels of the same type must not collapse into one
62
+ # result entry (that would undercount sends).
63
+ if channel_name in results:
64
+ suffix = 2
65
+ while f"{channel_name}#{suffix}" in results:
66
+ suffix += 1
67
+ channel_name = f"{channel_name}#{suffix}"
61
68
  try:
62
69
  results[channel_name] = bool(channel.send(alert_data, template))
63
70
  except Exception as exc:
@@ -4,14 +4,11 @@ from __future__ import annotations
4
4
 
5
5
  from datetime import datetime
6
6
 
7
- import numpy as np
8
-
9
7
  from detectkit.alerting.channels.base import AlertData
10
8
  from detectkit.alerting.orchestrator._base import _OrchestratorBase
11
9
  from detectkit.alerting.orchestrator._types import (
12
10
  DetectionRecord,
13
- _direction_from_metadata,
14
- _parse_detection_metadata,
11
+ hydrate_detection_records,
15
12
  )
16
13
 
17
14
 
@@ -71,27 +68,7 @@ class _RecoveryMixin(_OrchestratorBase):
71
68
  # No fresh detections at all → assume recovery.
72
69
  return True
73
70
 
74
- records: list[DetectionRecord] = []
75
- for det in recent_detections:
76
- metadata_list = det.get("detection_metadata_list") or [None] * len(det["detector_ids"])
77
- for i in range(len(det["detector_ids"])):
78
- is_anomaly = det["is_anomaly_flags"][i]
79
- metadata = _parse_detection_metadata(metadata_list[i])
80
- records.append(
81
- DetectionRecord(
82
- timestamp=np.datetime64(det["timestamp"]),
83
- detector_name=det["detector_names"][i],
84
- detector_id=det["detector_ids"][i],
85
- detector_params=det["detector_params_list"][i],
86
- value=det["value"],
87
- is_anomaly=is_anomaly,
88
- confidence_lower=det["confidence_lowers"][i],
89
- confidence_upper=det["confidence_uppers"][i],
90
- direction=_direction_from_metadata(metadata, is_anomaly),
91
- severity=0.0, # not used for the recovery check
92
- detection_metadata=metadata,
93
- )
94
- )
71
+ records = hydrate_detection_records(recent_detections)
95
72
 
96
73
  detections_by_time = self._group_by_timestamp(records)
97
74
  timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
@@ -114,7 +91,14 @@ class _RecoveryMixin(_OrchestratorBase):
114
91
  return len(blocking) == 0
115
92
 
116
93
  def _get_alert_trigger_direction(self, last_alert_timestamp: datetime) -> str | None:
117
- """Return the direction of the anomaly that triggered the last alert."""
94
+ """Return the direction of the anomaly that triggered the last alert.
95
+
96
+ Mirrors the quorum logic that fired the alert (``_quorum_at`` with
97
+ no locked direction) so recovery checks the SAME direction the
98
+ alert was raised for — not whichever anomalous detector happens to
99
+ sort first. Falls back to a simple majority when the quorum can no
100
+ longer be reconstructed.
101
+ """
118
102
  if not self.internal:
119
103
  return None
120
104
 
@@ -126,14 +110,27 @@ class _RecoveryMixin(_OrchestratorBase):
126
110
  if not trigger_detections:
127
111
  return None
128
112
 
129
- det = trigger_detections[0]
130
- metadata_list = det.get("detection_metadata_list") or [None] * len(det["detector_ids"])
131
- for i in range(len(det["detector_ids"])):
132
- if not det["is_anomaly_flags"][i]:
133
- continue
134
- direction = _direction_from_metadata(metadata_list[i], True)
135
- if direction in ("up", "down"):
136
- return direction
113
+ records = hydrate_detection_records(trigger_detections)
114
+ by_time = self._group_by_timestamp(records)
115
+ if not by_time:
116
+ return None
117
+ latest_ts = max(by_time.keys())
118
+ anomalies = [d for d in by_time[latest_ts] if d.is_anomaly]
119
+ if not anomalies:
120
+ return None
121
+
122
+ # _quorum_at lives in _DecisionMixin; both mixins are combined in
123
+ # AlertOrchestrator, so the call resolves at runtime.
124
+ _, direction = self._quorum_at(anomalies, None)
125
+ if direction in ("up", "down"):
126
+ return direction
127
+
128
+ ups = sum(1 for d in anomalies if d.direction == "up")
129
+ downs = sum(1 for d in anomalies if d.direction == "down")
130
+ if ups > downs:
131
+ return "up"
132
+ if downs > ups:
133
+ return "down"
137
134
  return None
138
135
 
139
136
  def _build_recovery_data(
@@ -3,10 +3,12 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass
6
+ from datetime import datetime
6
7
  from typing import Any
7
8
 
8
9
  import numpy as np
9
10
 
11
+ from detectkit.utils.datetime_utils import to_naive_utc
10
12
  from detectkit.utils.json_utils import json_loads
11
13
 
12
14
 
@@ -58,11 +60,15 @@ def _direction_from_metadata(metadata: Any, is_anomaly: bool) -> str:
58
60
 
59
61
  @dataclass
60
62
  class AlertConditions:
61
- """Conditions that turn a sequence of detections into an alert."""
63
+ """Conditions that turn a sequence of detections into an alert.
64
+
65
+ Defaults mirror :class:`detectkit.config.metric_config.AlertConfig`
66
+ so direct API users get the same behavior as YAML users.
67
+ """
62
68
 
63
69
  min_detectors: int = 1
64
- direction: str = "any" # "any", "same", "up", "down"
65
- consecutive_anomalies: int = 1
70
+ direction: str = "same" # "any", "same", "up", "down"
71
+ consecutive_anomalies: int = 3
66
72
 
67
73
 
68
74
  @dataclass
@@ -80,3 +86,45 @@ class DetectionRecord:
80
86
  direction: str # "up", "down", "none"
81
87
  severity: float
82
88
  detection_metadata: dict
89
+
90
+
91
+ def hydrate_detection_records(rows: list[dict]) -> list[DetectionRecord]:
92
+ """Build :class:`DetectionRecord` rows from ``get_recent_detections`` output.
93
+
94
+ Emits one record *per detector per timestamp* (the orchestrator counts
95
+ records to evaluate ``min_detectors``). Input rows are timestamp-DESC as
96
+ returned by SQL; output is oldest→newest. Timestamps are normalized to
97
+ ``datetime64[ms]`` so grid-adjacency arithmetic is well-defined.
98
+ """
99
+ records: list[DetectionRecord] = []
100
+ for row in reversed(rows):
101
+ raw_ts = row["timestamp"]
102
+ if isinstance(raw_ts, datetime):
103
+ raw_ts = to_naive_utc(raw_ts)
104
+ timestamp = np.datetime64(raw_ts, "ms")
105
+ metadata_list = row.get("detection_metadata_list") or [None] * len(row["detector_ids"])
106
+ for i in range(len(row["detector_ids"])):
107
+ is_anomaly = bool(row["is_anomaly_flags"][i])
108
+ metadata = _parse_detection_metadata(metadata_list[i])
109
+ try:
110
+ severity = float(metadata.get("severity", 0.0) or 0.0)
111
+ except (TypeError, ValueError):
112
+ severity = 0.0
113
+
114
+ records.append(
115
+ DetectionRecord(
116
+ timestamp=timestamp,
117
+ detector_name=row["detector_names"][i],
118
+ detector_id=row["detector_ids"][i],
119
+ detector_params=row["detector_params_list"][i],
120
+ value=row["value"],
121
+ is_anomaly=is_anomaly,
122
+ confidence_lower=row["confidence_lowers"][i],
123
+ confidence_upper=row["confidence_uppers"][i],
124
+ direction=_direction_from_metadata(metadata, is_anomaly),
125
+ severity=severity,
126
+ detection_metadata=metadata,
127
+ )
128
+ )
129
+
130
+ return records