detectkit 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.5.0/detectkit.egg-info → detectkit-0.5.2}/PKG-INFO +3 -2
- {detectkit-0.5.0 → detectkit-0.5.2}/README.md +2 -1
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/__init__.py +1 -1
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/commands/run.py +16 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/commands/test_alert.py +9 -3
- detectkit-0.5.2/detectkit/orchestration/error_dispatch.py +151 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/manager.py +12 -80
- {detectkit-0.5.0 → detectkit-0.5.2/detectkit.egg-info}/PKG-INFO +3 -2
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit.egg-info/SOURCES.txt +1 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/pyproject.toml +1 -1
- {detectkit-0.5.0 → detectkit-0.5.2}/LICENSE +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/MANIFEST.in +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/email.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/telegram.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_base.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_decision.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_recovery.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/_types.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/commands/init.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/cli/main.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/config/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/config/profile.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/config/project_config.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/config/validator.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/core/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/core/interval.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/core/models.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_alert_states.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_base.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_datapoints.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_detections.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_metrics.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_schema.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/_tasks.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/internal_tables/manager.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/manager.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/database/tables.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/base.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/seasonality.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/statistical/iqr.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/statistical/mad.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/detectors/statistical/zscore.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/loaders/metric_loader.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/_base.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/_load_step.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/orchestration/task_manager/_types.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/utils/__init__.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/utils/datetime_utils.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/utils/env_interpolation.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/utils/json_utils.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit/utils/stats.py +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/requirements.txt +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/setup.cfg +0 -0
- {detectkit-0.5.0 → detectkit-0.5.2}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -79,7 +79,8 @@ Dynamic: license-file
|
|
|
79
79
|
- **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
|
|
80
80
|
- **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
|
|
81
81
|
- **@mentions** — tag users/groups in alerts, each channel formats natively
|
|
82
|
-
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications
|
|
82
|
+
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
|
|
83
|
+
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
83
84
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
84
85
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
85
86
|
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
- **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
|
|
14
14
|
- **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
|
|
15
15
|
- **@mentions** — tag users/groups in alerts, each channel formats natively
|
|
16
|
-
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications
|
|
16
|
+
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
|
|
17
|
+
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
17
18
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
18
19
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
19
20
|
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.5.
|
|
7
|
+
__version__ = "0.5.2"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -14,6 +14,7 @@ from detectkit.config.profile import ProfilesConfig
|
|
|
14
14
|
from detectkit.config.project_config import ProjectConfig
|
|
15
15
|
from detectkit.config.validator import validate_metric_uniqueness
|
|
16
16
|
from detectkit.database.internal_tables import InternalTablesManager
|
|
17
|
+
from detectkit.orchestration.error_dispatch import dispatch_project_error_alert
|
|
17
18
|
from detectkit.orchestration.task_manager import PipelineStep, TaskManager
|
|
18
19
|
|
|
19
20
|
|
|
@@ -159,6 +160,15 @@ def run_command(
|
|
|
159
160
|
bold=True,
|
|
160
161
|
)
|
|
161
162
|
)
|
|
163
|
+
# Profiles are loaded → channels can be resolved → fire the
|
|
164
|
+
# project-level error alert before bailing. Otherwise a dead DB
|
|
165
|
+
# silently kills the entire run with no notification.
|
|
166
|
+
dispatch_project_error_alert(
|
|
167
|
+
profiles_config=profiles_config,
|
|
168
|
+
project_config=project_config,
|
|
169
|
+
metric_name="<startup>",
|
|
170
|
+
exc=e,
|
|
171
|
+
)
|
|
162
172
|
return
|
|
163
173
|
|
|
164
174
|
# Create internal tables manager
|
|
@@ -175,6 +185,12 @@ def run_command(
|
|
|
175
185
|
bold=True,
|
|
176
186
|
)
|
|
177
187
|
)
|
|
188
|
+
dispatch_project_error_alert(
|
|
189
|
+
profiles_config=profiles_config,
|
|
190
|
+
project_config=project_config,
|
|
191
|
+
metric_name="<startup>",
|
|
192
|
+
exc=e,
|
|
193
|
+
)
|
|
178
194
|
return
|
|
179
195
|
|
|
180
196
|
# Create task manager
|
|
@@ -20,6 +20,7 @@ from detectkit.utils.datetime_utils import now_utc
|
|
|
20
20
|
|
|
21
21
|
def create_mock_alert_data(
|
|
22
22
|
metric_config: MetricConfig,
|
|
23
|
+
alerting_config,
|
|
23
24
|
timezone_display: str = "UTC",
|
|
24
25
|
) -> AlertData:
|
|
25
26
|
"""
|
|
@@ -27,6 +28,10 @@ def create_mock_alert_data(
|
|
|
27
28
|
|
|
28
29
|
Args:
|
|
29
30
|
metric_config: Metric configuration
|
|
31
|
+
alerting_config: Single ``AlertingConfig`` from
|
|
32
|
+
``metric_config.alerting`` to source mentions/timezone from.
|
|
33
|
+
``metric_config.alerting`` is a list — the test command
|
|
34
|
+
iterates it and passes one entry at a time.
|
|
30
35
|
timezone_display: Timezone for display
|
|
31
36
|
|
|
32
37
|
Returns:
|
|
@@ -35,8 +40,9 @@ def create_mock_alert_data(
|
|
|
35
40
|
# Use current time
|
|
36
41
|
now = now_utc()
|
|
37
42
|
|
|
38
|
-
#
|
|
39
|
-
|
|
43
|
+
# Mentions are per-AlertingConfig (different alert routes can mention
|
|
44
|
+
# different teams). Pull them from the specific config we're testing.
|
|
45
|
+
mentions = list(alerting_config.mentions) if alerting_config else []
|
|
40
46
|
|
|
41
47
|
# Create realistic mock data
|
|
42
48
|
return AlertData(
|
|
@@ -147,7 +153,7 @@ def run_test_alert(metric_name: str, profile: str | None = None):
|
|
|
147
153
|
print(f" Timezone: {timezone_display}")
|
|
148
154
|
print(f" Channels: {', '.join(alerting_config.channels)}\n")
|
|
149
155
|
|
|
150
|
-
alert_data = create_mock_alert_data(metric_config, timezone_display)
|
|
156
|
+
alert_data = create_mock_alert_data(metric_config, alerting_config, timezone_display)
|
|
151
157
|
|
|
152
158
|
success_count = 0
|
|
153
159
|
for channel_name in alerting_config.channels:
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Project-level error alert dispatch shared by ``TaskManager`` and the CLI.
|
|
2
|
+
|
|
3
|
+
Why this lives here, not on ``TaskManager``:
|
|
4
|
+
|
|
5
|
+
The task manager only sees errors that happen during ``run_metric``. Three
|
|
6
|
+
classes of failures crash earlier — at the CLI level, before a TaskManager
|
|
7
|
+
exists at all:
|
|
8
|
+
|
|
9
|
+
1. ``profiles_config = ProfilesConfig.from_yaml(...)`` (no profile, can't
|
|
10
|
+
build channels — out of scope for this dispatcher)
|
|
11
|
+
2. ``db_manager = profiles_config.create_manager(profile)`` (DB unreachable;
|
|
12
|
+
profiles ARE loaded — channels can be built and we should alert)
|
|
13
|
+
3. ``internal_manager.ensure_tables()`` (DB reachable but DDL fails)
|
|
14
|
+
|
|
15
|
+
For (2) and (3) the operator needs the same project-level error alert as for
|
|
16
|
+
runtime failures. Extracting the dispatch into a free function lets the CLI
|
|
17
|
+
call it directly without needing a TaskManager.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import click
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
from detectkit.alerting.channels.base import AlertData
|
|
28
|
+
from detectkit.alerting.channels.factory import AlertChannelFactory
|
|
29
|
+
from detectkit.utils.datetime_utils import now_utc_naive
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def dispatch_project_error_alert(
|
|
33
|
+
*,
|
|
34
|
+
profiles_config: Any,
|
|
35
|
+
project_config: Any,
|
|
36
|
+
metric_name: str,
|
|
37
|
+
exc: BaseException,
|
|
38
|
+
) -> bool:
|
|
39
|
+
"""Send a project-level error alert based on ``project_config.error_alerting``.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
profiles_config: Loaded ``ProfilesConfig`` (needed to resolve channel
|
|
43
|
+
names → channel instances). ``None`` short-circuits the dispatch.
|
|
44
|
+
project_config: Loaded ``ProjectConfig``. Reads ``error_alerting``.
|
|
45
|
+
metric_name: A string identifier for the failure context. Use the
|
|
46
|
+
real metric name when failing inside a metric run, or a
|
|
47
|
+
placeholder like ``"<startup>"`` for early failures.
|
|
48
|
+
exc: The exception that triggered the alert. Its type name and
|
|
49
|
+
``str(exc)`` are passed to the channel template as
|
|
50
|
+
``{error_type}`` and ``{error_message}``.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
``True`` when the alert was actually attempted (caller should treat
|
|
54
|
+
this as "abort the rest of the run"). ``False`` when alerting is
|
|
55
|
+
disabled, has no channels configured, no profiles to resolve them
|
|
56
|
+
against, or the dispatch itself raised.
|
|
57
|
+
"""
|
|
58
|
+
cfg = getattr(project_config, "error_alerting", None)
|
|
59
|
+
if not cfg or not cfg.enabled or not cfg.channels:
|
|
60
|
+
return False
|
|
61
|
+
if profiles_config is None:
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
channels = _build_channels(profiles_config, cfg.channels)
|
|
66
|
+
if not channels:
|
|
67
|
+
click.echo(
|
|
68
|
+
click.style(
|
|
69
|
+
" │ Project error_alerting enabled but no valid channels "
|
|
70
|
+
"resolved — skipping.",
|
|
71
|
+
fg="yellow",
|
|
72
|
+
),
|
|
73
|
+
err=True,
|
|
74
|
+
)
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
alert_data = AlertData(
|
|
78
|
+
metric_name=metric_name,
|
|
79
|
+
timestamp=np.datetime64(now_utc_naive(), "ms"),
|
|
80
|
+
timezone=cfg.timezone or "UTC",
|
|
81
|
+
value=None,
|
|
82
|
+
confidence_lower=None,
|
|
83
|
+
confidence_upper=None,
|
|
84
|
+
detector_name="pipeline",
|
|
85
|
+
detector_params="",
|
|
86
|
+
direction="none",
|
|
87
|
+
severity=0.0,
|
|
88
|
+
detection_metadata={"reason": "pipeline_error"},
|
|
89
|
+
consecutive_count=0,
|
|
90
|
+
is_error=True,
|
|
91
|
+
error_type=type(exc).__name__,
|
|
92
|
+
error_message=str(exc),
|
|
93
|
+
description=None,
|
|
94
|
+
mentions=cfg.mentions,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
click.echo(
|
|
98
|
+
click.style(
|
|
99
|
+
f" │ ⚠ Project error alert → sending to {len(channels)} channel(s)...",
|
|
100
|
+
fg="yellow",
|
|
101
|
+
bold=True,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
for channel in channels:
|
|
105
|
+
channel_name = channel.__class__.__name__
|
|
106
|
+
try:
|
|
107
|
+
ok = bool(channel.send(alert_data, template=cfg.template))
|
|
108
|
+
mark = click.style("✓", fg="green") if ok else click.style("✗", fg="red")
|
|
109
|
+
click.echo(f" │ {mark} {channel_name}")
|
|
110
|
+
except Exception as channel_exc:
|
|
111
|
+
click.echo(
|
|
112
|
+
click.style(
|
|
113
|
+
f" │ ✗ {channel_name}: " f"{type(channel_exc).__name__}: {channel_exc}",
|
|
114
|
+
fg="red",
|
|
115
|
+
),
|
|
116
|
+
err=True,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return True
|
|
120
|
+
except Exception as dispatch_exc:
|
|
121
|
+
# Never let alert dispatch crash the caller — they're already
|
|
122
|
+
# handling another error and need to surface it cleanly.
|
|
123
|
+
click.echo(
|
|
124
|
+
click.style(
|
|
125
|
+
f" │ Failed to dispatch project error alert: "
|
|
126
|
+
f"{type(dispatch_exc).__name__}: {dispatch_exc}",
|
|
127
|
+
fg="red",
|
|
128
|
+
),
|
|
129
|
+
err=True,
|
|
130
|
+
)
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _build_channels(profiles_config: Any, channel_names: list[str]) -> list:
|
|
135
|
+
"""Resolve channel names against the loaded profiles config.
|
|
136
|
+
|
|
137
|
+
Mirrors ``_TaskManagerBase._create_alert_channels`` but lives outside
|
|
138
|
+
the TaskManager so the CLI early-failure paths can call it before a
|
|
139
|
+
TaskManager exists.
|
|
140
|
+
"""
|
|
141
|
+
channels = []
|
|
142
|
+
for name in channel_names:
|
|
143
|
+
try:
|
|
144
|
+
channel_config = profiles_config.get_alert_channel_config(name)
|
|
145
|
+
channels.append(AlertChannelFactory.create_from_config(channel_config))
|
|
146
|
+
except (ValueError, KeyError, ImportError, TypeError) as exc:
|
|
147
|
+
# Config-level problems (missing channel, bad type, missing
|
|
148
|
+
# driver, wrong constructor args) — skip this channel but
|
|
149
|
+
# keep going so a single typo doesn't kill the whole alert.
|
|
150
|
+
print(f"Warning: Failed to create channel '{name}': {type(exc).__name__}: {exc}")
|
|
151
|
+
return channels
|
|
@@ -7,15 +7,13 @@ from datetime import datetime
|
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
9
|
import click
|
|
10
|
-
import numpy as np
|
|
11
10
|
|
|
12
|
-
from detectkit.alerting.channels.base import AlertData
|
|
13
11
|
from detectkit.config.metric_config import MetricConfig
|
|
12
|
+
from detectkit.orchestration.error_dispatch import dispatch_project_error_alert
|
|
14
13
|
from detectkit.orchestration.task_manager._alert_step import _AlertStepMixin
|
|
15
14
|
from detectkit.orchestration.task_manager._detect_step import _DetectStepMixin
|
|
16
15
|
from detectkit.orchestration.task_manager._load_step import _LoadStepMixin
|
|
17
16
|
from detectkit.orchestration.task_manager._types import PipelineStep, TaskStatus
|
|
18
|
-
from detectkit.utils.datetime_utils import now_utc_naive
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
@@ -138,78 +136,23 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
|
138
136
|
|
|
139
137
|
Returns ``True`` when an alert was actually attempted (meaning the
|
|
140
138
|
caller should abort the rest of the run). ``False`` when alerting
|
|
141
|
-
is disabled
|
|
142
|
-
|
|
139
|
+
is disabled — in that case the run continues normally. Within one
|
|
140
|
+
TaskManager instance the alert fires at most once; subsequent
|
|
141
|
+
failures still return ``True`` so the CLI keeps aborting.
|
|
143
142
|
"""
|
|
144
143
|
cfg = getattr(self.project_config, "error_alerting", None)
|
|
145
144
|
if not cfg or not cfg.enabled:
|
|
146
145
|
return False
|
|
147
146
|
if self._error_alert_sent_in_run:
|
|
148
|
-
# Already alerted in this run — suppress and abort.
|
|
149
147
|
return True
|
|
150
|
-
if not cfg.channels:
|
|
151
|
-
return False
|
|
152
|
-
|
|
153
|
-
try:
|
|
154
|
-
channels = self._create_alert_channels(cfg.channels)
|
|
155
|
-
if not channels:
|
|
156
|
-
click.echo(
|
|
157
|
-
click.style(
|
|
158
|
-
" │ Project error_alerting enabled but no valid "
|
|
159
|
-
"channels resolved — skipping.",
|
|
160
|
-
fg="yellow",
|
|
161
|
-
),
|
|
162
|
-
err=True,
|
|
163
|
-
)
|
|
164
|
-
return False
|
|
165
|
-
|
|
166
|
-
alert_data = AlertData(
|
|
167
|
-
metric_name=metric_name,
|
|
168
|
-
timestamp=np.datetime64(now_utc_naive(), "ms"),
|
|
169
|
-
timezone=cfg.timezone or "UTC",
|
|
170
|
-
value=None,
|
|
171
|
-
confidence_lower=None,
|
|
172
|
-
confidence_upper=None,
|
|
173
|
-
detector_name="pipeline",
|
|
174
|
-
detector_params="",
|
|
175
|
-
direction="none",
|
|
176
|
-
severity=0.0,
|
|
177
|
-
detection_metadata={"reason": "pipeline_error"},
|
|
178
|
-
consecutive_count=0,
|
|
179
|
-
is_error=True,
|
|
180
|
-
error_type=type(exc).__name__,
|
|
181
|
-
error_message=str(exc),
|
|
182
|
-
description=None,
|
|
183
|
-
mentions=cfg.mentions,
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
click.echo(
|
|
187
|
-
click.style(
|
|
188
|
-
f" │ ⚠ Project error alert → sending to " f"{len(channels)} channel(s)...",
|
|
189
|
-
fg="yellow",
|
|
190
|
-
bold=True,
|
|
191
|
-
)
|
|
192
|
-
)
|
|
193
|
-
sent = 0
|
|
194
|
-
for channel in channels:
|
|
195
|
-
channel_name = channel.__class__.__name__
|
|
196
|
-
try:
|
|
197
|
-
if channel.send(alert_data, template=cfg.template):
|
|
198
|
-
sent += 1
|
|
199
|
-
mark = click.style("✓", fg="green")
|
|
200
|
-
else:
|
|
201
|
-
mark = click.style("✗", fg="red")
|
|
202
|
-
click.echo(f" │ {mark} {channel_name}")
|
|
203
|
-
except Exception as channel_exc:
|
|
204
|
-
click.echo(
|
|
205
|
-
click.style(
|
|
206
|
-
f" │ ✗ {channel_name}: "
|
|
207
|
-
f"{type(channel_exc).__name__}: {channel_exc}",
|
|
208
|
-
fg="red",
|
|
209
|
-
),
|
|
210
|
-
err=True,
|
|
211
|
-
)
|
|
212
148
|
|
|
149
|
+
sent = dispatch_project_error_alert(
|
|
150
|
+
profiles_config=self.profiles_config,
|
|
151
|
+
project_config=self.project_config,
|
|
152
|
+
metric_name=metric_name,
|
|
153
|
+
exc=exc,
|
|
154
|
+
)
|
|
155
|
+
if sent:
|
|
213
156
|
click.echo(
|
|
214
157
|
click.style(
|
|
215
158
|
" │ Aborting remaining metrics for this run.",
|
|
@@ -217,18 +160,7 @@ class TaskManager(_LoadStepMixin, _DetectStepMixin, _AlertStepMixin):
|
|
|
217
160
|
)
|
|
218
161
|
)
|
|
219
162
|
self._error_alert_sent_in_run = True
|
|
220
|
-
|
|
221
|
-
except Exception as dispatch_exc:
|
|
222
|
-
# Never let alert dispatch crash the run.
|
|
223
|
-
click.echo(
|
|
224
|
-
click.style(
|
|
225
|
-
f" │ Failed to dispatch project error alert: "
|
|
226
|
-
f"{type(dispatch_exc).__name__}: {dispatch_exc}",
|
|
227
|
-
fg="red",
|
|
228
|
-
),
|
|
229
|
-
err=True,
|
|
230
|
-
)
|
|
231
|
-
return False
|
|
163
|
+
return sent
|
|
232
164
|
|
|
233
165
|
def __repr__(self) -> str:
|
|
234
166
|
return f"TaskManager(db={self.db_manager.__class__.__name__})"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: detectkit
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Metric monitoring with automatic anomaly detection
|
|
5
5
|
Author: detectkit team
|
|
6
6
|
License: MIT
|
|
@@ -79,7 +79,8 @@ Dynamic: license-file
|
|
|
79
79
|
- **Statistical detectors** — Z-Score, MAD, IQR, Manual Bounds
|
|
80
80
|
- **Multi-channel alerting** — Mattermost, Slack, Telegram, Email, Webhook
|
|
81
81
|
- **@mentions** — tag users/groups in alerts, each channel formats natively
|
|
82
|
-
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications
|
|
82
|
+
- **Alert lifecycle** — consecutive anomalies, cooldown, recovery notifications, no-data alerts
|
|
83
|
+
- **Project-level error alerts** — catch DB outages and pipeline crashes once per run
|
|
83
84
|
- **Database agnostic** — ClickHouse, PostgreSQL, MySQL
|
|
84
85
|
- **Idempotent** — resume from interruptions, no duplicate processing
|
|
85
86
|
- **CLI** — `dtk init`, `dtk run --select`, tag-based selectors
|
|
@@ -68,6 +68,7 @@ detectkit/loaders/__init__.py
|
|
|
68
68
|
detectkit/loaders/metric_loader.py
|
|
69
69
|
detectkit/loaders/query_template.py
|
|
70
70
|
detectkit/orchestration/__init__.py
|
|
71
|
+
detectkit/orchestration/error_dispatch.py
|
|
71
72
|
detectkit/orchestration/task_manager/__init__.py
|
|
72
73
|
detectkit/orchestration/task_manager/_alert_step.py
|
|
73
74
|
detectkit/orchestration/task_manager/_base.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|