detectkit 0.16.4__tar.gz → 0.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.16.4/detectkit.egg-info → detectkit-0.18.0}/PKG-INFO +1 -1
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/__init__.py +1 -1
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/base.py +118 -26
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/email.py +27 -31
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/telegram.py +26 -14
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/webhook.py +29 -26
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_base.py +9 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_decision.py +49 -3
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_recovery.py +79 -2
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/alerting.md +42 -25
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/detectors.md +2 -1
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/test_alert.py +10 -1
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/_windowed.py +15 -3
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/utils/datetime_utils.py +36 -0
- {detectkit-0.16.4 → detectkit-0.18.0/detectkit.egg-info}/PKG-INFO +1 -1
- {detectkit-0.16.4 → detectkit-0.18.0}/LICENSE +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/MANIFEST.in +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/README.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/branding.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/_types.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/_output.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/CLAUDE.section.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/cli.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/metrics.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/overview.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/rules/project.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/skills/dtk-feedback/SKILL.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/skills/dtk-new-metric/SKILL.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/assets/claude/skills/dtk-setup-project/SKILL.md +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/clean.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/init.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/init_claude.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/run.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/commands/unlock.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/cli/main.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/config/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/config/profile.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/config/project_config.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/config/validator.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/core/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/core/interval.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/core/models.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/_sql_manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_alert_states.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_base.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_detections.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_maintenance.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_metrics.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_schema.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/_tasks.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/internal_tables/manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/mysql_manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/postgres_manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/database/tables.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/base.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/seasonality.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/iqr.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/mad.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/detectors/statistical/zscore.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/loaders/metric_loader.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/error_dispatch.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/_base.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/_types.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/orchestration/task_manager/manager.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/utils/__init__.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/utils/env_interpolation.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/utils/json_utils.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit/utils/stats.py +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit.egg-info/SOURCES.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/pyproject.toml +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/requirements.txt +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/setup.cfg +0 -0
- {detectkit-0.16.4 → detectkit-0.18.0}/setup.py +0 -0
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.18.0"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -92,6 +92,18 @@ class AlertData:
|
|
|
92
92
|
direction_policy: str | None = None
|
|
93
93
|
consecutive_required: int | None = None
|
|
94
94
|
detector_count: int = 1
|
|
95
|
+
# Incident timing — answers "how long has this been going on". The metric
|
|
96
|
+
# ``interval_seconds`` lets the message express the streak in wall-clock
|
|
97
|
+
# time; ``onset_timestamp`` is the first timestamp of the current anomalous
|
|
98
|
+
# run (anomaly) / the just-ended incident (recovery); ``streak_capped`` is
|
|
99
|
+
# True when the run is at least as long as the orchestrator's lookback
|
|
100
|
+
# window, so the duration is rendered as a lower bound ("over …"). The
|
|
101
|
+
# consecutive streak length itself rides on ``consecutive_count`` (the true
|
|
102
|
+
# run length, resolved at fire time). All default to None/False so
|
|
103
|
+
# direct-API callers and non-anomaly alerts render unchanged.
|
|
104
|
+
interval_seconds: int | None = None
|
|
105
|
+
onset_timestamp: Any | None = None
|
|
106
|
+
streak_capped: bool = False
|
|
95
107
|
|
|
96
108
|
|
|
97
109
|
class BaseAlertChannel(ABC):
|
|
@@ -165,8 +177,18 @@ class BaseAlertChannel(ABC):
|
|
|
165
177
|
- {direction} — observed/locked direction of the anomaly
|
|
166
178
|
- {direction_policy} — configured direction rule ("same"/"any"/...)
|
|
167
179
|
- {min_detectors} — configured quorum threshold (the rule)
|
|
168
|
-
- {consecutive_count} —
|
|
180
|
+
- {consecutive_count} — true consecutive streak length (resolved at
|
|
181
|
+
fire time, not capped at the rule's threshold)
|
|
169
182
|
- {consecutive_required} — configured consecutive threshold (rule)
|
|
183
|
+
- {interval_display} — metric interval as a string (e.g. "10min")
|
|
184
|
+
- {duration_display} — how long the streak/incident lasted
|
|
185
|
+
(e.g. "2h 30m"; "over …" when it predates the lookback window)
|
|
186
|
+
- {onset_display} / {started_display} — first timestamp of the run
|
|
187
|
+
({started_display} adds "or earlier" when the run is capped)
|
|
188
|
+
- {anomaly_lead} / {recovery_lead} — the ready-made plain-language
|
|
189
|
+
lead sentence ("Anomalous for …" / "… Incident lasted …")
|
|
190
|
+
- {window_line} — "Started: … | Latest/Cleared: …" (or a single
|
|
191
|
+
"Detected at: …" line when the onset is unknown)
|
|
170
192
|
- {severity}
|
|
171
193
|
- {status}
|
|
172
194
|
|
|
@@ -228,22 +250,27 @@ class BaseAlertChannel(ABC):
|
|
|
228
250
|
"""
|
|
229
251
|
import math
|
|
230
252
|
from datetime import datetime
|
|
253
|
+
from zoneinfo import ZoneInfo
|
|
231
254
|
|
|
232
255
|
import numpy as np
|
|
233
256
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
257
|
+
def _fmt_ts(value: Any) -> str:
|
|
258
|
+
"""Format a timestamp the same way for the main point and the onset:
|
|
259
|
+
naive UTC → target timezone, with a ``(tz)`` suffix when set."""
|
|
260
|
+
if value is None:
|
|
261
|
+
return ""
|
|
262
|
+
t = value
|
|
263
|
+
if isinstance(t, np.datetime64):
|
|
264
|
+
t = t.astype("datetime64[ms]").astype(datetime)
|
|
265
|
+
if not isinstance(t, datetime):
|
|
266
|
+
return str(t)
|
|
267
|
+
if alert_data.timezone:
|
|
268
|
+
t = t.replace(tzinfo=ZoneInfo("UTC")).astimezone(ZoneInfo(alert_data.timezone))
|
|
269
|
+
return f"{t.strftime('%Y-%m-%d %H:%M:%S')} ({alert_data.timezone})"
|
|
270
|
+
return t.strftime("%Y-%m-%d %H:%M:%S")
|
|
271
|
+
|
|
272
|
+
ts_str = _fmt_ts(alert_data.timestamp)
|
|
273
|
+
onset_str = _fmt_ts(alert_data.onset_timestamp)
|
|
247
274
|
|
|
248
275
|
# Format confidence interval
|
|
249
276
|
if alert_data.confidence_lower is not None and alert_data.confidence_upper is not None:
|
|
@@ -287,6 +314,66 @@ class BaseAlertChannel(ABC):
|
|
|
287
314
|
)
|
|
288
315
|
direction_policy = alert_data.direction_policy or alert_data.direction
|
|
289
316
|
|
|
317
|
+
# Incident timing — the "how long has this been going on" story shared by
|
|
318
|
+
# every channel. ``consecutive_count`` carries the *true* streak length
|
|
319
|
+
# (resolved at fire time); together with the metric interval it becomes a
|
|
320
|
+
# wall-clock duration and a plain-language lead. ``streak_capped`` means
|
|
321
|
+
# the run is at least as long as the orchestrator's lookback window, so
|
|
322
|
+
# the duration/started values render as lower bounds. Degrades cleanly to
|
|
323
|
+
# the legacy "Latest X/Y consecutive points met the quorum." lead when no
|
|
324
|
+
# interval is wired in (direct-API callers).
|
|
325
|
+
from detectkit.core.interval import Interval
|
|
326
|
+
from detectkit.utils.datetime_utils import format_duration
|
|
327
|
+
|
|
328
|
+
interval_seconds = alert_data.interval_seconds
|
|
329
|
+
streak = alert_data.consecutive_count or 0
|
|
330
|
+
capped = alert_data.streak_capped
|
|
331
|
+
interval_display = str(Interval(interval_seconds)) if interval_seconds else ""
|
|
332
|
+
|
|
333
|
+
if interval_seconds and streak >= 1:
|
|
334
|
+
duration_display = format_duration(streak * interval_seconds)
|
|
335
|
+
if capped:
|
|
336
|
+
duration_display = f"over {duration_display}"
|
|
337
|
+
streak_display = f"{streak}+" if capped else f"{streak}"
|
|
338
|
+
started_display = f"{onset_str} or earlier" if (capped and onset_str) else onset_str
|
|
339
|
+
intervals_word = "interval" if streak == 1 else "intervals"
|
|
340
|
+
anomaly_lead = (
|
|
341
|
+
f"Anomalous for {duration_display} — "
|
|
342
|
+
f"{streak_display} consecutive {interval_display} {intervals_word}."
|
|
343
|
+
)
|
|
344
|
+
recovery_lead = (
|
|
345
|
+
"The alert condition no longer holds — the metric is back within "
|
|
346
|
+
f"expected bounds. Incident lasted {duration_display} "
|
|
347
|
+
f"({streak_display} consecutive {interval_display} {intervals_word})."
|
|
348
|
+
)
|
|
349
|
+
else:
|
|
350
|
+
duration_display = ""
|
|
351
|
+
streak_display = f"{streak}" if streak else ""
|
|
352
|
+
started_display = onset_str
|
|
353
|
+
anomaly_lead = (
|
|
354
|
+
f"Latest {alert_data.consecutive_count}/{consecutive_required} "
|
|
355
|
+
"consecutive points met the quorum."
|
|
356
|
+
)
|
|
357
|
+
recovery_lead = (
|
|
358
|
+
"The alert condition no longer holds — the metric is back within "
|
|
359
|
+
"expected bounds."
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Kind-aware "window" line for the plain-text templates: the anomalous
|
|
363
|
+
# span (onset → latest/cleared) when known, else the single point.
|
|
364
|
+
kind = self.status_kind(alert_data)
|
|
365
|
+
if started_display and kind == "anomaly":
|
|
366
|
+
window_line = f"Started: {started_display} | Latest: {ts_str}\n"
|
|
367
|
+
elif started_display and kind == "recovery":
|
|
368
|
+
window_line = f"Started: {started_display} | Cleared: {ts_str}\n"
|
|
369
|
+
else:
|
|
370
|
+
window_label = {
|
|
371
|
+
"recovery": "Cleared at",
|
|
372
|
+
"no_data": "Expected at",
|
|
373
|
+
"error": "Detected at",
|
|
374
|
+
}.get(kind, "Detected at")
|
|
375
|
+
window_line = f"{window_label}: {ts_str}\n"
|
|
376
|
+
|
|
290
377
|
# Display-safe value: stays usable even when value is None/NaN (no-data).
|
|
291
378
|
raw_value = alert_data.value
|
|
292
379
|
if raw_value is None or (isinstance(raw_value, float) and math.isnan(raw_value)):
|
|
@@ -350,6 +437,15 @@ class BaseAlertChannel(ABC):
|
|
|
350
437
|
"severity": alert_data.severity,
|
|
351
438
|
"consecutive_count": alert_data.consecutive_count,
|
|
352
439
|
"consecutive_required": consecutive_required,
|
|
440
|
+
"interval_display": interval_display,
|
|
441
|
+
"duration_display": duration_display,
|
|
442
|
+
"streak_display": streak_display,
|
|
443
|
+
"streak_capped": capped,
|
|
444
|
+
"onset_display": onset_str,
|
|
445
|
+
"started_display": started_display,
|
|
446
|
+
"anomaly_lead": anomaly_lead,
|
|
447
|
+
"recovery_lead": recovery_lead,
|
|
448
|
+
"window_line": window_line,
|
|
353
449
|
"status": status,
|
|
354
450
|
"error_type": alert_data.error_type or "",
|
|
355
451
|
"error_message": alert_data.error_message or "",
|
|
@@ -472,16 +568,14 @@ class BaseAlertChannel(ABC):
|
|
|
472
568
|
return (
|
|
473
569
|
"🔴 {project_name_prefix}Alert: {metric_name}\n"
|
|
474
570
|
"{description_line}"
|
|
475
|
-
"
|
|
476
|
-
"direction {direction} (policy {direction_policy}) · "
|
|
477
|
-
"consecutive {consecutive_count}/{consecutive_required}\n"
|
|
571
|
+
"{anomaly_lead}\n"
|
|
478
572
|
"Rule: min_detectors={min_detectors} · "
|
|
479
573
|
"direction={direction_policy} · consecutive={consecutive_required}\n"
|
|
480
574
|
"\n"
|
|
481
|
-
"
|
|
482
|
-
"
|
|
483
|
-
"
|
|
484
|
-
"
|
|
575
|
+
"Value: {value_display} | Expected: {expected_range}\n"
|
|
576
|
+
"Quorum: {detector_count}/{min_detectors} · {direction}\n"
|
|
577
|
+
"Severity: {severity:.2f}\n"
|
|
578
|
+
"{window_line}"
|
|
485
579
|
"Detectors: {detector_name}\n"
|
|
486
580
|
"Parameters: {detector_params}\n"
|
|
487
581
|
"{dashboard_line}"
|
|
@@ -499,14 +593,12 @@ class BaseAlertChannel(ABC):
|
|
|
499
593
|
return (
|
|
500
594
|
"🟢 {project_name_prefix}Alert cleared: {metric_name}\n"
|
|
501
595
|
"{description_line}"
|
|
502
|
-
"
|
|
503
|
-
"the metric is back within expected bounds.\n"
|
|
596
|
+
"{recovery_lead}\n"
|
|
504
597
|
"Rule: min_detectors={min_detectors} · "
|
|
505
598
|
"direction={direction_policy} · consecutive={consecutive_required}\n"
|
|
506
599
|
"\n"
|
|
507
|
-
"
|
|
508
|
-
"
|
|
509
|
-
"· Value: {value_display} | Expected: {expected_range}\n"
|
|
600
|
+
"Value: {value_display} | Expected: {expected_range}\n"
|
|
601
|
+
"{window_line}"
|
|
510
602
|
"Detectors: {detector_name}\n"
|
|
511
603
|
"{dashboard_line}"
|
|
512
604
|
"{help_line}"
|
|
@@ -315,42 +315,38 @@ class EmailChannel(BaseAlertChannel):
|
|
|
315
315
|
parts.append(self._lead_html(ctx["description"]))
|
|
316
316
|
|
|
317
317
|
if kind == "anomaly":
|
|
318
|
+
# Description (how long it's been going on) leads; the Rule chip sits
|
|
319
|
+
# right above the stat grid it explains.
|
|
320
|
+
parts.append(self._lead_html(ctx["anomaly_lead"]))
|
|
318
321
|
parts.append(self._rule_html(ctx))
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
("Detected at", ctx["timestamp"]),
|
|
332
|
-
]
|
|
333
|
-
)
|
|
334
|
-
)
|
|
322
|
+
stats = [
|
|
323
|
+
("Value", ctx["value_display"]),
|
|
324
|
+
("Expected", ctx["expected_range"]),
|
|
325
|
+
("Severity", f"{alert_data.severity:.2f}"),
|
|
326
|
+
("Quorum", f"{ctx['detector_count']}/{ctx['min_detectors']} · {ctx['direction']}"),
|
|
327
|
+
]
|
|
328
|
+
if ctx["started_display"]:
|
|
329
|
+
stats.append(("Started", ctx["started_display"]))
|
|
330
|
+
stats.append(("Latest", ctx["timestamp"]))
|
|
331
|
+
else:
|
|
332
|
+
stats.append(("Detected at", ctx["timestamp"]))
|
|
333
|
+
parts.append(self._stat_grid(stats))
|
|
335
334
|
if ctx["detector_params"]:
|
|
336
335
|
parts.append(self._params_html(ctx["detector_name"], ctx["detector_params"]))
|
|
337
336
|
elif kind == "recovery":
|
|
338
|
-
|
|
339
|
-
"The alert condition no longer holds — the metric is back within "
|
|
340
|
-
"expected bounds."
|
|
341
|
-
)
|
|
342
|
-
parts.append(self._lead_html(lead))
|
|
337
|
+
parts.append(self._lead_html(ctx["recovery_lead"]))
|
|
343
338
|
parts.append(self._rule_html(ctx))
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
)
|
|
353
|
-
)
|
|
339
|
+
stats = [
|
|
340
|
+
("Value", ctx["value_display"]),
|
|
341
|
+
("Expected", ctx["expected_range"]),
|
|
342
|
+
]
|
|
343
|
+
if ctx["started_display"]:
|
|
344
|
+
stats.append(("Started", ctx["started_display"]))
|
|
345
|
+
stats.append(("Cleared", ctx["timestamp"]))
|
|
346
|
+
else:
|
|
347
|
+
stats.append(("Cleared at", ctx["timestamp"]))
|
|
348
|
+
stats.append(("Detector", ctx["detector_name"]))
|
|
349
|
+
parts.append(self._stat_grid(stats))
|
|
354
350
|
elif kind == "no_data":
|
|
355
351
|
lead = "Query returned no datapoint for the latest expected interval."
|
|
356
352
|
parts.append(self._lead_html(lead))
|
|
@@ -24,9 +24,10 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
24
24
|
|
|
25
25
|
Sends formatted messages to a Telegram chat using a bot token. The default
|
|
26
26
|
(no custom ``template``) message is a structured **HTML** layout — a colored
|
|
27
|
-
status dot, a bold headline, the
|
|
28
|
-
|
|
29
|
-
|
|
27
|
+
status dot, a bold headline, the lead (how long the anomaly has been
|
|
28
|
+
running) followed by the rule that fired, then the evidence
|
|
29
|
+
(value / expected / quorum / severity / started → latest / detector /
|
|
30
|
+
params) in ``<code>``, plus an optional "Open dashboard" link and @mentions.
|
|
30
31
|
|
|
31
32
|
HTML is the default ``parse_mode`` because the legacy ``Markdown`` mode
|
|
32
33
|
breaks on the detector params JSON (an unmatched ``_`` in e.g.
|
|
@@ -161,11 +162,9 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
161
162
|
lines.append("") # blank line
|
|
162
163
|
|
|
163
164
|
if kind == "anomaly":
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
f"<b>{ctx['consecutive_count']}/{ctx['consecutive_required']}</b> consecutive"
|
|
168
|
-
)
|
|
165
|
+
# Description (how long it's been going on) leads; the Rule chip sits
|
|
166
|
+
# right above the evidence it explains.
|
|
167
|
+
lines.append(esc(ctx["anomaly_lead"]))
|
|
169
168
|
lines.append(
|
|
170
169
|
f"<b>Rule</b> <code>min_detectors={ctx['min_detectors']} · "
|
|
171
170
|
f"direction={esc(ctx['direction_policy'])} · "
|
|
@@ -176,17 +175,24 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
176
175
|
f"• Value: <code>{esc(ctx['value_display'])}</code> · "
|
|
177
176
|
f"Expected: <code>{esc(ctx['expected_range'])}</code>"
|
|
178
177
|
)
|
|
178
|
+
lines.append(
|
|
179
|
+
f"• Quorum: <code>{ctx['detector_count']}/{ctx['min_detectors']} · "
|
|
180
|
+
f"{esc(ctx['direction'])}</code>"
|
|
181
|
+
)
|
|
179
182
|
lines.append(f"• Severity: <code>{alert_data.severity:.2f}</code>")
|
|
180
|
-
|
|
183
|
+
if ctx["started_display"]:
|
|
184
|
+
lines.append(
|
|
185
|
+
f"• Started: <code>{esc(ctx['started_display'])}</code> · "
|
|
186
|
+
f"Latest: <code>{esc(ctx['timestamp'])}</code>"
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
lines.append(f"• Time: <code>{esc(ctx['timestamp'])}</code>")
|
|
181
190
|
lines.append(f"• Detector: <code>{esc(ctx['detector_name'])}</code>")
|
|
182
191
|
if ctx["detector_params"]:
|
|
183
192
|
params = self._cap(ctx["detector_params"], _PARAMS_CAP)
|
|
184
193
|
lines.append(f"• Parameters: <code>{esc(params)}</code>")
|
|
185
194
|
elif kind == "recovery":
|
|
186
|
-
lines.append(
|
|
187
|
-
"The alert condition no longer holds — the metric is back within "
|
|
188
|
-
"expected bounds."
|
|
189
|
-
)
|
|
195
|
+
lines.append(esc(ctx["recovery_lead"]))
|
|
190
196
|
lines.append(
|
|
191
197
|
f"<b>Rule</b> <code>min_detectors={ctx['min_detectors']} · "
|
|
192
198
|
f"direction={esc(ctx['direction_policy'])} · "
|
|
@@ -197,7 +203,13 @@ class TelegramChannel(BaseAlertChannel):
|
|
|
197
203
|
f"• Value: <code>{esc(ctx['value_display'])}</code> · "
|
|
198
204
|
f"Expected: <code>{esc(ctx['expected_range'])}</code>"
|
|
199
205
|
)
|
|
200
|
-
|
|
206
|
+
if ctx["started_display"]:
|
|
207
|
+
lines.append(
|
|
208
|
+
f"• Started: <code>{esc(ctx['started_display'])}</code> · "
|
|
209
|
+
f"Cleared: <code>{esc(ctx['timestamp'])}</code>"
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
lines.append(f"• Cleared: <code>{esc(ctx['timestamp'])}</code>")
|
|
201
213
|
lines.append(f"• Detector: <code>{esc(ctx['detector_name'])}</code>")
|
|
202
214
|
elif kind == "no_data":
|
|
203
215
|
lines.append("Query returned no datapoint for the latest expected interval.")
|
|
@@ -27,9 +27,11 @@ class WebhookChannel(BaseAlertChannel):
|
|
|
27
27
|
|
|
28
28
|
Rendering: the default (no custom ``template``) payload is a single
|
|
29
29
|
**Slack/Mattermost message attachment** — a colored accent bar, a title,
|
|
30
|
-
a short markdown lead
|
|
31
|
-
|
|
32
|
-
|
|
30
|
+
a short markdown lead (how long the anomaly has been running) with the
|
|
31
|
+
**Rule** chip beneath it, and a compact **fields grid** (Value / Expected /
|
|
32
|
+
Quorum / Severity / Started / Latest — Started / Cleared on recovery — then
|
|
33
|
+
full-width Detectors / Parameters), branded with a ``footer`` +
|
|
34
|
+
``footer_icon``. This renders richly on both
|
|
33
35
|
Slack and Mattermost from one payload. A custom ``template`` degrades to a
|
|
34
36
|
plain text-only attachment (the template is one opaque string that can't be
|
|
35
37
|
sliced into fields), keeping the color, title and branding.
|
|
@@ -267,27 +269,32 @@ class WebhookChannel(BaseAlertChannel):
|
|
|
267
269
|
)
|
|
268
270
|
|
|
269
271
|
if kind == "anomaly":
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
"consecutive points met the quorum."
|
|
274
|
-
)
|
|
272
|
+
# Description (how long it's been going on) leads; the Rule chip sits
|
|
273
|
+
# right above the value/expected fields it explains.
|
|
274
|
+
lead = f"{ctx['anomaly_lead']}\n{rule_chip}"
|
|
275
275
|
short("Value", code(ctx["value_display"]))
|
|
276
276
|
short("Expected", code(ctx["expected_range"]))
|
|
277
277
|
short("Quorum", f"{ctx['detector_count']}/{ctx['min_detectors']} · {ctx['direction']}")
|
|
278
278
|
short("Severity", f"{alert_data.severity:.2f}")
|
|
279
|
-
|
|
279
|
+
# The problematic span: when it started and the latest point in it.
|
|
280
|
+
if ctx["started_display"]:
|
|
281
|
+
short("Started", ctx["started_display"])
|
|
282
|
+
short("Latest", ctx["timestamp"])
|
|
283
|
+
else:
|
|
284
|
+
full("Detected at", ctx["timestamp"])
|
|
280
285
|
full("Detectors", code(ctx["detector_name"]))
|
|
281
286
|
if ctx["detector_params"]:
|
|
282
287
|
full("Parameters", f"```{ctx['detector_params']}```")
|
|
283
288
|
elif kind == "recovery":
|
|
284
|
-
lead =
|
|
285
|
-
"The alert condition no longer holds — the metric is back within "
|
|
286
|
-
f"expected bounds.\n{rule_chip}"
|
|
287
|
-
)
|
|
289
|
+
lead = f"{ctx['recovery_lead']}\n{rule_chip}"
|
|
288
290
|
short("Value", code(ctx["value_display"]))
|
|
289
291
|
short("Expected", code(ctx["expected_range"]))
|
|
290
|
-
|
|
292
|
+
# The incident span: when it started and when it cleared.
|
|
293
|
+
if ctx["started_display"]:
|
|
294
|
+
short("Started", ctx["started_display"])
|
|
295
|
+
short("Cleared", ctx["timestamp"])
|
|
296
|
+
else:
|
|
297
|
+
full("Cleared at", ctx["timestamp"])
|
|
291
298
|
full("Detectors", code(ctx["detector_name"]))
|
|
292
299
|
elif kind == "no_data":
|
|
293
300
|
lead = "Query returned no datapoint for the latest expected interval."
|
|
@@ -383,16 +390,14 @@ class WebhookChannel(BaseAlertChannel):
|
|
|
383
390
|
"""
|
|
384
391
|
return (
|
|
385
392
|
"{description_line}"
|
|
386
|
-
"
|
|
387
|
-
"direction {direction} (policy {direction_policy}) · "
|
|
388
|
-
"consecutive {consecutive_count}/{consecutive_required}\n"
|
|
393
|
+
"{anomaly_lead}\n"
|
|
389
394
|
"Rule: min_detectors={min_detectors} · "
|
|
390
395
|
"direction={direction_policy} · consecutive={consecutive_required}\n"
|
|
391
396
|
"\n"
|
|
392
|
-
"
|
|
393
|
-
"
|
|
394
|
-
"
|
|
395
|
-
"
|
|
397
|
+
"Value: {value_display} | Expected: {expected_range}\n"
|
|
398
|
+
"Quorum: {detector_count}/{min_detectors} · {direction}\n"
|
|
399
|
+
"Severity: {severity:.2f}\n"
|
|
400
|
+
"{window_line}"
|
|
396
401
|
"Detectors: {detector_name}\n"
|
|
397
402
|
"Parameters: {detector_params}\n"
|
|
398
403
|
"{dashboard_line}"
|
|
@@ -406,14 +411,12 @@ class WebhookChannel(BaseAlertChannel):
|
|
|
406
411
|
"""
|
|
407
412
|
return (
|
|
408
413
|
"{description_line}"
|
|
409
|
-
"
|
|
410
|
-
"the metric is back within expected bounds.\n"
|
|
414
|
+
"{recovery_lead}\n"
|
|
411
415
|
"Rule: min_detectors={min_detectors} · "
|
|
412
416
|
"direction={direction_policy} · consecutive={consecutive_required}\n"
|
|
413
417
|
"\n"
|
|
414
|
-
"
|
|
415
|
-
"
|
|
416
|
-
"· Value: {value_display} | Expected: {expected_range}\n"
|
|
418
|
+
"Value: {value_display} | Expected: {expected_range}\n"
|
|
419
|
+
"{window_line}"
|
|
417
420
|
"Detectors: {detector_name}\n"
|
|
418
421
|
"{dashboard_line}"
|
|
419
422
|
"{help_line}"
|
|
@@ -10,6 +10,15 @@ from detectkit.alerting.orchestrator._types import (
|
|
|
10
10
|
)
|
|
11
11
|
from detectkit.core.interval import Interval
|
|
12
12
|
|
|
13
|
+
# How far back the orchestrator looks to reconstruct the *true* length of an
|
|
14
|
+
# anomalous run when an alert fires / clears. The decision itself only needs
|
|
15
|
+
# ``consecutive_anomalies`` points, but the message reports "how long has this
|
|
16
|
+
# been going on", which needs the full streak. Bounded so a metric stuck
|
|
17
|
+
# anomalous for a very long time never loads unboundedly — past this the run is
|
|
18
|
+
# reported as a lower bound ("over …"). Only queried on fire/recovery, never on
|
|
19
|
+
# the hot no-alert path.
|
|
20
|
+
STREAK_LOOKBACK_POINTS = 1000
|
|
21
|
+
|
|
13
22
|
|
|
14
23
|
class _OrchestratorBase:
|
|
15
24
|
def __init__(
|
|
@@ -34,8 +34,8 @@ from datetime import datetime, timezone
|
|
|
34
34
|
import numpy as np
|
|
35
35
|
|
|
36
36
|
from detectkit.alerting.channels.base import AlertData
|
|
37
|
-
from detectkit.alerting.orchestrator._base import _OrchestratorBase
|
|
38
|
-
from detectkit.alerting.orchestrator._types import DetectionRecord
|
|
37
|
+
from detectkit.alerting.orchestrator._base import STREAK_LOOKBACK_POINTS, _OrchestratorBase
|
|
38
|
+
from detectkit.alerting.orchestrator._types import DetectionRecord, hydrate_detection_records
|
|
39
39
|
from detectkit.utils.datetime_utils import now_utc, to_aware_utc
|
|
40
40
|
|
|
41
41
|
|
|
@@ -70,7 +70,46 @@ class _DecisionMixin(_OrchestratorBase):
|
|
|
70
70
|
if not latest_quorum or consecutive < self.conditions.consecutive_anomalies:
|
|
71
71
|
return False, None
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
# The decision is made; now resolve the *true* streak length/onset for
|
|
74
|
+
# the message (the shallow alert window caps ``consecutive`` at the rule
|
|
75
|
+
# threshold, which can't answer "how long has this been going on").
|
|
76
|
+
streak, onset_ts, capped = self._resolve_streak(latest_quorum[0].timestamp)
|
|
77
|
+
return True, self._build_alert_data(latest_quorum, streak, direction, onset_ts, capped)
|
|
78
|
+
|
|
79
|
+
def _resolve_streak(self, latest_ts: np.datetime64) -> tuple[int, np.datetime64, bool]:
|
|
80
|
+
"""Resolve the full anomalous run ending at *latest_ts*.
|
|
81
|
+
|
|
82
|
+
Loads up to :data:`STREAK_LOOKBACK_POINTS` detections and re-walks the
|
|
83
|
+
same direction-aware quorum logic used to fire, so the message can report
|
|
84
|
+
the real onset/duration rather than the shallow alert-window count.
|
|
85
|
+
Returns ``(streak_count, onset_timestamp, capped)`` — ``capped`` is True
|
|
86
|
+
when the run fills the whole lookback window (onset is older than we saw).
|
|
87
|
+
Only runs when an alert actually fires, so the hot no-alert path is
|
|
88
|
+
untouched.
|
|
89
|
+
"""
|
|
90
|
+
step = np.timedelta64(self.interval.seconds, "s")
|
|
91
|
+
if not self.internal:
|
|
92
|
+
# Direct-API path with no DB to walk: report the rule's required
|
|
93
|
+
# length so the message still carries a duration.
|
|
94
|
+
n = max(self.conditions.consecutive_anomalies, 1)
|
|
95
|
+
return n, latest_ts - step * (n - 1), False
|
|
96
|
+
|
|
97
|
+
last_point = latest_ts.astype("datetime64[ms]").astype(datetime)
|
|
98
|
+
rows = self.internal.get_recent_detections(
|
|
99
|
+
metric_name=self.metric_name,
|
|
100
|
+
last_point=last_point,
|
|
101
|
+
num_points=STREAK_LOOKBACK_POINTS,
|
|
102
|
+
)
|
|
103
|
+
records = hydrate_detection_records(rows)
|
|
104
|
+
if not records:
|
|
105
|
+
return 1, latest_ts, False
|
|
106
|
+
|
|
107
|
+
by_time = self._group_by_timestamp(records)
|
|
108
|
+
timestamps_sorted = sorted(by_time.keys(), reverse=True)
|
|
109
|
+
count, _, _ = self._count_consecutive_anomalies(by_time, timestamps_sorted)
|
|
110
|
+
count = max(count, 1)
|
|
111
|
+
capped = count >= STREAK_LOOKBACK_POINTS
|
|
112
|
+
return count, latest_ts - step * (count - 1), capped
|
|
74
113
|
|
|
75
114
|
def _quorum_at(
|
|
76
115
|
self,
|
|
@@ -185,6 +224,8 @@ class _DecisionMixin(_OrchestratorBase):
|
|
|
185
224
|
anomalies: list[DetectionRecord],
|
|
186
225
|
consecutive_count: int,
|
|
187
226
|
direction: str | None,
|
|
227
|
+
onset_timestamp: np.datetime64 | None = None,
|
|
228
|
+
streak_capped: bool = False,
|
|
188
229
|
) -> AlertData:
|
|
189
230
|
primary = self._primary_record(anomalies)
|
|
190
231
|
|
|
@@ -249,6 +290,10 @@ class _DecisionMixin(_OrchestratorBase):
|
|
|
249
290
|
direction_policy=self.conditions.direction,
|
|
250
291
|
consecutive_required=self.conditions.consecutive_anomalies,
|
|
251
292
|
detector_count=len(anomalies),
|
|
293
|
+
# Incident timing for the "how long has this been going on" line.
|
|
294
|
+
interval_seconds=self.interval.seconds,
|
|
295
|
+
onset_timestamp=onset_timestamp,
|
|
296
|
+
streak_capped=streak_capped,
|
|
252
297
|
)
|
|
253
298
|
|
|
254
299
|
def should_alert_no_data(
|
|
@@ -304,6 +349,7 @@ class _DecisionMixin(_OrchestratorBase):
|
|
|
304
349
|
links=self.links,
|
|
305
350
|
project_name=self.project_name,
|
|
306
351
|
help_url=self.help_url,
|
|
352
|
+
interval_seconds=self.interval.seconds,
|
|
307
353
|
)
|
|
308
354
|
|
|
309
355
|
def get_last_complete_point(self, now: datetime | None = None) -> datetime:
|