detectkit 0.16.3__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {detectkit-0.16.3/detectkit.egg-info → detectkit-0.17.0}/PKG-INFO +2 -1
  2. {detectkit-0.16.3 → detectkit-0.17.0}/README.md +1 -0
  3. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/__init__.py +1 -1
  4. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/base.py +118 -26
  5. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/email.py +27 -31
  6. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/telegram.py +26 -14
  7. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/webhook.py +29 -26
  8. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_base.py +9 -0
  9. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_decision.py +49 -3
  10. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_recovery.py +79 -2
  11. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/alerting.md +42 -25
  12. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/test_alert.py +10 -1
  13. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/utils/datetime_utils.py +36 -0
  14. {detectkit-0.16.3 → detectkit-0.17.0/detectkit.egg-info}/PKG-INFO +2 -1
  15. {detectkit-0.16.3 → detectkit-0.17.0}/LICENSE +0 -0
  16. {detectkit-0.16.3 → detectkit-0.17.0}/MANIFEST.in +0 -0
  17. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/__init__.py +0 -0
  18. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/__init__.py +0 -0
  19. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/branding.py +0 -0
  20. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/factory.py +0 -0
  21. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/mattermost.py +0 -0
  22. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/channels/slack.py +0 -0
  23. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/__init__.py +0 -0
  24. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
  25. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
  26. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/_types.py +0 -0
  27. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/alerting/orchestrator/orchestrator.py +0 -0
  28. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/__init__.py +0 -0
  29. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/_output.py +0 -0
  30. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/CLAUDE.section.md +0 -0
  31. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/cli.md +0 -0
  32. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/detectors.md +0 -0
  33. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/metrics.md +0 -0
  34. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/overview.md +0 -0
  35. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/rules/project.md +0 -0
  36. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/skills/dtk-feedback/SKILL.md +0 -0
  37. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/skills/dtk-new-metric/SKILL.md +0 -0
  38. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/assets/claude/skills/dtk-setup-project/SKILL.md +0 -0
  39. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/__init__.py +0 -0
  40. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/clean.py +0 -0
  41. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/init.py +0 -0
  42. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/init_claude.py +0 -0
  43. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/run.py +0 -0
  44. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/commands/unlock.py +0 -0
  45. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/cli/main.py +0 -0
  46. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/config/__init__.py +0 -0
  47. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/config/metric_config.py +0 -0
  48. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/config/profile.py +0 -0
  49. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/config/project_config.py +0 -0
  50. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/config/validator.py +0 -0
  51. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/core/__init__.py +0 -0
  52. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/core/interval.py +0 -0
  53. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/core/models.py +0 -0
  54. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/__init__.py +0 -0
  55. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/_sql_manager.py +0 -0
  56. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/clickhouse_manager.py +0 -0
  57. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/__init__.py +0 -0
  58. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_alert_states.py +0 -0
  59. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_base.py +0 -0
  60. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
  61. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_detections.py +0 -0
  62. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_maintenance.py +0 -0
  63. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_metrics.py +0 -0
  64. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_schema.py +0 -0
  65. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/_tasks.py +0 -0
  66. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/internal_tables/manager.py +0 -0
  67. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/manager.py +0 -0
  68. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/mysql_manager.py +0 -0
  69. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/postgres_manager.py +0 -0
  70. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/database/tables.py +0 -0
  71. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/__init__.py +0 -0
  72. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/base.py +0 -0
  73. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/factory.py +0 -0
  74. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/seasonality.py +0 -0
  75. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/__init__.py +0 -0
  76. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/_windowed.py +0 -0
  77. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/iqr.py +0 -0
  78. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/mad.py +0 -0
  79. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
  80. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/detectors/statistical/zscore.py +0 -0
  81. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/loaders/__init__.py +0 -0
  82. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/loaders/metric_loader.py +0 -0
  83. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/loaders/query_template.py +0 -0
  84. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/__init__.py +0 -0
  85. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/error_dispatch.py +0 -0
  86. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
  87. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
  88. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/_base.py +0 -0
  89. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
  90. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
  91. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/_types.py +0 -0
  92. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/orchestration/task_manager/manager.py +0 -0
  93. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/utils/__init__.py +0 -0
  94. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/utils/env_interpolation.py +0 -0
  95. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/utils/json_utils.py +0 -0
  96. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit/utils/stats.py +0 -0
  97. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit.egg-info/SOURCES.txt +0 -0
  98. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit.egg-info/dependency_links.txt +0 -0
  99. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit.egg-info/entry_points.txt +0 -0
  100. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit.egg-info/requires.txt +0 -0
  101. {detectkit-0.16.3 → detectkit-0.17.0}/detectkit.egg-info/top_level.txt +0 -0
  102. {detectkit-0.16.3 → detectkit-0.17.0}/pyproject.toml +0 -0
  103. {detectkit-0.16.3 → detectkit-0.17.0}/requirements.txt +0 -0
  104. {detectkit-0.16.3 → detectkit-0.17.0}/setup.cfg +0 -0
  105. {detectkit-0.16.3 → detectkit-0.17.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: detectkit
3
- Version: 0.16.3
3
+ Version: 0.17.0
4
4
  Summary: Metric monitoring with automatic anomaly detection
5
5
  Author: detectkit team
6
6
  License: MIT
@@ -185,6 +185,7 @@ for r in results:
185
185
  - [Configuration Guide](docs/guides/configuration.md) — all config options
186
186
  - [Detectors Guide](docs/guides/detectors.md) — choosing the right detector
187
187
  - [Alerting Guide](docs/guides/alerting.md) — channels, mentions, cooldown, recovery
188
+ - [Reading Alerts](docs/guides/reading-alerts.md) — what a received alert means (for stakeholders)
188
189
  - [CLI Reference](docs/reference/cli.md) — command-line documentation
189
190
  - [Examples](docs/examples/) — real-world monitoring scenarios
190
191
  - [Changelog](CHANGELOG.md) — version history
@@ -117,6 +117,7 @@ for r in results:
117
117
  - [Configuration Guide](docs/guides/configuration.md) — all config options
118
118
  - [Detectors Guide](docs/guides/detectors.md) — choosing the right detector
119
119
  - [Alerting Guide](docs/guides/alerting.md) — channels, mentions, cooldown, recovery
120
+ - [Reading Alerts](docs/guides/reading-alerts.md) — what a received alert means (for stakeholders)
120
121
  - [CLI Reference](docs/reference/cli.md) — command-line documentation
121
122
  - [Examples](docs/examples/) — real-world monitoring scenarios
122
123
  - [Changelog](CHANGELOG.md) — version history
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
4
4
  A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
5
5
  """
6
6
 
7
- __version__ = "0.16.3"
7
+ __version__ = "0.17.0"
8
8
 
9
9
  from detectkit.core.interval import Interval
10
10
  from detectkit.core.models import ColumnDefinition, TableModel
@@ -92,6 +92,18 @@ class AlertData:
92
92
  direction_policy: str | None = None
93
93
  consecutive_required: int | None = None
94
94
  detector_count: int = 1
95
+ # Incident timing — answers "how long has this been going on". The metric
96
+ # ``interval_seconds`` lets the message express the streak in wall-clock
97
+ # time; ``onset_timestamp`` is the first timestamp of the current anomalous
98
+ # run (anomaly) / the just-ended incident (recovery); ``streak_capped`` is
99
+ # True when the run is at least as long as the orchestrator's lookback
100
+ # window, so the duration is rendered as a lower bound ("over …"). The
101
+ # consecutive streak length itself rides on ``consecutive_count`` (the true
102
+ # run length, resolved at fire time). All default to None/False so
103
+ # direct-API callers and non-anomaly alerts render unchanged.
104
+ interval_seconds: int | None = None
105
+ onset_timestamp: Any | None = None
106
+ streak_capped: bool = False
95
107
 
96
108
 
97
109
  class BaseAlertChannel(ABC):
@@ -165,8 +177,18 @@ class BaseAlertChannel(ABC):
165
177
  - {direction} — observed/locked direction of the anomaly
166
178
  - {direction_policy} — configured direction rule ("same"/"any"/...)
167
179
  - {min_detectors} — configured quorum threshold (the rule)
168
- - {consecutive_count} — observed consecutive points
180
+ - {consecutive_count} — true consecutive streak length (resolved at
181
+ fire time, not capped at the rule's threshold)
169
182
  - {consecutive_required} — configured consecutive threshold (rule)
183
+ - {interval_display} — metric interval as a string (e.g. "10min")
184
+ - {duration_display} — how long the streak/incident lasted
185
+ (e.g. "2h 30m"; "over …" when it predates the lookback window)
186
+ - {onset_display} / {started_display} — first timestamp of the run
187
+ ({started_display} adds "or earlier" when the run is capped)
188
+ - {anomaly_lead} / {recovery_lead} — the ready-made plain-language
189
+ lead sentence ("Anomalous for …" / "… Incident lasted …")
190
+ - {window_line} — "Started: … | Latest/Cleared: …" (or a single
191
+ "Detected at: …" line when the onset is unknown)
170
192
  - {severity}
171
193
  - {status}
172
194
 
@@ -228,22 +250,27 @@ class BaseAlertChannel(ABC):
228
250
  """
229
251
  import math
230
252
  from datetime import datetime
253
+ from zoneinfo import ZoneInfo
231
254
 
232
255
  import numpy as np
233
256
 
234
- # Format timestamp to string
235
- ts = alert_data.timestamp
236
- if isinstance(ts, np.datetime64):
237
- ts = ts.astype(datetime)
238
-
239
- # Convert naive UTC timestamp to target timezone if specified
240
- if alert_data.timezone:
241
- from zoneinfo import ZoneInfo
242
-
243
- ts = ts.replace(tzinfo=ZoneInfo("UTC")).astimezone(ZoneInfo(alert_data.timezone))
244
- ts_str = f"{ts.strftime('%Y-%m-%d %H:%M:%S')} ({alert_data.timezone})"
245
- else:
246
- ts_str = ts.strftime("%Y-%m-%d %H:%M:%S")
257
+ def _fmt_ts(value: Any) -> str:
258
+ """Format a timestamp the same way for the main point and the onset:
259
+ naive UTC → target timezone, with a ``(tz)`` suffix when set."""
260
+ if value is None:
261
+ return ""
262
+ t = value
263
+ if isinstance(t, np.datetime64):
264
+ t = t.astype("datetime64[ms]").astype(datetime)
265
+ if not isinstance(t, datetime):
266
+ return str(t)
267
+ if alert_data.timezone:
268
+ t = t.replace(tzinfo=ZoneInfo("UTC")).astimezone(ZoneInfo(alert_data.timezone))
269
+ return f"{t.strftime('%Y-%m-%d %H:%M:%S')} ({alert_data.timezone})"
270
+ return t.strftime("%Y-%m-%d %H:%M:%S")
271
+
272
+ ts_str = _fmt_ts(alert_data.timestamp)
273
+ onset_str = _fmt_ts(alert_data.onset_timestamp)
247
274
 
248
275
  # Format confidence interval
249
276
  if alert_data.confidence_lower is not None and alert_data.confidence_upper is not None:
@@ -287,6 +314,66 @@ class BaseAlertChannel(ABC):
287
314
  )
288
315
  direction_policy = alert_data.direction_policy or alert_data.direction
289
316
 
317
+ # Incident timing — the "how long has this been going on" story shared by
318
+ # every channel. ``consecutive_count`` carries the *true* streak length
319
+ # (resolved at fire time); together with the metric interval it becomes a
320
+ # wall-clock duration and a plain-language lead. ``streak_capped`` means
321
+ # the run is at least as long as the orchestrator's lookback window, so
322
+ # the duration/started values render as lower bounds. Degrades cleanly to
323
+ # the legacy "Latest X/Y consecutive points met the quorum." lead when no
324
+ # interval is wired in (direct-API callers).
325
+ from detectkit.core.interval import Interval
326
+ from detectkit.utils.datetime_utils import format_duration
327
+
328
+ interval_seconds = alert_data.interval_seconds
329
+ streak = alert_data.consecutive_count or 0
330
+ capped = alert_data.streak_capped
331
+ interval_display = str(Interval(interval_seconds)) if interval_seconds else ""
332
+
333
+ if interval_seconds and streak >= 1:
334
+ duration_display = format_duration(streak * interval_seconds)
335
+ if capped:
336
+ duration_display = f"over {duration_display}"
337
+ streak_display = f"{streak}+" if capped else f"{streak}"
338
+ started_display = f"{onset_str} or earlier" if (capped and onset_str) else onset_str
339
+ intervals_word = "interval" if streak == 1 else "intervals"
340
+ anomaly_lead = (
341
+ f"Anomalous for {duration_display} — "
342
+ f"{streak_display} consecutive {interval_display} {intervals_word}."
343
+ )
344
+ recovery_lead = (
345
+ "The alert condition no longer holds — the metric is back within "
346
+ f"expected bounds. Incident lasted {duration_display} "
347
+ f"({streak_display} consecutive {interval_display} {intervals_word})."
348
+ )
349
+ else:
350
+ duration_display = ""
351
+ streak_display = f"{streak}" if streak else ""
352
+ started_display = onset_str
353
+ anomaly_lead = (
354
+ f"Latest {alert_data.consecutive_count}/{consecutive_required} "
355
+ "consecutive points met the quorum."
356
+ )
357
+ recovery_lead = (
358
+ "The alert condition no longer holds — the metric is back within "
359
+ "expected bounds."
360
+ )
361
+
362
+ # Kind-aware "window" line for the plain-text templates: the anomalous
363
+ # span (onset → latest/cleared) when known, else the single point.
364
+ kind = self.status_kind(alert_data)
365
+ if started_display and kind == "anomaly":
366
+ window_line = f"Started: {started_display} | Latest: {ts_str}\n"
367
+ elif started_display and kind == "recovery":
368
+ window_line = f"Started: {started_display} | Cleared: {ts_str}\n"
369
+ else:
370
+ window_label = {
371
+ "recovery": "Cleared at",
372
+ "no_data": "Expected at",
373
+ "error": "Detected at",
374
+ }.get(kind, "Detected at")
375
+ window_line = f"{window_label}: {ts_str}\n"
376
+
290
377
  # Display-safe value: stays usable even when value is None/NaN (no-data).
291
378
  raw_value = alert_data.value
292
379
  if raw_value is None or (isinstance(raw_value, float) and math.isnan(raw_value)):
@@ -350,6 +437,15 @@ class BaseAlertChannel(ABC):
350
437
  "severity": alert_data.severity,
351
438
  "consecutive_count": alert_data.consecutive_count,
352
439
  "consecutive_required": consecutive_required,
440
+ "interval_display": interval_display,
441
+ "duration_display": duration_display,
442
+ "streak_display": streak_display,
443
+ "streak_capped": capped,
444
+ "onset_display": onset_str,
445
+ "started_display": started_display,
446
+ "anomaly_lead": anomaly_lead,
447
+ "recovery_lead": recovery_lead,
448
+ "window_line": window_line,
353
449
  "status": status,
354
450
  "error_type": alert_data.error_type or "",
355
451
  "error_message": alert_data.error_message or "",
@@ -472,16 +568,14 @@ class BaseAlertChannel(ABC):
472
568
  return (
473
569
  "🔴 {project_name_prefix}Alert: {metric_name}\n"
474
570
  "{description_line}"
475
- "Quorum {detector_count}/{min_detectors} · "
476
- "direction {direction} (policy {direction_policy}) · "
477
- "consecutive {consecutive_count}/{consecutive_required}\n"
571
+ "{anomaly_lead}\n"
478
572
  "Rule: min_detectors={min_detectors} · "
479
573
  "direction={direction_policy} · consecutive={consecutive_required}\n"
480
574
  "\n"
481
- "Latest point (evidence):\n"
482
- "· Time: {timestamp}\n"
483
- "· Value: {value_display} | Expected: {expected_range}\n"
484
- "· Severity: {severity:.2f}\n"
575
+ "Value: {value_display} | Expected: {expected_range}\n"
576
+ "Quorum: {detector_count}/{min_detectors} · {direction}\n"
577
+ "Severity: {severity:.2f}\n"
578
+ "{window_line}"
485
579
  "Detectors: {detector_name}\n"
486
580
  "Parameters: {detector_params}\n"
487
581
  "{dashboard_line}"
@@ -499,14 +593,12 @@ class BaseAlertChannel(ABC):
499
593
  return (
500
594
  "🟢 {project_name_prefix}Alert cleared: {metric_name}\n"
501
595
  "{description_line}"
502
- "The alert condition no longer holds — "
503
- "the metric is back within expected bounds.\n"
596
+ "{recovery_lead}\n"
504
597
  "Rule: min_detectors={min_detectors} · "
505
598
  "direction={direction_policy} · consecutive={consecutive_required}\n"
506
599
  "\n"
507
- "Latest point:\n"
508
- "· Time: {timestamp}\n"
509
- "· Value: {value_display} | Expected: {expected_range}\n"
600
+ "Value: {value_display} | Expected: {expected_range}\n"
601
+ "{window_line}"
510
602
  "Detectors: {detector_name}\n"
511
603
  "{dashboard_line}"
512
604
  "{help_line}"
@@ -315,42 +315,38 @@ class EmailChannel(BaseAlertChannel):
315
315
  parts.append(self._lead_html(ctx["description"]))
316
316
 
317
317
  if kind == "anomaly":
318
+ # Description (how long it's been going on) leads; the Rule chip sits
319
+ # right above the stat grid it explains.
320
+ parts.append(self._lead_html(ctx["anomaly_lead"]))
318
321
  parts.append(self._rule_html(ctx))
319
- parts.append(
320
- self._lead_html(
321
- f"Latest {ctx['consecutive_count']}/{ctx['consecutive_required']} "
322
- "consecutive points met the quorum."
323
- )
324
- )
325
- parts.append(
326
- self._stat_grid(
327
- [
328
- ("Value", ctx["value_display"]),
329
- ("Expected", ctx["expected_range"]),
330
- ("Severity", f"{alert_data.severity:.2f}"),
331
- ("Detected at", ctx["timestamp"]),
332
- ]
333
- )
334
- )
322
+ stats = [
323
+ ("Value", ctx["value_display"]),
324
+ ("Expected", ctx["expected_range"]),
325
+ ("Severity", f"{alert_data.severity:.2f}"),
326
+ ("Quorum", f"{ctx['detector_count']}/{ctx['min_detectors']} · {ctx['direction']}"),
327
+ ]
328
+ if ctx["started_display"]:
329
+ stats.append(("Started", ctx["started_display"]))
330
+ stats.append(("Latest", ctx["timestamp"]))
331
+ else:
332
+ stats.append(("Detected at", ctx["timestamp"]))
333
+ parts.append(self._stat_grid(stats))
335
334
  if ctx["detector_params"]:
336
335
  parts.append(self._params_html(ctx["detector_name"], ctx["detector_params"]))
337
336
  elif kind == "recovery":
338
- lead = (
339
- "The alert condition no longer holds — the metric is back within "
340
- "expected bounds."
341
- )
342
- parts.append(self._lead_html(lead))
337
+ parts.append(self._lead_html(ctx["recovery_lead"]))
343
338
  parts.append(self._rule_html(ctx))
344
- parts.append(
345
- self._stat_grid(
346
- [
347
- ("Value", ctx["value_display"]),
348
- ("Expected", ctx["expected_range"]),
349
- ("Detector", ctx["detector_name"]),
350
- ("Cleared at", ctx["timestamp"]),
351
- ]
352
- )
353
- )
339
+ stats = [
340
+ ("Value", ctx["value_display"]),
341
+ ("Expected", ctx["expected_range"]),
342
+ ]
343
+ if ctx["started_display"]:
344
+ stats.append(("Started", ctx["started_display"]))
345
+ stats.append(("Cleared", ctx["timestamp"]))
346
+ else:
347
+ stats.append(("Cleared at", ctx["timestamp"]))
348
+ stats.append(("Detector", ctx["detector_name"]))
349
+ parts.append(self._stat_grid(stats))
354
350
  elif kind == "no_data":
355
351
  lead = "Query returned no datapoint for the latest expected interval."
356
352
  parts.append(self._lead_html(lead))
@@ -24,9 +24,10 @@ class TelegramChannel(BaseAlertChannel):
24
24
 
25
25
  Sends formatted messages to a Telegram chat using a bot token. The default
26
26
  (no custom ``template``) message is a structured **HTML** layout — a colored
27
- status dot, a bold headline, the rule that fired, then the evidence
28
- (value / expected / severity / time / detector / params) in ``<code>``,
29
- plus an optional "Open dashboard" link and @mentions.
27
+ status dot, a bold headline, the lead (how long the anomaly has been
28
+ running) followed by the rule that fired, then the evidence
29
+ (value / expected / quorum / severity / started → latest / detector /
30
+ params) in ``<code>``, plus an optional "Open dashboard" link and @mentions.
30
31
 
31
32
  HTML is the default ``parse_mode`` because the legacy ``Markdown`` mode
32
33
  breaks on the detector params JSON (an unmatched ``_`` in e.g.
@@ -161,11 +162,9 @@ class TelegramChannel(BaseAlertChannel):
161
162
  lines.append("") # blank line
162
163
 
163
164
  if kind == "anomaly":
164
- lines.append(
165
- f"<b>Quorum</b> {ctx['detector_count']}/{ctx['min_detectors']} · "
166
- f"direction <b>{esc(ctx['direction'])}</b> · "
167
- f"<b>{ctx['consecutive_count']}/{ctx['consecutive_required']}</b> consecutive"
168
- )
165
+ # Description (how long it's been going on) leads; the Rule chip sits
166
+ # right above the evidence it explains.
167
+ lines.append(esc(ctx["anomaly_lead"]))
169
168
  lines.append(
170
169
  f"<b>Rule</b> <code>min_detectors={ctx['min_detectors']} · "
171
170
  f"direction={esc(ctx['direction_policy'])} · "
@@ -176,17 +175,24 @@ class TelegramChannel(BaseAlertChannel):
176
175
  f"• Value: <code>{esc(ctx['value_display'])}</code> · "
177
176
  f"Expected: <code>{esc(ctx['expected_range'])}</code>"
178
177
  )
178
+ lines.append(
179
+ f"• Quorum: <code>{ctx['detector_count']}/{ctx['min_detectors']} · "
180
+ f"{esc(ctx['direction'])}</code>"
181
+ )
179
182
  lines.append(f"• Severity: <code>{alert_data.severity:.2f}</code>")
180
- lines.append(f"• Time: <code>{esc(ctx['timestamp'])}</code>")
183
+ if ctx["started_display"]:
184
+ lines.append(
185
+ f"• Started: <code>{esc(ctx['started_display'])}</code> · "
186
+ f"Latest: <code>{esc(ctx['timestamp'])}</code>"
187
+ )
188
+ else:
189
+ lines.append(f"• Time: <code>{esc(ctx['timestamp'])}</code>")
181
190
  lines.append(f"• Detector: <code>{esc(ctx['detector_name'])}</code>")
182
191
  if ctx["detector_params"]:
183
192
  params = self._cap(ctx["detector_params"], _PARAMS_CAP)
184
193
  lines.append(f"• Parameters: <code>{esc(params)}</code>")
185
194
  elif kind == "recovery":
186
- lines.append(
187
- "The alert condition no longer holds — the metric is back within "
188
- "expected bounds."
189
- )
195
+ lines.append(esc(ctx["recovery_lead"]))
190
196
  lines.append(
191
197
  f"<b>Rule</b> <code>min_detectors={ctx['min_detectors']} · "
192
198
  f"direction={esc(ctx['direction_policy'])} · "
@@ -197,7 +203,13 @@ class TelegramChannel(BaseAlertChannel):
197
203
  f"• Value: <code>{esc(ctx['value_display'])}</code> · "
198
204
  f"Expected: <code>{esc(ctx['expected_range'])}</code>"
199
205
  )
200
- lines.append(f"• Time: <code>{esc(ctx['timestamp'])}</code>")
206
+ if ctx["started_display"]:
207
+ lines.append(
208
+ f"• Started: <code>{esc(ctx['started_display'])}</code> · "
209
+ f"Cleared: <code>{esc(ctx['timestamp'])}</code>"
210
+ )
211
+ else:
212
+ lines.append(f"• Cleared: <code>{esc(ctx['timestamp'])}</code>")
201
213
  lines.append(f"• Detector: <code>{esc(ctx['detector_name'])}</code>")
202
214
  elif kind == "no_data":
203
215
  lines.append("Query returned no datapoint for the latest expected interval.")
@@ -27,9 +27,11 @@ class WebhookChannel(BaseAlertChannel):
27
27
 
28
28
  Rendering: the default (no custom ``template``) payload is a single
29
29
  **Slack/Mattermost message attachment** — a colored accent bar, a title,
30
- a short markdown lead, and a compact **fields grid** (Value / Expected /
31
- Quorum / Severity, then full-width Detected-at / Detectors / Parameters),
32
- branded with a ``footer`` + ``footer_icon``. This renders richly on both
30
+ a short markdown lead (how long the anomaly has been running) with the
31
+ **Rule** chip beneath it, and a compact **fields grid** (Value / Expected /
32
+ Quorum / Severity / Started / Latest Started / Cleared on recovery — then
33
+ full-width Detectors / Parameters), branded with a ``footer`` +
34
+ ``footer_icon``. This renders richly on both
33
35
  Slack and Mattermost from one payload. A custom ``template`` degrades to a
34
36
  plain text-only attachment (the template is one opaque string that can't be
35
37
  sliced into fields), keeping the color, title and branding.
@@ -267,27 +269,32 @@ class WebhookChannel(BaseAlertChannel):
267
269
  )
268
270
 
269
271
  if kind == "anomaly":
270
- lead = (
271
- f"{rule_chip}\n"
272
- f"Latest {ctx['consecutive_count']}/{ctx['consecutive_required']} "
273
- "consecutive points met the quorum."
274
- )
272
+ # Description (how long it's been going on) leads; the Rule chip sits
273
+ # right above the value/expected fields it explains.
274
+ lead = f"{ctx['anomaly_lead']}\n{rule_chip}"
275
275
  short("Value", code(ctx["value_display"]))
276
276
  short("Expected", code(ctx["expected_range"]))
277
277
  short("Quorum", f"{ctx['detector_count']}/{ctx['min_detectors']} · {ctx['direction']}")
278
278
  short("Severity", f"{alert_data.severity:.2f}")
279
- full("Detected at", ctx["timestamp"])
279
+ # The problematic span: when it started and the latest point in it.
280
+ if ctx["started_display"]:
281
+ short("Started", ctx["started_display"])
282
+ short("Latest", ctx["timestamp"])
283
+ else:
284
+ full("Detected at", ctx["timestamp"])
280
285
  full("Detectors", code(ctx["detector_name"]))
281
286
  if ctx["detector_params"]:
282
287
  full("Parameters", f"```{ctx['detector_params']}```")
283
288
  elif kind == "recovery":
284
- lead = (
285
- "The alert condition no longer holds — the metric is back within "
286
- f"expected bounds.\n{rule_chip}"
287
- )
289
+ lead = f"{ctx['recovery_lead']}\n{rule_chip}"
288
290
  short("Value", code(ctx["value_display"]))
289
291
  short("Expected", code(ctx["expected_range"]))
290
- full("Detected at", ctx["timestamp"])
292
+ # The incident span: when it started and when it cleared.
293
+ if ctx["started_display"]:
294
+ short("Started", ctx["started_display"])
295
+ short("Cleared", ctx["timestamp"])
296
+ else:
297
+ full("Cleared at", ctx["timestamp"])
291
298
  full("Detectors", code(ctx["detector_name"]))
292
299
  elif kind == "no_data":
293
300
  lead = "Query returned no datapoint for the latest expected interval."
@@ -383,16 +390,14 @@ class WebhookChannel(BaseAlertChannel):
383
390
  """
384
391
  return (
385
392
  "{description_line}"
386
- "Quorum {detector_count}/{min_detectors} · "
387
- "direction {direction} (policy {direction_policy}) · "
388
- "consecutive {consecutive_count}/{consecutive_required}\n"
393
+ "{anomaly_lead}\n"
389
394
  "Rule: min_detectors={min_detectors} · "
390
395
  "direction={direction_policy} · consecutive={consecutive_required}\n"
391
396
  "\n"
392
- "Latest point (evidence):\n"
393
- "· Time: {timestamp}\n"
394
- "· Value: {value_display} | Expected: {expected_range}\n"
395
- "· Severity: {severity:.2f}\n"
397
+ "Value: {value_display} | Expected: {expected_range}\n"
398
+ "Quorum: {detector_count}/{min_detectors} · {direction}\n"
399
+ "Severity: {severity:.2f}\n"
400
+ "{window_line}"
396
401
  "Detectors: {detector_name}\n"
397
402
  "Parameters: {detector_params}\n"
398
403
  "{dashboard_line}"
@@ -406,14 +411,12 @@ class WebhookChannel(BaseAlertChannel):
406
411
  """
407
412
  return (
408
413
  "{description_line}"
409
- "The alert condition no longer holds — "
410
- "the metric is back within expected bounds.\n"
414
+ "{recovery_lead}\n"
411
415
  "Rule: min_detectors={min_detectors} · "
412
416
  "direction={direction_policy} · consecutive={consecutive_required}\n"
413
417
  "\n"
414
- "Latest point:\n"
415
- "· Time: {timestamp}\n"
416
- "· Value: {value_display} | Expected: {expected_range}\n"
418
+ "Value: {value_display} | Expected: {expected_range}\n"
419
+ "{window_line}"
417
420
  "Detectors: {detector_name}\n"
418
421
  "{dashboard_line}"
419
422
  "{help_line}"
@@ -10,6 +10,15 @@ from detectkit.alerting.orchestrator._types import (
10
10
  )
11
11
  from detectkit.core.interval import Interval
12
12
 
13
+ # How far back the orchestrator looks to reconstruct the *true* length of an
14
+ # anomalous run when an alert fires / clears. The decision itself only needs
15
+ # ``consecutive_anomalies`` points, but the message reports "how long has this
16
+ # been going on", which needs the full streak. Bounded so a metric stuck
17
+ # anomalous for a very long time never loads unboundedly — past this the run is
18
+ # reported as a lower bound ("over …"). Only queried on fire/recovery, never on
19
+ # the hot no-alert path.
20
+ STREAK_LOOKBACK_POINTS = 1000
21
+
13
22
 
14
23
  class _OrchestratorBase:
15
24
  def __init__(
@@ -34,8 +34,8 @@ from datetime import datetime, timezone
34
34
  import numpy as np
35
35
 
36
36
  from detectkit.alerting.channels.base import AlertData
37
- from detectkit.alerting.orchestrator._base import _OrchestratorBase
38
- from detectkit.alerting.orchestrator._types import DetectionRecord
37
+ from detectkit.alerting.orchestrator._base import STREAK_LOOKBACK_POINTS, _OrchestratorBase
38
+ from detectkit.alerting.orchestrator._types import DetectionRecord, hydrate_detection_records
39
39
  from detectkit.utils.datetime_utils import now_utc, to_aware_utc
40
40
 
41
41
 
@@ -70,7 +70,46 @@ class _DecisionMixin(_OrchestratorBase):
70
70
  if not latest_quorum or consecutive < self.conditions.consecutive_anomalies:
71
71
  return False, None
72
72
 
73
- return True, self._build_alert_data(latest_quorum, consecutive, direction)
73
+ # The decision is made; now resolve the *true* streak length/onset for
74
+ # the message (the shallow alert window caps ``consecutive`` at the rule
75
+ # threshold, which can't answer "how long has this been going on").
76
+ streak, onset_ts, capped = self._resolve_streak(latest_quorum[0].timestamp)
77
+ return True, self._build_alert_data(latest_quorum, streak, direction, onset_ts, capped)
78
+
79
+ def _resolve_streak(self, latest_ts: np.datetime64) -> tuple[int, np.datetime64, bool]:
80
+ """Resolve the full anomalous run ending at *latest_ts*.
81
+
82
+ Loads up to :data:`STREAK_LOOKBACK_POINTS` detections and re-walks the
83
+ same direction-aware quorum logic used to fire, so the message can report
84
+ the real onset/duration rather than the shallow alert-window count.
85
+ Returns ``(streak_count, onset_timestamp, capped)`` — ``capped`` is True
86
+ when the run fills the whole lookback window (onset is older than we saw).
87
+ Only runs when an alert actually fires, so the hot no-alert path is
88
+ untouched.
89
+ """
90
+ step = np.timedelta64(self.interval.seconds, "s")
91
+ if not self.internal:
92
+ # Direct-API path with no DB to walk: report the rule's required
93
+ # length so the message still carries a duration.
94
+ n = max(self.conditions.consecutive_anomalies, 1)
95
+ return n, latest_ts - step * (n - 1), False
96
+
97
+ last_point = latest_ts.astype("datetime64[ms]").astype(datetime)
98
+ rows = self.internal.get_recent_detections(
99
+ metric_name=self.metric_name,
100
+ last_point=last_point,
101
+ num_points=STREAK_LOOKBACK_POINTS,
102
+ )
103
+ records = hydrate_detection_records(rows)
104
+ if not records:
105
+ return 1, latest_ts, False
106
+
107
+ by_time = self._group_by_timestamp(records)
108
+ timestamps_sorted = sorted(by_time.keys(), reverse=True)
109
+ count, _, _ = self._count_consecutive_anomalies(by_time, timestamps_sorted)
110
+ count = max(count, 1)
111
+ capped = count >= STREAK_LOOKBACK_POINTS
112
+ return count, latest_ts - step * (count - 1), capped
74
113
 
75
114
  def _quorum_at(
76
115
  self,
@@ -185,6 +224,8 @@ class _DecisionMixin(_OrchestratorBase):
185
224
  anomalies: list[DetectionRecord],
186
225
  consecutive_count: int,
187
226
  direction: str | None,
227
+ onset_timestamp: np.datetime64 | None = None,
228
+ streak_capped: bool = False,
188
229
  ) -> AlertData:
189
230
  primary = self._primary_record(anomalies)
190
231
 
@@ -249,6 +290,10 @@ class _DecisionMixin(_OrchestratorBase):
249
290
  direction_policy=self.conditions.direction,
250
291
  consecutive_required=self.conditions.consecutive_anomalies,
251
292
  detector_count=len(anomalies),
293
+ # Incident timing for the "how long has this been going on" line.
294
+ interval_seconds=self.interval.seconds,
295
+ onset_timestamp=onset_timestamp,
296
+ streak_capped=streak_capped,
252
297
  )
253
298
 
254
299
  def should_alert_no_data(
@@ -304,6 +349,7 @@ class _DecisionMixin(_OrchestratorBase):
304
349
  links=self.links,
305
350
  project_name=self.project_name,
306
351
  help_url=self.help_url,
352
+ interval_seconds=self.interval.seconds,
307
353
  )
308
354
 
309
355
  def get_last_complete_point(self, now: datetime | None = None) -> datetime: