detectkit 0.27.0__tar.gz → 0.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.27.0 → detectkit-0.29.0}/MANIFEST.in +1 -0
- {detectkit-0.27.0/detectkit.egg-info → detectkit-0.29.0}/PKG-INFO +1 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/__init__.py +1 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/__init__.py +2 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_recovery.py +23 -15
- detectkit-0.29.0/detectkit/alerting/orchestrator/_replay.py +258 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/orchestrator.py +4 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/crossval.py +24 -3
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/grid_search.py +27 -7
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/html_labeler.py +13 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/label_server.py +8 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/labels.py +38 -2
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/settings.py +3 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/window_select.py +30 -11
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/autotune.md +14 -6
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/cli.md +12 -2
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/autotune.py +57 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/run.py +77 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/main.py +28 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/config/metric_config.py +15 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_detections.py +62 -0
- detectkit-0.29.0/detectkit/reporting/__init__.py +18 -0
- detectkit-0.29.0/detectkit/reporting/assets/report.js +62 -0
- detectkit-0.29.0/detectkit/reporting/builder.py +267 -0
- detectkit-0.29.0/detectkit/reporting/html_report.py +79 -0
- {detectkit-0.27.0 → detectkit-0.29.0/detectkit.egg-info}/PKG-INFO +1 -1
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit.egg-info/SOURCES.txt +5 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/pyproject.toml +3 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/LICENSE +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/README.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/branding.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/email.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/telegram.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_cooldown.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_decision.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_dispatch.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/alerting/orchestrator/_types.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/_base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/_types.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/autotuner.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/config_emitter.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/detector_select.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/distribution.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/result.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/scoring.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/autotune/seasonality_search.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/_output.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/CLAUDE.section.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/alerting.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/detectors.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/metrics.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/overview.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/rules/project.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/skills/dtk-autotune/SKILL.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/skills/dtk-feedback/SKILL.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/skills/dtk-new-metric/SKILL.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/assets/claude/skills/dtk-setup-project/SKILL.md +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/clean.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/init.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/init_claude.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/test_alert.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/cli/commands/unlock.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/config/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/config/profile.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/config/project_config.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/config/validator.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/core/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/core/interval.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/core/models.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/_sql_manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/clickhouse_manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_alert_states.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_autotune_runs.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_datapoints.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_maintenance.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_metrics.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_schema.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/_tasks.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/internal_tables/manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/mysql_manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/postgres_manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/database/tables.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/seasonality.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/_windowed.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/iqr.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/mad.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/detectors/statistical/zscore.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/loaders/metric_loader.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/error_dispatch.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/_alert_step.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/_base.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/_detect_step.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/_load_step.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/_types.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/orchestration/task_manager/manager.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/utils/__init__.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/utils/datetime_utils.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/utils/env_interpolation.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/utils/json_utils.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit/utils/stats.py +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/requirements.txt +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/setup.cfg +0 -0
- {detectkit-0.27.0 → detectkit-0.29.0}/setup.py +0 -0
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.29.0"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Public surface of the alert-orchestrator package."""
|
|
2
2
|
|
|
3
|
+
from detectkit.alerting.orchestrator._replay import ReplayedEvent
|
|
3
4
|
from detectkit.alerting.orchestrator._types import (
|
|
4
5
|
AlertConditions,
|
|
5
6
|
DetectionRecord,
|
|
@@ -13,6 +14,7 @@ __all__ = [
|
|
|
13
14
|
"AlertOrchestrator",
|
|
14
15
|
"AlertConditions",
|
|
15
16
|
"DetectionRecord",
|
|
17
|
+
"ReplayedEvent",
|
|
16
18
|
# Shared hydration of DetectionRecord rows from get_recent_detections
|
|
17
19
|
# output (used by TaskManager and the recovery mixin).
|
|
18
20
|
"hydrate_detection_records",
|
|
@@ -139,6 +139,7 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
139
139
|
def _build_recovery_data(
|
|
140
140
|
self,
|
|
141
141
|
detections: list[DetectionRecord],
|
|
142
|
+
incident_records: list[DetectionRecord] | None = None,
|
|
142
143
|
) -> AlertData | None:
|
|
143
144
|
"""Construct the AlertData payload sent as a recovery notification."""
|
|
144
145
|
if not detections:
|
|
@@ -165,7 +166,9 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
165
166
|
|
|
166
167
|
# Reconstruct the just-ended incident so the recovery message can say how
|
|
167
168
|
# long it lasted (symmetric with the anomaly alert's onset/duration).
|
|
168
|
-
incident_count, onset_ts, capped = self._resolve_incident(
|
|
169
|
+
incident_count, onset_ts, capped = self._resolve_incident(
|
|
170
|
+
latest.timestamp, records=incident_records
|
|
171
|
+
)
|
|
169
172
|
|
|
170
173
|
return AlertData(
|
|
171
174
|
metric_name=self.metric_name,
|
|
@@ -200,7 +203,9 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
200
203
|
streak_capped=capped,
|
|
201
204
|
)
|
|
202
205
|
|
|
203
|
-
def _resolve_incident(
|
|
206
|
+
def _resolve_incident(
|
|
207
|
+
self, cleared_ts: Any, records: list[DetectionRecord] | None = None
|
|
208
|
+
) -> tuple[int, Any, bool]:
|
|
204
209
|
"""Find the anomalous run that just ended before the recovery point.
|
|
205
210
|
|
|
206
211
|
Walks back from *cleared_ts* (the latest, now-clean point): skips the
|
|
@@ -209,20 +214,23 @@ class _RecoveryMixin(_OrchestratorBase):
|
|
|
209
214
|
capped)`` — ``(0, None, False)`` when no run can be reconstructed, so the
|
|
210
215
|
recovery message just omits the incident duration.
|
|
211
216
|
"""
|
|
212
|
-
if not self.internal:
|
|
213
|
-
return 0, None, False
|
|
214
|
-
|
|
215
217
|
step = np.timedelta64(self.interval.seconds, "s")
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
218
|
+
# ``records`` lets a pure caller (alert replay) supply the in-memory
|
|
219
|
+
# detection slice instead of a DB read; production passes None and the
|
|
220
|
+
# incident is resolved from ``_dtk_detections`` as before.
|
|
221
|
+
if records is None:
|
|
222
|
+
if not self.internal:
|
|
223
|
+
return 0, None, False
|
|
224
|
+
if isinstance(cleared_ts, np.datetime64):
|
|
225
|
+
last_point = cleared_ts.astype("datetime64[ms]").astype(datetime)
|
|
226
|
+
else:
|
|
227
|
+
last_point = cleared_ts
|
|
228
|
+
rows = self.internal.get_recent_detections(
|
|
229
|
+
metric_name=self.metric_name,
|
|
230
|
+
last_point=last_point,
|
|
231
|
+
num_points=STREAK_LOOKBACK_POINTS,
|
|
232
|
+
)
|
|
233
|
+
records = hydrate_detection_records(rows)
|
|
226
234
|
if not records:
|
|
227
235
|
return 0, None, False
|
|
228
236
|
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Pure historical replay of alert/recovery/no-data events.
|
|
2
|
+
|
|
3
|
+
Reconstructs the alert events the orchestrator *would have* produced over a
|
|
4
|
+
historical period from already-persisted detections — **without** any channel
|
|
5
|
+
dispatch, DB state writes or wall-clock. It is the offline counterpart of the
|
|
6
|
+
live ``should_alert`` / ``should_send_recovery`` / ``should_alert_no_data`` path:
|
|
7
|
+
state (last alert / last recovery) is simulated in memory and the decision at
|
|
8
|
+
every grid point is evaluated *causally* (only records with ``timestamp <= t``,
|
|
9
|
+
since the windowed detector is causal), reusing the exact same quorum,
|
|
10
|
+
consecutive-walk, cooldown and recovery arithmetic as the live path.
|
|
11
|
+
|
|
12
|
+
Used to answer "what would these detections have alerted on over this window"
|
|
13
|
+
for backtesting / autotune alert-window sweeps, where firing real channels and
|
|
14
|
+
mutating ``_dtk_alert_states`` would be wrong.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from datetime import datetime, timedelta
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from detectkit.alerting.channels.base import AlertData
|
|
25
|
+
from detectkit.alerting.orchestrator._base import STREAK_LOOKBACK_POINTS, _OrchestratorBase
|
|
26
|
+
from detectkit.alerting.orchestrator._types import DetectionRecord
|
|
27
|
+
from detectkit.core.interval import Interval
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class ReplayedEvent:
|
|
32
|
+
"""One alert event reconstructed by :meth:`_ReplayMixin.replay`.
|
|
33
|
+
|
|
34
|
+
``kind`` is ``"anomaly"``, ``"recovery"`` or ``"no_data"``; ``timestamp`` is
|
|
35
|
+
the grid point at which the event fired (the simulated "now"); ``alert_data``
|
|
36
|
+
is identical in shape to a live :class:`AlertData` (built via the same
|
|
37
|
+
``_build_*`` helpers as the live path).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
kind: str
|
|
41
|
+
timestamp: np.datetime64
|
|
42
|
+
alert_data: AlertData
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _ReplayMixin(_OrchestratorBase):
|
|
46
|
+
def replay(
|
|
47
|
+
self,
|
|
48
|
+
detections: list[DetectionRecord],
|
|
49
|
+
value_at: dict[np.datetime64, float | None],
|
|
50
|
+
start: datetime,
|
|
51
|
+
end: datetime,
|
|
52
|
+
) -> list[ReplayedEvent]:
|
|
53
|
+
"""Reconstruct alert/recovery/no-data events over ``[start, end]``.
|
|
54
|
+
|
|
55
|
+
Forward pass over every interval boundary in the closed range
|
|
56
|
+
``[start, end]``. At each grid point ``t`` the decision is evaluated
|
|
57
|
+
causally — only ``detections`` with ``timestamp <= t`` are considered —
|
|
58
|
+
reusing the live quorum / consecutive-walk / cooldown / recovery logic.
|
|
59
|
+
Simulated state (last alert / last recovery) lives in memory, so nothing
|
|
60
|
+
is dispatched and no DB row is written.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
detections: every persisted detection over the period (any order;
|
|
64
|
+
the same per-detector-per-timestamp shape the live path uses).
|
|
65
|
+
value_at: grid ``np.datetime64`` → value, with ``None`` for a
|
|
66
|
+
missing / NaN datapoint (drives the no-data check).
|
|
67
|
+
start: first grid boundary to evaluate (inclusive).
|
|
68
|
+
end: last grid boundary to evaluate (inclusive).
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The fired events in chronological order.
|
|
72
|
+
"""
|
|
73
|
+
by_time = self._group_by_timestamp(detections)
|
|
74
|
+
|
|
75
|
+
sim_last_alert: np.datetime64 | None = None
|
|
76
|
+
sim_last_recovery: np.datetime64 | None = None
|
|
77
|
+
events: list[ReplayedEvent] = []
|
|
78
|
+
|
|
79
|
+
for t in self._replay_grid(start, end):
|
|
80
|
+
# No-data fires independently of the quorum (a single binary
|
|
81
|
+
# metric-level signal), only when configured and not in cooldown.
|
|
82
|
+
if (
|
|
83
|
+
self.alert_config
|
|
84
|
+
and getattr(self.alert_config, "no_data_alert", False)
|
|
85
|
+
and value_at.get(t) is None
|
|
86
|
+
and not self._replay_in_cooldown(t, sim_last_alert, sim_last_recovery)
|
|
87
|
+
):
|
|
88
|
+
last_point = t.astype("datetime64[ms]").astype(datetime)
|
|
89
|
+
events.append(
|
|
90
|
+
ReplayedEvent("no_data", t, self._build_no_data_alert_data(last_point))
|
|
91
|
+
)
|
|
92
|
+
sim_last_alert = t
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
causal = {ts: recs for ts, recs in by_time.items() if ts <= t}
|
|
96
|
+
ts_desc = sorted(causal, reverse=True)
|
|
97
|
+
|
|
98
|
+
consecutive, latest_quorum, direction = self._count_consecutive_anomalies(
|
|
99
|
+
causal, ts_desc
|
|
100
|
+
)
|
|
101
|
+
fired = (
|
|
102
|
+
latest_quorum is not None
|
|
103
|
+
and consecutive >= self.conditions.consecutive_anomalies
|
|
104
|
+
and not self._replay_in_cooldown(t, sim_last_alert, sim_last_recovery)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if fired:
|
|
108
|
+
assert latest_quorum is not None # narrowed by ``fired``
|
|
109
|
+
streak, onset, capped = self._replay_streak(causal, ts_desc)
|
|
110
|
+
ad = self._build_alert_data(latest_quorum, streak, direction, onset, capped)
|
|
111
|
+
events.append(ReplayedEvent("anomaly", t, ad))
|
|
112
|
+
sim_last_alert = t
|
|
113
|
+
elif (
|
|
114
|
+
self.alert_config
|
|
115
|
+
and getattr(self.alert_config, "notify_on_recovery", False)
|
|
116
|
+
and sim_last_alert is not None
|
|
117
|
+
and (sim_last_recovery is None or sim_last_recovery < sim_last_alert)
|
|
118
|
+
and self._replay_recovered(causal, ts_desc, sim_last_alert)
|
|
119
|
+
):
|
|
120
|
+
slice_ = [d for d in detections if d.timestamp <= t]
|
|
121
|
+
# Pure replay: resolve the just-ended incident from the in-memory
|
|
122
|
+
# slice, never from the DB (keeps replay standalone).
|
|
123
|
+
rd = self._build_recovery_data(slice_, incident_records=slice_)
|
|
124
|
+
if rd is not None:
|
|
125
|
+
events.append(ReplayedEvent("recovery", t, rd))
|
|
126
|
+
sim_last_recovery = t
|
|
127
|
+
|
|
128
|
+
return events
|
|
129
|
+
|
|
130
|
+
def _replay_grid(self, start: datetime, end: datetime) -> list[np.datetime64]:
|
|
131
|
+
"""Every interval boundary in the closed range ``[start, end]``.
|
|
132
|
+
|
|
133
|
+
Boundaries are produced in ``datetime64[ms]`` so they compare exactly
|
|
134
|
+
with hydrated detection timestamps and ``value_at`` keys.
|
|
135
|
+
"""
|
|
136
|
+
step = timedelta(seconds=self.interval.seconds)
|
|
137
|
+
grid: list[np.datetime64] = []
|
|
138
|
+
cur = start
|
|
139
|
+
while cur <= end:
|
|
140
|
+
grid.append(np.datetime64(cur, "ms"))
|
|
141
|
+
cur = cur + step
|
|
142
|
+
return grid
|
|
143
|
+
|
|
144
|
+
def _replay_in_cooldown(
|
|
145
|
+
self,
|
|
146
|
+
t: np.datetime64,
|
|
147
|
+
sim_last_alert: np.datetime64 | None,
|
|
148
|
+
sim_last_recovery: np.datetime64 | None,
|
|
149
|
+
) -> bool:
|
|
150
|
+
"""In-memory analog of :meth:`_CooldownMixin._is_in_cooldown`.
|
|
151
|
+
|
|
152
|
+
Elapsed time is measured on the grid (``t - sim_last_alert``) rather than
|
|
153
|
+
from the wall clock. ``cooldown_reset_on_recovery`` clears the cooldown
|
|
154
|
+
when a recovery has been simulated since the last alert.
|
|
155
|
+
"""
|
|
156
|
+
if not self.alert_config or not getattr(self.alert_config, "alert_cooldown", None):
|
|
157
|
+
return False
|
|
158
|
+
if sim_last_alert is None:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
cooldown = np.timedelta64(Interval(self.alert_config.alert_cooldown).seconds, "s")
|
|
162
|
+
elapsed = (t - sim_last_alert).astype("timedelta64[s]")
|
|
163
|
+
|
|
164
|
+
if getattr(self.alert_config, "cooldown_reset_on_recovery", True):
|
|
165
|
+
if sim_last_recovery is not None and sim_last_recovery > sim_last_alert:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
return bool(elapsed < cooldown)
|
|
169
|
+
|
|
170
|
+
def _replay_recovered(
|
|
171
|
+
self,
|
|
172
|
+
causal: dict[np.datetime64, list[DetectionRecord]],
|
|
173
|
+
ts_desc: list[np.datetime64],
|
|
174
|
+
sim_last_alert: np.datetime64,
|
|
175
|
+
) -> bool:
|
|
176
|
+
"""Pure half of :meth:`_RecoveryMixin._check_recovery_since_last_alert`.
|
|
177
|
+
|
|
178
|
+
Returns ``True`` when the metric has recovered as of the latest causal
|
|
179
|
+
point: no blocking anomalies under the trigger direction, OR no causal
|
|
180
|
+
detections strictly after the last simulated alert.
|
|
181
|
+
"""
|
|
182
|
+
if not ts_desc:
|
|
183
|
+
# No detections at all → nothing blocking → recovered.
|
|
184
|
+
return True
|
|
185
|
+
|
|
186
|
+
# No fresh detections after the alert → assume recovery (mirrors the
|
|
187
|
+
# live "no fresh detections" branch).
|
|
188
|
+
if not any(ts > sim_last_alert for ts in ts_desc):
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
latest_ts = ts_desc[0]
|
|
192
|
+
latest_anomalies = [d for d in causal[latest_ts] if d.is_anomaly]
|
|
193
|
+
|
|
194
|
+
policy = self.conditions.direction
|
|
195
|
+
if policy == "down":
|
|
196
|
+
blocking = [d for d in latest_anomalies if d.direction == "down"]
|
|
197
|
+
elif policy == "up":
|
|
198
|
+
blocking = [d for d in latest_anomalies if d.direction == "up"]
|
|
199
|
+
elif policy == "same":
|
|
200
|
+
trigger_direction = self._replay_trigger_direction(causal, sim_last_alert)
|
|
201
|
+
if trigger_direction is None:
|
|
202
|
+
blocking = latest_anomalies # conservative fallback
|
|
203
|
+
else:
|
|
204
|
+
blocking = [d for d in latest_anomalies if d.direction == trigger_direction]
|
|
205
|
+
else: # "any" / unknown — preserve historical behaviour
|
|
206
|
+
blocking = latest_anomalies
|
|
207
|
+
|
|
208
|
+
return len(blocking) == 0
|
|
209
|
+
|
|
210
|
+
def _replay_trigger_direction(
|
|
211
|
+
self,
|
|
212
|
+
causal: dict[np.datetime64, list[DetectionRecord]],
|
|
213
|
+
sim_last_alert: np.datetime64,
|
|
214
|
+
) -> str | None:
|
|
215
|
+
"""Direction of the anomaly that triggered the simulated last alert.
|
|
216
|
+
|
|
217
|
+
Pure analog of :meth:`_RecoveryMixin._get_alert_trigger_direction`: the
|
|
218
|
+
live code reads the single detection row at the alert timestamp; here the
|
|
219
|
+
alert fired at the grid point ``sim_last_alert``, so the triggering
|
|
220
|
+
quorum is the latest causal point at or before it.
|
|
221
|
+
"""
|
|
222
|
+
candidates = [ts for ts in causal if ts <= sim_last_alert]
|
|
223
|
+
if not candidates:
|
|
224
|
+
return None
|
|
225
|
+
latest_ts = max(candidates)
|
|
226
|
+
anomalies = [d for d in causal[latest_ts] if d.is_anomaly]
|
|
227
|
+
if not anomalies:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
_, direction = self._quorum_at(anomalies, None)
|
|
231
|
+
if direction in ("up", "down"):
|
|
232
|
+
return direction
|
|
233
|
+
|
|
234
|
+
ups = sum(1 for d in anomalies if d.direction == "up")
|
|
235
|
+
downs = sum(1 for d in anomalies if d.direction == "down")
|
|
236
|
+
if ups > downs:
|
|
237
|
+
return "up"
|
|
238
|
+
if downs > ups:
|
|
239
|
+
return "down"
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
def _replay_streak(
|
|
243
|
+
self,
|
|
244
|
+
causal: dict[np.datetime64, list[DetectionRecord]],
|
|
245
|
+
ts_desc: list[np.datetime64],
|
|
246
|
+
) -> tuple[int, np.datetime64, bool]:
|
|
247
|
+
"""In-memory analog of :meth:`_DecisionMixin._resolve_streak`.
|
|
248
|
+
|
|
249
|
+
Re-walks the same direction-aware quorum logic over the causal records to
|
|
250
|
+
get the *true* streak length, then derives the onset and the cap flag the
|
|
251
|
+
same way the live path does.
|
|
252
|
+
"""
|
|
253
|
+
latest_ts = ts_desc[0]
|
|
254
|
+
step = np.timedelta64(self.interval.seconds, "s")
|
|
255
|
+
count, _, _ = self._count_consecutive_anomalies(causal, ts_desc)
|
|
256
|
+
count = max(count, 1)
|
|
257
|
+
capped = count >= STREAK_LOOKBACK_POINTS
|
|
258
|
+
return count, latest_ts - step * (count - 1), capped
|
|
@@ -6,12 +6,14 @@ from detectkit.alerting.orchestrator._cooldown import _CooldownMixin
|
|
|
6
6
|
from detectkit.alerting.orchestrator._decision import _DecisionMixin
|
|
7
7
|
from detectkit.alerting.orchestrator._dispatch import _DispatchMixin
|
|
8
8
|
from detectkit.alerting.orchestrator._recovery import _RecoveryMixin
|
|
9
|
+
from detectkit.alerting.orchestrator._replay import _ReplayMixin
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class AlertOrchestrator(
|
|
12
13
|
_DecisionMixin,
|
|
13
14
|
_CooldownMixin,
|
|
14
15
|
_RecoveryMixin,
|
|
16
|
+
_ReplayMixin,
|
|
15
17
|
_DispatchMixin,
|
|
16
18
|
):
|
|
17
19
|
"""Coordinates alert decisions, cooldown, recovery and dispatch.
|
|
@@ -21,6 +23,8 @@ class AlertOrchestrator(
|
|
|
21
23
|
* ``_DecisionMixin`` — should we alert? builds AlertData.
|
|
22
24
|
* ``_CooldownMixin`` — suppress within the configured window.
|
|
23
25
|
* ``_RecoveryMixin`` — direction-aware "all-clear" detection.
|
|
26
|
+
* ``_ReplayMixin`` — pure historical replay of alert/recovery/no-data
|
|
27
|
+
events (no dispatch, no DB state, no wall-clock).
|
|
24
28
|
* ``_DispatchMixin`` — ship to channels and stamp state.
|
|
25
29
|
"""
|
|
26
30
|
|
|
@@ -61,6 +61,29 @@ def predictions_from_results(
|
|
|
61
61
|
return y_pred, y_score, valid
|
|
62
62
|
|
|
63
63
|
|
|
64
|
+
def _aggregate(per_fold: list[float], stability_lambda: float) -> tuple[float, float]:
|
|
65
|
+
"""Mean across folds minus a **downside-only** dispersion penalty.
|
|
66
|
+
|
|
67
|
+
Returns ``(aggregate, penalty)``. The penalty uses the semi-deviation of the
|
|
68
|
+
folds that score *below* the mean, not the full ``std``: a config that simply
|
|
69
|
+
scores *higher* on some folds — e.g. a recency-aware baseline that fits the
|
|
70
|
+
current regime better than stale history — should not be punished for that
|
|
71
|
+
*upside* spread. Penalizing full ``std`` did exactly that, biasing the search
|
|
72
|
+
against regime-adaptive configs; downside-only keeps the guard against
|
|
73
|
+
genuinely unstable candidates while letting an adaptive one win.
|
|
74
|
+
"""
|
|
75
|
+
arr = np.asarray(per_fold, dtype=float)
|
|
76
|
+
mean = float(np.mean(arr))
|
|
77
|
+
# Downside deviation: square only the shortfalls below the mean (upside → 0),
|
|
78
|
+
# averaged over ALL folds. This is always <= the full std, and reduces to 0
|
|
79
|
+
# when folds are equal — so a config that's merely *better* on recent folds is
|
|
80
|
+
# not penalized, only one that drops below par on some.
|
|
81
|
+
deficits = np.minimum(arr - mean, 0.0)
|
|
82
|
+
downside = float(np.sqrt(np.mean(deficits**2)))
|
|
83
|
+
penalty = stability_lambda * downside
|
|
84
|
+
return mean - penalty, penalty
|
|
85
|
+
|
|
86
|
+
|
|
64
87
|
def run_cv(
|
|
65
88
|
detector: BaseDetector,
|
|
66
89
|
data: dict[str, np.ndarray],
|
|
@@ -95,7 +118,5 @@ def run_cv(
|
|
|
95
118
|
if not per_fold:
|
|
96
119
|
return FoldScores(per_fold=[], aggregate=0.0, stability_penalty=0.0)
|
|
97
120
|
|
|
98
|
-
|
|
99
|
-
penalty = settings.stability_lambda * float(np.std(arr))
|
|
100
|
-
aggregate = float(np.mean(arr)) - penalty
|
|
121
|
+
aggregate, penalty = _aggregate(per_fold, settings.stability_lambda)
|
|
101
122
|
return FoldScores(per_fold=per_fold, aggregate=aggregate, stability_penalty=penalty)
|
|
@@ -13,10 +13,13 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
16
18
|
from detectkit.autotune._base import _AutoTuneBase
|
|
17
19
|
from detectkit.autotune._types import CandidateEval
|
|
18
20
|
from detectkit.autotune.window_select import (
|
|
19
21
|
detect_level_shift,
|
|
22
|
+
half_life_grid,
|
|
20
23
|
min_samples_for,
|
|
21
24
|
select_window,
|
|
22
25
|
trend_present,
|
|
@@ -50,18 +53,23 @@ def grid_search(
|
|
|
50
53
|
# enough to inflate the global MAD it is measured against. When that
|
|
51
54
|
# happens the engine treats the series as stationary — prefers the largest
|
|
52
55
|
# window, skips detrend — and the baseline quietly averages two regimes.
|
|
53
|
-
# Surface it so the user can narrow the
|
|
54
|
-
|
|
56
|
+
# Surface it (with a concrete --from date) so the user can narrow the
|
|
57
|
+
# window and re-tune; advisory only.
|
|
58
|
+
found, sigmas, idx = detect_level_shift(tuner)
|
|
55
59
|
if found:
|
|
60
|
+
timestamps = tuner.data["timestamp"]
|
|
61
|
+
n = int(len(timestamps))
|
|
62
|
+
from_date = str(np.datetime64(timestamps[idx], "D"))
|
|
63
|
+
pct = round(idx / n * 100) if n else 0
|
|
56
64
|
tuner.log(
|
|
57
65
|
"regime",
|
|
58
66
|
f"series reads stationary, but a large level shift (~{sigmas:.1f}σ "
|
|
59
|
-
f"within-regime) sits ~{
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"`--from
|
|
67
|
+
f"within-regime) sits ~{pct}% in, around {from_date} — the midpoint "
|
|
68
|
+
"trend test misses an off-center shift, so the baseline may average "
|
|
69
|
+
f"two regimes. If the earlier regime is stale, re-tune with "
|
|
70
|
+
f"`--from {from_date}` (or set `autotune.max_history`).",
|
|
63
71
|
shift_sigmas=round(sigmas, 2),
|
|
64
|
-
|
|
72
|
+
shift_at=from_date,
|
|
65
73
|
)
|
|
66
74
|
eps = tuner.settings.min_improvement
|
|
67
75
|
best_overall: CandidateEval | None = None
|
|
@@ -104,6 +112,18 @@ def grid_search(
|
|
|
104
112
|
if ev is not None and ev.score > best.score + eps:
|
|
105
113
|
best, accepted["window_weights"] = ev, weights
|
|
106
114
|
|
|
115
|
+
# Axis 2b: half-life of the recency weighting — only when exponential
|
|
116
|
+
# weighting was adopted. The detector defaults to a fixed half-life; this
|
|
117
|
+
# lets the search pick a faster-forgetting baseline that tracks the current
|
|
118
|
+
# regime (the term that matters on a metric that shifted level).
|
|
119
|
+
if accepted.get("window_weights") == "exponential":
|
|
120
|
+
for half_life in half_life_grid(accepted["window_size"], accepted["min_samples"]):
|
|
121
|
+
if half_life == accepted.get("half_life"):
|
|
122
|
+
continue
|
|
123
|
+
ev = tuner.safe_evaluate(detector_type, {**accepted, "half_life": half_life})
|
|
124
|
+
if ev is not None and ev.score > best.score + eps:
|
|
125
|
+
best, accepted["half_life"] = ev, half_life
|
|
126
|
+
|
|
107
127
|
# Axis 3: detrend (gated by the trend pre-test).
|
|
108
128
|
if has_trend:
|
|
109
129
|
for detrend in (None, "linear"):
|
|
@@ -216,6 +216,8 @@ const INTERVAL_S = __INTERVAL__;
|
|
|
216
216
|
// Incidents to seed the editor with (editing an existing labels file). Each is
|
|
217
217
|
// {start, end, label} in "YYYY-MM-DD HH:MM:SS" UTC; a point is start === end.
|
|
218
218
|
const PRELOAD = __INCIDENTS__;
|
|
219
|
+
// Threshold-capture window(s) to restore (from a saved file): [{start, end}] UTC.
|
|
220
|
+
const CAPWINS = __CAPTURE_WINDOWS__;
|
|
219
221
|
const pts = DATA.points.map(p => ({ts: Date.parse(p.t.replace(' ','T')+'Z'), v: p.v}));
|
|
220
222
|
const N = pts.length;
|
|
221
223
|
const vraw = pts.filter(p => p.v !== null).map(p => p.v);
|
|
@@ -242,6 +244,12 @@ let selObj = null, hoverRow = -1, hoverDel = -1, thMode = false, thHover = null;
|
|
|
242
244
|
// Threshold-capture window: thDown tracks a press, thDragWin a live drag, capWin
|
|
243
245
|
// the committed custom window (null → capture within the current view).
|
|
244
246
|
let thDown = null, thDragWin = null, capWin = null;
|
|
247
|
+
// Restore a saved capture window so re-opening a labels file keeps the painted
|
|
248
|
+
// regime scope (only shown once threshold capture is toggled on).
|
|
249
|
+
if (CAPWINS && CAPWINS.length) { const w0 = CAPWINS[0];
|
|
250
|
+
const a = Date.parse(String(w0.start).replace(' ','T')+'Z'),
|
|
251
|
+
b = Date.parse(String(w0.end).replace(' ','T')+'Z');
|
|
252
|
+
if (!isNaN(a) && !isNaN(b)) capWin = {a: Math.min(a,b), b: Math.max(a,b)}; }
|
|
245
253
|
|
|
246
254
|
const clamp = (x,a,b) => Math.max(a, Math.min(b, x));
|
|
247
255
|
const vspan = () => viewMax - viewMin;
|
|
@@ -710,6 +718,9 @@ const buildYaml = () => {
|
|
|
710
718
|
if (!sorted.length) y+=' []\\n';
|
|
711
719
|
sorted.forEach(iv => { y+=' - {start: "'+fmtTs(iv.a)+'", end: "'+fmtTs(iv.b)+'"'
|
|
712
720
|
+ (iv.label && iv.label.trim() ? ', label: '+yamlStr(iv.label.trim()) : '') + '}\\n'; });
|
|
721
|
+
// Persist the painted threshold-capture window so the regime scope is auditable
|
|
722
|
+
// in the saved file and restored on reopen. Pure metadata — autotune ignores it.
|
|
723
|
+
if (capWin) y+='capture_windows:\\n - {start: "'+fmtTs(capWin.a)+'", end: "'+fmtTs(capWin.b)+'"}\\n';
|
|
713
724
|
return y;
|
|
714
725
|
};
|
|
715
726
|
|
|
@@ -767,6 +778,7 @@ def render_labeler_html(
|
|
|
767
778
|
save_url: str | None = None,
|
|
768
779
|
interval_seconds: int | None = None,
|
|
769
780
|
incidents: list[dict[str, str]] | None = None,
|
|
781
|
+
capture_windows: list[dict[str, str]] | None = None,
|
|
770
782
|
) -> str:
|
|
771
783
|
"""Return a self-contained HTML labeler page for *metric_name*'s series.
|
|
772
784
|
|
|
@@ -791,6 +803,7 @@ def render_labeler_html(
|
|
|
791
803
|
return (
|
|
792
804
|
_TEMPLATE.replace("__PAYLOAD__", payload)
|
|
793
805
|
.replace("__INCIDENTS__", preload)
|
|
806
|
+
.replace("__CAPTURE_WINDOWS__", json_dumps_sorted(capture_windows or []))
|
|
794
807
|
.replace("__FAVICON__", _favicon_data_uri())
|
|
795
808
|
.replace("__SAVE_URL__", json.dumps(save_url))
|
|
796
809
|
.replace("__INTERVAL__", json.dumps(interval_seconds))
|
|
@@ -122,11 +122,14 @@ def build_label_server(
|
|
|
122
122
|
incidents_dir: Path,
|
|
123
123
|
interval_seconds: int,
|
|
124
124
|
preload: list[dict[str, str]] | None = None,
|
|
125
|
+
capture_windows: list[dict[str, str]] | None = None,
|
|
125
126
|
) -> tuple[_LabelServer, str]:
|
|
126
127
|
"""Construct (without running) the labeler server; return ``(server, page_url)``.
|
|
127
128
|
|
|
128
129
|
``preload`` seeds the labeler with already-marked incidents (editing an
|
|
129
130
|
existing labels file); the caller resolves which file to load.
|
|
131
|
+
``capture_windows`` restores the painted threshold-capture window from a saved
|
|
132
|
+
file so the regime scope survives a reopen.
|
|
130
133
|
"""
|
|
131
134
|
server = _LabelServer(("127.0.0.1", 0), _Handler)
|
|
132
135
|
token = secrets.token_urlsafe(16)
|
|
@@ -141,6 +144,7 @@ def build_label_server(
|
|
|
141
144
|
save_url=f"http://127.0.0.1:{port}/save?token={token}",
|
|
142
145
|
interval_seconds=interval_seconds,
|
|
143
146
|
incidents=preload,
|
|
147
|
+
capture_windows=capture_windows,
|
|
144
148
|
)
|
|
145
149
|
return server, f"http://127.0.0.1:{port}/?token={token}"
|
|
146
150
|
|
|
@@ -155,10 +159,12 @@ def serve_labeler(
|
|
|
155
159
|
echo: Callable[[str], None] = print,
|
|
156
160
|
on_ready: Callable[[str], None] | None = None,
|
|
157
161
|
preload: list[dict[str, str]] | None = None,
|
|
162
|
+
capture_windows: list[dict[str, str]] | None = None,
|
|
158
163
|
) -> Path | None:
|
|
159
164
|
"""Serve the labeler until the user saves (returns the file) or cancels (None).
|
|
160
165
|
|
|
161
|
-
``preload`` seeds the page with existing incidents to edit in place
|
|
166
|
+
``preload`` seeds the page with existing incidents to edit in place;
|
|
167
|
+
``capture_windows`` restores the painted threshold-capture scope.
|
|
162
168
|
"""
|
|
163
169
|
server, url = build_label_server(
|
|
164
170
|
metric_name=metric_name,
|
|
@@ -166,6 +172,7 @@ def serve_labeler(
|
|
|
166
172
|
incidents_dir=incidents_dir,
|
|
167
173
|
interval_seconds=interval_seconds,
|
|
168
174
|
preload=preload,
|
|
175
|
+
capture_windows=capture_windows,
|
|
169
176
|
)
|
|
170
177
|
if on_ready is not None:
|
|
171
178
|
on_ready(url)
|
|
@@ -14,7 +14,7 @@ When no labels are supplied the tuner falls back to unsupervised mode.
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
-
from dataclasses import dataclass
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
18
|
from datetime import datetime, timezone
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
from typing import Any
|
|
@@ -62,6 +62,10 @@ class IncidentLabels:
|
|
|
62
62
|
|
|
63
63
|
intervals: list[IncidentInterval]
|
|
64
64
|
points: list[IncidentPoint]
|
|
65
|
+
# Optional threshold-capture time window(s) painted in the labeler. Pure
|
|
66
|
+
# metadata: it records the regime scope the user reasoned about (auditable in
|
|
67
|
+
# the saved file, restored on reopen); it does NOT affect ground truth.
|
|
68
|
+
capture_windows: list[tuple[datetime, datetime]] = field(default_factory=list)
|
|
65
69
|
|
|
66
70
|
def is_empty(self) -> bool:
|
|
67
71
|
return not self.intervals and not self.points
|
|
@@ -152,6 +156,7 @@ def parse_incident_labels(
|
|
|
152
156
|
if raw is None:
|
|
153
157
|
return IncidentLabels([], [])
|
|
154
158
|
|
|
159
|
+
raw_windows: list = []
|
|
155
160
|
if isinstance(raw, list):
|
|
156
161
|
entries = raw
|
|
157
162
|
tz: ZoneInfo | None = None
|
|
@@ -164,6 +169,9 @@ def parse_incident_labels(
|
|
|
164
169
|
entries = raw.get("incidents", [])
|
|
165
170
|
if not isinstance(entries, list):
|
|
166
171
|
raise ValueError("'incidents' must be a list")
|
|
172
|
+
raw_windows = raw.get("capture_windows") or []
|
|
173
|
+
if not isinstance(raw_windows, list):
|
|
174
|
+
raise ValueError("'capture_windows' must be a list")
|
|
167
175
|
else:
|
|
168
176
|
raise ValueError("Labels must be a mapping with 'incidents' or a list of incidents")
|
|
169
177
|
|
|
@@ -187,7 +195,16 @@ def parse_incident_labels(
|
|
|
187
195
|
"Each incident needs either 'at' (a point) or 'start'+'end' (an interval)"
|
|
188
196
|
)
|
|
189
197
|
|
|
190
|
-
|
|
198
|
+
capture_windows: list[tuple[datetime, datetime]] = []
|
|
199
|
+
for win in raw_windows:
|
|
200
|
+
if not isinstance(win, dict) or "start" not in win or "end" not in win:
|
|
201
|
+
raise ValueError("Each capture_windows entry needs 'start' and 'end'")
|
|
202
|
+
ws, we = _parse_dt(win["start"], tz), _parse_dt(win["end"], tz)
|
|
203
|
+
if ws > we:
|
|
204
|
+
raise ValueError(f"Capture window start {ws} is after end {we}")
|
|
205
|
+
capture_windows.append((ws, we))
|
|
206
|
+
|
|
207
|
+
return IncidentLabels(intervals=intervals, points=points, capture_windows=capture_windows)
|
|
191
208
|
|
|
192
209
|
|
|
193
210
|
def parse_labels_file(
|
|
@@ -243,3 +260,22 @@ def load_incidents_for_display(
|
|
|
243
260
|
"""Load a canonical labels file and render it as labeler display dicts."""
|
|
244
261
|
labels = parse_labels_file(path, interval_seconds=interval_seconds, metric_name=metric_name)
|
|
245
262
|
return incidents_to_display(labels)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def capture_windows_to_display(labels: IncidentLabels) -> list[dict[str, str]]:
|
|
266
|
+
"""Render parsed capture windows as labeler display dicts (naive-UTC strings)."""
|
|
267
|
+
return [
|
|
268
|
+
{"start": start.strftime(_DISPLAY_FMT), "end": end.strftime(_DISPLAY_FMT)}
|
|
269
|
+
for start, end in labels.capture_windows
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def load_capture_windows(
|
|
274
|
+
path: str | Path,
|
|
275
|
+
*,
|
|
276
|
+
interval_seconds: int,
|
|
277
|
+
metric_name: str | None = None,
|
|
278
|
+
) -> list[dict[str, str]]:
|
|
279
|
+
"""Load a labels file and render its capture windows as labeler display dicts."""
|
|
280
|
+
labels = parse_labels_file(path, interval_seconds=interval_seconds, metric_name=metric_name)
|
|
281
|
+
return capture_windows_to_display(labels)
|