detectkit 0.3.12__tar.gz → 0.3.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {detectkit-0.3.12/detectkit.egg-info → detectkit-0.3.13}/PKG-INFO +1 -1
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/__init__.py +1 -1
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/orchestrator.py +28 -20
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/commands/test_alert.py +2 -2
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/database/clickhouse_manager.py +8 -43
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/database/internal_tables.py +138 -140
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/database/tables.py +44 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/loaders/metric_loader.py +6 -8
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/orchestration/task_manager.py +44 -24
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/utils/__init__.py +10 -0
- detectkit-0.3.13/detectkit/utils/datetime_utils.py +54 -0
- {detectkit-0.3.12 → detectkit-0.3.13/detectkit.egg-info}/PKG-INFO +1 -1
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit.egg-info/SOURCES.txt +1 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/pyproject.toml +1 -1
- {detectkit-0.3.12 → detectkit-0.3.13}/LICENSE +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/MANIFEST.in +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/README.md +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/base.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/email.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/factory.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/mattermost.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/slack.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/telegram.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/alerting/channels/webhook.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/commands/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/commands/init.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/commands/run.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/cli/main.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/config/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/config/metric_config.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/config/profile.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/config/project_config.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/config/validator.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/core/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/core/interval.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/core/models.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/database/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/database/manager.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/base.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/factory.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/statistical/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/statistical/iqr.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/statistical/mad.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/statistical/manual_bounds.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/detectors/statistical/zscore.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/loaders/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/loaders/query_template.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/orchestration/__init__.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit/utils/stats.py +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit.egg-info/dependency_links.txt +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit.egg-info/entry_points.txt +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit.egg-info/requires.txt +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/detectkit.egg-info/top_level.txt +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/requirements.txt +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/setup.cfg +0 -0
- {detectkit-0.3.12 → detectkit-0.3.13}/setup.py +0 -0
|
@@ -4,7 +4,7 @@ detectk - Anomaly Detection for Time-Series Metrics
|
|
|
4
4
|
A Python library for data analysts and engineers to monitor metrics with automatic anomaly detection.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.3.
|
|
7
|
+
__version__ = "0.3.13"
|
|
8
8
|
|
|
9
9
|
from detectkit.core.interval import Interval
|
|
10
10
|
from detectkit.core.models import ColumnDefinition, TableModel
|
|
@@ -12,6 +12,7 @@ Handles:
|
|
|
12
12
|
from dataclasses import dataclass
|
|
13
13
|
from datetime import datetime, timezone
|
|
14
14
|
from typing import Dict, List, Optional
|
|
15
|
+
from detectkit.utils.datetime_utils import now_utc, now_utc_naive, to_naive_utc, to_aware_utc
|
|
15
16
|
|
|
16
17
|
import numpy as np
|
|
17
18
|
|
|
@@ -60,6 +61,7 @@ class AlertOrchestrator:
|
|
|
60
61
|
>>> orchestrator = AlertOrchestrator(
|
|
61
62
|
... metric_name="cpu_usage",
|
|
62
63
|
... interval=Interval.parse("10min"),
|
|
64
|
+
... alert_config_id="abc123",
|
|
63
65
|
... conditions=AlertConditions(consecutive_anomalies=3, direction="same")
|
|
64
66
|
... )
|
|
65
67
|
>>> should_alert, alert_data = orchestrator.should_alert(recent_detections)
|
|
@@ -71,6 +73,7 @@ class AlertOrchestrator:
|
|
|
71
73
|
self,
|
|
72
74
|
metric_name: str,
|
|
73
75
|
interval: Interval,
|
|
76
|
+
alert_config_id: str,
|
|
74
77
|
conditions: Optional[AlertConditions] = None,
|
|
75
78
|
timezone_display: str = "UTC",
|
|
76
79
|
internal=None, # InternalTablesManager (optional, for cooldown tracking)
|
|
@@ -84,6 +87,7 @@ class AlertOrchestrator:
|
|
|
84
87
|
Args:
|
|
85
88
|
metric_name: Name of the metric
|
|
86
89
|
interval: Metric interval
|
|
90
|
+
alert_config_id: MD5 hash of alerting config params (for independent state per config)
|
|
87
91
|
conditions: Alert conditions (defaults to AlertConditions())
|
|
88
92
|
timezone_display: Timezone for alert display (default: UTC)
|
|
89
93
|
internal: InternalTablesManager instance (optional, for cooldown tracking)
|
|
@@ -93,6 +97,7 @@ class AlertOrchestrator:
|
|
|
93
97
|
"""
|
|
94
98
|
self.metric_name = metric_name
|
|
95
99
|
self.interval = interval
|
|
100
|
+
self.alert_config_id = alert_config_id
|
|
96
101
|
self.conditions = conditions or AlertConditions()
|
|
97
102
|
self.timezone_display = timezone_display
|
|
98
103
|
self.internal = internal
|
|
@@ -335,13 +340,14 @@ class AlertOrchestrator:
|
|
|
335
340
|
print(f"Error sending alert via {channel_name}: {e}")
|
|
336
341
|
results[channel_name] = False
|
|
337
342
|
|
|
338
|
-
#
|
|
343
|
+
# Update alert timestamp after sending (for cooldown tracking)
|
|
339
344
|
if any(results.values()) and self.internal:
|
|
340
345
|
# At least one channel succeeded - update timestamp
|
|
341
346
|
self.internal.update_alert_timestamp(
|
|
342
347
|
metric_name=self.metric_name,
|
|
343
|
-
|
|
344
|
-
|
|
348
|
+
alert_config_id=self.alert_config_id,
|
|
349
|
+
timestamp=now_utc_naive(),
|
|
350
|
+
increment_count=True,
|
|
345
351
|
)
|
|
346
352
|
|
|
347
353
|
return results
|
|
@@ -362,18 +368,17 @@ class AlertOrchestrator:
|
|
|
362
368
|
- Example: now=13:23, interval=10min -> 13:10
|
|
363
369
|
|
|
364
370
|
Example:
|
|
365
|
-
>>> orchestrator = AlertOrchestrator("metric", Interval.parse("10min"))
|
|
371
|
+
>>> orchestrator = AlertOrchestrator("metric", Interval.parse("10min"), alert_config_id="abc123")
|
|
366
372
|
>>> now = datetime(2024, 1, 1, 13, 23, 0, tzinfo=timezone.utc)
|
|
367
373
|
>>> last_point = orchestrator.get_last_complete_point(now)
|
|
368
374
|
>>> print(last_point)
|
|
369
375
|
2024-01-01 13:10:00+00:00
|
|
370
376
|
"""
|
|
371
377
|
if now is None:
|
|
372
|
-
now =
|
|
378
|
+
now = now_utc()
|
|
373
379
|
|
|
374
380
|
# Ensure UTC
|
|
375
|
-
|
|
376
|
-
now = now.replace(tzinfo=timezone.utc)
|
|
381
|
+
now = to_aware_utc(now)
|
|
377
382
|
|
|
378
383
|
# Floor to interval
|
|
379
384
|
interval_seconds = self.interval.seconds
|
|
@@ -412,7 +417,7 @@ class AlertOrchestrator:
|
|
|
412
417
|
return False
|
|
413
418
|
|
|
414
419
|
# Get last alert timestamp
|
|
415
|
-
last_sent = self.internal.get_last_alert_timestamp(self.metric_name)
|
|
420
|
+
last_sent = self.internal.get_last_alert_timestamp(self.metric_name, self.alert_config_id)
|
|
416
421
|
|
|
417
422
|
if not last_sent:
|
|
418
423
|
return False # Never sent alert before
|
|
@@ -423,7 +428,7 @@ class AlertOrchestrator:
|
|
|
423
428
|
cooldown_seconds = cooldown_interval.seconds
|
|
424
429
|
|
|
425
430
|
# Calculate elapsed time
|
|
426
|
-
now =
|
|
431
|
+
now = now_utc_naive()
|
|
427
432
|
elapsed = (now - last_sent).total_seconds()
|
|
428
433
|
|
|
429
434
|
# Check recovery reset (if enabled)
|
|
@@ -523,14 +528,15 @@ class AlertOrchestrator:
|
|
|
523
528
|
detections_by_time = self._group_by_timestamp(detection_records)
|
|
524
529
|
timestamps_sorted = sorted(detections_by_time.keys(), reverse=True)
|
|
525
530
|
|
|
526
|
-
# Check that latest post-alert point is NOT anomalous
|
|
527
|
-
#
|
|
528
|
-
#
|
|
531
|
+
# Check that latest post-alert point is NOT anomalous by ANY detector.
|
|
532
|
+
# Recovery = zero detectors flag the latest point as anomalous.
|
|
533
|
+
# Using > 0 (not >= min_detectors) prevents false recovery when
|
|
534
|
+
# some but not all detectors still flag the metric as anomalous.
|
|
529
535
|
latest_ts = timestamps_sorted[0]
|
|
530
536
|
latest_detections = detections_by_time[latest_ts]
|
|
531
537
|
latest_anomalies = [d for d in latest_detections if d.is_anomaly]
|
|
532
|
-
if len(latest_anomalies)
|
|
533
|
-
#
|
|
538
|
+
if len(latest_anomalies) > 0:
|
|
539
|
+
# At least one detector still considers this point anomalous — no recovery
|
|
534
540
|
return False
|
|
535
541
|
|
|
536
542
|
return True
|
|
@@ -558,12 +564,12 @@ class AlertOrchestrator:
|
|
|
558
564
|
return False, None
|
|
559
565
|
|
|
560
566
|
# Check if there was a previous alert
|
|
561
|
-
last_alert = self.internal.get_last_alert_timestamp(self.metric_name)
|
|
567
|
+
last_alert = self.internal.get_last_alert_timestamp(self.metric_name, self.alert_config_id)
|
|
562
568
|
if not last_alert:
|
|
563
569
|
return False, None # Never alerted, nothing to recover from
|
|
564
570
|
|
|
565
571
|
# Check if recovery already sent for this incident
|
|
566
|
-
last_recovery = self.internal.get_last_recovery_timestamp(self.metric_name)
|
|
572
|
+
last_recovery = self.internal.get_last_recovery_timestamp(self.metric_name, self.alert_config_id)
|
|
567
573
|
if last_recovery and last_recovery >= last_alert:
|
|
568
574
|
return False, None # Already sent recovery for this alert
|
|
569
575
|
|
|
@@ -595,8 +601,9 @@ class AlertOrchestrator:
|
|
|
595
601
|
if not detections:
|
|
596
602
|
return None
|
|
597
603
|
|
|
598
|
-
# Use the latest detection point for recovery info
|
|
599
|
-
|
|
604
|
+
# Use the latest (newest) detection point for recovery info.
|
|
605
|
+
# detections are sorted oldest→newest by _load_recent_detections.
|
|
606
|
+
latest = detections[-1]
|
|
600
607
|
|
|
601
608
|
return AlertData(
|
|
602
609
|
metric_name=self.metric_name,
|
|
@@ -647,10 +654,10 @@ class AlertOrchestrator:
|
|
|
647
654
|
|
|
648
655
|
# Update recovery timestamp after sending
|
|
649
656
|
if any(results.values()) and self.internal:
|
|
650
|
-
from datetime import timezone as tz
|
|
651
657
|
self.internal.update_recovery_timestamp(
|
|
652
658
|
metric_name=self.metric_name,
|
|
653
|
-
|
|
659
|
+
alert_config_id=self.alert_config_id,
|
|
660
|
+
timestamp=now_utc_naive(),
|
|
654
661
|
)
|
|
655
662
|
|
|
656
663
|
return results
|
|
@@ -661,6 +668,7 @@ class AlertOrchestrator:
|
|
|
661
668
|
f"AlertOrchestrator("
|
|
662
669
|
f"metric='{self.metric_name}', "
|
|
663
670
|
f"interval={self.interval}, "
|
|
671
|
+
f"config_id='{self.alert_config_id[:8]}...', "
|
|
664
672
|
f"min_detectors={self.conditions.min_detectors}, "
|
|
665
673
|
f"direction='{self.conditions.direction}', "
|
|
666
674
|
f"consecutive={self.conditions.consecutive_anomalies})"
|
|
@@ -8,11 +8,11 @@ Useful for:
|
|
|
8
8
|
- Previewing alert templates
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
from datetime import datetime, timezone
|
|
12
11
|
from pathlib import Path
|
|
13
12
|
from typing import Optional
|
|
14
13
|
|
|
15
14
|
import numpy as np
|
|
15
|
+
from detectkit.utils.datetime_utils import now_utc
|
|
16
16
|
|
|
17
17
|
from detectkit.alerting.channels.base import AlertData
|
|
18
18
|
from detectkit.alerting.channels.factory import AlertChannelFactory
|
|
@@ -34,7 +34,7 @@ def create_mock_alert_data(
|
|
|
34
34
|
AlertData with mock anomaly data
|
|
35
35
|
"""
|
|
36
36
|
# Use current time
|
|
37
|
-
now =
|
|
37
|
+
now = now_utc()
|
|
38
38
|
|
|
39
39
|
# Get mentions from alerting config
|
|
40
40
|
mentions = metric_config.alerting.mentions if metric_config.alerting else []
|
|
@@ -8,6 +8,7 @@ from datetime import datetime, timezone
|
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
|
+
from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc
|
|
11
12
|
|
|
12
13
|
try:
|
|
13
14
|
from clickhouse_driver import Client
|
|
@@ -345,38 +346,8 @@ class ClickHouseDatabaseManager(BaseDatabaseManager):
|
|
|
345
346
|
|
|
346
347
|
full_table = self.get_full_table_name(TABLE_TASKS, use_internal=True)
|
|
347
348
|
|
|
348
|
-
# Get current UTC time (
|
|
349
|
-
now =
|
|
350
|
-
|
|
351
|
-
# Read existing alert tracking fields before delete (preserve across upsert)
|
|
352
|
-
existing_last_alert_sent = None
|
|
353
|
-
existing_last_recovery_sent = None
|
|
354
|
-
existing_alert_count = 0
|
|
355
|
-
|
|
356
|
-
preserve_query = f"""
|
|
357
|
-
SELECT last_alert_sent, last_recovery_sent, alert_count
|
|
358
|
-
FROM {full_table}
|
|
359
|
-
WHERE metric_name = %(metric_name)s
|
|
360
|
-
AND detector_id = %(detector_id)s
|
|
361
|
-
AND process_type = %(process_type)s
|
|
362
|
-
ORDER BY updated_at DESC
|
|
363
|
-
LIMIT 1
|
|
364
|
-
"""
|
|
365
|
-
try:
|
|
366
|
-
preserve_results = self.execute_query(
|
|
367
|
-
preserve_query,
|
|
368
|
-
params={
|
|
369
|
-
"metric_name": metric_name,
|
|
370
|
-
"detector_id": detector_id,
|
|
371
|
-
"process_type": process_type,
|
|
372
|
-
}
|
|
373
|
-
)
|
|
374
|
-
if preserve_results:
|
|
375
|
-
existing_last_alert_sent = preserve_results[0].get("last_alert_sent")
|
|
376
|
-
existing_last_recovery_sent = preserve_results[0].get("last_recovery_sent")
|
|
377
|
-
existing_alert_count = preserve_results[0].get("alert_count", 0) or 0
|
|
378
|
-
except Exception:
|
|
379
|
-
pass # If read fails, proceed with defaults
|
|
349
|
+
# Get current UTC time (naive UTC for numpy compatibility)
|
|
350
|
+
now = now_utc_naive()
|
|
380
351
|
|
|
381
352
|
# Delete existing record (if any), sync to ensure old row is gone before insert
|
|
382
353
|
delete_query = f"""
|
|
@@ -396,15 +367,9 @@ class ClickHouseDatabaseManager(BaseDatabaseManager):
|
|
|
396
367
|
}
|
|
397
368
|
)
|
|
398
369
|
|
|
399
|
-
|
|
400
|
-
last_ts_naive = None
|
|
401
|
-
if last_processed_timestamp:
|
|
402
|
-
if last_processed_timestamp.tzinfo is not None:
|
|
403
|
-
last_ts_naive = last_processed_timestamp.replace(tzinfo=None)
|
|
404
|
-
else:
|
|
405
|
-
last_ts_naive = last_processed_timestamp
|
|
370
|
+
last_ts_naive = to_naive_utc(last_processed_timestamp)
|
|
406
371
|
|
|
407
|
-
#
|
|
372
|
+
# Insert new record (alert state is now stored in _dtk_alert_states, not here)
|
|
408
373
|
insert_data = {
|
|
409
374
|
"metric_name": np.array([metric_name]),
|
|
410
375
|
"detector_id": np.array([detector_id]),
|
|
@@ -415,9 +380,9 @@ class ClickHouseDatabaseManager(BaseDatabaseManager):
|
|
|
415
380
|
"last_processed_timestamp": np.array([last_ts_naive], dtype="datetime64[ms]") if last_ts_naive else np.array([None]),
|
|
416
381
|
"error_message": np.array([error_message]),
|
|
417
382
|
"timeout_seconds": np.array([timeout_seconds], dtype=np.int32),
|
|
418
|
-
"last_alert_sent": np.array([
|
|
419
|
-
"alert_count": np.array([
|
|
420
|
-
"last_recovery_sent": np.array([
|
|
383
|
+
"last_alert_sent": np.array([None]),
|
|
384
|
+
"alert_count": np.array([0], dtype=np.uint32),
|
|
385
|
+
"last_recovery_sent": np.array([None]),
|
|
421
386
|
}
|
|
422
387
|
|
|
423
388
|
self.insert_batch(
|
|
@@ -9,14 +9,16 @@ methods underneath. It does NOT duplicate logic - just provides semantic wrapper
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import json
|
|
12
|
-
from datetime import datetime
|
|
12
|
+
from datetime import datetime
|
|
13
13
|
from typing import Dict, List, Optional
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
|
+
from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc, to_aware_utc
|
|
16
17
|
|
|
17
18
|
from detectkit.database.manager import BaseDatabaseManager
|
|
18
19
|
from detectkit.database.tables import (
|
|
19
20
|
INTERNAL_TABLES,
|
|
21
|
+
TABLE_ALERT_STATES,
|
|
20
22
|
TABLE_DATAPOINTS,
|
|
21
23
|
TABLE_DETECTIONS,
|
|
22
24
|
TABLE_METRICS,
|
|
@@ -127,7 +129,7 @@ class InternalTablesManager:
|
|
|
127
129
|
num_rows, ",".join(seasonality_columns), dtype=object
|
|
128
130
|
),
|
|
129
131
|
"created_at": np.full(
|
|
130
|
-
num_rows,
|
|
132
|
+
num_rows, now_utc_naive(), dtype="datetime64[ms]"
|
|
131
133
|
),
|
|
132
134
|
}
|
|
133
135
|
|
|
@@ -198,7 +200,7 @@ class InternalTablesManager:
|
|
|
198
200
|
"detector_params": np.full(num_rows, detector_params, dtype=object),
|
|
199
201
|
"detection_metadata": data["detection_metadata"],
|
|
200
202
|
"created_at": np.full(
|
|
201
|
-
num_rows,
|
|
203
|
+
num_rows, now_utc_naive(), dtype="datetime64[ms]"
|
|
202
204
|
),
|
|
203
205
|
}
|
|
204
206
|
|
|
@@ -343,9 +345,7 @@ class InternalTablesManager:
|
|
|
343
345
|
|
|
344
346
|
# Convert timezone-aware timestamps to naive to avoid numpy warning
|
|
345
347
|
timestamps = [
|
|
346
|
-
row["timestamp"]
|
|
347
|
-
if hasattr(row["timestamp"], 'tzinfo') and row["timestamp"].tzinfo
|
|
348
|
-
else row["timestamp"]
|
|
348
|
+
to_naive_utc(row["timestamp"])
|
|
349
349
|
for row in results
|
|
350
350
|
]
|
|
351
351
|
values = [row["value"] for row in results]
|
|
@@ -560,8 +560,7 @@ class InternalTablesManager:
|
|
|
560
560
|
ts_value = ts
|
|
561
561
|
else:
|
|
562
562
|
# datetime object - normalize and convert to string
|
|
563
|
-
|
|
564
|
-
ts = ts.replace(tzinfo=None)
|
|
563
|
+
ts = to_naive_utc(ts)
|
|
565
564
|
ts_key = ts.isoformat()
|
|
566
565
|
ts_value = ts
|
|
567
566
|
|
|
@@ -791,7 +790,7 @@ class InternalTablesManager:
|
|
|
791
790
|
)
|
|
792
791
|
|
|
793
792
|
# Get current UTC time (naive for numpy compatibility)
|
|
794
|
-
now =
|
|
793
|
+
now = now_utc_naive()
|
|
795
794
|
|
|
796
795
|
# Parse loading_start_time if provided
|
|
797
796
|
loading_start_time_dt = None
|
|
@@ -801,7 +800,7 @@ class InternalTablesManager:
|
|
|
801
800
|
loading_start_time_dt = dt.strptime(
|
|
802
801
|
metric_config.loading_start_time,
|
|
803
802
|
"%Y-%m-%d %H:%M:%S"
|
|
804
|
-
)
|
|
803
|
+
) # already naive UTC from config string
|
|
805
804
|
except (ValueError, AttributeError):
|
|
806
805
|
# If parsing fails, leave as None
|
|
807
806
|
pass
|
|
@@ -858,207 +857,206 @@ class InternalTablesManager:
|
|
|
858
857
|
|
|
859
858
|
def get_last_alert_timestamp(
|
|
860
859
|
self,
|
|
861
|
-
metric_name: str
|
|
860
|
+
metric_name: str,
|
|
861
|
+
alert_config_id: str,
|
|
862
862
|
) -> Optional[datetime]:
|
|
863
863
|
"""
|
|
864
|
-
Get timestamp of last sent alert for a
|
|
865
|
-
|
|
866
|
-
Used for alert cooldown tracking - prevents sending alerts
|
|
867
|
-
too frequently for the same metric.
|
|
864
|
+
Get timestamp of last sent alert for a specific alerting config.
|
|
868
865
|
|
|
869
866
|
Args:
|
|
870
867
|
metric_name: Metric identifier
|
|
868
|
+
alert_config_id: MD5 hash of alerting config params
|
|
871
869
|
|
|
872
870
|
Returns:
|
|
873
871
|
Timestamp of last sent alert, or None if never sent
|
|
874
|
-
|
|
875
|
-
Example:
|
|
876
|
-
>>> last_sent = internal.get_last_alert_timestamp("cpu_usage")
|
|
877
|
-
>>> if last_sent:
|
|
878
|
-
... elapsed = (datetime.utcnow() - last_sent).total_seconds()
|
|
879
|
-
... print(f"Last alert sent {elapsed}s ago")
|
|
880
872
|
"""
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
)
|
|
873
|
+
state = self.get_alert_state(metric_name, alert_config_id)
|
|
874
|
+
return state["last_alert_sent"]
|
|
884
875
|
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
LIMIT 1
|
|
876
|
+
def update_alert_timestamp(
|
|
877
|
+
self,
|
|
878
|
+
metric_name: str,
|
|
879
|
+
alert_config_id: str,
|
|
880
|
+
timestamp: datetime,
|
|
881
|
+
increment_count: bool = True,
|
|
882
|
+
) -> int:
|
|
893
883
|
"""
|
|
884
|
+
Update last_alert_sent timestamp for a specific alerting config.
|
|
894
885
|
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
886
|
+
Args:
|
|
887
|
+
metric_name: Metric identifier
|
|
888
|
+
alert_config_id: MD5 hash of alerting config params
|
|
889
|
+
timestamp: Timestamp when alert was sent
|
|
890
|
+
increment_count: Whether to increment alert_count (default: True)
|
|
899
891
|
|
|
900
|
-
|
|
901
|
-
|
|
892
|
+
Returns:
|
|
893
|
+
1 (always)
|
|
894
|
+
"""
|
|
895
|
+
self.upsert_alert_state(
|
|
896
|
+
metric_name=metric_name,
|
|
897
|
+
alert_config_id=alert_config_id,
|
|
898
|
+
last_alert_sent=timestamp,
|
|
899
|
+
increment_count=increment_count,
|
|
900
|
+
)
|
|
901
|
+
return 1
|
|
902
902
|
|
|
903
|
-
|
|
903
|
+
def get_last_recovery_timestamp(
|
|
904
|
+
self,
|
|
905
|
+
metric_name: str,
|
|
906
|
+
alert_config_id: str,
|
|
907
|
+
) -> Optional[datetime]:
|
|
908
|
+
"""
|
|
909
|
+
Get timestamp of last sent recovery notification for a specific alerting config.
|
|
904
910
|
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
911
|
+
Args:
|
|
912
|
+
metric_name: Metric identifier
|
|
913
|
+
alert_config_id: MD5 hash of alerting config params
|
|
908
914
|
|
|
909
|
-
|
|
915
|
+
Returns:
|
|
916
|
+
Timestamp of last sent recovery, or None if never sent
|
|
917
|
+
"""
|
|
918
|
+
state = self.get_alert_state(metric_name, alert_config_id)
|
|
919
|
+
return state["last_recovery_sent"]
|
|
910
920
|
|
|
911
|
-
def
|
|
921
|
+
def update_recovery_timestamp(
|
|
912
922
|
self,
|
|
913
923
|
metric_name: str,
|
|
924
|
+
alert_config_id: str,
|
|
914
925
|
timestamp: datetime,
|
|
915
|
-
increment_count: bool = True
|
|
916
926
|
) -> int:
|
|
917
927
|
"""
|
|
918
|
-
Update
|
|
919
|
-
|
|
920
|
-
Called after successfully sending an alert to track cooldown state.
|
|
928
|
+
Update last_recovery_sent timestamp for a specific alerting config.
|
|
921
929
|
|
|
922
930
|
Args:
|
|
923
931
|
metric_name: Metric identifier
|
|
924
|
-
|
|
925
|
-
|
|
932
|
+
alert_config_id: MD5 hash of alerting config params
|
|
933
|
+
timestamp: Timestamp when recovery was sent
|
|
926
934
|
|
|
927
935
|
Returns:
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
Example:
|
|
931
|
-
>>> # After sending alert
|
|
932
|
-
>>> internal.update_alert_timestamp(
|
|
933
|
-
... "cpu_usage",
|
|
934
|
-
... datetime.utcnow(),
|
|
935
|
-
... increment_count=True
|
|
936
|
-
... )
|
|
936
|
+
1 (always)
|
|
937
937
|
"""
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
# Normalize timestamp to naive if needed
|
|
943
|
-
if hasattr(timestamp, 'tzinfo') and timestamp.tzinfo is not None:
|
|
944
|
-
timestamp = timestamp.replace(tzinfo=None)
|
|
945
|
-
|
|
946
|
-
if increment_count:
|
|
947
|
-
# Update with alert_count increment
|
|
948
|
-
update_query = f"""
|
|
949
|
-
ALTER TABLE {full_table_name}
|
|
950
|
-
UPDATE
|
|
951
|
-
last_alert_sent = %(timestamp)s,
|
|
952
|
-
alert_count = alert_count + 1,
|
|
953
|
-
updated_at = %(timestamp)s
|
|
954
|
-
WHERE metric_name = %(metric_name)s
|
|
955
|
-
AND detector_id = 'pipeline'
|
|
956
|
-
AND process_type = 'pipeline'
|
|
957
|
-
SETTINGS mutations_sync = 1
|
|
958
|
-
"""
|
|
959
|
-
else:
|
|
960
|
-
# Update without alert_count increment
|
|
961
|
-
update_query = f"""
|
|
962
|
-
ALTER TABLE {full_table_name}
|
|
963
|
-
UPDATE
|
|
964
|
-
last_alert_sent = %(timestamp)s,
|
|
965
|
-
updated_at = %(timestamp)s
|
|
966
|
-
WHERE metric_name = %(metric_name)s
|
|
967
|
-
AND detector_id = 'pipeline'
|
|
968
|
-
AND process_type = 'pipeline'
|
|
969
|
-
SETTINGS mutations_sync = 1
|
|
970
|
-
"""
|
|
971
|
-
|
|
972
|
-
self._manager.execute_query(
|
|
973
|
-
update_query,
|
|
974
|
-
params={
|
|
975
|
-
"metric_name": metric_name,
|
|
976
|
-
"timestamp": timestamp
|
|
977
|
-
}
|
|
938
|
+
self.upsert_alert_state(
|
|
939
|
+
metric_name=metric_name,
|
|
940
|
+
alert_config_id=alert_config_id,
|
|
941
|
+
last_recovery_sent=timestamp,
|
|
978
942
|
)
|
|
979
|
-
|
|
980
943
|
return 1
|
|
981
944
|
|
|
982
|
-
|
|
945
|
+
# ─── Alert States (_dtk_alert_states) ───────────────────────────────────
|
|
946
|
+
|
|
947
|
+
def get_alert_state(
|
|
983
948
|
self,
|
|
984
|
-
metric_name: str
|
|
985
|
-
|
|
949
|
+
metric_name: str,
|
|
950
|
+
alert_config_id: str,
|
|
951
|
+
) -> Dict:
|
|
986
952
|
"""
|
|
987
|
-
Get
|
|
953
|
+
Get alert state for a specific alerting config.
|
|
988
954
|
|
|
989
955
|
Args:
|
|
990
956
|
metric_name: Metric identifier
|
|
957
|
+
alert_config_id: MD5 hash of alerting config params
|
|
991
958
|
|
|
992
959
|
Returns:
|
|
993
|
-
|
|
960
|
+
Dict with keys:
|
|
961
|
+
- last_alert_sent: datetime or None
|
|
962
|
+
- last_recovery_sent: datetime or None
|
|
963
|
+
- alert_count: int
|
|
994
964
|
"""
|
|
995
965
|
full_table_name = self._manager.get_full_table_name(
|
|
996
|
-
|
|
966
|
+
TABLE_ALERT_STATES, use_internal=True
|
|
997
967
|
)
|
|
998
968
|
|
|
999
969
|
query = f"""
|
|
1000
|
-
SELECT last_recovery_sent
|
|
970
|
+
SELECT last_alert_sent, last_recovery_sent, alert_count
|
|
1001
971
|
FROM {full_table_name}
|
|
972
|
+
FINAL
|
|
1002
973
|
WHERE metric_name = %(metric_name)s
|
|
1003
|
-
AND
|
|
1004
|
-
AND process_type = 'pipeline'
|
|
974
|
+
AND alert_config_id = %(alert_config_id)s
|
|
1005
975
|
LIMIT 1
|
|
1006
976
|
"""
|
|
1007
977
|
|
|
1008
978
|
results = self._manager.execute_query(
|
|
1009
979
|
query,
|
|
1010
|
-
params={
|
|
980
|
+
params={
|
|
981
|
+
"metric_name": metric_name,
|
|
982
|
+
"alert_config_id": alert_config_id,
|
|
983
|
+
}
|
|
1011
984
|
)
|
|
1012
985
|
|
|
1013
|
-
if not results
|
|
1014
|
-
return
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
last_sent = last_sent.replace(tzinfo=None)
|
|
986
|
+
if not results:
|
|
987
|
+
return {
|
|
988
|
+
"last_alert_sent": None,
|
|
989
|
+
"last_recovery_sent": None,
|
|
990
|
+
"alert_count": 0,
|
|
991
|
+
}
|
|
1020
992
|
|
|
1021
|
-
|
|
993
|
+
row = results[0]
|
|
994
|
+
return {
|
|
995
|
+
"last_alert_sent": to_naive_utc(row.get("last_alert_sent")),
|
|
996
|
+
"last_recovery_sent": to_naive_utc(row.get("last_recovery_sent")),
|
|
997
|
+
"alert_count": row.get("alert_count", 0) or 0,
|
|
998
|
+
}
|
|
1022
999
|
|
|
1023
|
-
def
|
|
1000
|
+
def upsert_alert_state(
|
|
1024
1001
|
self,
|
|
1025
1002
|
metric_name: str,
|
|
1026
|
-
|
|
1027
|
-
|
|
1003
|
+
alert_config_id: str,
|
|
1004
|
+
last_alert_sent: Optional[datetime] = None,
|
|
1005
|
+
last_recovery_sent: Optional[datetime] = None,
|
|
1006
|
+
increment_count: bool = False,
|
|
1007
|
+
) -> None:
|
|
1028
1008
|
"""
|
|
1029
|
-
|
|
1009
|
+
Upsert alert state for a specific alerting config.
|
|
1010
|
+
|
|
1011
|
+
Uses SELECT -> DELETE -> INSERT pattern to handle new rows and updates.
|
|
1030
1012
|
|
|
1031
1013
|
Args:
|
|
1032
1014
|
metric_name: Metric identifier
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1015
|
+
alert_config_id: MD5 hash of alerting config params
|
|
1016
|
+
last_alert_sent: New last_alert_sent timestamp (None = keep existing)
|
|
1017
|
+
last_recovery_sent: New last_recovery_sent timestamp (None = keep existing)
|
|
1018
|
+
increment_count: Whether to increment alert_count by 1
|
|
1037
1019
|
"""
|
|
1038
1020
|
full_table_name = self._manager.get_full_table_name(
|
|
1039
|
-
|
|
1021
|
+
TABLE_ALERT_STATES, use_internal=True
|
|
1040
1022
|
)
|
|
1041
1023
|
|
|
1042
|
-
|
|
1043
|
-
|
|
1024
|
+
# Read existing state to preserve fields not being updated
|
|
1025
|
+
existing = self.get_alert_state(metric_name, alert_config_id)
|
|
1026
|
+
|
|
1027
|
+
now = now_utc_naive()
|
|
1028
|
+
|
|
1029
|
+
new_last_alert = to_naive_utc(last_alert_sent) if last_alert_sent is not None else existing["last_alert_sent"]
|
|
1030
|
+
new_last_recovery = to_naive_utc(last_recovery_sent) if last_recovery_sent is not None else existing["last_recovery_sent"]
|
|
1031
|
+
new_alert_count = existing["alert_count"] + 1 if increment_count else existing["alert_count"]
|
|
1044
1032
|
|
|
1045
|
-
|
|
1033
|
+
# Delete existing row
|
|
1034
|
+
delete_query = f"""
|
|
1046
1035
|
ALTER TABLE {full_table_name}
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
updated_at = %(timestamp)s
|
|
1050
|
-
WHERE metric_name = %(metric_name)s
|
|
1051
|
-
AND detector_id = 'pipeline'
|
|
1052
|
-
AND process_type = 'pipeline'
|
|
1036
|
+
DELETE WHERE metric_name = %(metric_name)s
|
|
1037
|
+
AND alert_config_id = %(alert_config_id)s
|
|
1053
1038
|
SETTINGS mutations_sync = 1
|
|
1054
1039
|
"""
|
|
1055
|
-
|
|
1056
1040
|
self._manager.execute_query(
|
|
1057
|
-
|
|
1041
|
+
delete_query,
|
|
1058
1042
|
params={
|
|
1059
1043
|
"metric_name": metric_name,
|
|
1060
|
-
"
|
|
1044
|
+
"alert_config_id": alert_config_id,
|
|
1061
1045
|
}
|
|
1062
1046
|
)
|
|
1063
1047
|
|
|
1064
|
-
|
|
1048
|
+
# Insert new row
|
|
1049
|
+
insert_data = {
|
|
1050
|
+
"metric_name": np.array([metric_name]),
|
|
1051
|
+
"alert_config_id": np.array([alert_config_id]),
|
|
1052
|
+
"last_alert_sent": np.array([new_last_alert], dtype="datetime64[ms]") if new_last_alert else np.array([None]),
|
|
1053
|
+
"last_recovery_sent": np.array([new_last_recovery], dtype="datetime64[ms]") if new_last_recovery else np.array([None]),
|
|
1054
|
+
"alert_count": np.array([new_alert_count], dtype=np.uint32),
|
|
1055
|
+
"updated_at": np.array([now], dtype="datetime64[ms]"),
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
self._manager.insert_batch(
|
|
1059
|
+
full_table_name,
|
|
1060
|
+
insert_data,
|
|
1061
|
+
conflict_strategy="ignore",
|
|
1062
|
+
)
|
|
@@ -140,6 +140,48 @@ def get_tasks_table_model() -> TableModel:
|
|
|
140
140
|
)
|
|
141
141
|
|
|
142
142
|
|
|
143
|
+
def get_alert_states_table_model() -> TableModel:
|
|
144
|
+
"""
|
|
145
|
+
Get TableModel for _dtk_alert_states table.
|
|
146
|
+
|
|
147
|
+
Stores alert state independently per alerting config (not per metric).
|
|
148
|
+
Each alerting config block in metric YAML gets its own row identified
|
|
149
|
+
by a hash of the config parameters.
|
|
150
|
+
|
|
151
|
+
Schema:
|
|
152
|
+
- metric_name: Metric identifier
|
|
153
|
+
- alert_config_id: MD5 hash of alerting config params (channels, conditions, etc.)
|
|
154
|
+
- last_alert_sent: Timestamp of last sent alert (nullable)
|
|
155
|
+
- last_recovery_sent: Timestamp of last sent recovery notification (nullable)
|
|
156
|
+
- alert_count: Total alerts sent for this config
|
|
157
|
+
- updated_at: Last update timestamp
|
|
158
|
+
|
|
159
|
+
Primary Key: (metric_name, alert_config_id)
|
|
160
|
+
Engine: ReplacingMergeTree(updated_at)
|
|
161
|
+
"""
|
|
162
|
+
return TableModel(
|
|
163
|
+
columns=[
|
|
164
|
+
ColumnDefinition("metric_name", "String"),
|
|
165
|
+
ColumnDefinition("alert_config_id", "String"),
|
|
166
|
+
ColumnDefinition(
|
|
167
|
+
"last_alert_sent",
|
|
168
|
+
"Nullable(DateTime64(3, 'UTC'))",
|
|
169
|
+
nullable=True,
|
|
170
|
+
),
|
|
171
|
+
ColumnDefinition(
|
|
172
|
+
"last_recovery_sent",
|
|
173
|
+
"Nullable(DateTime64(3, 'UTC'))",
|
|
174
|
+
nullable=True,
|
|
175
|
+
),
|
|
176
|
+
ColumnDefinition("alert_count", "UInt32", default="0"),
|
|
177
|
+
ColumnDefinition("updated_at", "DateTime64(3, 'UTC')"),
|
|
178
|
+
],
|
|
179
|
+
primary_key=["metric_name", "alert_config_id"],
|
|
180
|
+
engine="ReplacingMergeTree(updated_at)",
|
|
181
|
+
order_by=["metric_name", "alert_config_id"],
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
143
185
|
def get_metrics_table_model() -> TableModel:
|
|
144
186
|
"""
|
|
145
187
|
Get TableModel for _dtk_metrics table.
|
|
@@ -203,6 +245,7 @@ TABLE_DATAPOINTS = "_dtk_datapoints"
|
|
|
203
245
|
TABLE_DETECTIONS = "_dtk_detections"
|
|
204
246
|
TABLE_TASKS = "_dtk_tasks"
|
|
205
247
|
TABLE_METRICS = "_dtk_metrics"
|
|
248
|
+
TABLE_ALERT_STATES = "_dtk_alert_states"
|
|
206
249
|
|
|
207
250
|
# Map of table names to model factories
|
|
208
251
|
INTERNAL_TABLES = {
|
|
@@ -210,4 +253,5 @@ INTERNAL_TABLES = {
|
|
|
210
253
|
TABLE_DETECTIONS: get_detections_table_model,
|
|
211
254
|
TABLE_TASKS: get_tasks_table_model,
|
|
212
255
|
TABLE_METRICS: get_metrics_table_model,
|
|
256
|
+
TABLE_ALERT_STATES: get_alert_states_table_model,
|
|
213
257
|
}
|
|
@@ -11,6 +11,7 @@ Loads time-series data from databases with:
|
|
|
11
11
|
|
|
12
12
|
from datetime import datetime, timedelta, timezone
|
|
13
13
|
from typing import Dict, List, Optional
|
|
14
|
+
from detectkit.utils.datetime_utils import now_utc_naive, to_naive_utc
|
|
14
15
|
|
|
15
16
|
import numpy as np
|
|
16
17
|
|
|
@@ -118,12 +119,9 @@ class MetricLoader:
|
|
|
118
119
|
>>> print(data["timestamp"])
|
|
119
120
|
>>> print(data["value"])
|
|
120
121
|
"""
|
|
121
|
-
# Normalize datetimes to naive (
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
from_date = from_date.replace(tzinfo=None)
|
|
125
|
-
if to_date.tzinfo is not None:
|
|
126
|
-
to_date = to_date.replace(tzinfo=None)
|
|
122
|
+
# Normalize datetimes to naive UTC (ClickHouse returns naive UTC for DateTime64)
|
|
123
|
+
from_date = to_naive_utc(from_date)
|
|
124
|
+
to_date = to_naive_utc(to_date)
|
|
127
125
|
|
|
128
126
|
# Get interval
|
|
129
127
|
interval = self.config.get_interval()
|
|
@@ -344,7 +342,7 @@ class MetricLoader:
|
|
|
344
342
|
# Parse loading_start_time string (format: "YYYY-MM-DD HH:MM:SS" in UTC)
|
|
345
343
|
from_date = datetime.strptime(
|
|
346
344
|
self.config.loading_start_time, "%Y-%m-%d %H:%M:%S"
|
|
347
|
-
)
|
|
345
|
+
) # naive UTC from config string
|
|
348
346
|
else:
|
|
349
347
|
# No data and no loading_start_time - need to specify from_date
|
|
350
348
|
raise ValueError(
|
|
@@ -353,7 +351,7 @@ class MetricLoader:
|
|
|
353
351
|
)
|
|
354
352
|
|
|
355
353
|
if to_date is None:
|
|
356
|
-
to_date =
|
|
354
|
+
to_date = now_utc_naive()
|
|
357
355
|
|
|
358
356
|
# Load and save
|
|
359
357
|
data = self.load(from_date, to_date, fill_gaps=True)
|
|
@@ -8,8 +8,10 @@ Orchestrates the complete workflow:
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
from datetime import datetime, timezone, timedelta
|
|
11
|
+
from detectkit.utils.datetime_utils import now_utc, now_utc_naive, to_naive_utc, to_aware_utc
|
|
11
12
|
from enum import Enum
|
|
12
13
|
from typing import Dict, List, Optional
|
|
14
|
+
import hashlib
|
|
13
15
|
import json
|
|
14
16
|
|
|
15
17
|
import click
|
|
@@ -30,6 +32,33 @@ from detectkit.detectors.factory import DetectorFactory
|
|
|
30
32
|
from detectkit.loaders.metric_loader import MetricLoader
|
|
31
33
|
|
|
32
34
|
|
|
35
|
+
def _make_alert_config_id(alerting_config) -> str:
|
|
36
|
+
"""
|
|
37
|
+
Generate a stable unique ID for an alerting config block.
|
|
38
|
+
|
|
39
|
+
Hashes ALL parameters that define the config's identity so that:
|
|
40
|
+
- Two configs with same channels but different conditions get different IDs
|
|
41
|
+
- The same config always gets the same ID across runs
|
|
42
|
+
- Changing any parameter produces a new ID (fresh state)
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
alerting_config: AlertConfig instance
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
16-character hex string (MD5 truncated)
|
|
49
|
+
"""
|
|
50
|
+
config_dict = {
|
|
51
|
+
"channels": sorted(alerting_config.channels),
|
|
52
|
+
"min_detectors": alerting_config.min_detectors,
|
|
53
|
+
"direction": alerting_config.direction,
|
|
54
|
+
"consecutive_anomalies": alerting_config.consecutive_anomalies,
|
|
55
|
+
"alert_cooldown": str(alerting_config.alert_cooldown) if alerting_config.alert_cooldown else None,
|
|
56
|
+
"cooldown_reset_on_recovery": alerting_config.cooldown_reset_on_recovery,
|
|
57
|
+
}
|
|
58
|
+
config_str = json.dumps(config_dict, sort_keys=True)
|
|
59
|
+
return hashlib.md5(config_str.encode()).hexdigest()[:16]
|
|
60
|
+
|
|
61
|
+
|
|
33
62
|
class PipelineStep(str, Enum):
|
|
34
63
|
"""Pipeline execution steps."""
|
|
35
64
|
|
|
@@ -268,7 +297,7 @@ class TaskManager:
|
|
|
268
297
|
if config.loading_start_time:
|
|
269
298
|
actual_from = datetime.strptime(
|
|
270
299
|
config.loading_start_time, "%Y-%m-%d %H:%M:%S"
|
|
271
|
-
)
|
|
300
|
+
) # naive UTC from config string
|
|
272
301
|
click.echo(f" │ Starting fresh from: {config.loading_start_time}")
|
|
273
302
|
else:
|
|
274
303
|
raise ValueError(
|
|
@@ -277,13 +306,12 @@ class TaskManager:
|
|
|
277
306
|
)
|
|
278
307
|
|
|
279
308
|
if actual_to is None:
|
|
280
|
-
actual_to =
|
|
309
|
+
actual_to = now_utc_naive()
|
|
310
|
+
else:
|
|
311
|
+
actual_to = to_naive_utc(actual_to)
|
|
281
312
|
|
|
282
313
|
# Normalize to naive UTC (ClickHouse returns aware UTC for DateTime64(3, 'UTC'))
|
|
283
|
-
|
|
284
|
-
actual_from = actual_from.replace(tzinfo=None)
|
|
285
|
-
if actual_to.tzinfo is not None:
|
|
286
|
-
actual_to = actual_to.replace(tzinfo=None)
|
|
314
|
+
actual_from = to_naive_utc(actual_from)
|
|
287
315
|
|
|
288
316
|
# Guard: next interval hasn't arrived yet
|
|
289
317
|
if actual_from >= actual_to:
|
|
@@ -379,15 +407,8 @@ class TaskManager:
|
|
|
379
407
|
click.echo(f" │ Running {len(config.detectors)} detector(s)...")
|
|
380
408
|
|
|
381
409
|
# Determine to_date if not specified
|
|
382
|
-
actual_to = to_date
|
|
383
|
-
|
|
384
|
-
if actual_to and actual_to.tzinfo is not None:
|
|
385
|
-
actual_to = actual_to.replace(tzinfo=None)
|
|
386
|
-
|
|
387
|
-
# Normalize from_date to naive
|
|
388
|
-
normalized_from_date = from_date
|
|
389
|
-
if normalized_from_date and normalized_from_date.tzinfo is not None:
|
|
390
|
-
normalized_from_date = normalized_from_date.replace(tzinfo=None)
|
|
410
|
+
actual_to = to_naive_utc(to_date) if to_date else now_utc_naive()
|
|
411
|
+
normalized_from_date = to_naive_utc(from_date)
|
|
391
412
|
|
|
392
413
|
# Run each detector
|
|
393
414
|
for idx, detector_config in enumerate(config.detectors, 1):
|
|
@@ -425,9 +446,7 @@ class TaskManager:
|
|
|
425
446
|
metric_name=config.name,
|
|
426
447
|
detector_id=detector_id
|
|
427
448
|
)
|
|
428
|
-
|
|
429
|
-
if last_detection_ts and last_detection_ts.tzinfo is not None:
|
|
430
|
-
last_detection_ts = last_detection_ts.replace(tzinfo=None)
|
|
449
|
+
last_detection_ts = to_naive_utc(last_detection_ts)
|
|
431
450
|
|
|
432
451
|
# Determine actual from_date
|
|
433
452
|
actual_from = normalized_from_date
|
|
@@ -442,17 +461,15 @@ class TaskManager:
|
|
|
442
461
|
# Apply start_time filter if configured
|
|
443
462
|
start_time_str = detector_config.get_start_time()
|
|
444
463
|
if start_time_str:
|
|
445
|
-
start_time =
|
|
446
|
-
|
|
447
|
-
|
|
464
|
+
start_time = to_naive_utc(
|
|
465
|
+
datetime.fromisoformat(start_time_str.replace('Z', '+00:00'))
|
|
466
|
+
)
|
|
448
467
|
if actual_from:
|
|
449
468
|
actual_from = max(actual_from, start_time)
|
|
450
469
|
else:
|
|
451
470
|
actual_from = start_time
|
|
452
471
|
|
|
453
|
-
|
|
454
|
-
if actual_from and actual_from.tzinfo is not None:
|
|
455
|
-
actual_from = actual_from.replace(tzinfo=None)
|
|
472
|
+
actual_from = to_naive_utc(actual_from)
|
|
456
473
|
|
|
457
474
|
# Skip if nothing to detect
|
|
458
475
|
if not actual_from or actual_from >= actual_to:
|
|
@@ -589,9 +606,12 @@ class TaskManager:
|
|
|
589
606
|
|
|
590
607
|
click.echo(f" │ Checking alert conditions...")
|
|
591
608
|
|
|
609
|
+
alert_config_id = _make_alert_config_id(alerting_config)
|
|
610
|
+
|
|
592
611
|
orchestrator = AlertOrchestrator(
|
|
593
612
|
metric_name=config.name,
|
|
594
613
|
interval=interval,
|
|
614
|
+
alert_config_id=alert_config_id,
|
|
595
615
|
conditions=AlertConditions(
|
|
596
616
|
min_detectors=alerting_config.min_detectors,
|
|
597
617
|
direction=alerting_config.direction,
|
|
@@ -7,6 +7,12 @@ from detectkit.utils.stats import (
|
|
|
7
7
|
weighted_percentile,
|
|
8
8
|
weighted_std,
|
|
9
9
|
)
|
|
10
|
+
from detectkit.utils.datetime_utils import (
|
|
11
|
+
now_utc,
|
|
12
|
+
now_utc_naive,
|
|
13
|
+
to_naive_utc,
|
|
14
|
+
to_aware_utc,
|
|
15
|
+
)
|
|
10
16
|
|
|
11
17
|
__all__ = [
|
|
12
18
|
"weighted_percentile",
|
|
@@ -14,4 +20,8 @@ __all__ = [
|
|
|
14
20
|
"weighted_mad",
|
|
15
21
|
"weighted_mean",
|
|
16
22
|
"weighted_std",
|
|
23
|
+
"now_utc",
|
|
24
|
+
"now_utc_naive",
|
|
25
|
+
"to_naive_utc",
|
|
26
|
+
"to_aware_utc",
|
|
17
27
|
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""UTC datetime utilities.
|
|
2
|
+
|
|
3
|
+
Contract: all internal timestamps are naive UTC (tzinfo=None).
|
|
4
|
+
- ClickHouse DateTime64(3, 'UTC') stores and returns naive UTC
|
|
5
|
+
- numpy datetime64 has no timezone representation
|
|
6
|
+
- Comparisons between timestamps must use the same convention
|
|
7
|
+
|
|
8
|
+
Functions:
|
|
9
|
+
now_utc() -> aware UTC datetime (for calculations requiring timezone)
|
|
10
|
+
now_utc_naive() -> naive UTC datetime (for numpy / ClickHouse inserts)
|
|
11
|
+
to_naive_utc() -> normalize any datetime to naive UTC
|
|
12
|
+
to_aware_utc() -> normalize any datetime to aware UTC
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def now_utc() -> datetime:
|
|
20
|
+
"""Return current time as timezone-aware UTC datetime."""
|
|
21
|
+
return datetime.now(timezone.utc)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def now_utc_naive() -> datetime:
|
|
25
|
+
"""Return current time as naive UTC datetime (for numpy / ClickHouse)."""
|
|
26
|
+
return datetime.now(timezone.utc).replace(tzinfo=None)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def to_naive_utc(dt: Optional[datetime]) -> Optional[datetime]:
|
|
30
|
+
"""Strip tzinfo from a UTC datetime, returning naive UTC.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
dt: datetime object (aware or naive) or None
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Naive UTC datetime, or None if input is None
|
|
37
|
+
"""
|
|
38
|
+
if dt is None:
|
|
39
|
+
return None
|
|
40
|
+
return dt.replace(tzinfo=None) if dt.tzinfo is not None else dt
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def to_aware_utc(dt: Optional[datetime]) -> Optional[datetime]:
|
|
44
|
+
"""Attach UTC timezone to a naive datetime.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
dt: datetime object (aware or naive) or None
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Timezone-aware UTC datetime, or None if input is None
|
|
51
|
+
"""
|
|
52
|
+
if dt is None:
|
|
53
|
+
return None
|
|
54
|
+
return dt if dt.tzinfo is not None else dt.replace(tzinfo=timezone.utc)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|