detectkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- detectkit/__init__.py +17 -0
- detectkit/alerting/__init__.py +13 -0
- detectkit/alerting/channels/__init__.py +21 -0
- detectkit/alerting/channels/base.py +191 -0
- detectkit/alerting/channels/email.py +146 -0
- detectkit/alerting/channels/factory.py +193 -0
- detectkit/alerting/channels/mattermost.py +53 -0
- detectkit/alerting/channels/slack.py +55 -0
- detectkit/alerting/channels/telegram.py +110 -0
- detectkit/alerting/channels/webhook.py +139 -0
- detectkit/alerting/orchestrator.py +368 -0
- detectkit/cli/__init__.py +1 -0
- detectkit/cli/commands/__init__.py +1 -0
- detectkit/cli/commands/init.py +282 -0
- detectkit/cli/commands/run.py +427 -0
- detectkit/cli/commands/test_alert.py +184 -0
- detectkit/cli/main.py +186 -0
- detectkit/config/__init__.py +30 -0
- detectkit/config/metric_config.py +467 -0
- detectkit/config/profile.py +285 -0
- detectkit/config/project_config.py +164 -0
- detectkit/core/__init__.py +6 -0
- detectkit/core/interval.py +132 -0
- detectkit/core/models.py +106 -0
- detectkit/database/__init__.py +27 -0
- detectkit/database/clickhouse_manager.py +385 -0
- detectkit/database/internal_tables.py +581 -0
- detectkit/database/manager.py +324 -0
- detectkit/database/tables.py +134 -0
- detectkit/detectors/__init__.py +6 -0
- detectkit/detectors/base.py +222 -0
- detectkit/detectors/factory.py +138 -0
- detectkit/detectors/statistical/__init__.py +8 -0
- detectkit/detectors/statistical/iqr.py +230 -0
- detectkit/detectors/statistical/mad.py +423 -0
- detectkit/detectors/statistical/manual_bounds.py +177 -0
- detectkit/detectors/statistical/zscore.py +225 -0
- detectkit/loaders/__init__.py +6 -0
- detectkit/loaders/metric_loader.py +470 -0
- detectkit/loaders/query_template.py +164 -0
- detectkit/orchestration/__init__.py +9 -0
- detectkit/orchestration/task_manager.py +698 -0
- detectkit/utils/__init__.py +1 -0
- detectkit-0.1.0.dist-info/METADATA +231 -0
- detectkit-0.1.0.dist-info/RECORD +49 -0
- detectkit-0.1.0.dist-info/WHEEL +5 -0
- detectkit-0.1.0.dist-info/entry_points.txt +2 -0
- detectkit-0.1.0.dist-info/licenses/LICENSE +21 -0
- detectkit-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task manager for metric processing pipeline.
|
|
3
|
+
|
|
4
|
+
Orchestrates the complete workflow:
|
|
5
|
+
1. Load data from database
|
|
6
|
+
2. Run anomaly detection
|
|
7
|
+
3. Send alerts
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from datetime import datetime, timezone, timedelta
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
import json
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from detectkit.alerting.channels.base import AlertData, BaseAlertChannel
|
|
18
|
+
from detectkit.alerting.channels.factory import AlertChannelFactory
|
|
19
|
+
from detectkit.alerting.orchestrator import (
|
|
20
|
+
AlertConditions,
|
|
21
|
+
AlertOrchestrator,
|
|
22
|
+
DetectionRecord,
|
|
23
|
+
)
|
|
24
|
+
from detectkit.config.metric_config import MetricConfig
|
|
25
|
+
from detectkit.core.interval import Interval
|
|
26
|
+
from detectkit.database.internal_tables import InternalTablesManager
|
|
27
|
+
from detectkit.detectors.base import BaseDetector
|
|
28
|
+
from detectkit.detectors.factory import DetectorFactory
|
|
29
|
+
from detectkit.loaders.metric_loader import MetricLoader
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PipelineStep(str, Enum):
|
|
33
|
+
"""Pipeline execution steps."""
|
|
34
|
+
|
|
35
|
+
LOAD = "load"
|
|
36
|
+
DETECT = "detect"
|
|
37
|
+
ALERT = "alert"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TaskStatus(str, Enum):
|
|
41
|
+
"""Task execution status."""
|
|
42
|
+
|
|
43
|
+
RUNNING = "running"
|
|
44
|
+
SUCCESS = "success"
|
|
45
|
+
FAILED = "failed"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TaskManager:
|
|
49
|
+
"""
|
|
50
|
+
Manages metric processing pipeline.
|
|
51
|
+
|
|
52
|
+
Responsibilities:
|
|
53
|
+
- Execute pipeline steps (load, detect, alert)
|
|
54
|
+
- Task locking to prevent concurrent runs
|
|
55
|
+
- Idempotency (resume from last processed timestamp)
|
|
56
|
+
- Error handling and status tracking
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> config = MetricConfig.from_yaml("metrics/cpu_usage.yml")
|
|
60
|
+
>>> manager = TaskManager(
|
|
61
|
+
... internal_manager=internal_tables,
|
|
62
|
+
... db_manager=clickhouse,
|
|
63
|
+
... )
|
|
64
|
+
>>> manager.run_metric(
|
|
65
|
+
... config,
|
|
66
|
+
... steps=[PipelineStep.LOAD, PipelineStep.DETECT, PipelineStep.ALERT]
|
|
67
|
+
... )
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
internal_manager: InternalTablesManager,
|
|
73
|
+
db_manager, # BaseDatabaseManager
|
|
74
|
+
profiles_config=None, # ProfilesConfig (optional for backward compatibility)
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize task manager.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
internal_manager: Manager for internal detectk tables
|
|
81
|
+
db_manager: Database manager for metric data
|
|
82
|
+
profiles_config: Profiles configuration (for alert channels)
|
|
83
|
+
"""
|
|
84
|
+
self.internal = internal_manager
|
|
85
|
+
self.db_manager = db_manager
|
|
86
|
+
self.profiles_config = profiles_config
|
|
87
|
+
|
|
88
|
+
def run_metric(
|
|
89
|
+
self,
|
|
90
|
+
config: MetricConfig,
|
|
91
|
+
steps: Optional[List[PipelineStep]] = None,
|
|
92
|
+
from_date: Optional[datetime] = None,
|
|
93
|
+
to_date: Optional[datetime] = None,
|
|
94
|
+
full_refresh: bool = False,
|
|
95
|
+
force: bool = False,
|
|
96
|
+
) -> Dict[str, any]:
|
|
97
|
+
"""
|
|
98
|
+
Run metric processing pipeline.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
config: Metric configuration
|
|
102
|
+
steps: Pipeline steps to execute (default: all steps)
|
|
103
|
+
from_date: Start date for data loading (optional)
|
|
104
|
+
to_date: End date for data loading (optional)
|
|
105
|
+
full_refresh: Delete all existing data and reload from scratch
|
|
106
|
+
force: Ignore task locks
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Dict with execution results:
|
|
110
|
+
{
|
|
111
|
+
"status": "success" | "failed",
|
|
112
|
+
"steps_completed": ["load", "detect"],
|
|
113
|
+
"datapoints_loaded": 1000,
|
|
114
|
+
"anomalies_detected": 5,
|
|
115
|
+
"alerts_sent": 2,
|
|
116
|
+
"error": None | "error message"
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
Example:
|
|
120
|
+
>>> result = manager.run_metric(
|
|
121
|
+
... config,
|
|
122
|
+
... steps=[PipelineStep.LOAD, PipelineStep.DETECT],
|
|
123
|
+
... from_date=datetime(2024, 1, 1),
|
|
124
|
+
... )
|
|
125
|
+
>>> print(result["status"])
|
|
126
|
+
success
|
|
127
|
+
"""
|
|
128
|
+
steps = steps or [PipelineStep.LOAD, PipelineStep.DETECT, PipelineStep.ALERT]
|
|
129
|
+
metric_name = config.name
|
|
130
|
+
|
|
131
|
+
result = {
|
|
132
|
+
"status": TaskStatus.SUCCESS,
|
|
133
|
+
"steps_completed": [],
|
|
134
|
+
"datapoints_loaded": 0,
|
|
135
|
+
"anomalies_detected": 0,
|
|
136
|
+
"alerts_sent": 0,
|
|
137
|
+
"error": None,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
# Step 0: Acquire lock
|
|
142
|
+
if not force:
|
|
143
|
+
# Default timeout: 1 hour (can be overridden via ProjectConfig in future)
|
|
144
|
+
timeout_seconds = 3600
|
|
145
|
+
lock_acquired = self.internal.acquire_lock(
|
|
146
|
+
metric_name=metric_name,
|
|
147
|
+
detector_id="pipeline", # General pipeline lock
|
|
148
|
+
process_type="pipeline", # Full pipeline execution
|
|
149
|
+
timeout_seconds=timeout_seconds,
|
|
150
|
+
)
|
|
151
|
+
if not lock_acquired:
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
f"Failed to acquire lock for metric '{metric_name}'. "
|
|
154
|
+
f"Another task is running. Use --force to override."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
# Step 1: Load data
|
|
159
|
+
if PipelineStep.LOAD in steps:
|
|
160
|
+
load_result = self._run_load_step(
|
|
161
|
+
config, from_date, to_date, full_refresh
|
|
162
|
+
)
|
|
163
|
+
result["datapoints_loaded"] = load_result["points_loaded"]
|
|
164
|
+
result["steps_completed"].append(PipelineStep.LOAD)
|
|
165
|
+
|
|
166
|
+
# Step 2: Detect anomalies
|
|
167
|
+
if PipelineStep.DETECT in steps:
|
|
168
|
+
detect_result = self._run_detect_step(config, from_date, to_date, full_refresh)
|
|
169
|
+
result["anomalies_detected"] = detect_result["anomalies_count"]
|
|
170
|
+
result["steps_completed"].append(PipelineStep.DETECT)
|
|
171
|
+
|
|
172
|
+
# Step 3: Send alerts
|
|
173
|
+
if PipelineStep.ALERT in steps:
|
|
174
|
+
alert_result = self._run_alert_step(config)
|
|
175
|
+
result["alerts_sent"] = alert_result["alerts_sent"]
|
|
176
|
+
result["steps_completed"].append(PipelineStep.ALERT)
|
|
177
|
+
|
|
178
|
+
finally:
|
|
179
|
+
# Always release lock
|
|
180
|
+
if not force:
|
|
181
|
+
status = "completed" if result["status"] == TaskStatus.SUCCESS else "failed"
|
|
182
|
+
error_msg = result.get("error")
|
|
183
|
+
self.internal.release_lock(
|
|
184
|
+
metric_name=metric_name,
|
|
185
|
+
detector_id="pipeline",
|
|
186
|
+
process_type="pipeline",
|
|
187
|
+
status=status,
|
|
188
|
+
error_message=error_msg,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
result["status"] = TaskStatus.FAILED
|
|
193
|
+
result["error"] = str(e)
|
|
194
|
+
|
|
195
|
+
return result
|
|
196
|
+
|
|
197
|
+
def _run_load_step(
|
|
198
|
+
self,
|
|
199
|
+
config: MetricConfig,
|
|
200
|
+
from_date: Optional[datetime],
|
|
201
|
+
to_date: Optional[datetime],
|
|
202
|
+
full_refresh: bool,
|
|
203
|
+
) -> Dict[str, int]:
|
|
204
|
+
"""
|
|
205
|
+
Execute data loading step with batching.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
config: Metric configuration
|
|
209
|
+
from_date: Start date (optional)
|
|
210
|
+
to_date: End date (optional)
|
|
211
|
+
full_refresh: Delete existing data
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dict with {"points_loaded": N}
|
|
215
|
+
"""
|
|
216
|
+
loader = MetricLoader(
|
|
217
|
+
config=config,
|
|
218
|
+
db_manager=self.db_manager,
|
|
219
|
+
internal_manager=self.internal,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Determine date range
|
|
223
|
+
if full_refresh:
|
|
224
|
+
# Delete existing data for this metric
|
|
225
|
+
self.internal.delete_datapoints(
|
|
226
|
+
metric_name=config.name,
|
|
227
|
+
from_timestamp=from_date,
|
|
228
|
+
to_timestamp=to_date,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Determine actual from_date and to_date
|
|
232
|
+
actual_from = from_date
|
|
233
|
+
actual_to = to_date
|
|
234
|
+
|
|
235
|
+
if actual_from is None:
|
|
236
|
+
# Get last saved timestamp
|
|
237
|
+
last_ts = self.internal.get_last_datapoint_timestamp(config.name)
|
|
238
|
+
if last_ts:
|
|
239
|
+
# Start from next interval after last timestamp
|
|
240
|
+
interval = config.get_interval()
|
|
241
|
+
actual_from = last_ts + timedelta(seconds=interval.seconds)
|
|
242
|
+
else:
|
|
243
|
+
# No data yet - use loading_start_time from config
|
|
244
|
+
if config.loading_start_time:
|
|
245
|
+
actual_from = datetime.strptime(
|
|
246
|
+
config.loading_start_time, "%Y-%m-%d %H:%M:%S"
|
|
247
|
+
).replace(tzinfo=timezone.utc)
|
|
248
|
+
else:
|
|
249
|
+
raise ValueError(
|
|
250
|
+
"No existing data and no loading_start_time configured. "
|
|
251
|
+
"Please specify from_date or set loading_start_time in config."
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if actual_to is None:
|
|
255
|
+
actual_to = datetime.now(timezone.utc)
|
|
256
|
+
|
|
257
|
+
# Calculate total points and batch size
|
|
258
|
+
interval = config.get_interval()
|
|
259
|
+
total_seconds = (actual_to - actual_from).total_seconds()
|
|
260
|
+
total_points = int(total_seconds / interval.seconds)
|
|
261
|
+
batch_size = config.loading_batch_size
|
|
262
|
+
|
|
263
|
+
# If total points <= batch_size, load in one go
|
|
264
|
+
if total_points <= batch_size:
|
|
265
|
+
rows_inserted = loader.load_and_save(from_date=actual_from, to_date=actual_to)
|
|
266
|
+
return {"points_loaded": rows_inserted}
|
|
267
|
+
|
|
268
|
+
# Load in batches
|
|
269
|
+
total_loaded = 0
|
|
270
|
+
current_from = actual_from
|
|
271
|
+
|
|
272
|
+
while current_from < actual_to:
|
|
273
|
+
# Calculate batch end time
|
|
274
|
+
batch_seconds = batch_size * interval.seconds
|
|
275
|
+
batch_to = current_from + timedelta(seconds=batch_seconds)
|
|
276
|
+
if batch_to > actual_to:
|
|
277
|
+
batch_to = actual_to
|
|
278
|
+
|
|
279
|
+
# Load batch
|
|
280
|
+
rows = loader.load_and_save(from_date=current_from, to_date=batch_to)
|
|
281
|
+
total_loaded += rows
|
|
282
|
+
|
|
283
|
+
# Move to next batch
|
|
284
|
+
current_from = batch_to
|
|
285
|
+
|
|
286
|
+
return {"points_loaded": total_loaded}
|
|
287
|
+
|
|
288
|
+
def _run_detect_step(
|
|
289
|
+
self,
|
|
290
|
+
config: MetricConfig,
|
|
291
|
+
from_date: Optional[datetime],
|
|
292
|
+
to_date: Optional[datetime],
|
|
293
|
+
full_refresh: bool = False,
|
|
294
|
+
) -> Dict[str, int]:
|
|
295
|
+
"""
|
|
296
|
+
Execute anomaly detection step with batching and idempotency.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
config: Metric configuration
|
|
300
|
+
from_date: Start date for detection (optional)
|
|
301
|
+
to_date: End date for detection (optional)
|
|
302
|
+
full_refresh: Delete existing detections before running
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Dict with {"anomalies_count": N}
|
|
306
|
+
"""
|
|
307
|
+
anomalies_count = 0
|
|
308
|
+
|
|
309
|
+
# Skip if no detectors configured
|
|
310
|
+
if not config.detectors:
|
|
311
|
+
return {"anomalies_count": 0}
|
|
312
|
+
|
|
313
|
+
# Get interval
|
|
314
|
+
interval = config.get_interval()
|
|
315
|
+
|
|
316
|
+
# Determine to_date if not specified
|
|
317
|
+
actual_to = to_date or datetime.now(timezone.utc)
|
|
318
|
+
# Normalize to naive datetime (remove timezone info)
|
|
319
|
+
if actual_to and actual_to.tzinfo is not None:
|
|
320
|
+
actual_to = actual_to.replace(tzinfo=None)
|
|
321
|
+
|
|
322
|
+
# Normalize from_date to naive
|
|
323
|
+
normalized_from_date = from_date
|
|
324
|
+
if normalized_from_date and normalized_from_date.tzinfo is not None:
|
|
325
|
+
normalized_from_date = normalized_from_date.replace(tzinfo=None)
|
|
326
|
+
|
|
327
|
+
# Run each detector
|
|
328
|
+
for detector_config in config.detectors:
|
|
329
|
+
# Create detector to get detector_id
|
|
330
|
+
# Combine algorithm params with execution params (seasonality_components)
|
|
331
|
+
detector_params = detector_config.get_algorithm_params()
|
|
332
|
+
|
|
333
|
+
# Add seasonality_components if configured
|
|
334
|
+
seasonality_components = detector_config.get_seasonality_components()
|
|
335
|
+
if seasonality_components is not None:
|
|
336
|
+
detector_params["seasonality_components"] = seasonality_components
|
|
337
|
+
|
|
338
|
+
detector_dict = {
|
|
339
|
+
"type": detector_config.type,
|
|
340
|
+
"params": detector_params
|
|
341
|
+
}
|
|
342
|
+
detector = DetectorFactory.create_from_config(detector_dict)
|
|
343
|
+
detector_id = detector.get_detector_id()
|
|
344
|
+
|
|
345
|
+
# Delete existing detections if full_refresh
|
|
346
|
+
if full_refresh:
|
|
347
|
+
self.internal.delete_detections(
|
|
348
|
+
metric_name=config.name,
|
|
349
|
+
detector_id=detector_id,
|
|
350
|
+
from_timestamp=normalized_from_date,
|
|
351
|
+
to_timestamp=actual_to,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# IDEMPOTENCY: Get last detected timestamp
|
|
355
|
+
last_detection_ts = self.internal.get_last_detection_timestamp(
|
|
356
|
+
metric_name=config.name,
|
|
357
|
+
detector_id=detector_id
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Determine actual from_date
|
|
361
|
+
actual_from = normalized_from_date
|
|
362
|
+
if not full_refresh and last_detection_ts:
|
|
363
|
+
# Resume from last detected point + 1 interval
|
|
364
|
+
resume_from = last_detection_ts + timedelta(seconds=interval.seconds)
|
|
365
|
+
if actual_from:
|
|
366
|
+
actual_from = max(actual_from, resume_from)
|
|
367
|
+
else:
|
|
368
|
+
actual_from = resume_from
|
|
369
|
+
|
|
370
|
+
# Apply start_time filter if configured
|
|
371
|
+
start_time_str = detector_config.get_start_time()
|
|
372
|
+
if start_time_str:
|
|
373
|
+
start_time = datetime.fromisoformat(start_time_str.replace('Z', '+00:00'))
|
|
374
|
+
# Always normalize to naive datetime
|
|
375
|
+
start_time = start_time.replace(tzinfo=None)
|
|
376
|
+
if actual_from:
|
|
377
|
+
# Ensure actual_from is also naive
|
|
378
|
+
if actual_from.tzinfo is not None:
|
|
379
|
+
actual_from = actual_from.replace(tzinfo=None)
|
|
380
|
+
actual_from = max(actual_from, start_time)
|
|
381
|
+
else:
|
|
382
|
+
actual_from = start_time
|
|
383
|
+
|
|
384
|
+
# Skip if nothing to detect
|
|
385
|
+
if not actual_from or actual_from >= actual_to:
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
# Get batch_size and window_size
|
|
389
|
+
batch_size = detector_config.get_batch_size() or 1000 # Default batch size
|
|
390
|
+
window_size = detector_config.params.get('window_size', 0)
|
|
391
|
+
|
|
392
|
+
# Calculate total points to detect
|
|
393
|
+
total_seconds = (actual_to - actual_from).total_seconds()
|
|
394
|
+
total_points = int(total_seconds / interval.seconds)
|
|
395
|
+
|
|
396
|
+
# BATCHING: Process in batches
|
|
397
|
+
current_from = actual_from
|
|
398
|
+
|
|
399
|
+
while current_from < actual_to:
|
|
400
|
+
# Calculate batch end
|
|
401
|
+
batch_seconds = batch_size * interval.seconds
|
|
402
|
+
batch_to = current_from + timedelta(seconds=batch_seconds)
|
|
403
|
+
if batch_to > actual_to:
|
|
404
|
+
batch_to = actual_to
|
|
405
|
+
|
|
406
|
+
# Calculate window start (need historical data for window)
|
|
407
|
+
window_seconds = window_size * interval.seconds
|
|
408
|
+
window_from = current_from - timedelta(seconds=window_seconds)
|
|
409
|
+
|
|
410
|
+
# Load data with window
|
|
411
|
+
data = self.internal.load_datapoints(
|
|
412
|
+
metric_name=config.name,
|
|
413
|
+
from_timestamp=window_from,
|
|
414
|
+
to_timestamp=batch_to,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
if not data or len(data["timestamp"]) == 0:
|
|
418
|
+
# No data, move to next batch
|
|
419
|
+
current_from = batch_to
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
# Run detection on data
|
|
423
|
+
# Detector will handle window sliding internally
|
|
424
|
+
results = detector.detect(data)
|
|
425
|
+
|
|
426
|
+
# Filter results to only current batch (not historical window)
|
|
427
|
+
current_from_np = np.datetime64(current_from, 'ms')
|
|
428
|
+
batch_to_np = np.datetime64(batch_to, 'ms')
|
|
429
|
+
batch_results = [
|
|
430
|
+
r for r in results
|
|
431
|
+
if current_from_np <= np.datetime64(r.timestamp, 'ms') < batch_to_np
|
|
432
|
+
]
|
|
433
|
+
|
|
434
|
+
# Save results to _dtk_detections
|
|
435
|
+
if batch_results and len(batch_results) > 0:
|
|
436
|
+
# Convert List[DetectionResult] to dict with numpy arrays
|
|
437
|
+
detection_data = {
|
|
438
|
+
"timestamp": np.array([r.timestamp for r in batch_results], dtype="datetime64[ms]"),
|
|
439
|
+
"is_anomaly": np.array([r.is_anomaly for r in batch_results], dtype=bool),
|
|
440
|
+
"confidence_lower": np.array([r.confidence_lower for r in batch_results], dtype=np.float64),
|
|
441
|
+
"confidence_upper": np.array([r.confidence_upper for r in batch_results], dtype=np.float64),
|
|
442
|
+
"value": np.array([r.value for r in batch_results], dtype=np.float64),
|
|
443
|
+
"detection_metadata": np.array([
|
|
444
|
+
json.dumps(r.detection_metadata) if r.detection_metadata else "{}"
|
|
445
|
+
for r in batch_results
|
|
446
|
+
], dtype=object),
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
self.internal.save_detections(
|
|
450
|
+
metric_name=config.name,
|
|
451
|
+
detector_id=detector_id,
|
|
452
|
+
data=detection_data,
|
|
453
|
+
detector_params=detector.get_detector_params(),
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Count anomalies
|
|
457
|
+
anomalies_count += sum(1 for r in batch_results if r.is_anomaly)
|
|
458
|
+
|
|
459
|
+
# Move to next batch
|
|
460
|
+
current_from = batch_to
|
|
461
|
+
|
|
462
|
+
return {"anomalies_count": anomalies_count}
|
|
463
|
+
|
|
464
|
+
def _run_alert_step(self, config: MetricConfig) -> Dict[str, int]:
|
|
465
|
+
"""
|
|
466
|
+
Execute alerting step.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
config: Metric configuration
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Dict with {"alerts_sent": N}
|
|
473
|
+
"""
|
|
474
|
+
alerts_sent = 0
|
|
475
|
+
|
|
476
|
+
# Check if alerting is configured
|
|
477
|
+
if not config.alerting or not config.alerting.enabled:
|
|
478
|
+
return {"alerts_sent": 0}
|
|
479
|
+
|
|
480
|
+
if not config.alerting.channels:
|
|
481
|
+
return {"alerts_sent": 0}
|
|
482
|
+
|
|
483
|
+
# Get alerting config
|
|
484
|
+
alerting_config = config.alerting
|
|
485
|
+
|
|
486
|
+
# Create alert orchestrator
|
|
487
|
+
interval = config.get_interval()
|
|
488
|
+
orchestrator = AlertOrchestrator(
|
|
489
|
+
metric_name=config.name,
|
|
490
|
+
interval=interval,
|
|
491
|
+
conditions=AlertConditions(
|
|
492
|
+
min_detectors=1, # At least one detector must flag anomaly
|
|
493
|
+
direction="any", # Any direction (up or down)
|
|
494
|
+
consecutive_anomalies=alerting_config.consecutive_anomalies,
|
|
495
|
+
),
|
|
496
|
+
timezone_display="UTC",
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Get last complete point
|
|
500
|
+
last_point = orchestrator.get_last_complete_point()
|
|
501
|
+
|
|
502
|
+
# Load recent detections for consecutive anomaly checking
|
|
503
|
+
# We need N recent points where N = consecutive_anomalies
|
|
504
|
+
recent_detections = self._load_recent_detections(
|
|
505
|
+
metric_name=config.name,
|
|
506
|
+
last_point=last_point,
|
|
507
|
+
num_points=alerting_config.consecutive_anomalies,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
if not recent_detections:
|
|
511
|
+
return {"alerts_sent": 0}
|
|
512
|
+
|
|
513
|
+
# Check if alert should be sent
|
|
514
|
+
should_alert, alert_data = orchestrator.should_alert(recent_detections)
|
|
515
|
+
|
|
516
|
+
if should_alert:
|
|
517
|
+
# Create alert channels from config
|
|
518
|
+
channels = self._create_alert_channels(alerting_config.channels)
|
|
519
|
+
|
|
520
|
+
if channels:
|
|
521
|
+
# Send alerts
|
|
522
|
+
results = orchestrator.send_alerts(alert_data, channels)
|
|
523
|
+
alerts_sent = sum(1 for success in results.values() if success)
|
|
524
|
+
|
|
525
|
+
return {"alerts_sent": alerts_sent}
|
|
526
|
+
|
|
527
|
+
def _load_recent_detections(
|
|
528
|
+
self,
|
|
529
|
+
metric_name: str,
|
|
530
|
+
last_point: datetime,
|
|
531
|
+
num_points: int,
|
|
532
|
+
) -> List[DetectionRecord]:
|
|
533
|
+
"""
|
|
534
|
+
Load recent detection records for consecutive anomaly checking.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
metric_name: Metric name
|
|
538
|
+
last_point: Last complete timestamp
|
|
539
|
+
num_points: Number of recent points to load
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
List of DetectionRecord objects
|
|
543
|
+
"""
|
|
544
|
+
# Get full table name for _dtk_detections
|
|
545
|
+
from detectkit.database.tables import TABLE_DETECTIONS
|
|
546
|
+
full_table_name = self.db_manager.get_full_table_name(
|
|
547
|
+
TABLE_DETECTIONS, use_internal=True
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# Query _dtk_detections for recent points
|
|
551
|
+
# GROUP BY timestamp to combine results from multiple detectors
|
|
552
|
+
query = f"""
|
|
553
|
+
SELECT
|
|
554
|
+
timestamp,
|
|
555
|
+
groupArray(detector_id) as detector_ids,
|
|
556
|
+
groupArray(is_anomaly) as is_anomaly_flags,
|
|
557
|
+
groupArray(confidence_lower) as confidence_lowers,
|
|
558
|
+
groupArray(confidence_upper) as confidence_uppers,
|
|
559
|
+
any(value) as value
|
|
560
|
+
FROM {full_table_name}
|
|
561
|
+
WHERE metric_name = %(metric_name)s
|
|
562
|
+
AND timestamp <= %(last_point)s
|
|
563
|
+
GROUP BY timestamp
|
|
564
|
+
ORDER BY timestamp DESC
|
|
565
|
+
LIMIT %(num_points)s
|
|
566
|
+
"""
|
|
567
|
+
|
|
568
|
+
results = self.db_manager.execute_query(
|
|
569
|
+
query,
|
|
570
|
+
params={
|
|
571
|
+
"metric_name": metric_name,
|
|
572
|
+
"last_point": last_point,
|
|
573
|
+
"num_points": num_points,
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
if not results:
|
|
578
|
+
return []
|
|
579
|
+
|
|
580
|
+
# Convert to DetectionRecord objects
|
|
581
|
+
records = []
|
|
582
|
+
for row in results:
|
|
583
|
+
# Check if any detector flagged this point as anomaly
|
|
584
|
+
is_anomaly = any(row["is_anomaly_flags"])
|
|
585
|
+
|
|
586
|
+
# Get detector IDs that flagged anomaly
|
|
587
|
+
anomaly_detectors = [
|
|
588
|
+
d_id
|
|
589
|
+
for d_id, flag in zip(row["detector_ids"], row["is_anomaly_flags"])
|
|
590
|
+
if flag
|
|
591
|
+
]
|
|
592
|
+
|
|
593
|
+
# Determine direction and severity for the most severe detector
|
|
594
|
+
direction = "none"
|
|
595
|
+
severity = 0.0
|
|
596
|
+
confidence_lower = None
|
|
597
|
+
confidence_upper = None
|
|
598
|
+
|
|
599
|
+
if is_anomaly and anomaly_detectors:
|
|
600
|
+
# Get confidence bounds from first detector (they should be similar)
|
|
601
|
+
first_detector_idx = row["detector_ids"].index(anomaly_detectors[0])
|
|
602
|
+
confidence_lower = row["confidence_lowers"][first_detector_idx]
|
|
603
|
+
confidence_upper = row["confidence_uppers"][first_detector_idx]
|
|
604
|
+
|
|
605
|
+
# Determine direction
|
|
606
|
+
value = row["value"]
|
|
607
|
+
if value < confidence_lower:
|
|
608
|
+
direction = "down"
|
|
609
|
+
severity = (confidence_lower - value) / max(abs(confidence_lower), 1e-10)
|
|
610
|
+
elif value > confidence_upper:
|
|
611
|
+
direction = "up"
|
|
612
|
+
severity = (value - confidence_upper) / max(abs(confidence_upper), 1e-10)
|
|
613
|
+
|
|
614
|
+
records.append(
|
|
615
|
+
DetectionRecord(
|
|
616
|
+
timestamp=row["timestamp"],
|
|
617
|
+
detector_name=anomaly_detectors[0] if anomaly_detectors else "unknown",
|
|
618
|
+
detector_id=anomaly_detectors[0] if anomaly_detectors else "unknown",
|
|
619
|
+
value=row["value"],
|
|
620
|
+
is_anomaly=is_anomaly,
|
|
621
|
+
confidence_lower=confidence_lower,
|
|
622
|
+
confidence_upper=confidence_upper,
|
|
623
|
+
direction=direction,
|
|
624
|
+
severity=severity,
|
|
625
|
+
detection_metadata={},
|
|
626
|
+
)
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# Reverse to get chronological order
|
|
630
|
+
return list(reversed(records))
|
|
631
|
+
|
|
632
|
+
def _create_alert_channels(
|
|
633
|
+
self, channel_names: List[str]
|
|
634
|
+
) -> List[BaseAlertChannel]:
|
|
635
|
+
"""
|
|
636
|
+
Create alert channel instances from channel names.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
channel_names: List of channel names to create
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
List of alert channel instances
|
|
643
|
+
"""
|
|
644
|
+
if not self.profiles_config:
|
|
645
|
+
# No profiles config available, return empty list
|
|
646
|
+
return []
|
|
647
|
+
|
|
648
|
+
channels = []
|
|
649
|
+
for channel_name in channel_names:
|
|
650
|
+
try:
|
|
651
|
+
# Get channel config from profiles
|
|
652
|
+
channel_config = self.profiles_config.get_alert_channel_config(
|
|
653
|
+
channel_name
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
# Create channel instance using factory
|
|
657
|
+
channel = AlertChannelFactory.create_from_config(channel_config)
|
|
658
|
+
channels.append(channel)
|
|
659
|
+
|
|
660
|
+
except Exception as e:
|
|
661
|
+
# Log error but continue with other channels
|
|
662
|
+
print(f"Warning: Failed to create channel '{channel_name}': {e}")
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
return channels
|
|
666
|
+
|
|
667
|
+
def get_metric_status(self, metric_name: str) -> Optional[Dict]:
|
|
668
|
+
"""
|
|
669
|
+
Get current status of a metric.
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
metric_name: Name of the metric
|
|
673
|
+
|
|
674
|
+
Returns:
|
|
675
|
+
Dict with status information or None if not found
|
|
676
|
+
|
|
677
|
+
Example:
|
|
678
|
+
>>> status = manager.get_metric_status("cpu_usage")
|
|
679
|
+
>>> print(status["last_run"])
|
|
680
|
+
2024-01-01 12:00:00
|
|
681
|
+
"""
|
|
682
|
+
# Check if locked
|
|
683
|
+
lock_info = self.internal.check_lock(metric_name)
|
|
684
|
+
|
|
685
|
+
# Get last datapoint timestamp
|
|
686
|
+
last_timestamp = self.internal.get_last_datapoint_timestamp(metric_name)
|
|
687
|
+
|
|
688
|
+
return {
|
|
689
|
+
"metric_name": metric_name,
|
|
690
|
+
"is_locked": lock_info is not None,
|
|
691
|
+
"locked_by": lock_info.get("locked_by") if lock_info else None,
|
|
692
|
+
"locked_at": lock_info.get("locked_at") if lock_info else None,
|
|
693
|
+
"last_datapoint": last_timestamp,
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
def __repr__(self) -> str:
|
|
697
|
+
"""String representation."""
|
|
698
|
+
return f"TaskManager(db={self.db_manager.__class__.__name__})"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility functions for detectk."""
|