detectkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. detectkit/__init__.py +17 -0
  2. detectkit/alerting/__init__.py +13 -0
  3. detectkit/alerting/channels/__init__.py +21 -0
  4. detectkit/alerting/channels/base.py +191 -0
  5. detectkit/alerting/channels/email.py +146 -0
  6. detectkit/alerting/channels/factory.py +193 -0
  7. detectkit/alerting/channels/mattermost.py +53 -0
  8. detectkit/alerting/channels/slack.py +55 -0
  9. detectkit/alerting/channels/telegram.py +110 -0
  10. detectkit/alerting/channels/webhook.py +139 -0
  11. detectkit/alerting/orchestrator.py +368 -0
  12. detectkit/cli/__init__.py +1 -0
  13. detectkit/cli/commands/__init__.py +1 -0
  14. detectkit/cli/commands/init.py +282 -0
  15. detectkit/cli/commands/run.py +427 -0
  16. detectkit/cli/commands/test_alert.py +184 -0
  17. detectkit/cli/main.py +186 -0
  18. detectkit/config/__init__.py +30 -0
  19. detectkit/config/metric_config.py +467 -0
  20. detectkit/config/profile.py +285 -0
  21. detectkit/config/project_config.py +164 -0
  22. detectkit/core/__init__.py +6 -0
  23. detectkit/core/interval.py +132 -0
  24. detectkit/core/models.py +106 -0
  25. detectkit/database/__init__.py +27 -0
  26. detectkit/database/clickhouse_manager.py +385 -0
  27. detectkit/database/internal_tables.py +581 -0
  28. detectkit/database/manager.py +324 -0
  29. detectkit/database/tables.py +134 -0
  30. detectkit/detectors/__init__.py +6 -0
  31. detectkit/detectors/base.py +222 -0
  32. detectkit/detectors/factory.py +138 -0
  33. detectkit/detectors/statistical/__init__.py +8 -0
  34. detectkit/detectors/statistical/iqr.py +230 -0
  35. detectkit/detectors/statistical/mad.py +423 -0
  36. detectkit/detectors/statistical/manual_bounds.py +177 -0
  37. detectkit/detectors/statistical/zscore.py +225 -0
  38. detectkit/loaders/__init__.py +6 -0
  39. detectkit/loaders/metric_loader.py +470 -0
  40. detectkit/loaders/query_template.py +164 -0
  41. detectkit/orchestration/__init__.py +9 -0
  42. detectkit/orchestration/task_manager.py +698 -0
  43. detectkit/utils/__init__.py +1 -0
  44. detectkit-0.1.0.dist-info/METADATA +231 -0
  45. detectkit-0.1.0.dist-info/RECORD +49 -0
  46. detectkit-0.1.0.dist-info/WHEEL +5 -0
  47. detectkit-0.1.0.dist-info/entry_points.txt +2 -0
  48. detectkit-0.1.0.dist-info/licenses/LICENSE +21 -0
  49. detectkit-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,423 @@
1
+ """
2
+ Median Absolute Deviation (MAD) anomaly detector.
3
+
4
+ MAD is a robust statistical method for outlier detection that:
5
+ - Uses median (robust to outliers) instead of mean
6
+ - Measures deviation from median using MAD instead of std
7
+ - Less sensitive to extreme values than Z-Score
8
+
9
+ Formula:
10
+ - median_val = median(values)
11
+ - mad_val = median(|values - median_val|)
12
+ - lower_bound = median_val - threshold × mad_val
13
+ - upper_bound = median_val + threshold × mad_val
14
+
15
+ Seasonality support:
16
+ - Groups data by seasonality components
17
+ - Computes global statistics (entire window)
18
+ - Computes component statistics (per group)
19
+ - Applies multipliers to adjust confidence intervals
20
+ """
21
+
22
+ from typing import Any, Dict, List, Optional, Tuple, Union
23
+ import json
24
+
25
+ import numpy as np
26
+
27
+ from detectkit.detectors.base import BaseDetector, DetectionResult
28
+
29
+
30
+ class MADDetector(BaseDetector):
31
+ """
32
+ Median Absolute Deviation detector for anomaly detection.
33
+
34
+ Detects anomalies by comparing values against confidence intervals
35
+ based on median and MAD (median absolute deviation).
36
+
37
+ Parameters:
38
+ threshold (float): Number of MAD units from median (default: 3.0)
39
+ - 3.0 is standard (similar to 3-sigma in Z-Score)
40
+ - Higher = less sensitive (fewer anomalies)
41
+ - Lower = more sensitive (more anomalies)
42
+
43
+ window_size (int): Historical window size in points (default: 100)
44
+ - Uses last N points to compute statistics
45
+ - Larger = more stable but less responsive
46
+ - Smaller = more responsive but less stable
47
+
48
+ min_samples (int): Minimum samples required for detection (default: 30)
49
+ - Skip detection if window has fewer valid points
50
+ - Ensures statistical reliability
51
+
52
+ Example:
53
+ >>> detector = MADDetector(threshold=3.0, window_size=100)
54
+ >>> results = detector.detect(data)
55
+ >>> for r in results:
56
+ ... if r.is_anomaly:
57
+ ... print(f"Anomaly: {r.value} outside [{r.confidence_lower}, {r.confidence_upper}]")
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ threshold: float = 3.0,
63
+ window_size: int = 100,
64
+ min_samples: int = 30,
65
+ seasonality_components: Optional[List[Union[str, List[str]]]] = None,
66
+ min_samples_per_group: int = 10,
67
+ ):
68
+ """
69
+ Initialize MAD detector with parameters.
70
+
71
+ Args:
72
+ threshold: Number of MAD units from median
73
+ window_size: Historical window size in points
74
+ min_samples: Minimum total samples required
75
+ seasonality_components: Optional list of seasonality groups
76
+ Examples:
77
+ - ["day_of_week"] - single component
78
+ - [["day_of_week", "hour"]] - combined group
79
+ - ["day", ["hour", "minute"]] - separate + combined
80
+ min_samples_per_group: Minimum samples per seasonality group
81
+ """
82
+ super().__init__(
83
+ threshold=threshold,
84
+ window_size=window_size,
85
+ min_samples=min_samples,
86
+ seasonality_components=seasonality_components,
87
+ min_samples_per_group=min_samples_per_group,
88
+ )
89
+
90
+ def _validate_params(self):
91
+ """Validate detector parameters."""
92
+ threshold = self.params.get("threshold")
93
+ if threshold is None or threshold <= 0:
94
+ raise ValueError("threshold must be positive")
95
+
96
+ window_size = self.params.get("window_size")
97
+ if window_size is None or window_size < 1:
98
+ raise ValueError("window_size must be at least 1")
99
+
100
+ min_samples = self.params.get("min_samples")
101
+ if min_samples is None or min_samples < 1:
102
+ raise ValueError("min_samples must be at least 1")
103
+
104
+ if min_samples > window_size:
105
+ raise ValueError("min_samples cannot exceed window_size")
106
+
107
+ def _parse_seasonality_data(
108
+ self, seasonality_data: np.ndarray, seasonality_columns: List[str]
109
+ ) -> Dict[str, np.ndarray]:
110
+ """
111
+ Parse seasonality JSON strings into structured data.
112
+
113
+ Args:
114
+ seasonality_data: Array of JSON strings
115
+ seasonality_columns: List of column names
116
+
117
+ Returns:
118
+ Dict with column names as keys, numpy arrays as values
119
+
120
+ Example:
121
+ Input: ['{"day": 1, "hour": 10}', '{"day": 1, "hour": 11}']
122
+ Output: {"day": array([1, 1]), "hour": array([10, 11])}
123
+ """
124
+ if len(seasonality_data) == 0:
125
+ return {}
126
+
127
+ # Parse all JSON strings
128
+ parsed_data = {col: [] for col in seasonality_columns}
129
+
130
+ for json_str in seasonality_data:
131
+ if json_str is None or json_str == "{}":
132
+ # Empty seasonality - add None for all columns
133
+ for col in seasonality_columns:
134
+ parsed_data[col].append(None)
135
+ else:
136
+ try:
137
+ data_dict = json.loads(json_str)
138
+ for col in seasonality_columns:
139
+ parsed_data[col].append(data_dict.get(col))
140
+ except (json.JSONDecodeError, TypeError):
141
+ # Invalid JSON - add None
142
+ for col in seasonality_columns:
143
+ parsed_data[col].append(None)
144
+
145
+ # Convert to numpy arrays
146
+ return {col: np.array(vals) for col, vals in parsed_data.items()}
147
+
148
+ def _create_seasonality_mask(
149
+ self,
150
+ seasonality_dict: Dict[str, np.ndarray],
151
+ window_start: int,
152
+ current_idx: int,
153
+ group_columns: List[str],
154
+ ) -> np.ndarray:
155
+ """
156
+ Create boolean mask for seasonality group.
157
+
158
+ Args:
159
+ seasonality_dict: Parsed seasonality data
160
+ window_start: Start index of window
161
+ current_idx: Current point index
162
+ group_columns: List of columns to group by (e.g., ["day", "hour"])
163
+
164
+ Returns:
165
+ Boolean mask for window indices matching current point's seasonality
166
+
167
+ Example:
168
+ Current point: day=1, hour=10
169
+ Group columns: ["day", "hour"]
170
+ Returns: mask where (day==1) AND (hour==10)
171
+ """
172
+ if not group_columns or not seasonality_dict:
173
+ # No grouping - return all True
174
+ window_size = current_idx - window_start
175
+ return np.ones(window_size, dtype=bool)
176
+
177
+ # Get current point's seasonality values
178
+ current_values = {}
179
+ for col in group_columns:
180
+ if col in seasonality_dict:
181
+ current_values[col] = seasonality_dict[col][current_idx]
182
+ else:
183
+ # Column not found - no filtering
184
+ return np.ones(current_idx - window_start, dtype=bool)
185
+
186
+ # Create combined mask (AND of all columns)
187
+ mask = np.ones(current_idx - window_start, dtype=bool)
188
+
189
+ for col in group_columns:
190
+ current_val = current_values[col]
191
+ window_vals = seasonality_dict[col][window_start:current_idx]
192
+ mask &= (window_vals == current_val)
193
+
194
+ return mask
195
+
196
+ def detect(self, data: Dict[str, np.ndarray]) -> list[DetectionResult]:
197
+ """
198
+ Perform MAD-based anomaly detection with seasonality support.
199
+
200
+ Algorithm (TECHNICAL_SPEC.md section 8):
201
+ 1. Parse seasonality data
202
+ 2. For each point:
203
+ - Compute global statistics (entire window)
204
+ - For each seasonality group:
205
+ * Create mask matching current point's seasonality
206
+ * Compute group statistics
207
+ * Calculate multipliers
208
+ - Apply all multipliers to adjust intervals
209
+ - Detect anomalies
210
+
211
+ Args:
212
+ data: Dictionary with keys:
213
+ - timestamp: np.array of datetime64[ms]
214
+ - value: np.array of float64 (may contain NaN)
215
+ - seasonality_data: np.array of JSON strings
216
+ - seasonality_columns: list of column names
217
+
218
+ Returns:
219
+ List of DetectionResult for each point
220
+ """
221
+ timestamps = data["timestamp"]
222
+ values = data["value"]
223
+ seasonality_data = data.get("seasonality_data", np.array([]))
224
+ seasonality_columns = data.get("seasonality_columns", [])
225
+
226
+ threshold = self.params["threshold"]
227
+ window_size = self.params["window_size"]
228
+ min_samples = self.params["min_samples"]
229
+ seasonality_components = self.params.get("seasonality_components")
230
+ min_samples_per_group = self.params.get("min_samples_per_group", 10)
231
+
232
+ # Parse seasonality data once
233
+ seasonality_dict = {}
234
+ if len(seasonality_data) > 0 and seasonality_columns:
235
+ seasonality_dict = self._parse_seasonality_data(
236
+ seasonality_data, seasonality_columns
237
+ )
238
+
239
+ results = []
240
+ n_points = len(timestamps)
241
+
242
+ for i in range(n_points):
243
+ current_val = values[i]
244
+ current_ts = timestamps[i]
245
+
246
+ # Skip NaN values
247
+ if np.isnan(current_val):
248
+ results.append(
249
+ DetectionResult(
250
+ timestamp=current_ts,
251
+ value=current_val,
252
+ is_anomaly=False,
253
+ detection_metadata={"reason": "missing_data"},
254
+ )
255
+ )
256
+ continue
257
+
258
+ # Get historical window (not including current point)
259
+ window_start = max(0, i - window_size)
260
+ window_values = values[window_start:i]
261
+
262
+ # Filter out NaN values from window
263
+ valid_mask = ~np.isnan(window_values)
264
+ window_valid = window_values[valid_mask]
265
+
266
+ # Check if we have enough samples
267
+ if len(window_valid) < min_samples:
268
+ results.append(
269
+ DetectionResult(
270
+ timestamp=current_ts,
271
+ value=current_val,
272
+ is_anomaly=False,
273
+ detection_metadata={
274
+ "reason": "insufficient_data",
275
+ "window_size": int(len(window_valid)),
276
+ "min_samples": min_samples,
277
+ },
278
+ )
279
+ )
280
+ continue
281
+
282
+ # STEP 1: Compute GLOBAL statistics (entire window)
283
+ global_median = np.median(window_valid)
284
+ global_abs_deviations = np.abs(window_valid - global_median)
285
+ global_mad = np.median(global_abs_deviations)
286
+
287
+ # Initialize adjusted statistics with global values
288
+ adjusted_median = global_median
289
+ adjusted_mad = global_mad
290
+
291
+ # STEP 2: Apply seasonality adjustments
292
+ multipliers_applied = []
293
+
294
+ if seasonality_components and seasonality_dict:
295
+ # Process each seasonality group
296
+ for group in seasonality_components:
297
+ # Normalize to list (handle both str and List[str])
298
+ group_cols = [group] if isinstance(group, str) else group
299
+
300
+ # Create mask for this group
301
+ season_mask = self._create_seasonality_mask(
302
+ seasonality_dict, window_start, i, group_cols
303
+ )
304
+
305
+ # Apply mask to window (only valid values + seasonality match)
306
+ combined_mask = valid_mask.copy()
307
+ combined_mask[valid_mask] &= season_mask
308
+
309
+ group_values = window_values[combined_mask]
310
+
311
+ # Check if enough samples in group
312
+ if len(group_values) < min_samples_per_group:
313
+ # Insufficient data - skip this group (multiplier = 1.0)
314
+ multipliers_applied.append({
315
+ "group": group_cols,
316
+ "median_multiplier": 1.0,
317
+ "mad_multiplier": 1.0,
318
+ "reason": "insufficient_group_data",
319
+ "group_size": int(len(group_values)),
320
+ })
321
+ continue
322
+
323
+ # Compute group statistics
324
+ group_median = np.median(group_values)
325
+ group_abs_dev = np.abs(group_values - group_median)
326
+ group_mad = np.median(group_abs_dev)
327
+
328
+ # Calculate multipliers
329
+ if global_median != 0:
330
+ median_multiplier = group_median / global_median
331
+ else:
332
+ median_multiplier = 1.0
333
+
334
+ if global_mad != 0:
335
+ mad_multiplier = group_mad / global_mad
336
+ else:
337
+ mad_multiplier = 1.0
338
+
339
+ # Apply multipliers
340
+ adjusted_median *= median_multiplier
341
+ adjusted_mad *= mad_multiplier
342
+
343
+ multipliers_applied.append({
344
+ "group": group_cols,
345
+ "median_multiplier": float(median_multiplier),
346
+ "mad_multiplier": float(mad_multiplier),
347
+ "group_size": int(len(group_values)),
348
+ })
349
+
350
+ # STEP 3: Build confidence interval
351
+ if adjusted_mad == 0:
352
+ # All values identical - any deviation is anomalous
353
+ confidence_lower = adjusted_median - 1e-10
354
+ confidence_upper = adjusted_median + 1e-10
355
+ else:
356
+ confidence_lower = adjusted_median - threshold * adjusted_mad
357
+ confidence_upper = adjusted_median + threshold * adjusted_mad
358
+
359
+ # STEP 4: Check if current value is anomalous
360
+ is_anomaly = (current_val < confidence_lower) or (current_val > confidence_upper)
361
+
362
+ # Build metadata
363
+ metadata = {
364
+ "global_median": float(global_median),
365
+ "global_mad": float(global_mad),
366
+ "adjusted_median": float(adjusted_median),
367
+ "adjusted_mad": float(adjusted_mad),
368
+ "window_size": int(len(window_valid)),
369
+ }
370
+
371
+ if seasonality_components and multipliers_applied:
372
+ metadata["seasonality_groups"] = multipliers_applied
373
+
374
+ if is_anomaly:
375
+ if current_val < confidence_lower:
376
+ direction = "below"
377
+ distance = confidence_lower - current_val
378
+ else:
379
+ direction = "above"
380
+ distance = current_val - confidence_upper
381
+
382
+ # Severity: how many adjusted MAD units away
383
+ severity = distance / adjusted_mad if adjusted_mad > 0 else float("inf")
384
+
385
+ metadata.update({
386
+ "direction": direction,
387
+ "severity": float(severity),
388
+ "distance": float(distance),
389
+ })
390
+
391
+ results.append(
392
+ DetectionResult(
393
+ timestamp=current_ts,
394
+ value=current_val,
395
+ is_anomaly=is_anomaly,
396
+ confidence_lower=float(confidence_lower),
397
+ confidence_upper=float(confidence_upper),
398
+ detection_metadata=metadata,
399
+ )
400
+ )
401
+
402
+ return results
403
+
404
+ def _get_non_default_params(self) -> Dict[str, Any]:
405
+ """
406
+ Get parameters that differ from defaults.
407
+
408
+ Excludes execution parameters (seasonality_components, min_samples_per_group)
409
+ from detector ID hash.
410
+ """
411
+ defaults = {
412
+ "threshold": 3.0,
413
+ "window_size": 100,
414
+ "min_samples": 30,
415
+ "min_samples_per_group": 10,
416
+ }
417
+ # Execution parameters that don't affect detector ID
418
+ execution_params = {"seasonality_components", "min_samples_per_group"}
419
+
420
+ return {
421
+ k: v for k, v in self.params.items()
422
+ if v != defaults.get(k) and k not in execution_params
423
+ }
@@ -0,0 +1,177 @@
1
+ """
2
+ Manual Bounds anomaly detector.
3
+
4
+ Simple detector that uses user-specified thresholds for anomaly detection.
5
+ Useful when domain knowledge exists about acceptable ranges.
6
+
7
+ Examples:
8
+ - CPU usage should be <= 90%
9
+ - Response time should be <= 1000ms
10
+ - Queue size should be >= 0 and <= 10000
11
+ """
12
+
13
+ from typing import Any, Dict, Optional
14
+
15
+ import numpy as np
16
+
17
+ from detectkit.detectors.base import BaseDetector, DetectionResult
18
+
19
+
20
+ class ManualBoundsDetector(BaseDetector):
21
+ """
22
+ Manual threshold detector for anomaly detection.
23
+
24
+ Detects anomalies by comparing values against user-specified bounds.
25
+ Does not use historical data - purely threshold-based.
26
+
27
+ Parameters:
28
+ lower_bound (float | None): Minimum acceptable value (default: None = no lower limit)
29
+ - Values below this are anomalous
30
+ - None means no lower bound
31
+
32
+ upper_bound (float | None): Maximum acceptable value (default: None = no upper limit)
33
+ - Values above this are anomalous
34
+ - None means no upper bound
35
+
36
+ At least one bound must be specified.
37
+
38
+ Example:
39
+ >>> # Detect values above 100
40
+ >>> detector = ManualBoundsDetector(upper_bound=100.0)
41
+ >>> results = detector.detect(data)
42
+
43
+ >>> # Detect values outside [10, 90]
44
+ >>> detector = ManualBoundsDetector(lower_bound=10.0, upper_bound=90.0)
45
+ >>> results = detector.detect(data)
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ lower_bound: Optional[float] = None,
51
+ upper_bound: Optional[float] = None,
52
+ ):
53
+ """Initialize Manual Bounds detector with thresholds."""
54
+ super().__init__(
55
+ lower_bound=lower_bound,
56
+ upper_bound=upper_bound,
57
+ )
58
+
59
+ def _validate_params(self):
60
+ """Validate detector parameters."""
61
+ lower_bound = self.params.get("lower_bound")
62
+ upper_bound = self.params.get("upper_bound")
63
+
64
+ # At least one bound must be specified
65
+ if lower_bound is None and upper_bound is None:
66
+ raise ValueError("At least one of lower_bound or upper_bound must be specified")
67
+
68
+ # If both specified, lower must be less than upper
69
+ if lower_bound is not None and upper_bound is not None:
70
+ if lower_bound >= upper_bound:
71
+ raise ValueError("lower_bound must be less than upper_bound")
72
+
73
+ def detect(self, data: Dict[str, np.ndarray]) -> list[DetectionResult]:
74
+ """
75
+ Perform threshold-based anomaly detection.
76
+
77
+ Simply checks if each value is outside the specified bounds.
78
+ Does not use historical window - purely threshold-based.
79
+
80
+ Args:
81
+ data: Dictionary with keys:
82
+ - timestamp: np.array of datetime64[ms]
83
+ - value: np.array of float64 (may contain NaN)
84
+ - seasonality_data: np.array of JSON strings (not used)
85
+ - seasonality_columns: list of column names (not used)
86
+
87
+ Returns:
88
+ List of DetectionResult for each point
89
+
90
+ Notes:
91
+ - NaN values are skipped (marked as non-anomalous)
92
+ - No historical window needed
93
+ - No minimum samples requirement
94
+ """
95
+ timestamps = data["timestamp"]
96
+ values = data["value"]
97
+ lower_bound = self.params.get("lower_bound")
98
+ upper_bound = self.params.get("upper_bound")
99
+
100
+ results = []
101
+ n_points = len(timestamps)
102
+
103
+ for i in range(n_points):
104
+ current_val = values[i]
105
+ current_ts = timestamps[i]
106
+
107
+ # Skip NaN values
108
+ if np.isnan(current_val):
109
+ results.append(
110
+ DetectionResult(
111
+ timestamp=current_ts,
112
+ value=current_val,
113
+ is_anomaly=False,
114
+ detection_metadata={"reason": "missing_data"},
115
+ )
116
+ )
117
+ continue
118
+
119
+ # Check bounds
120
+ is_anomaly = False
121
+ direction = None
122
+ distance = 0.0
123
+
124
+ if lower_bound is not None and current_val < lower_bound:
125
+ is_anomaly = True
126
+ direction = "below"
127
+ distance = lower_bound - current_val
128
+
129
+ if upper_bound is not None and current_val > upper_bound:
130
+ is_anomaly = True
131
+ direction = "above"
132
+ distance = current_val - upper_bound
133
+
134
+ # Prepare metadata
135
+ metadata = {}
136
+ if is_anomaly:
137
+ metadata["direction"] = direction
138
+ metadata["distance"] = float(distance)
139
+
140
+ # Severity: relative distance from bound
141
+ if direction == "below":
142
+ # How far below as percentage of range
143
+ if upper_bound is not None:
144
+ bound_range = upper_bound - lower_bound
145
+ severity = distance / bound_range if bound_range > 0 else float("inf")
146
+ else:
147
+ # No upper bound, just use absolute distance
148
+ severity = distance
149
+ else: # above
150
+ if lower_bound is not None:
151
+ bound_range = upper_bound - lower_bound
152
+ severity = distance / bound_range if bound_range > 0 else float("inf")
153
+ else:
154
+ severity = distance
155
+
156
+ metadata["severity"] = float(severity)
157
+
158
+ results.append(
159
+ DetectionResult(
160
+ timestamp=current_ts,
161
+ value=current_val,
162
+ is_anomaly=is_anomaly,
163
+ confidence_lower=lower_bound,
164
+ confidence_upper=upper_bound,
165
+ detection_metadata=metadata,
166
+ )
167
+ )
168
+
169
+ return results
170
+
171
+ def _get_non_default_params(self) -> Dict[str, Any]:
172
+ """Get parameters that differ from defaults."""
173
+ # No defaults - all params are non-default
174
+ return {
175
+ k: v for k, v in self.params.items()
176
+ if v is not None
177
+ }