detectkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. detectkit/__init__.py +17 -0
  2. detectkit/alerting/__init__.py +13 -0
  3. detectkit/alerting/channels/__init__.py +21 -0
  4. detectkit/alerting/channels/base.py +191 -0
  5. detectkit/alerting/channels/email.py +146 -0
  6. detectkit/alerting/channels/factory.py +193 -0
  7. detectkit/alerting/channels/mattermost.py +53 -0
  8. detectkit/alerting/channels/slack.py +55 -0
  9. detectkit/alerting/channels/telegram.py +110 -0
  10. detectkit/alerting/channels/webhook.py +139 -0
  11. detectkit/alerting/orchestrator.py +368 -0
  12. detectkit/cli/__init__.py +1 -0
  13. detectkit/cli/commands/__init__.py +1 -0
  14. detectkit/cli/commands/init.py +282 -0
  15. detectkit/cli/commands/run.py +427 -0
  16. detectkit/cli/commands/test_alert.py +184 -0
  17. detectkit/cli/main.py +186 -0
  18. detectkit/config/__init__.py +30 -0
  19. detectkit/config/metric_config.py +467 -0
  20. detectkit/config/profile.py +285 -0
  21. detectkit/config/project_config.py +164 -0
  22. detectkit/core/__init__.py +6 -0
  23. detectkit/core/interval.py +132 -0
  24. detectkit/core/models.py +106 -0
  25. detectkit/database/__init__.py +27 -0
  26. detectkit/database/clickhouse_manager.py +385 -0
  27. detectkit/database/internal_tables.py +581 -0
  28. detectkit/database/manager.py +324 -0
  29. detectkit/database/tables.py +134 -0
  30. detectkit/detectors/__init__.py +6 -0
  31. detectkit/detectors/base.py +222 -0
  32. detectkit/detectors/factory.py +138 -0
  33. detectkit/detectors/statistical/__init__.py +8 -0
  34. detectkit/detectors/statistical/iqr.py +230 -0
  35. detectkit/detectors/statistical/mad.py +423 -0
  36. detectkit/detectors/statistical/manual_bounds.py +177 -0
  37. detectkit/detectors/statistical/zscore.py +225 -0
  38. detectkit/loaders/__init__.py +6 -0
  39. detectkit/loaders/metric_loader.py +470 -0
  40. detectkit/loaders/query_template.py +164 -0
  41. detectkit/orchestration/__init__.py +9 -0
  42. detectkit/orchestration/task_manager.py +698 -0
  43. detectkit/utils/__init__.py +1 -0
  44. detectkit-0.1.0.dist-info/METADATA +231 -0
  45. detectkit-0.1.0.dist-info/RECORD +49 -0
  46. detectkit-0.1.0.dist-info/WHEEL +5 -0
  47. detectkit-0.1.0.dist-info/entry_points.txt +2 -0
  48. detectkit-0.1.0.dist-info/licenses/LICENSE +21 -0
  49. detectkit-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,225 @@
1
+ """
2
+ Z-Score anomaly detector.
3
+
4
+ Z-Score is a classical statistical method for outlier detection that:
5
+ - Uses mean as measure of center
6
+ - Uses standard deviation as measure of spread
7
+ - Assumes approximately normal distribution
8
+
9
+ Formula:
10
+ - mean_val = mean(values)
11
+ - std_val = std(values)
12
+ - z_score = (value - mean_val) / std_val
13
+ - lower_bound = mean_val - threshold × std_val
14
+ - upper_bound = mean_val + threshold × std_val
15
+
16
+ Note: Z-Score is more sensitive to outliers than MAD because
17
+ both mean and std are affected by extreme values.
18
+ """
19
+
20
+ from typing import Any, Dict
21
+
22
+ import numpy as np
23
+
24
+ from detectkit.detectors.base import BaseDetector, DetectionResult
25
+
26
+
27
+ class ZScoreDetector(BaseDetector):
28
+ """
29
+ Z-Score detector for anomaly detection.
30
+
31
+ Detects anomalies by comparing values against confidence intervals
32
+ based on mean and standard deviation (Z-Score method).
33
+
34
+ Parameters:
35
+ threshold (float): Number of standard deviations from mean (default: 3.0)
36
+ - 3.0 is standard (99.7% of normal data within ±3σ)
37
+ - Higher = less sensitive (fewer anomalies)
38
+ - Lower = more sensitive (more anomalies)
39
+
40
+ window_size (int): Historical window size in points (default: 100)
41
+ - Uses last N points to compute statistics
42
+ - Larger = more stable but less responsive
43
+ - Smaller = more responsive but less stable
44
+
45
+ min_samples (int): Minimum samples required for detection (default: 30)
46
+ - Skip detection if window has fewer valid points
47
+ - Ensures statistical reliability
48
+
49
+ Example:
50
+ >>> detector = ZScoreDetector(threshold=3.0, window_size=100)
51
+ >>> results = detector.detect(data)
52
+ >>> for r in results:
53
+ ... if r.is_anomaly:
54
+ ... print(f"Anomaly: {r.value} outside [{r.confidence_lower}, {r.confidence_upper}]")
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ threshold: float = 3.0,
60
+ window_size: int = 100,
61
+ min_samples: int = 30,
62
+ ):
63
+ """Initialize Z-Score detector with parameters."""
64
+ super().__init__(
65
+ threshold=threshold,
66
+ window_size=window_size,
67
+ min_samples=min_samples,
68
+ )
69
+
70
+ def _validate_params(self):
71
+ """Validate detector parameters."""
72
+ threshold = self.params.get("threshold")
73
+ if threshold is None or threshold <= 0:
74
+ raise ValueError("threshold must be positive")
75
+
76
+ window_size = self.params.get("window_size")
77
+ if window_size is None or window_size < 1:
78
+ raise ValueError("window_size must be at least 1")
79
+
80
+ min_samples = self.params.get("min_samples")
81
+ if min_samples is None or min_samples < 2:
82
+ raise ValueError("min_samples must be at least 2")
83
+
84
+ if min_samples > window_size:
85
+ raise ValueError("min_samples cannot exceed window_size")
86
+
87
+ def detect(self, data: Dict[str, np.ndarray]) -> list[DetectionResult]:
88
+ """
89
+ Perform Z-Score based anomaly detection.
90
+
91
+ For each point, uses historical window to compute:
92
+ 1. mean_val = mean of window
93
+ 2. std_val = standard deviation of window
94
+ 3. confidence_interval = [mean - threshold×std, mean + threshold×std]
95
+ 4. is_anomaly = value outside confidence interval
96
+
97
+ Args:
98
+ data: Dictionary with keys:
99
+ - timestamp: np.array of datetime64[ms]
100
+ - value: np.array of float64 (may contain NaN)
101
+ - seasonality_data: np.array of JSON strings (not used yet)
102
+ - seasonality_columns: list of column names (not used yet)
103
+
104
+ Returns:
105
+ List of DetectionResult for each point
106
+
107
+ Notes:
108
+ - NaN values are skipped (marked as non-anomalous)
109
+ - First min_samples-1 points are skipped (insufficient history)
110
+ - Uses Bessel's correction (ddof=1) for std calculation
111
+ - Seasonality support will be added in future versions
112
+ """
113
+ timestamps = data["timestamp"]
114
+ values = data["value"]
115
+ threshold = self.params["threshold"]
116
+ window_size = self.params["window_size"]
117
+ min_samples = self.params["min_samples"]
118
+
119
+ results = []
120
+ n_points = len(timestamps)
121
+
122
+ for i in range(n_points):
123
+ current_val = values[i]
124
+ current_ts = timestamps[i]
125
+
126
+ # Skip NaN values
127
+ if np.isnan(current_val):
128
+ results.append(
129
+ DetectionResult(
130
+ timestamp=current_ts,
131
+ value=current_val,
132
+ is_anomaly=False,
133
+ detection_metadata={"reason": "missing_data"},
134
+ )
135
+ )
136
+ continue
137
+
138
+ # Get historical window (not including current point)
139
+ window_start = max(0, i - window_size)
140
+ window_values = values[window_start:i]
141
+
142
+ # Filter out NaN values from window
143
+ window_valid = window_values[~np.isnan(window_values)]
144
+
145
+ # Check if we have enough samples
146
+ if len(window_valid) < min_samples:
147
+ results.append(
148
+ DetectionResult(
149
+ timestamp=current_ts,
150
+ value=current_val,
151
+ is_anomaly=False,
152
+ detection_metadata={
153
+ "reason": "insufficient_data",
154
+ "window_size": int(len(window_valid)),
155
+ "min_samples": min_samples,
156
+ },
157
+ )
158
+ )
159
+ continue
160
+
161
+ # Compute Z-Score statistics
162
+ mean_val = np.mean(window_valid)
163
+ std_val = np.std(window_valid, ddof=1) # Bessel's correction
164
+
165
+ # Handle edge case: std = 0 (all values identical)
166
+ if std_val == 0:
167
+ # Use small epsilon to avoid division by zero
168
+ # If all values are identical, any deviation is anomalous
169
+ confidence_lower = mean_val - 1e-10
170
+ confidence_upper = mean_val + 1e-10
171
+ else:
172
+ confidence_lower = mean_val - threshold * std_val
173
+ confidence_upper = mean_val + threshold * std_val
174
+
175
+ # Check if current value is anomalous
176
+ is_anomaly = (current_val < confidence_lower) or (current_val > confidence_upper)
177
+
178
+ # Determine direction and severity
179
+ metadata = {
180
+ "mean": float(mean_val),
181
+ "std": float(std_val),
182
+ "window_size": int(len(window_valid)),
183
+ }
184
+
185
+ if is_anomaly:
186
+ if current_val < confidence_lower:
187
+ direction = "below"
188
+ distance = confidence_lower - current_val
189
+ else:
190
+ direction = "above"
191
+ distance = current_val - confidence_upper
192
+
193
+ # Severity: how many standard deviations away (Z-score)
194
+ z_score = abs((current_val - mean_val) / std_val) if std_val > 0 else float("inf")
195
+
196
+ metadata.update({
197
+ "direction": direction,
198
+ "severity": float(z_score),
199
+ "distance": float(distance),
200
+ })
201
+
202
+ results.append(
203
+ DetectionResult(
204
+ timestamp=current_ts,
205
+ value=current_val,
206
+ is_anomaly=is_anomaly,
207
+ confidence_lower=float(confidence_lower),
208
+ confidence_upper=float(confidence_upper),
209
+ detection_metadata=metadata,
210
+ )
211
+ )
212
+
213
+ return results
214
+
215
+ def _get_non_default_params(self) -> Dict[str, Any]:
216
+ """Get parameters that differ from defaults."""
217
+ defaults = {
218
+ "threshold": 3.0,
219
+ "window_size": 100,
220
+ "min_samples": 30,
221
+ }
222
+ return {
223
+ k: v for k, v in self.params.items()
224
+ if v != defaults.get(k)
225
+ }
@@ -0,0 +1,6 @@
1
+ """Metric data loaders for detectk."""
2
+
3
+ from detectkit.loaders.query_template import QueryTemplate
4
+ from detectkit.loaders.metric_loader import MetricLoader
5
+
6
+ __all__ = ["QueryTemplate", "MetricLoader"]
@@ -0,0 +1,470 @@
1
+ """
2
+ Metric data loader.
3
+
4
+ Loads time-series data from databases with:
5
+ - SQL query execution (with Jinja2 templating)
6
+ - Gap filling for missing timestamps
7
+ - Seasonality feature extraction
8
+ - Batch processing
9
+ - Integration with InternalTablesManager
10
+ """
11
+
12
+ from datetime import datetime, timedelta, timezone
13
+ from typing import Dict, List, Optional
14
+
15
+ import numpy as np
16
+
17
+ try:
18
+ import orjson
19
+ HAS_ORJSON = True
20
+ except ImportError:
21
+ import json
22
+ HAS_ORJSON = False
23
+
24
+
25
+ def json_dumps_sorted(obj):
26
+ """JSON dumps with sorted keys - handles both orjson and standard json."""
27
+ if HAS_ORJSON:
28
+ return orjson.dumps(obj, option=orjson.OPT_SORT_KEYS).decode('utf-8')
29
+ else:
30
+ return json.dumps(obj, sort_keys=True)
31
+
32
+ from detectkit.config.metric_config import MetricConfig
33
+ from detectkit.database.internal_tables import InternalTablesManager
34
+ from detectkit.database.manager import BaseDatabaseManager
35
+ from detectkit.loaders.query_template import QueryTemplate
36
+
37
+
38
+ class MetricLoader:
39
+ """
40
+ Loads metric data from database with preprocessing.
41
+
42
+ Features:
43
+ - Execute SQL queries with Jinja2 templating
44
+ - Fill gaps in time series
45
+ - Extract seasonality features (hour, day_of_week, etc.)
46
+ - Save to _dtk_datapoints table
47
+ - Batch processing for large datasets
48
+
49
+ Example:
50
+ >>> config = MetricConfig.from_yaml_file("metrics/cpu_usage.yml")
51
+ >>> manager = ClickHouseDatabaseManager(...)
52
+ >>> internal = InternalTablesManager(manager)
53
+ >>>
54
+ >>> loader = MetricLoader(config, manager, internal)
55
+ >>> data = loader.load(
56
+ ... from_date=datetime(2024, 1, 1),
57
+ ... to_date=datetime(2024, 1, 2)
58
+ ... )
59
+ >>> loader.save(data)
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ config: MetricConfig,
65
+ db_manager: BaseDatabaseManager,
66
+ internal_manager: InternalTablesManager,
67
+ ):
68
+ """
69
+ Initialize metric loader.
70
+
71
+ Args:
72
+ config: Metric configuration
73
+ db_manager: Database manager for executing queries
74
+ internal_manager: Internal tables manager for saving data
75
+ """
76
+ self.config = config
77
+ self.db_manager = db_manager
78
+ self.internal_manager = internal_manager
79
+ self.query_template = QueryTemplate()
80
+
81
+ def load(
82
+ self,
83
+ from_date: datetime,
84
+ to_date: datetime,
85
+ fill_gaps: bool = True,
86
+ ) -> Dict[str, np.ndarray]:
87
+ """
88
+ Load metric data from database.
89
+
90
+ Steps:
91
+ 1. Render SQL query with Jinja2
92
+ 2. Execute query
93
+ 3. Extract seasonality features
94
+ 4. Fill gaps (if enabled)
95
+ 5. Return as numpy arrays
96
+
97
+ Args:
98
+ from_date: Start date (inclusive)
99
+ to_date: End date (exclusive)
100
+ fill_gaps: Whether to fill missing timestamps with NULL
101
+
102
+ Returns:
103
+ Dictionary with keys:
104
+ - timestamp: np.array of datetime64[ms]
105
+ - value: np.array of float64 (nullable)
106
+ - seasonality_data: np.array of JSON strings
107
+ - seasonality_columns: list of column names
108
+
109
+ Raises:
110
+ ValueError: If query returns invalid data
111
+ Exception: If query execution fails
112
+
113
+ Example:
114
+ >>> data = loader.load(
115
+ ... datetime(2024, 1, 1),
116
+ ... datetime(2024, 1, 2)
117
+ ... )
118
+ >>> print(data["timestamp"])
119
+ >>> print(data["value"])
120
+ """
121
+ # Normalize datetimes to naive (remove timezone info)
122
+ # ClickHouse returns naive datetimes, so we need to compare with naive
123
+ if from_date.tzinfo is not None:
124
+ from_date = from_date.replace(tzinfo=None)
125
+ if to_date.tzinfo is not None:
126
+ to_date = to_date.replace(tzinfo=None)
127
+
128
+ # Get interval
129
+ interval = self.config.get_interval()
130
+ interval_seconds = interval.seconds
131
+
132
+ # Render SQL query
133
+ query_text = self.config.get_query_text()
134
+ rendered_query = self.query_template.render(
135
+ query_text,
136
+ dtk_start_time=from_date,
137
+ dtk_end_time=to_date,
138
+ interval_seconds=interval_seconds,
139
+ )
140
+
141
+ # Execute query
142
+ results = self.db_manager.execute_query(rendered_query)
143
+
144
+ if not results:
145
+ # No data - return empty arrays
146
+ return self._create_empty_result()
147
+
148
+ # Get column names from config (with defaults)
149
+ if self.config.query_columns:
150
+ timestamp_col = self.config.query_columns.timestamp
151
+ value_col = self.config.query_columns.metric
152
+ else:
153
+ # Default column names
154
+ timestamp_col = "timestamp"
155
+ value_col = "value"
156
+
157
+ # Filter results to exclude to_date (exclusive end)
158
+ # SQL queries often use BETWEEN which includes both boundaries,
159
+ # but our semantics are [from_date, to_date) - exclusive end
160
+ filtered_results = []
161
+ for row in results:
162
+ if timestamp_col not in row:
163
+ raise ValueError(
164
+ f"Query must return '{timestamp_col}' column "
165
+ f"(configured as timestamp column). "
166
+ f"Got columns: {list(row.keys())}"
167
+ )
168
+
169
+ # Filter by timestamp
170
+ row_ts = row[timestamp_col]
171
+ if isinstance(row_ts, datetime):
172
+ # Already datetime - compare directly
173
+ if row_ts >= to_date:
174
+ continue
175
+ else:
176
+ # Convert to datetime for comparison
177
+ row_dt = np.datetime64(row_ts, "ms").astype(datetime)
178
+ if row_dt >= to_date:
179
+ continue
180
+
181
+ filtered_results.append(row)
182
+
183
+ results = filtered_results
184
+
185
+ if not results:
186
+ # No data after filtering - return empty arrays
187
+ return self._create_empty_result()
188
+
189
+ # Convert results to numpy arrays
190
+ timestamps = []
191
+ values = []
192
+
193
+ for row in results:
194
+ if value_col not in row:
195
+ raise ValueError(
196
+ f"Query must return '{value_col}' column "
197
+ f"(configured as metric value column). "
198
+ f"Got columns: {list(row.keys())}"
199
+ )
200
+
201
+ timestamps.append(row[timestamp_col])
202
+ values.append(row[value_col])
203
+
204
+ # Convert to numpy
205
+ timestamp_array = np.array(timestamps, dtype="datetime64[ms]")
206
+ value_array = np.array(values, dtype=np.float64)
207
+
208
+ # Extract seasonality data BEFORE gap filling (from query results)
209
+ # This is needed because gap filling may add rows that don't exist in query results
210
+ seasonality_from_query = None
211
+ seasonality_columns_from_query = []
212
+
213
+ if self.config.query_columns and self.config.query_columns.seasonality:
214
+ # Query returns custom seasonality columns - extract them
215
+ seasonality_columns_from_query = self.config.query_columns.seasonality
216
+ seasonality_from_query = []
217
+
218
+ for row in results:
219
+ features = {}
220
+ for col in seasonality_columns_from_query:
221
+ if col not in row:
222
+ raise ValueError(
223
+ f"Query must return seasonality column '{col}' "
224
+ f"(configured in query_columns.seasonality). "
225
+ f"Got columns: {list(row.keys())}"
226
+ )
227
+ features[col] = row[col]
228
+
229
+ # Convert to JSON
230
+ seasonality_from_query.append(json_dumps_sorted(features))
231
+
232
+ seasonality_from_query = np.array(seasonality_from_query, dtype=object)
233
+
234
+ # Fill gaps if needed
235
+ if fill_gaps:
236
+ timestamp_array, value_array = self._fill_gaps(
237
+ timestamp_array, value_array, from_date, to_date, interval_seconds
238
+ )
239
+
240
+ # If we have seasonality from query, we need to fill gaps in it too
241
+ if seasonality_from_query is not None:
242
+ # For gap-filled rows, seasonality will be extracted from timestamp
243
+ # This is a simplified approach - we just use empty JSON for gaps
244
+ # In production, you might want to interpolate or use timestamp-based features
245
+ original_length = len(seasonality_from_query)
246
+ new_length = len(timestamp_array)
247
+ if new_length > original_length:
248
+ # We have gaps - pad with empty JSON
249
+ empty_json = json_dumps_sorted({})
250
+ seasonality_from_query = np.pad(
251
+ seasonality_from_query,
252
+ (0, new_length - original_length),
253
+ mode='constant',
254
+ constant_values=empty_json
255
+ )
256
+
257
+ # Determine final seasonality data and columns
258
+ if seasonality_from_query is not None:
259
+ # Use seasonality from query
260
+ seasonality_data = seasonality_from_query
261
+ seasonality_columns = seasonality_columns_from_query
262
+ else:
263
+ # Extract seasonality features from timestamps (standard behavior)
264
+ seasonality_data = self._extract_seasonality(
265
+ timestamp_array, self.config.seasonality_columns
266
+ )
267
+ seasonality_columns = self.config.seasonality_columns
268
+
269
+ return {
270
+ "timestamp": timestamp_array,
271
+ "value": value_array,
272
+ "seasonality_data": seasonality_data,
273
+ "seasonality_columns": seasonality_columns,
274
+ }
275
+
276
+ def save(self, data: Dict[str, np.ndarray]) -> int:
277
+ """
278
+ Save loaded data to _dtk_datapoints table.
279
+
280
+ Args:
281
+ data: Data dictionary from load()
282
+
283
+ Returns:
284
+ Number of rows inserted
285
+
286
+ Example:
287
+ >>> data = loader.load(from_date, to_date)
288
+ >>> rows = loader.save(data)
289
+ >>> print(f"Saved {rows} data points")
290
+ """
291
+ if len(data["timestamp"]) == 0:
292
+ return 0
293
+
294
+ interval = self.config.get_interval()
295
+
296
+ return self.internal_manager.save_datapoints(
297
+ metric_name=self.config.name,
298
+ data=data,
299
+ interval_seconds=interval.seconds,
300
+ seasonality_columns=data["seasonality_columns"],
301
+ )
302
+
303
+ def load_and_save(
304
+ self,
305
+ from_date: Optional[datetime] = None,
306
+ to_date: Optional[datetime] = None,
307
+ ) -> int:
308
+ """
309
+ Load and save data in one operation with batching.
310
+
311
+ If from_date is None, loads from last saved timestamp.
312
+ If to_date is None, loads until now.
313
+
314
+ Args:
315
+ from_date: Start date (if None, use last saved timestamp)
316
+ to_date: End date (if None, use now)
317
+
318
+ Returns:
319
+ Total number of rows inserted
320
+
321
+ Example:
322
+ >>> # Load from last saved point until now
323
+ >>> rows = loader.load_and_save()
324
+ >>>
325
+ >>> # Load specific range
326
+ >>> rows = loader.load_and_save(
327
+ ... from_date=datetime(2024, 1, 1),
328
+ ... to_date=datetime(2024, 1, 2)
329
+ ... )
330
+ """
331
+ # Determine date range
332
+ if from_date is None:
333
+ # Get last saved timestamp
334
+ last_ts = self.internal_manager.get_last_datapoint_timestamp(
335
+ self.config.name
336
+ )
337
+ if last_ts:
338
+ # Start from next interval after last timestamp
339
+ interval = self.config.get_interval()
340
+ from_date = last_ts + timedelta(seconds=interval.seconds)
341
+ else:
342
+ # No data yet - use loading_start_time from config if available
343
+ if self.config.loading_start_time:
344
+ # Parse loading_start_time string (format: "YYYY-MM-DD HH:MM:SS" in UTC)
345
+ from_date = datetime.strptime(
346
+ self.config.loading_start_time, "%Y-%m-%d %H:%M:%S"
347
+ ).replace(tzinfo=timezone.utc)
348
+ else:
349
+ # No data and no loading_start_time - need to specify from_date
350
+ raise ValueError(
351
+ "No existing data for metric and no loading_start_time configured. "
352
+ "Please specify from_date for initial load or set loading_start_time in config."
353
+ )
354
+
355
+ if to_date is None:
356
+ to_date = datetime.now(timezone.utc)
357
+
358
+ # Load and save
359
+ data = self.load(from_date, to_date, fill_gaps=True)
360
+ return self.save(data)
361
+
362
+ def _create_empty_result(self) -> Dict[str, np.ndarray]:
363
+ """Create empty result dictionary."""
364
+ return {
365
+ "timestamp": np.array([], dtype="datetime64[ms]"),
366
+ "value": np.array([], dtype=np.float64),
367
+ "seasonality_data": np.array([], dtype=object),
368
+ "seasonality_columns": self.config.seasonality_columns,
369
+ }
370
+
371
+ def _fill_gaps(
372
+ self,
373
+ timestamps: np.ndarray,
374
+ values: np.ndarray,
375
+ from_date: datetime,
376
+ to_date: datetime,
377
+ interval_seconds: int,
378
+ ) -> tuple[np.ndarray, np.ndarray]:
379
+ """
380
+ Fill missing timestamps with NULL values.
381
+
382
+ Generates full timestamp range based on interval and fills
383
+ missing points.
384
+
385
+ Args:
386
+ timestamps: Existing timestamps
387
+ values: Existing values
388
+ from_date: Range start
389
+ to_date: Range end
390
+ interval_seconds: Interval in seconds
391
+
392
+ Returns:
393
+ Tuple of (filled_timestamps, filled_values)
394
+ """
395
+ # Generate full timestamp range
396
+ start_ts = np.datetime64(from_date, "ms")
397
+ end_ts = np.datetime64(to_date, "ms")
398
+ interval_delta = np.timedelta64(interval_seconds, "s")
399
+
400
+ # Create full range
401
+ full_timestamps = np.arange(start_ts, end_ts, interval_delta)
402
+
403
+ if len(timestamps) == 0:
404
+ # No data at all - return full range with NaN
405
+ return full_timestamps, np.full(len(full_timestamps), np.nan)
406
+
407
+ # Create mapping from existing timestamps to values
408
+ ts_to_value = dict(zip(timestamps, values))
409
+
410
+ # Fill values for full range
411
+ filled_values = np.array(
412
+ [ts_to_value.get(ts, np.nan) for ts in full_timestamps],
413
+ dtype=np.float64,
414
+ )
415
+
416
+ return full_timestamps, filled_values
417
+
418
+ def _extract_seasonality(
419
+ self,
420
+ timestamps: np.ndarray,
421
+ seasonality_columns: List[str],
422
+ ) -> np.ndarray:
423
+ """
424
+ Extract seasonality features from timestamps.
425
+
426
+ Args:
427
+ timestamps: Array of datetime64 timestamps
428
+ seasonality_columns: List of features to extract
429
+
430
+ Returns:
431
+ Array of JSON strings with seasonality data
432
+
433
+ Supported features:
434
+ - hour: Hour of day (0-23)
435
+ - day_of_week: Day of week (0=Monday, 6=Sunday)
436
+ - day_of_month: Day of month (1-31)
437
+ - month: Month (1-12)
438
+ - is_weekend: Boolean (Saturday=5, Sunday=6)
439
+ - is_holiday: Boolean (requires holiday calendar - not implemented)
440
+ """
441
+ if len(timestamps) == 0:
442
+ return np.array([], dtype=object)
443
+
444
+ seasonality_data = []
445
+
446
+ for ts in timestamps:
447
+ # Convert numpy datetime64 to Python datetime
448
+ ts_datetime = ts.astype("datetime64[s]").astype(datetime)
449
+
450
+ features = {}
451
+
452
+ for col in seasonality_columns:
453
+ if col == "hour":
454
+ features["hour"] = ts_datetime.hour
455
+ elif col == "day_of_week":
456
+ features["day_of_week"] = ts_datetime.weekday() # 0=Monday
457
+ elif col == "day_of_month":
458
+ features["day_of_month"] = ts_datetime.day
459
+ elif col == "month":
460
+ features["month"] = ts_datetime.month
461
+ elif col == "is_weekend":
462
+ features["is_weekend"] = ts_datetime.weekday() >= 5
463
+ elif col == "is_holiday":
464
+ # TODO: Implement holiday calendar
465
+ features["is_holiday"] = False
466
+
467
+ # Convert to JSON
468
+ seasonality_data.append(json_dumps_sorted(features))
469
+
470
+ return np.array(seasonality_data, dtype=object)