duckguard 2.2.0__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,724 @@
1
+ """ML-based anomaly detection methods for DuckGuard.
2
+
3
+ Provides advanced anomaly detection methods that learn from historical data
4
+ rather than requiring manual threshold configuration.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from dataclasses import dataclass, field
11
+ from typing import Any
12
+
13
+ from duckguard.anomaly.baselines import BaselineStorage
14
+ from duckguard.anomaly.methods import AnomalyMethod, AnomalyScore
15
+ from duckguard.history.storage import HistoryStorage
16
+
17
+
18
+ @dataclass
19
+ class BaselineComparison:
20
+ """Result of comparing current data to baseline.
21
+
22
+ Attributes:
23
+ column_name: Name of the column
24
+ metric: Metric being compared
25
+ baseline_value: Stored baseline value
26
+ current_value: Current value
27
+ deviation: How far current deviates from baseline
28
+ deviation_percent: Deviation as percentage
29
+ is_anomalous: Whether this deviation is anomalous
30
+ details: Additional comparison details
31
+ """
32
+
33
+ column_name: str
34
+ metric: str
35
+ baseline_value: float
36
+ current_value: float
37
+ deviation: float
38
+ deviation_percent: float
39
+ is_anomalous: bool
40
+ details: dict[str, Any] = field(default_factory=dict)
41
+
42
+
43
+ @dataclass
44
+ class DistributionComparison:
45
+ """Result of comparing distributions.
46
+
47
+ Attributes:
48
+ column_name: Name of the column
49
+ statistic: Test statistic value
50
+ p_value: P-value of the test
51
+ is_drifted: Whether significant drift was detected
52
+ method: Statistical test used
53
+ details: Additional test details
54
+ """
55
+
56
+ column_name: str
57
+ statistic: float
58
+ p_value: float
59
+ is_drifted: bool
60
+ method: str
61
+ details: dict[str, Any] = field(default_factory=dict)
62
+
63
+
64
+ class BaselineMethod(AnomalyMethod):
65
+ """Detect anomalies by comparing to learned baseline.
66
+
67
+ This method learns statistical properties from historical data and
68
+ detects values that deviate significantly from the learned baseline.
69
+
70
+ Usage:
71
+ from duckguard.anomaly.ml_methods import BaselineMethod
72
+
73
+ # Create method with storage
74
+ method = BaselineMethod(sensitivity=2.0)
75
+
76
+ # Fit to baseline data
77
+ baseline_values = [100, 102, 98, 105, 97, 103]
78
+ method.fit(baseline_values)
79
+
80
+ # Score new values
81
+ score = method.score(150) # High score = anomalous
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ storage: HistoryStorage | None = None,
87
+ sensitivity: float = 2.0,
88
+ min_samples: int = 5,
89
+ ):
90
+ """Initialize baseline method.
91
+
92
+ Args:
93
+ storage: Optional HistoryStorage for persisting baselines
94
+ sensitivity: Number of standard deviations for anomaly threshold
95
+ min_samples: Minimum samples needed before flagging anomalies
96
+ """
97
+ self._storage = storage
98
+ self._baseline_storage = BaselineStorage(storage) if storage else None
99
+ self.sensitivity = sensitivity
100
+ self.min_samples = min_samples
101
+
102
+ # Learned parameters
103
+ self._mean: float = 0.0
104
+ self._stddev: float = 1.0
105
+ self._min: float = float('-inf')
106
+ self._max: float = float('inf')
107
+ self._sample_count: int = 0
108
+ self._fitted = False
109
+
110
+ @property
111
+ def name(self) -> str:
112
+ return "baseline"
113
+
114
+ def fit(self, values: list[float]) -> None:
115
+ """Learn baseline from values.
116
+
117
+ Args:
118
+ values: List of numeric values to learn from
119
+ """
120
+ clean = [v for v in values if v is not None and not math.isnan(v)]
121
+ if not clean:
122
+ return
123
+
124
+ n = len(clean)
125
+ self._mean = sum(clean) / n
126
+ self._min = min(clean)
127
+ self._max = max(clean)
128
+
129
+ if n > 1:
130
+ variance = sum((x - self._mean) ** 2 for x in clean) / (n - 1)
131
+ self._stddev = math.sqrt(variance) if variance > 0 else 1.0
132
+ else:
133
+ self._stddev = 1.0
134
+
135
+ self._sample_count = n
136
+ self._fitted = True
137
+
138
+ def score(self, value: float) -> AnomalyScore:
139
+ """Score a value against the baseline.
140
+
141
+ Args:
142
+ value: Value to score
143
+
144
+ Returns:
145
+ AnomalyScore indicating how anomalous the value is
146
+ """
147
+ if value is None or math.isnan(value):
148
+ return AnomalyScore(
149
+ value=value,
150
+ score=0.0,
151
+ is_anomaly=False,
152
+ threshold=self.sensitivity,
153
+ details={"reason": "null_or_nan"}
154
+ )
155
+
156
+ # Not enough samples to determine anomaly
157
+ if self._sample_count < self.min_samples:
158
+ return AnomalyScore(
159
+ value=value,
160
+ score=0.0,
161
+ is_anomaly=False,
162
+ threshold=self.sensitivity,
163
+ details={"reason": "insufficient_samples", "sample_count": self._sample_count}
164
+ )
165
+
166
+ # Calculate z-score deviation from baseline
167
+ if self._stddev == 0:
168
+ deviation = 0.0
169
+ else:
170
+ deviation = abs((value - self._mean) / self._stddev)
171
+
172
+ is_anomaly = deviation > self.sensitivity
173
+
174
+ return AnomalyScore(
175
+ value=value,
176
+ score=deviation,
177
+ is_anomaly=is_anomaly,
178
+ threshold=self.sensitivity,
179
+ details={
180
+ "baseline_mean": self._mean,
181
+ "baseline_stddev": self._stddev,
182
+ "baseline_min": self._min,
183
+ "baseline_max": self._max,
184
+ "deviation_stddevs": deviation,
185
+ "sample_count": self._sample_count,
186
+ }
187
+ )
188
+
189
+ def compare_to_baseline(
190
+ self,
191
+ values: list[float],
192
+ metric: str = "mean",
193
+ ) -> BaselineComparison:
194
+ """Compare current values to stored baseline.
195
+
196
+ Args:
197
+ values: Current values to compare
198
+ metric: Metric to compare (mean, stddev, min, max)
199
+
200
+ Returns:
201
+ BaselineComparison result
202
+ """
203
+ clean = [v for v in values if v is not None and not math.isnan(v)]
204
+ if not clean:
205
+ raise ValueError("No valid values to compare")
206
+
207
+ # Calculate current metric
208
+ if metric == "mean":
209
+ current = sum(clean) / len(clean)
210
+ baseline = self._mean
211
+ elif metric == "stddev":
212
+ current_mean = sum(clean) / len(clean)
213
+ variance = sum((x - current_mean) ** 2 for x in clean) / (len(clean) - 1)
214
+ current = math.sqrt(variance) if variance > 0 else 0.0
215
+ baseline = self._stddev
216
+ elif metric == "min":
217
+ current = min(clean)
218
+ baseline = self._min
219
+ elif metric == "max":
220
+ current = max(clean)
221
+ baseline = self._max
222
+ else:
223
+ raise ValueError(f"Unknown metric: {metric}")
224
+
225
+ deviation = abs(current - baseline)
226
+ deviation_percent = (deviation / baseline * 100) if baseline != 0 else 0.0
227
+
228
+ # Use sensitivity threshold for anomaly detection
229
+ threshold = self.sensitivity * self._stddev if metric == "mean" else deviation_percent > 20
230
+
231
+ is_anomalous = deviation > threshold if isinstance(threshold, float) else threshold
232
+
233
+ return BaselineComparison(
234
+ column_name="", # Set by caller
235
+ metric=metric,
236
+ baseline_value=baseline,
237
+ current_value=current,
238
+ deviation=deviation,
239
+ deviation_percent=deviation_percent,
240
+ is_anomalous=is_anomalous,
241
+ details={
242
+ "sensitivity": self.sensitivity,
243
+ "threshold": threshold if isinstance(threshold, float) else 20.0,
244
+ }
245
+ )
246
+
247
+ def save_baseline(
248
+ self,
249
+ source: str,
250
+ column_name: str,
251
+ ) -> None:
252
+ """Save learned baseline to storage.
253
+
254
+ Args:
255
+ source: Data source path
256
+ column_name: Column name
257
+ """
258
+ if not self._baseline_storage:
259
+ raise ValueError("No storage configured for baseline persistence")
260
+
261
+ if not self._fitted:
262
+ raise ValueError("Method not fitted - call fit() first")
263
+
264
+ self._baseline_storage.store(source, column_name, "mean", self._mean, sample_size=self._sample_count)
265
+ self._baseline_storage.store(source, column_name, "stddev", self._stddev, sample_size=self._sample_count)
266
+ self._baseline_storage.store(source, column_name, "min", self._min, sample_size=self._sample_count)
267
+ self._baseline_storage.store(source, column_name, "max", self._max, sample_size=self._sample_count)
268
+
269
+ def load_baseline(
270
+ self,
271
+ source: str,
272
+ column_name: str,
273
+ ) -> bool:
274
+ """Load baseline from storage.
275
+
276
+ Args:
277
+ source: Data source path
278
+ column_name: Column name
279
+
280
+ Returns:
281
+ True if baseline was loaded, False if not found
282
+ """
283
+ if not self._baseline_storage:
284
+ raise ValueError("No storage configured for baseline persistence")
285
+
286
+ mean_bl = self._baseline_storage.get(source, column_name, "mean")
287
+ if not mean_bl:
288
+ return False
289
+
290
+ self._mean = mean_bl.value
291
+ self._sample_count = mean_bl.sample_size or 0
292
+
293
+ stddev_bl = self._baseline_storage.get(source, column_name, "stddev")
294
+ if stddev_bl:
295
+ self._stddev = stddev_bl.value
296
+
297
+ min_bl = self._baseline_storage.get(source, column_name, "min")
298
+ if min_bl:
299
+ self._min = min_bl.value
300
+
301
+ max_bl = self._baseline_storage.get(source, column_name, "max")
302
+ if max_bl:
303
+ self._max = max_bl.value
304
+
305
+ self._fitted = True
306
+ return True
307
+
308
+
309
+ class KSTestMethod(AnomalyMethod):
310
+ """Detect distribution drift using Kolmogorov-Smirnov test.
311
+
312
+ This method compares the current data distribution to a baseline
313
+ distribution and detects statistically significant differences.
314
+
315
+ Usage:
316
+ from duckguard.anomaly.ml_methods import KSTestMethod
317
+
318
+ method = KSTestMethod(p_value_threshold=0.05)
319
+
320
+ # Fit to baseline data
321
+ baseline_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
322
+ method.fit(baseline_data)
323
+
324
+ # Detect if new data has drifted
325
+ new_data = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
326
+ comparison = method.compare_distributions(new_data)
327
+ if comparison.is_drifted:
328
+ print(f"Distribution drift detected! p-value: {comparison.p_value}")
329
+ """
330
+
331
+ def __init__(self, p_value_threshold: float = 0.05):
332
+ """Initialize KS test method.
333
+
334
+ Args:
335
+ p_value_threshold: P-value below which drift is detected
336
+ """
337
+ self.p_value_threshold = p_value_threshold
338
+ self._baseline_values: list[float] = []
339
+ self._baseline_ecdf: list[tuple[float, float]] = []
340
+ self._fitted = False
341
+
342
+ @property
343
+ def name(self) -> str:
344
+ return "ks_test"
345
+
346
+ def fit(self, values: list[float]) -> None:
347
+ """Learn baseline distribution.
348
+
349
+ Args:
350
+ values: List of numeric values for baseline
351
+ """
352
+ clean = sorted(v for v in values if v is not None and not math.isnan(v))
353
+ if not clean:
354
+ return
355
+
356
+ self._baseline_values = clean
357
+ self._baseline_ecdf = self._compute_ecdf(clean)
358
+ self._fitted = True
359
+
360
+ def score(self, value: float) -> AnomalyScore:
361
+ """Score a single value (uses empirical CDF).
362
+
363
+ For distribution testing, use compare_distributions() instead.
364
+
365
+ Args:
366
+ value: Value to score
367
+
368
+ Returns:
369
+ AnomalyScore based on position in baseline distribution
370
+ """
371
+ if value is None or math.isnan(value):
372
+ return AnomalyScore(
373
+ value=value,
374
+ score=0.0,
375
+ is_anomaly=False,
376
+ threshold=self.p_value_threshold,
377
+ details={"reason": "null_or_nan"}
378
+ )
379
+
380
+ if not self._fitted:
381
+ return AnomalyScore(
382
+ value=value,
383
+ score=0.0,
384
+ is_anomaly=False,
385
+ threshold=self.p_value_threshold,
386
+ details={"reason": "not_fitted"}
387
+ )
388
+
389
+ # Find percentile in baseline
390
+ percentile = self._get_percentile(value)
391
+
392
+ # Extreme percentiles indicate potential anomalies
393
+ is_anomaly = percentile < 0.01 or percentile > 0.99
394
+
395
+ return AnomalyScore(
396
+ value=value,
397
+ score=min(percentile, 1 - percentile), # Distance from extremes
398
+ is_anomaly=is_anomaly,
399
+ threshold=0.01,
400
+ details={
401
+ "percentile": percentile,
402
+ "baseline_size": len(self._baseline_values),
403
+ }
404
+ )
405
+
406
+ def compare_distributions(
407
+ self,
408
+ current_values: list[float],
409
+ ) -> DistributionComparison:
410
+ """Compare current distribution to baseline using KS test.
411
+
412
+ Args:
413
+ current_values: Current values to compare
414
+
415
+ Returns:
416
+ DistributionComparison with test results
417
+ """
418
+ if not self._fitted:
419
+ raise ValueError("Method not fitted - call fit() first")
420
+
421
+ clean_current = sorted(v for v in current_values if v is not None and not math.isnan(v))
422
+ if not clean_current:
423
+ raise ValueError("No valid values to compare")
424
+
425
+ # Compute KS statistic (two-sample)
426
+ ks_stat, p_value = self._ks_two_sample(self._baseline_values, clean_current)
427
+
428
+ return DistributionComparison(
429
+ column_name="", # Set by caller
430
+ statistic=ks_stat,
431
+ p_value=p_value,
432
+ is_drifted=p_value < self.p_value_threshold,
433
+ method="ks_test",
434
+ details={
435
+ "baseline_size": len(self._baseline_values),
436
+ "current_size": len(clean_current),
437
+ "threshold": self.p_value_threshold,
438
+ }
439
+ )
440
+
441
+ def _compute_ecdf(self, sorted_values: list[float]) -> list[tuple[float, float]]:
442
+ """Compute empirical CDF from sorted values."""
443
+ n = len(sorted_values)
444
+ return [(v, (i + 1) / n) for i, v in enumerate(sorted_values)]
445
+
446
+ def _get_percentile(self, value: float) -> float:
447
+ """Get percentile of value in baseline distribution."""
448
+ if not self._baseline_values:
449
+ return 0.5
450
+
451
+ count_below = sum(1 for v in self._baseline_values if v <= value)
452
+ return count_below / len(self._baseline_values)
453
+
454
+ def _ks_two_sample(
455
+ self,
456
+ sample1: list[float],
457
+ sample2: list[float],
458
+ ) -> tuple[float, float]:
459
+ """Compute two-sample KS test statistic and approximate p-value.
460
+
461
+ Returns:
462
+ Tuple of (KS statistic, approximate p-value)
463
+ """
464
+ n1, n2 = len(sample1), len(sample2)
465
+ if n1 == 0 or n2 == 0:
466
+ return 0.0, 1.0
467
+
468
+ # Merge and sort all values with source labels
469
+ combined = [(v, 1) for v in sample1] + [(v, 2) for v in sample2]
470
+ combined.sort(key=lambda x: x[0])
471
+
472
+ # Compute ECDFs and find max difference
473
+ ecdf1, ecdf2 = 0.0, 0.0
474
+ max_diff = 0.0
475
+
476
+ for value, source in combined:
477
+ if source == 1:
478
+ ecdf1 += 1 / n1
479
+ else:
480
+ ecdf2 += 1 / n2
481
+ max_diff = max(max_diff, abs(ecdf1 - ecdf2))
482
+
483
+ # Approximate p-value using asymptotic formula
484
+ # P(D > d) ≈ 2 * exp(-2 * n * d^2) where n = n1*n2/(n1+n2)
485
+ n_effective = (n1 * n2) / (n1 + n2)
486
+ p_value = 2 * math.exp(-2 * n_effective * max_diff ** 2)
487
+ p_value = min(1.0, max(0.0, p_value))
488
+
489
+ return max_diff, p_value
490
+
491
+
492
+ class SeasonalMethod(AnomalyMethod):
493
+ """Detect anomalies accounting for seasonal patterns.
494
+
495
+ This method learns typical values for different time periods
496
+ (hour of day, day of week, etc.) and detects deviations from
497
+ expected seasonal patterns.
498
+
499
+ Usage:
500
+ from duckguard.anomaly.ml_methods import SeasonalMethod
501
+
502
+ method = SeasonalMethod(period="daily", sensitivity=2.0)
503
+
504
+ # Fit with time-value pairs
505
+ # values format: [(timestamp, value), ...]
506
+ method.fit_with_timestamps(historical_data)
507
+
508
+ # Score new values
509
+ score = method.score_with_timestamp(new_timestamp, new_value)
510
+ """
511
+
512
+ PERIODS = {
513
+ "hourly": 24, # 24 buckets (hours of day)
514
+ "daily": 7, # 7 buckets (days of week)
515
+ "weekly": 52, # 52 buckets (weeks of year)
516
+ "monthly": 12, # 12 buckets (months of year)
517
+ }
518
+
519
+ def __init__(
520
+ self,
521
+ period: str = "daily",
522
+ sensitivity: float = 2.0,
523
+ ):
524
+ """Initialize seasonal method.
525
+
526
+ Args:
527
+ period: Seasonality period (hourly, daily, weekly, monthly)
528
+ sensitivity: Number of standard deviations for anomaly threshold
529
+ """
530
+ if period not in self.PERIODS:
531
+ raise ValueError(f"Unknown period: {period}. Valid: {list(self.PERIODS.keys())}")
532
+
533
+ self.period = period
534
+ self.sensitivity = sensitivity
535
+ self._num_buckets = self.PERIODS[period]
536
+
537
+ # Learned parameters per bucket
538
+ self._bucket_means: dict[int, float] = {}
539
+ self._bucket_stddevs: dict[int, float] = {}
540
+ self._bucket_counts: dict[int, int] = {}
541
+ self._fitted = False
542
+
543
+ # For non-timestamped fitting
544
+ self._global_mean: float = 0.0
545
+ self._global_stddev: float = 1.0
546
+
547
+ @property
548
+ def name(self) -> str:
549
+ return f"seasonal_{self.period}"
550
+
551
+ def fit(self, values: list[float]) -> None:
552
+ """Fit without timestamps (falls back to global statistics).
553
+
554
+ For proper seasonal detection, use fit_with_timestamps().
555
+
556
+ Args:
557
+ values: List of numeric values
558
+ """
559
+ clean = [v for v in values if v is not None and not math.isnan(v)]
560
+ if not clean:
561
+ return
562
+
563
+ n = len(clean)
564
+ self._global_mean = sum(clean) / n
565
+
566
+ if n > 1:
567
+ variance = sum((x - self._global_mean) ** 2 for x in clean) / (n - 1)
568
+ self._global_stddev = math.sqrt(variance) if variance > 0 else 1.0
569
+
570
+ self._fitted = True
571
+
572
+ def fit_with_timestamps(
573
+ self,
574
+ data: list[tuple[Any, float]],
575
+ ) -> None:
576
+ """Fit with timestamps for seasonal pattern learning.
577
+
578
+ Args:
579
+ data: List of (timestamp, value) tuples.
580
+ Timestamps can be datetime objects or timestamps.
581
+ """
582
+ from datetime import datetime
583
+
584
+ # Group values by bucket
585
+ buckets: dict[int, list[float]] = {i: [] for i in range(self._num_buckets)}
586
+
587
+ for timestamp, value in data:
588
+ if value is None or math.isnan(value):
589
+ continue
590
+
591
+ if isinstance(timestamp, (int, float)):
592
+ timestamp = datetime.fromtimestamp(timestamp)
593
+
594
+ bucket = self._get_bucket(timestamp)
595
+ buckets[bucket].append(value)
596
+
597
+ # Compute statistics per bucket
598
+ for bucket, values in buckets.items():
599
+ if values:
600
+ n = len(values)
601
+ mean = sum(values) / n
602
+ self._bucket_means[bucket] = mean
603
+ self._bucket_counts[bucket] = n
604
+
605
+ if n > 1:
606
+ variance = sum((x - mean) ** 2 for x in values) / (n - 1)
607
+ self._bucket_stddevs[bucket] = math.sqrt(variance) if variance > 0 else 1.0
608
+ else:
609
+ self._bucket_stddevs[bucket] = 1.0
610
+
611
+ # Compute global stats as fallback
612
+ all_values = [v for bucket_values in buckets.values() for v in bucket_values]
613
+ if all_values:
614
+ self._global_mean = sum(all_values) / len(all_values)
615
+ if len(all_values) > 1:
616
+ variance = sum((x - self._global_mean) ** 2 for x in all_values) / (len(all_values) - 1)
617
+ self._global_stddev = math.sqrt(variance) if variance > 0 else 1.0
618
+
619
+ self._fitted = True
620
+
621
+ def score(self, value: float) -> AnomalyScore:
622
+ """Score a value without timestamp (uses global stats).
623
+
624
+ For proper seasonal scoring, use score_with_timestamp().
625
+
626
+ Args:
627
+ value: Value to score
628
+
629
+ Returns:
630
+ AnomalyScore using global statistics
631
+ """
632
+ if value is None or math.isnan(value):
633
+ return AnomalyScore(
634
+ value=value,
635
+ score=0.0,
636
+ is_anomaly=False,
637
+ threshold=self.sensitivity,
638
+ details={"reason": "null_or_nan"}
639
+ )
640
+
641
+ deviation = abs((value - self._global_mean) / self._global_stddev) if self._global_stddev != 0 else 0.0
642
+ is_anomaly = deviation > self.sensitivity
643
+
644
+ return AnomalyScore(
645
+ value=value,
646
+ score=deviation,
647
+ is_anomaly=is_anomaly,
648
+ threshold=self.sensitivity,
649
+ details={
650
+ "global_mean": self._global_mean,
651
+ "global_stddev": self._global_stddev,
652
+ "note": "No timestamp provided - using global statistics",
653
+ }
654
+ )
655
+
656
+ def score_with_timestamp(
657
+ self,
658
+ timestamp: Any,
659
+ value: float,
660
+ ) -> AnomalyScore:
661
+ """Score a value with timestamp for seasonal comparison.
662
+
663
+ Args:
664
+ timestamp: Datetime or timestamp
665
+ value: Value to score
666
+
667
+ Returns:
668
+ AnomalyScore considering seasonal patterns
669
+ """
670
+ from datetime import datetime
671
+
672
+ if value is None or math.isnan(value):
673
+ return AnomalyScore(
674
+ value=value,
675
+ score=0.0,
676
+ is_anomaly=False,
677
+ threshold=self.sensitivity,
678
+ details={"reason": "null_or_nan"}
679
+ )
680
+
681
+ if isinstance(timestamp, (int, float)):
682
+ timestamp = datetime.fromtimestamp(timestamp)
683
+
684
+ bucket = self._get_bucket(timestamp)
685
+
686
+ # Use bucket-specific stats if available, otherwise global
687
+ if bucket in self._bucket_means and self._bucket_counts.get(bucket, 0) >= 3:
688
+ mean = self._bucket_means[bucket]
689
+ stddev = self._bucket_stddevs.get(bucket, 1.0)
690
+ used_bucket = True
691
+ else:
692
+ mean = self._global_mean
693
+ stddev = self._global_stddev
694
+ used_bucket = False
695
+
696
+ deviation = abs((value - mean) / stddev) if stddev != 0 else 0.0
697
+ is_anomaly = deviation > self.sensitivity
698
+
699
+ return AnomalyScore(
700
+ value=value,
701
+ score=deviation,
702
+ is_anomaly=is_anomaly,
703
+ threshold=self.sensitivity,
704
+ details={
705
+ "bucket": bucket,
706
+ "period": self.period,
707
+ "bucket_mean": mean,
708
+ "bucket_stddev": stddev,
709
+ "used_seasonal": used_bucket,
710
+ "bucket_sample_count": self._bucket_counts.get(bucket, 0),
711
+ }
712
+ )
713
+
714
+ def _get_bucket(self, timestamp) -> int:
715
+ """Get bucket index for a timestamp."""
716
+ if self.period == "hourly":
717
+ return timestamp.hour
718
+ elif self.period == "daily":
719
+ return timestamp.weekday()
720
+ elif self.period == "weekly":
721
+ return timestamp.isocalendar()[1] - 1 # Week of year (0-indexed)
722
+ elif self.period == "monthly":
723
+ return timestamp.month - 1 # Month (0-indexed)
724
+ return 0