duckguard 3.0.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,53 @@ class AnomalyScore:
29
29
  threshold: float
30
30
  details: dict[str, Any] = field(default_factory=dict)
31
31
 
32
+ def __lt__(self, other: AnomalyScore | float) -> bool:
33
+ """Less than comparison based on score."""
34
+ if isinstance(other, AnomalyScore):
35
+ return self.score < other.score
36
+ return self.score < other
37
+
38
+ def __le__(self, other: AnomalyScore | float) -> bool:
39
+ """Less than or equal comparison based on score."""
40
+ if isinstance(other, AnomalyScore):
41
+ return self.score <= other.score
42
+ return self.score <= other
43
+
44
+ def __gt__(self, other: AnomalyScore | float) -> bool:
45
+ """Greater than comparison based on score."""
46
+ if isinstance(other, AnomalyScore):
47
+ return self.score > other.score
48
+ return self.score > other
49
+
50
+ def __ge__(self, other: AnomalyScore | float) -> bool:
51
+ """Greater than or equal comparison based on score."""
52
+ if isinstance(other, AnomalyScore):
53
+ return self.score >= other.score
54
+ return self.score >= other
55
+
56
+ def __eq__(self, other: object) -> bool:
57
+ """Equality comparison based on score."""
58
+ if isinstance(other, AnomalyScore):
59
+ return self.score == other.score
60
+ if isinstance(other, (int, float)):
61
+ return self.score == other
62
+ return NotImplemented
63
+
64
+ def __ne__(self, other: object) -> bool:
65
+ """Inequality comparison based on score."""
66
+ result = self.__eq__(other)
67
+ if result is NotImplemented:
68
+ return result
69
+ return not result
70
+
71
+ def __float__(self) -> float:
72
+ """Convert to float (returns the score)."""
73
+ return self.score
74
+
75
+ def __format__(self, format_spec: str) -> str:
76
+ """Format the score using the given format specification."""
77
+ return format(self.score, format_spec)
78
+
32
79
 
33
80
  class AnomalyMethod(ABC):
34
81
  """Base class for anomaly detection methods."""
@@ -60,6 +60,18 @@ class DistributionComparison:
60
60
  method: str
61
61
  details: dict[str, Any] = field(default_factory=dict)
62
62
 
63
+ @property
64
+ def is_drift(self) -> bool:
65
+ """Alias for is_drifted (backward compatibility)."""
66
+ return self.is_drifted
67
+
68
+ @property
69
+ def message(self) -> str:
70
+ """Generate a human-readable message about the comparison."""
71
+ if self.is_drifted:
72
+ return f"Distribution drift detected (p-value: {self.p_value:.4f} < threshold)"
73
+ return f"No significant drift detected (p-value: {self.p_value:.4f})"
74
+
63
75
 
64
76
  class BaselineMethod(AnomalyMethod):
65
77
  """Detect anomalies by comparing to learned baseline.
@@ -111,12 +123,32 @@ class BaselineMethod(AnomalyMethod):
111
123
  def name(self) -> str:
112
124
  return "baseline"
113
125
 
114
- def fit(self, values: list[float]) -> None:
126
+ @property
127
+ def baseline_mean(self) -> float:
128
+ """Get the baseline mean value."""
129
+ return self._mean
130
+
131
+ @property
132
+ def baseline_std(self) -> float:
133
+ """Get the baseline standard deviation."""
134
+ return self._stddev
135
+
136
+ @property
137
+ def is_fitted(self) -> bool:
138
+ """Check if the model has been fitted."""
139
+ return self._fitted
140
+
141
+ def fit(self, values: list[float] | Any) -> None:
115
142
  """Learn baseline from values.
116
143
 
117
144
  Args:
118
- values: List of numeric values to learn from
145
+ values: List of numeric values or Column object to learn from
119
146
  """
147
+ # Handle Column objects
148
+ from duckguard.core.column import Column
149
+ if isinstance(values, Column):
150
+ values = self._get_column_values(values)
151
+
120
152
  clean = [v for v in values if v is not None and not math.isnan(v)]
121
153
  if not clean:
122
154
  return
@@ -135,15 +167,37 @@ class BaselineMethod(AnomalyMethod):
135
167
  self._sample_count = n
136
168
  self._fitted = True
137
169
 
138
- def score(self, value: float) -> AnomalyScore:
139
- """Score a value against the baseline.
170
+ def _get_column_values(self, column) -> list[float]:
171
+ """Extract numeric values from a Column object."""
172
+ dataset = column._dataset
173
+ column_name = column._name
174
+ engine = dataset._engine
175
+ table_name = dataset._source.replace('\\', '/')
176
+
177
+ query = f"""
178
+ SELECT "{column_name}"
179
+ FROM '{table_name}'
180
+ WHERE "{column_name}" IS NOT NULL
181
+ """
182
+
183
+ result = engine.fetch_all(query)
184
+ return [float(row[0]) for row in result]
185
+
186
+ def score(self, value: float | Any) -> AnomalyScore | list[AnomalyScore]:
187
+ """Score a value or column against the baseline.
140
188
 
141
189
  Args:
142
- value: Value to score
190
+ value: Single numeric value or Column object to score
143
191
 
144
192
  Returns:
145
- AnomalyScore indicating how anomalous the value is
193
+ AnomalyScore for single value, or list of AnomalyScores for Column
146
194
  """
195
+ # Handle Column objects
196
+ from duckguard.core.column import Column
197
+ if isinstance(value, Column):
198
+ values = self._get_column_values(value)
199
+ return [self.score(v) for v in values]
200
+
147
201
  if value is None or math.isnan(value):
148
202
  return AnomalyScore(
149
203
  value=value,
@@ -343,12 +397,17 @@ class KSTestMethod(AnomalyMethod):
343
397
  def name(self) -> str:
344
398
  return "ks_test"
345
399
 
346
- def fit(self, values: list[float]) -> None:
400
+ def fit(self, values: list[float] | Any) -> None:
347
401
  """Learn baseline distribution.
348
402
 
349
403
  Args:
350
- values: List of numeric values for baseline
404
+ values: List of numeric values or Column object for baseline
351
405
  """
406
+ # Handle Column objects
407
+ from duckguard.core.column import Column
408
+ if isinstance(values, Column):
409
+ values = self._get_column_values(values)
410
+
352
411
  clean = sorted(v for v in values if v is not None and not math.isnan(v))
353
412
  if not clean:
354
413
  return
@@ -357,17 +416,39 @@ class KSTestMethod(AnomalyMethod):
357
416
  self._baseline_ecdf = self._compute_ecdf(clean)
358
417
  self._fitted = True
359
418
 
360
- def score(self, value: float) -> AnomalyScore:
361
- """Score a single value (uses empirical CDF).
419
+ def _get_column_values(self, column) -> list[float]:
420
+ """Extract numeric values from a Column object."""
421
+ dataset = column._dataset
422
+ column_name = column._name
423
+ engine = dataset._engine
424
+ table_name = dataset._source.replace('\\', '/')
425
+
426
+ query = f"""
427
+ SELECT "{column_name}"
428
+ FROM '{table_name}'
429
+ WHERE "{column_name}" IS NOT NULL
430
+ """
431
+
432
+ result = engine.fetch_all(query)
433
+ return [float(row[0]) for row in result]
434
+
435
+ def score(self, value: float | Any) -> AnomalyScore | list[AnomalyScore]:
436
+ """Score a value or column (uses empirical CDF).
362
437
 
363
438
  For distribution testing, use compare_distributions() instead.
364
439
 
365
440
  Args:
366
- value: Value to score
441
+ value: Single numeric value or Column object to score
367
442
 
368
443
  Returns:
369
- AnomalyScore based on position in baseline distribution
444
+ AnomalyScore for single value, or list of AnomalyScores for Column
370
445
  """
446
+ # Handle Column objects
447
+ from duckguard.core.column import Column
448
+ if isinstance(value, Column):
449
+ values = self._get_column_values(value)
450
+ return [self.score(v) for v in values]
451
+
371
452
  if value is None or math.isnan(value):
372
453
  return AnomalyScore(
373
454
  value=value,
@@ -405,18 +486,35 @@ class KSTestMethod(AnomalyMethod):
405
486
 
406
487
  def compare_distributions(
407
488
  self,
408
- current_values: list[float],
489
+ current_values: list[float] | Any,
490
+ baseline_values: list[float] | Any | None = None,
409
491
  ) -> DistributionComparison:
410
492
  """Compare current distribution to baseline using KS test.
411
493
 
412
494
  Args:
413
- current_values: Current values to compare
495
+ current_values: List of values or Column object to compare
496
+ baseline_values: Optional baseline data. If not provided and not fitted,
497
+ will use current_values as baseline (self-comparison)
414
498
 
415
499
  Returns:
416
500
  DistributionComparison with test results
417
501
  """
502
+ # Handle Column objects for current_values
503
+ from duckguard.core.column import Column
504
+ if isinstance(current_values, Column):
505
+ current_values = self._get_column_values(current_values)
506
+
507
+ # Handle Column objects for baseline_values
508
+ if baseline_values is not None and isinstance(baseline_values, Column):
509
+ baseline_values = self._get_column_values(baseline_values)
510
+
511
+ # Auto-fit if not fitted and baseline provided
418
512
  if not self._fitted:
419
- raise ValueError("Method not fitted - call fit() first")
513
+ if baseline_values is not None:
514
+ self.fit(baseline_values)
515
+ else:
516
+ # Use current_values as baseline (self-comparison for normality test)
517
+ self.fit(current_values)
420
518
 
421
519
  clean_current = sorted(v for v in current_values if v is not None and not math.isnan(v))
422
520
  if not clean_current:
@@ -548,14 +646,19 @@ class SeasonalMethod(AnomalyMethod):
548
646
  def name(self) -> str:
549
647
  return f"seasonal_{self.period}"
550
648
 
551
- def fit(self, values: list[float]) -> None:
649
+ def fit(self, values: list[float] | Any) -> None:
552
650
  """Fit without timestamps (falls back to global statistics).
553
651
 
554
652
  For proper seasonal detection, use fit_with_timestamps().
555
653
 
556
654
  Args:
557
- values: List of numeric values
655
+ values: List of numeric values or Column object
558
656
  """
657
+ # Handle Column objects
658
+ from duckguard.core.column import Column
659
+ if isinstance(values, Column):
660
+ values = self._get_column_values(values)
661
+
559
662
  clean = [v for v in values if v is not None and not math.isnan(v)]
560
663
  if not clean:
561
664
  return
@@ -569,6 +672,22 @@ class SeasonalMethod(AnomalyMethod):
569
672
 
570
673
  self._fitted = True
571
674
 
675
+ def _get_column_values(self, column) -> list[float]:
676
+ """Extract numeric values from a Column object."""
677
+ dataset = column._dataset
678
+ column_name = column._name
679
+ engine = dataset._engine
680
+ table_name = dataset._source.replace('\\', '/')
681
+
682
+ query = f"""
683
+ SELECT "{column_name}"
684
+ FROM '{table_name}'
685
+ WHERE "{column_name}" IS NOT NULL
686
+ """
687
+
688
+ result = engine.fetch_all(query)
689
+ return [float(row[0]) for row in result]
690
+
572
691
  def fit_with_timestamps(
573
692
  self,
574
693
  data: list[tuple[Any, float]],
@@ -618,17 +737,23 @@ class SeasonalMethod(AnomalyMethod):
618
737
 
619
738
  self._fitted = True
620
739
 
621
- def score(self, value: float) -> AnomalyScore:
622
- """Score a value without timestamp (uses global stats).
740
+ def score(self, value: float | Any) -> AnomalyScore | list[AnomalyScore]:
741
+ """Score a value or column without timestamp (uses global stats).
623
742
 
624
743
  For proper seasonal scoring, use score_with_timestamp().
625
744
 
626
745
  Args:
627
- value: Value to score
746
+ value: Single numeric value or Column object to score
628
747
 
629
748
  Returns:
630
- AnomalyScore using global statistics
749
+ AnomalyScore for single value, or list of AnomalyScores for Column
631
750
  """
751
+ # Handle Column objects
752
+ from duckguard.core.column import Column
753
+ if isinstance(value, Column):
754
+ values = self._get_column_values(value)
755
+ return [self.score(v) for v in values]
756
+
632
757
  if value is None or math.isnan(value):
633
758
  return AnomalyScore(
634
759
  value=value,
duckguard/core/result.py CHANGED
@@ -37,6 +37,11 @@ class FailedRow:
37
37
  reason: str = ""
38
38
  context: dict[str, Any] = field(default_factory=dict)
39
39
 
40
+ @property
41
+ def row_number(self) -> int:
42
+ """Alias for row_index (backward compatibility)."""
43
+ return self.row_index
44
+
40
45
  def __repr__(self) -> str:
41
46
  return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
42
47
 
@@ -138,6 +138,15 @@ class EmailNotifier(BaseNotifier):
138
138
  if not self.email_config.to_addresses:
139
139
  raise ValueError("At least one recipient address (to_addresses) is required")
140
140
 
141
+ # Populate NotificationConfig with email settings for easy access
142
+ self.config.smtp_host = self.email_config.smtp_host
143
+ self.config.smtp_port = self.email_config.smtp_port
144
+ self.config.from_address = self.email_config.from_address
145
+ self.config.to_addresses = self.email_config.to_addresses
146
+ self.config.use_tls = self.email_config.use_tls
147
+ self.config.use_ssl = self.email_config.use_ssl
148
+ self.config.subject_prefix = self.email_config.subject_prefix
149
+
141
150
  # Set webhook_url to a placeholder (not used for email)
142
151
  self.webhook_url = "email://smtp"
143
152
 
@@ -40,6 +40,16 @@ class NotificationConfig:
40
40
  max_failures_shown: int = 10
41
41
  mention_users: list[str] = field(default_factory=list)
42
42
  channel: str | None = None
43
+ username: str | None = None # Slack bot username
44
+
45
+ # Email-specific attributes (set by EmailNotifier)
46
+ smtp_host: str | None = None
47
+ smtp_port: int | None = None
48
+ from_address: str | None = None
49
+ to_addresses: list[str] | None = None
50
+ use_tls: bool | None = None
51
+ use_ssl: bool | None = None
52
+ subject_prefix: str | None = None
43
53
 
44
54
 
45
55
  class BaseNotifier(ABC):
@@ -143,13 +153,39 @@ class SlackNotifier(BaseNotifier):
143
153
  """Slack webhook notifier.
144
154
 
145
155
  Usage:
146
- notifier = SlackNotifier(webhook_url="https://hooks.slack.com/...")
156
+ notifier = SlackNotifier(
157
+ webhook_url="https://hooks.slack.com/...",
158
+ channel="#data-quality",
159
+ username="DuckGuard Bot"
160
+ )
147
161
  # or set DUCKGUARD_SLACK_WEBHOOK environment variable
148
162
 
149
163
  result = execute_rules(rules, "data.csv")
150
164
  notifier.send_results(result)
151
165
  """
152
166
 
167
+ def __init__(
168
+ self,
169
+ webhook_url: str | None = None,
170
+ channel: str | None = None,
171
+ username: str | None = None,
172
+ config: NotificationConfig | None = None,
173
+ ):
174
+ """Initialize Slack notifier.
175
+
176
+ Args:
177
+ webhook_url: Slack webhook URL
178
+ channel: Override default channel (e.g., "#data-quality")
179
+ username: Bot username to display
180
+ config: Notification configuration
181
+ """
182
+ super().__init__(webhook_url=webhook_url, config=config)
183
+ # Only override if explicitly provided (don't overwrite config values with None)
184
+ if channel is not None:
185
+ self.config.channel = channel
186
+ if username is not None:
187
+ self.config.username = username
188
+
153
189
  @property
154
190
  def _env_var_name(self) -> str:
155
191
  return "DUCKGUARD_SLACK_WEBHOOK"
@@ -211,6 +247,8 @@ class SlackNotifier(BaseNotifier):
211
247
 
212
248
  if self.config.channel:
213
249
  message["channel"] = self.config.channel
250
+ if self.config.username:
251
+ message["username"] = self.config.username
214
252
 
215
253
  return message
216
254
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckguard
3
- Version: 3.0.0
3
+ Version: 3.0.1
4
4
  Summary: A Python-native data quality tool with AI superpowers, built on DuckDB for speed
5
5
  Project-URL: Homepage, https://github.com/XDataHubAI/duckguard
6
6
  Project-URL: Documentation, https://github.com/XDataHubAI/duckguard
@@ -3,8 +3,8 @@ duckguard/errors.py,sha256=xhQPxCCeB3dCQspTbQf58h_DvwHP1vAb6vKI9fHYAJ0,11493
3
3
  duckguard/anomaly/__init__.py,sha256=mrTyL70cOR5S7_RNc9QLADdnBimIsbAoFTbKlWiIsbw,1353
4
4
  duckguard/anomaly/baselines.py,sha256=k28CjjqBa8IaZxnIgof-wjw_Xdb7NJZImC2OJJkGXQ8,8776
5
5
  duckguard/anomaly/detector.py,sha256=voA7WS2x2p5h5cnwH3C_2ly7HdYpXLwC4jDiPL2Xleo,12443
6
- duckguard/anomaly/methods.py,sha256=CtV2G-kowXGgz0HYvNoi2Ge7eyHUg2GwGa3oZvunS38,13475
7
- duckguard/anomaly/ml_methods.py,sha256=UyEr8q4K_wNq7pWgTsV23IoBI13aqm0hHIwIFjIxeas,23449
6
+ duckguard/anomaly/methods.py,sha256=IRt7_1YWGaQHz2syfEd89lL6kAjOjheSk6ayLRUi58M,15237
7
+ duckguard/anomaly/ml_methods.py,sha256=Ne8BOULj-bcPmf1_YAqJqnlXDlljfhsxvFbBIjWkJB8,28221
8
8
  duckguard/checks/__init__.py,sha256=aSxO02ZILHnfrGhfomQ5EN69t7NZ4yr61Etwtcv_zIw,847
9
9
  duckguard/checks/conditional.py,sha256=gYFZD_6M-IUs1MGMZeDYH-qC99dyMJ-u63r1SgcBVs8,26646
10
10
  duckguard/checks/distributional.py,sha256=Cy3YlWnSPA5QZdNT_lYuTMRLrwvU1yJGk--RGzOQ5N4,18302
@@ -37,7 +37,7 @@ duckguard/core/__init__.py,sha256=pHndzrdehB0GFtlSQ46uvw8XgUQj55dVZQP1ZK-aDso,35
37
37
  duckguard/core/column.py,sha256=88m3WipKNdNslXNWAk4ofTf0kmNlDDAyhjDUa-Q6UGg,48326
38
38
  duckguard/core/dataset.py,sha256=kQY2ALTsid5x1NWOM5Wse60mOrLdUj8lKUs1cLK7cCo,44364
39
39
  duckguard/core/engine.py,sha256=ld_NHsWyBkVynmWyvbyQcHdXHhpIoSaRDyqAAtVx8J0,7897
40
- duckguard/core/result.py,sha256=BwmP0gNPAKVYHdyque1rDkbAhEvwFaA3PwhxaI7cY14,15178
40
+ duckguard/core/result.py,sha256=kQ_tzDkxjJTGK_k1P6crprrrYIszokhSxQMGlP1laAw,15316
41
41
  duckguard/core/scoring.py,sha256=42CVgxmmfo3Yb3m3Xl8qWnDgR7ndSZd8vXRwy9XSThI,16826
42
42
  duckguard/freshness/__init__.py,sha256=8XR7JxH9tz61En5DTMSDHrjhroPzvwCTVzBbBiRFexs,854
43
43
  duckguard/freshness/monitor.py,sha256=O_b4fh6unyZ2DXioX6O7KP9VpenGdLTpb9OdNb79dX8,14695
@@ -49,9 +49,9 @@ duckguard/integrations/__init__.py,sha256=SuqOzfdaejlMCti372FHD_R6bVaPaUmfEPG9IM
49
49
  duckguard/integrations/airflow.py,sha256=pxC14Kgwou_2xWPvTfx8YWO-xg_vgFeAlGDhgGfXRyM,13195
50
50
  duckguard/integrations/dbt.py,sha256=Dw1meY-UhylDFhUZ2s47FnJGMp_gszHvadGn_hqYkSM,14101
51
51
  duckguard/notifications/__init__.py,sha256=qEfUvt7d_WXlbsGlLB-FaNF4ksLtAyO8JXi1JCdo89w,1541
52
- duckguard/notifications/email.py,sha256=jwgxec8r6NUNqrxz3v5B4A3UL0-ZdxnJZhXQXWgMWH4,17168
52
+ duckguard/notifications/email.py,sha256=6qmHXufExnczyXEpa1dt6A6dli0kgRHZV_DhEkfMsj4,17677
53
53
  duckguard/notifications/formatter.py,sha256=Z2vGMpLdqPWYaYTaVtVjYnIbNU8Haer-7efohZ5IZxM,3991
54
- duckguard/notifications/notifiers.py,sha256=e-UBvoskFSzIwlCFTxIFdkI-z54zZeEeSQkvOvgV6JI,11703
54
+ duckguard/notifications/notifiers.py,sha256=nViWe2rms8C9t05WMbc2mwJrryS7V8N2OBSJ3u0PQGE,13023
55
55
  duckguard/profiler/__init__.py,sha256=a16GYeeFDZzwCemTsTuzO3Ih4M7_hOPb9hS8yt-nHzU,169
56
56
  duckguard/profiler/auto_profile.py,sha256=KbAkty-HrpNbTribi2uD17Fcsb-UiV5eG4zZsbyBOL4,12267
57
57
  duckguard/profiler/distribution_analyzer.py,sha256=I_jnDUtEG260yu7zEBU-2vHRIeYpAzuF-HKX99i8MGU,12644
@@ -79,8 +79,8 @@ duckguard/semantic/analyzer.py,sha256=2be1oofe-owBhTg-Dy88-wihaoTQ7DPxf1NuA1sgfR
79
79
  duckguard/semantic/detector.py,sha256=MPdb2Rv9VGQBko7nmPk4-Kjga_XVjPZdHCr29gdET0M,15665
80
80
  duckguard/semantic/validators.py,sha256=8Zu3vwPwh79U09zGf4_PpcwV85_hbNCwRHcxTIQ7G_I,10945
81
81
  duckguard/validators/__init__.py,sha256=g717IM5xlVLCTg1nLRRccLAFHCsbRO-IgjzG4H6K32A,268
82
- duckguard-3.0.0.dist-info/METADATA,sha256=bkRQeGGM5c3BcvOZpJeHx4byCHWctL1jgCDHa7VR5kc,31770
83
- duckguard-3.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
84
- duckguard-3.0.0.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
85
- duckguard-3.0.0.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
86
- duckguard-3.0.0.dist-info/RECORD,,
82
+ duckguard-3.0.1.dist-info/METADATA,sha256=9jLfixYYUu4coNP0hadedJL2pacYkyjqD6vBtwQj6Og,31770
83
+ duckguard-3.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
84
+ duckguard-3.0.1.dist-info/entry_points.txt,sha256=teP6JdXUvY20E9P44TW_Z24xuQtXMgnCyOuWtd_KIYU,108
85
+ duckguard-3.0.1.dist-info/licenses/LICENSE,sha256=1Li9P3fainL-epQ9kEHZWKDScWtp4inPd6AkhUTJStk,3841
86
+ duckguard-3.0.1.dist-info/RECORD,,