duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,437 @@
1
+ """Column class with validation methods."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from duckguard.core.result import ValidationResult
9
+
10
+ if TYPE_CHECKING:
11
+ from duckguard.core.dataset import Dataset
12
+
13
+
14
+ class Column:
15
+ """
16
+ Represents a column in a dataset with validation capabilities.
17
+
18
+ Columns provide a fluent interface for data validation that
19
+ feels natural to Python developers.
20
+
21
+ Example:
22
+ assert orders.customer_id.null_percent < 5
23
+ assert orders.amount.between(0, 10000)
24
+ assert orders.email.matches(r'^[\\w.-]+@[\\w.-]+\\.\\w+$')
25
+ """
26
+
27
+ def __init__(self, name: str, dataset: Dataset):
28
+ """
29
+ Initialize a Column.
30
+
31
+ Args:
32
+ name: Column name
33
+ dataset: Parent dataset
34
+ """
35
+ self._name = name
36
+ self._dataset = dataset
37
+ self._stats_cache: dict[str, Any] | None = None
38
+ self._numeric_stats_cache: dict[str, Any] | None = None
39
+
40
+ @property
41
+ def name(self) -> str:
42
+ """Get the column name."""
43
+ return self._name
44
+
45
+ @property
46
+ def dataset(self) -> Dataset:
47
+ """Get the parent dataset."""
48
+ return self._dataset
49
+
50
+ def _get_stats(self) -> dict[str, Any]:
51
+ """Get cached or fetch column statistics."""
52
+ if self._stats_cache is None:
53
+ self._stats_cache = self._dataset.engine.get_column_stats(
54
+ self._dataset.source, self._name
55
+ )
56
+ return self._stats_cache
57
+
58
+ def _get_numeric_stats(self) -> dict[str, Any]:
59
+ """Get cached or fetch numeric statistics."""
60
+ if self._numeric_stats_cache is None:
61
+ self._numeric_stats_cache = self._dataset.engine.get_numeric_stats(
62
+ self._dataset.source, self._name
63
+ )
64
+ return self._numeric_stats_cache
65
+
66
+ # =========================================================================
67
+ # Basic Statistics (return values for use in assertions)
68
+ # =========================================================================
69
+
70
+ @property
71
+ def null_count(self) -> int:
72
+ """Get the number of null values."""
73
+ return self._get_stats().get("null_count", 0)
74
+
75
+ @property
76
+ def null_percent(self) -> float:
77
+ """Get the percentage of null values (0-100)."""
78
+ return self._get_stats().get("null_percent", 0.0)
79
+
80
+ @property
81
+ def non_null_count(self) -> int:
82
+ """Get the number of non-null values."""
83
+ return self._get_stats().get("non_null_count", 0)
84
+
85
+ @property
86
+ def unique_count(self) -> int:
87
+ """Get the number of unique values."""
88
+ return self._get_stats().get("unique_count", 0)
89
+
90
+ @property
91
+ def unique_percent(self) -> float:
92
+ """Get the percentage of unique values (0-100)."""
93
+ return self._get_stats().get("unique_percent", 0.0)
94
+
95
+ @property
96
+ def total_count(self) -> int:
97
+ """Get the total number of values."""
98
+ return self._get_stats().get("total_count", 0)
99
+
100
+ @property
101
+ def min(self) -> Any:
102
+ """Get the minimum value."""
103
+ return self._get_stats().get("min_value")
104
+
105
+ @property
106
+ def max(self) -> Any:
107
+ """Get the maximum value."""
108
+ return self._get_stats().get("max_value")
109
+
110
+ @property
111
+ def mean(self) -> float | None:
112
+ """Get the mean value (for numeric columns)."""
113
+ return self._get_numeric_stats().get("mean")
114
+
115
+ @property
116
+ def stddev(self) -> float | None:
117
+ """Get the standard deviation (for numeric columns)."""
118
+ return self._get_numeric_stats().get("stddev")
119
+
120
+ @property
121
+ def median(self) -> float | None:
122
+ """Get the median value (for numeric columns)."""
123
+ return self._get_numeric_stats().get("median")
124
+
125
+ # =========================================================================
126
+ # Validation Methods (return ValidationResult or bool)
127
+ # =========================================================================
128
+
129
+ def is_not_null(self, threshold: float = 0.0) -> ValidationResult:
130
+ """
131
+ Check that null percentage is below threshold.
132
+
133
+ Args:
134
+ threshold: Maximum allowed null percentage (0-100)
135
+
136
+ Returns:
137
+ ValidationResult
138
+ """
139
+ actual = self.null_percent
140
+ passed = actual <= threshold
141
+ return ValidationResult(
142
+ passed=passed,
143
+ actual_value=actual,
144
+ expected_value=f"<= {threshold}%",
145
+ message=f"Column '{self._name}' null_percent is {actual:.2f}% (threshold: {threshold}%)",
146
+ )
147
+
148
+ def is_unique(self, threshold: float = 100.0) -> ValidationResult:
149
+ """
150
+ Check that unique percentage is at or above threshold.
151
+
152
+ Args:
153
+ threshold: Minimum required unique percentage (0-100)
154
+
155
+ Returns:
156
+ ValidationResult
157
+ """
158
+ actual = self.unique_percent
159
+ passed = actual >= threshold
160
+ return ValidationResult(
161
+ passed=passed,
162
+ actual_value=actual,
163
+ expected_value=f">= {threshold}%",
164
+ message=f"Column '{self._name}' unique_percent is {actual:.2f}% (threshold: {threshold}%)",
165
+ )
166
+
167
+ def between(self, min_val: Any, max_val: Any) -> ValidationResult:
168
+ """
169
+ Check that all values are between min and max (inclusive).
170
+
171
+ Args:
172
+ min_val: Minimum allowed value
173
+ max_val: Maximum allowed value
174
+
175
+ Returns:
176
+ ValidationResult indicating if all non-null values are in range
177
+ """
178
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
179
+ col = f'"{self._name}"'
180
+
181
+ sql = f"""
182
+ SELECT COUNT(*) as out_of_range
183
+ FROM {ref}
184
+ WHERE {col} IS NOT NULL
185
+ AND ({col} < {min_val} OR {col} > {max_val})
186
+ """
187
+
188
+ out_of_range = self._dataset.engine.fetch_value(sql) or 0
189
+ passed = out_of_range == 0
190
+
191
+ return ValidationResult(
192
+ passed=passed,
193
+ actual_value=out_of_range,
194
+ expected_value=0,
195
+ message=f"Column '{self._name}' has {out_of_range} values outside [{min_val}, {max_val}]",
196
+ details={"min": min_val, "max": max_val, "out_of_range_count": out_of_range},
197
+ )
198
+
199
+ def matches(self, pattern: str) -> ValidationResult:
200
+ """
201
+ Check that all non-null values match a regex pattern.
202
+
203
+ Args:
204
+ pattern: Regular expression pattern
205
+
206
+ Returns:
207
+ ValidationResult
208
+ """
209
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
210
+ col = f'"{self._name}"'
211
+
212
+ # DuckDB uses regexp_matches for regex
213
+ sql = f"""
214
+ SELECT COUNT(*) as non_matching
215
+ FROM {ref}
216
+ WHERE {col} IS NOT NULL
217
+ AND NOT regexp_matches({col}::VARCHAR, '{pattern}')
218
+ """
219
+
220
+ non_matching = self._dataset.engine.fetch_value(sql) or 0
221
+ passed = non_matching == 0
222
+
223
+ return ValidationResult(
224
+ passed=passed,
225
+ actual_value=non_matching,
226
+ expected_value=0,
227
+ message=f"Column '{self._name}' has {non_matching} values not matching pattern '{pattern}'",
228
+ details={"pattern": pattern, "non_matching_count": non_matching},
229
+ )
230
+
231
+ def isin(self, values: list[Any]) -> ValidationResult:
232
+ """
233
+ Check that all non-null values are in the allowed set.
234
+
235
+ Args:
236
+ values: List of allowed values
237
+
238
+ Returns:
239
+ ValidationResult
240
+ """
241
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
242
+ col = f'"{self._name}"'
243
+
244
+ # Build value list for SQL
245
+ formatted_values = ", ".join(
246
+ f"'{v}'" if isinstance(v, str) else str(v) for v in values
247
+ )
248
+
249
+ sql = f"""
250
+ SELECT COUNT(*) as invalid_count
251
+ FROM {ref}
252
+ WHERE {col} IS NOT NULL
253
+ AND {col} NOT IN ({formatted_values})
254
+ """
255
+
256
+ invalid_count = self._dataset.engine.fetch_value(sql) or 0
257
+ passed = invalid_count == 0
258
+
259
+ return ValidationResult(
260
+ passed=passed,
261
+ actual_value=invalid_count,
262
+ expected_value=0,
263
+ message=f"Column '{self._name}' has {invalid_count} values not in allowed set",
264
+ details={"allowed_values": values, "invalid_count": invalid_count},
265
+ )
266
+
267
+ def has_no_duplicates(self) -> ValidationResult:
268
+ """
269
+ Check that all values are unique (no duplicates).
270
+
271
+ Returns:
272
+ ValidationResult
273
+ """
274
+ total = self.total_count
275
+ unique = self.unique_count
276
+ duplicates = total - unique
277
+ passed = duplicates == 0
278
+
279
+ return ValidationResult(
280
+ passed=passed,
281
+ actual_value=duplicates,
282
+ expected_value=0,
283
+ message=f"Column '{self._name}' has {duplicates} duplicate values",
284
+ )
285
+
286
+ def greater_than(self, value: Any) -> ValidationResult:
287
+ """
288
+ Check that all non-null values are greater than a value.
289
+
290
+ Args:
291
+ value: Minimum value (exclusive)
292
+
293
+ Returns:
294
+ ValidationResult
295
+ """
296
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
297
+ col = f'"{self._name}"'
298
+
299
+ sql = f"""
300
+ SELECT COUNT(*) as invalid_count
301
+ FROM {ref}
302
+ WHERE {col} IS NOT NULL AND {col} <= {value}
303
+ """
304
+
305
+ invalid_count = self._dataset.engine.fetch_value(sql) or 0
306
+ passed = invalid_count == 0
307
+
308
+ return ValidationResult(
309
+ passed=passed,
310
+ actual_value=invalid_count,
311
+ expected_value=0,
312
+ message=f"Column '{self._name}' has {invalid_count} values <= {value}",
313
+ )
314
+
315
+ def less_than(self, value: Any) -> ValidationResult:
316
+ """
317
+ Check that all non-null values are less than a value.
318
+
319
+ Args:
320
+ value: Maximum value (exclusive)
321
+
322
+ Returns:
323
+ ValidationResult
324
+ """
325
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
326
+ col = f'"{self._name}"'
327
+
328
+ sql = f"""
329
+ SELECT COUNT(*) as invalid_count
330
+ FROM {ref}
331
+ WHERE {col} IS NOT NULL AND {col} >= {value}
332
+ """
333
+
334
+ invalid_count = self._dataset.engine.fetch_value(sql) or 0
335
+ passed = invalid_count == 0
336
+
337
+ return ValidationResult(
338
+ passed=passed,
339
+ actual_value=invalid_count,
340
+ expected_value=0,
341
+ message=f"Column '{self._name}' has {invalid_count} values >= {value}",
342
+ )
343
+
344
+ def value_lengths_between(self, min_len: int, max_len: int) -> ValidationResult:
345
+ """
346
+ Check that string value lengths are within range.
347
+
348
+ Args:
349
+ min_len: Minimum length
350
+ max_len: Maximum length
351
+
352
+ Returns:
353
+ ValidationResult
354
+ """
355
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
356
+ col = f'"{self._name}"'
357
+
358
+ sql = f"""
359
+ SELECT COUNT(*) as invalid_count
360
+ FROM {ref}
361
+ WHERE {col} IS NOT NULL
362
+ AND (LENGTH({col}::VARCHAR) < {min_len} OR LENGTH({col}::VARCHAR) > {max_len})
363
+ """
364
+
365
+ invalid_count = self._dataset.engine.fetch_value(sql) or 0
366
+ passed = invalid_count == 0
367
+
368
+ return ValidationResult(
369
+ passed=passed,
370
+ actual_value=invalid_count,
371
+ expected_value=0,
372
+ message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
373
+ )
374
+
375
+ def get_distinct_values(self, limit: int = 100) -> list[Any]:
376
+ """
377
+ Get distinct values in the column.
378
+
379
+ Args:
380
+ limit: Maximum number of values to return
381
+
382
+ Returns:
383
+ List of distinct values
384
+ """
385
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
386
+ col = f'"{self._name}"'
387
+
388
+ sql = f"""
389
+ SELECT DISTINCT {col}
390
+ FROM {ref}
391
+ WHERE {col} IS NOT NULL
392
+ LIMIT {limit}
393
+ """
394
+
395
+ rows = self._dataset.engine.fetch_all(sql)
396
+ return [row[0] for row in rows]
397
+
398
+ def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
399
+ """
400
+ Get value counts for the column.
401
+
402
+ Args:
403
+ limit: Maximum number of values to return
404
+
405
+ Returns:
406
+ Dictionary of value -> count
407
+ """
408
+ ref = self._dataset.engine.get_source_reference(self._dataset.source)
409
+ col = f'"{self._name}"'
410
+
411
+ sql = f"""
412
+ SELECT {col}, COUNT(*) as cnt
413
+ FROM {ref}
414
+ GROUP BY {col}
415
+ ORDER BY cnt DESC
416
+ LIMIT {limit}
417
+ """
418
+
419
+ rows = self._dataset.engine.fetch_all(sql)
420
+ return {row[0]: row[1] for row in rows}
421
+
422
+ def clear_cache(self) -> None:
423
+ """Clear cached statistics."""
424
+ self._stats_cache = None
425
+ self._numeric_stats_cache = None
426
+
427
+ def __repr__(self) -> str:
428
+ return f"Column('{self._name}', dataset='{self._dataset.name}')"
429
+
430
+ def __str__(self) -> str:
431
+ stats = self._get_stats()
432
+ return (
433
+ f"Column: {self._name}\n"
434
+ f" Total: {stats.get('total_count', 'N/A')}\n"
435
+ f" Nulls: {stats.get('null_count', 'N/A')} ({stats.get('null_percent', 0):.2f}%)\n"
436
+ f" Unique: {stats.get('unique_count', 'N/A')} ({stats.get('unique_percent', 0):.2f}%)"
437
+ )