duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
duckguard/core/column.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""Column class with validation methods."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from duckguard.core.result import ValidationResult
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from duckguard.core.dataset import Dataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Column:
|
|
15
|
+
"""
|
|
16
|
+
Represents a column in a dataset with validation capabilities.
|
|
17
|
+
|
|
18
|
+
Columns provide a fluent interface for data validation that
|
|
19
|
+
feels natural to Python developers.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
assert orders.customer_id.null_percent < 5
|
|
23
|
+
assert orders.amount.between(0, 10000)
|
|
24
|
+
assert orders.email.matches(r'^[\\w.-]+@[\\w.-]+\\.\\w+$')
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, name: str, dataset: Dataset):
|
|
28
|
+
"""
|
|
29
|
+
Initialize a Column.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: Column name
|
|
33
|
+
dataset: Parent dataset
|
|
34
|
+
"""
|
|
35
|
+
self._name = name
|
|
36
|
+
self._dataset = dataset
|
|
37
|
+
self._stats_cache: dict[str, Any] | None = None
|
|
38
|
+
self._numeric_stats_cache: dict[str, Any] | None = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def name(self) -> str:
|
|
42
|
+
"""Get the column name."""
|
|
43
|
+
return self._name
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def dataset(self) -> Dataset:
|
|
47
|
+
"""Get the parent dataset."""
|
|
48
|
+
return self._dataset
|
|
49
|
+
|
|
50
|
+
def _get_stats(self) -> dict[str, Any]:
|
|
51
|
+
"""Get cached or fetch column statistics."""
|
|
52
|
+
if self._stats_cache is None:
|
|
53
|
+
self._stats_cache = self._dataset.engine.get_column_stats(
|
|
54
|
+
self._dataset.source, self._name
|
|
55
|
+
)
|
|
56
|
+
return self._stats_cache
|
|
57
|
+
|
|
58
|
+
def _get_numeric_stats(self) -> dict[str, Any]:
|
|
59
|
+
"""Get cached or fetch numeric statistics."""
|
|
60
|
+
if self._numeric_stats_cache is None:
|
|
61
|
+
self._numeric_stats_cache = self._dataset.engine.get_numeric_stats(
|
|
62
|
+
self._dataset.source, self._name
|
|
63
|
+
)
|
|
64
|
+
return self._numeric_stats_cache
|
|
65
|
+
|
|
66
|
+
# =========================================================================
|
|
67
|
+
# Basic Statistics (return values for use in assertions)
|
|
68
|
+
# =========================================================================
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def null_count(self) -> int:
|
|
72
|
+
"""Get the number of null values."""
|
|
73
|
+
return self._get_stats().get("null_count", 0)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def null_percent(self) -> float:
|
|
77
|
+
"""Get the percentage of null values (0-100)."""
|
|
78
|
+
return self._get_stats().get("null_percent", 0.0)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def non_null_count(self) -> int:
|
|
82
|
+
"""Get the number of non-null values."""
|
|
83
|
+
return self._get_stats().get("non_null_count", 0)
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def unique_count(self) -> int:
|
|
87
|
+
"""Get the number of unique values."""
|
|
88
|
+
return self._get_stats().get("unique_count", 0)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def unique_percent(self) -> float:
|
|
92
|
+
"""Get the percentage of unique values (0-100)."""
|
|
93
|
+
return self._get_stats().get("unique_percent", 0.0)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def total_count(self) -> int:
|
|
97
|
+
"""Get the total number of values."""
|
|
98
|
+
return self._get_stats().get("total_count", 0)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def min(self) -> Any:
|
|
102
|
+
"""Get the minimum value."""
|
|
103
|
+
return self._get_stats().get("min_value")
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def max(self) -> Any:
|
|
107
|
+
"""Get the maximum value."""
|
|
108
|
+
return self._get_stats().get("max_value")
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def mean(self) -> float | None:
|
|
112
|
+
"""Get the mean value (for numeric columns)."""
|
|
113
|
+
return self._get_numeric_stats().get("mean")
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def stddev(self) -> float | None:
|
|
117
|
+
"""Get the standard deviation (for numeric columns)."""
|
|
118
|
+
return self._get_numeric_stats().get("stddev")
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def median(self) -> float | None:
|
|
122
|
+
"""Get the median value (for numeric columns)."""
|
|
123
|
+
return self._get_numeric_stats().get("median")
|
|
124
|
+
|
|
125
|
+
# =========================================================================
|
|
126
|
+
# Validation Methods (return ValidationResult or bool)
|
|
127
|
+
# =========================================================================
|
|
128
|
+
|
|
129
|
+
def is_not_null(self, threshold: float = 0.0) -> ValidationResult:
|
|
130
|
+
"""
|
|
131
|
+
Check that null percentage is below threshold.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
threshold: Maximum allowed null percentage (0-100)
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
ValidationResult
|
|
138
|
+
"""
|
|
139
|
+
actual = self.null_percent
|
|
140
|
+
passed = actual <= threshold
|
|
141
|
+
return ValidationResult(
|
|
142
|
+
passed=passed,
|
|
143
|
+
actual_value=actual,
|
|
144
|
+
expected_value=f"<= {threshold}%",
|
|
145
|
+
message=f"Column '{self._name}' null_percent is {actual:.2f}% (threshold: {threshold}%)",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def is_unique(self, threshold: float = 100.0) -> ValidationResult:
|
|
149
|
+
"""
|
|
150
|
+
Check that unique percentage is at or above threshold.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
threshold: Minimum required unique percentage (0-100)
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
ValidationResult
|
|
157
|
+
"""
|
|
158
|
+
actual = self.unique_percent
|
|
159
|
+
passed = actual >= threshold
|
|
160
|
+
return ValidationResult(
|
|
161
|
+
passed=passed,
|
|
162
|
+
actual_value=actual,
|
|
163
|
+
expected_value=f">= {threshold}%",
|
|
164
|
+
message=f"Column '{self._name}' unique_percent is {actual:.2f}% (threshold: {threshold}%)",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def between(self, min_val: Any, max_val: Any) -> ValidationResult:
|
|
168
|
+
"""
|
|
169
|
+
Check that all values are between min and max (inclusive).
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
min_val: Minimum allowed value
|
|
173
|
+
max_val: Maximum allowed value
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
ValidationResult indicating if all non-null values are in range
|
|
177
|
+
"""
|
|
178
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
179
|
+
col = f'"{self._name}"'
|
|
180
|
+
|
|
181
|
+
sql = f"""
|
|
182
|
+
SELECT COUNT(*) as out_of_range
|
|
183
|
+
FROM {ref}
|
|
184
|
+
WHERE {col} IS NOT NULL
|
|
185
|
+
AND ({col} < {min_val} OR {col} > {max_val})
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
out_of_range = self._dataset.engine.fetch_value(sql) or 0
|
|
189
|
+
passed = out_of_range == 0
|
|
190
|
+
|
|
191
|
+
return ValidationResult(
|
|
192
|
+
passed=passed,
|
|
193
|
+
actual_value=out_of_range,
|
|
194
|
+
expected_value=0,
|
|
195
|
+
message=f"Column '{self._name}' has {out_of_range} values outside [{min_val}, {max_val}]",
|
|
196
|
+
details={"min": min_val, "max": max_val, "out_of_range_count": out_of_range},
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def matches(self, pattern: str) -> ValidationResult:
|
|
200
|
+
"""
|
|
201
|
+
Check that all non-null values match a regex pattern.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
pattern: Regular expression pattern
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
ValidationResult
|
|
208
|
+
"""
|
|
209
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
210
|
+
col = f'"{self._name}"'
|
|
211
|
+
|
|
212
|
+
# DuckDB uses regexp_matches for regex
|
|
213
|
+
sql = f"""
|
|
214
|
+
SELECT COUNT(*) as non_matching
|
|
215
|
+
FROM {ref}
|
|
216
|
+
WHERE {col} IS NOT NULL
|
|
217
|
+
AND NOT regexp_matches({col}::VARCHAR, '{pattern}')
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
non_matching = self._dataset.engine.fetch_value(sql) or 0
|
|
221
|
+
passed = non_matching == 0
|
|
222
|
+
|
|
223
|
+
return ValidationResult(
|
|
224
|
+
passed=passed,
|
|
225
|
+
actual_value=non_matching,
|
|
226
|
+
expected_value=0,
|
|
227
|
+
message=f"Column '{self._name}' has {non_matching} values not matching pattern '{pattern}'",
|
|
228
|
+
details={"pattern": pattern, "non_matching_count": non_matching},
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def isin(self, values: list[Any]) -> ValidationResult:
|
|
232
|
+
"""
|
|
233
|
+
Check that all non-null values are in the allowed set.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
values: List of allowed values
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
ValidationResult
|
|
240
|
+
"""
|
|
241
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
242
|
+
col = f'"{self._name}"'
|
|
243
|
+
|
|
244
|
+
# Build value list for SQL
|
|
245
|
+
formatted_values = ", ".join(
|
|
246
|
+
f"'{v}'" if isinstance(v, str) else str(v) for v in values
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
sql = f"""
|
|
250
|
+
SELECT COUNT(*) as invalid_count
|
|
251
|
+
FROM {ref}
|
|
252
|
+
WHERE {col} IS NOT NULL
|
|
253
|
+
AND {col} NOT IN ({formatted_values})
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
257
|
+
passed = invalid_count == 0
|
|
258
|
+
|
|
259
|
+
return ValidationResult(
|
|
260
|
+
passed=passed,
|
|
261
|
+
actual_value=invalid_count,
|
|
262
|
+
expected_value=0,
|
|
263
|
+
message=f"Column '{self._name}' has {invalid_count} values not in allowed set",
|
|
264
|
+
details={"allowed_values": values, "invalid_count": invalid_count},
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def has_no_duplicates(self) -> ValidationResult:
|
|
268
|
+
"""
|
|
269
|
+
Check that all values are unique (no duplicates).
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
ValidationResult
|
|
273
|
+
"""
|
|
274
|
+
total = self.total_count
|
|
275
|
+
unique = self.unique_count
|
|
276
|
+
duplicates = total - unique
|
|
277
|
+
passed = duplicates == 0
|
|
278
|
+
|
|
279
|
+
return ValidationResult(
|
|
280
|
+
passed=passed,
|
|
281
|
+
actual_value=duplicates,
|
|
282
|
+
expected_value=0,
|
|
283
|
+
message=f"Column '{self._name}' has {duplicates} duplicate values",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
def greater_than(self, value: Any) -> ValidationResult:
|
|
287
|
+
"""
|
|
288
|
+
Check that all non-null values are greater than a value.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
value: Minimum value (exclusive)
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
ValidationResult
|
|
295
|
+
"""
|
|
296
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
297
|
+
col = f'"{self._name}"'
|
|
298
|
+
|
|
299
|
+
sql = f"""
|
|
300
|
+
SELECT COUNT(*) as invalid_count
|
|
301
|
+
FROM {ref}
|
|
302
|
+
WHERE {col} IS NOT NULL AND {col} <= {value}
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
306
|
+
passed = invalid_count == 0
|
|
307
|
+
|
|
308
|
+
return ValidationResult(
|
|
309
|
+
passed=passed,
|
|
310
|
+
actual_value=invalid_count,
|
|
311
|
+
expected_value=0,
|
|
312
|
+
message=f"Column '{self._name}' has {invalid_count} values <= {value}",
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
def less_than(self, value: Any) -> ValidationResult:
|
|
316
|
+
"""
|
|
317
|
+
Check that all non-null values are less than a value.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
value: Maximum value (exclusive)
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
ValidationResult
|
|
324
|
+
"""
|
|
325
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
326
|
+
col = f'"{self._name}"'
|
|
327
|
+
|
|
328
|
+
sql = f"""
|
|
329
|
+
SELECT COUNT(*) as invalid_count
|
|
330
|
+
FROM {ref}
|
|
331
|
+
WHERE {col} IS NOT NULL AND {col} >= {value}
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
335
|
+
passed = invalid_count == 0
|
|
336
|
+
|
|
337
|
+
return ValidationResult(
|
|
338
|
+
passed=passed,
|
|
339
|
+
actual_value=invalid_count,
|
|
340
|
+
expected_value=0,
|
|
341
|
+
message=f"Column '{self._name}' has {invalid_count} values >= {value}",
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
def value_lengths_between(self, min_len: int, max_len: int) -> ValidationResult:
|
|
345
|
+
"""
|
|
346
|
+
Check that string value lengths are within range.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
min_len: Minimum length
|
|
350
|
+
max_len: Maximum length
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
ValidationResult
|
|
354
|
+
"""
|
|
355
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
356
|
+
col = f'"{self._name}"'
|
|
357
|
+
|
|
358
|
+
sql = f"""
|
|
359
|
+
SELECT COUNT(*) as invalid_count
|
|
360
|
+
FROM {ref}
|
|
361
|
+
WHERE {col} IS NOT NULL
|
|
362
|
+
AND (LENGTH({col}::VARCHAR) < {min_len} OR LENGTH({col}::VARCHAR) > {max_len})
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
366
|
+
passed = invalid_count == 0
|
|
367
|
+
|
|
368
|
+
return ValidationResult(
|
|
369
|
+
passed=passed,
|
|
370
|
+
actual_value=invalid_count,
|
|
371
|
+
expected_value=0,
|
|
372
|
+
message=f"Column '{self._name}' has {invalid_count} values with length outside [{min_len}, {max_len}]",
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def get_distinct_values(self, limit: int = 100) -> list[Any]:
|
|
376
|
+
"""
|
|
377
|
+
Get distinct values in the column.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
limit: Maximum number of values to return
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
List of distinct values
|
|
384
|
+
"""
|
|
385
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
386
|
+
col = f'"{self._name}"'
|
|
387
|
+
|
|
388
|
+
sql = f"""
|
|
389
|
+
SELECT DISTINCT {col}
|
|
390
|
+
FROM {ref}
|
|
391
|
+
WHERE {col} IS NOT NULL
|
|
392
|
+
LIMIT {limit}
|
|
393
|
+
"""
|
|
394
|
+
|
|
395
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
396
|
+
return [row[0] for row in rows]
|
|
397
|
+
|
|
398
|
+
def get_value_counts(self, limit: int = 20) -> dict[Any, int]:
|
|
399
|
+
"""
|
|
400
|
+
Get value counts for the column.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
limit: Maximum number of values to return
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Dictionary of value -> count
|
|
407
|
+
"""
|
|
408
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
409
|
+
col = f'"{self._name}"'
|
|
410
|
+
|
|
411
|
+
sql = f"""
|
|
412
|
+
SELECT {col}, COUNT(*) as cnt
|
|
413
|
+
FROM {ref}
|
|
414
|
+
GROUP BY {col}
|
|
415
|
+
ORDER BY cnt DESC
|
|
416
|
+
LIMIT {limit}
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
420
|
+
return {row[0]: row[1] for row in rows}
|
|
421
|
+
|
|
422
|
+
def clear_cache(self) -> None:
|
|
423
|
+
"""Clear cached statistics."""
|
|
424
|
+
self._stats_cache = None
|
|
425
|
+
self._numeric_stats_cache = None
|
|
426
|
+
|
|
427
|
+
def __repr__(self) -> str:
|
|
428
|
+
return f"Column('{self._name}', dataset='{self._dataset.name}')"
|
|
429
|
+
|
|
430
|
+
def __str__(self) -> str:
|
|
431
|
+
stats = self._get_stats()
|
|
432
|
+
return (
|
|
433
|
+
f"Column: {self._name}\n"
|
|
434
|
+
f" Total: {stats.get('total_count', 'N/A')}\n"
|
|
435
|
+
f" Nulls: {stats.get('null_count', 'N/A')} ({stats.get('null_percent', 0):.2f}%)\n"
|
|
436
|
+
f" Unique: {stats.get('unique_count', 'N/A')} ({stats.get('unique_percent', 0):.2f}%)"
|
|
437
|
+
)
|