duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Dataset class representing a data source for validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
8
|
+
from duckguard.core.column import Column
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from duckguard.core.scoring import QualityScore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Dataset:
|
|
15
|
+
"""
|
|
16
|
+
Represents a data source with validation capabilities.
|
|
17
|
+
|
|
18
|
+
A Dataset wraps a data source (file, database table, etc.) and provides
|
|
19
|
+
a Pythonic interface for accessing columns and performing validations.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
orders = Dataset("data/orders.csv")
|
|
23
|
+
assert orders.row_count > 0
|
|
24
|
+
assert orders.customer_id.null_percent < 5
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
source: str,
|
|
30
|
+
engine: DuckGuardEngine | None = None,
|
|
31
|
+
name: str | None = None,
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Initialize a Dataset.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
source: Path to file or connection string
|
|
38
|
+
engine: Optional DuckGuardEngine instance (uses singleton if not provided)
|
|
39
|
+
name: Optional name for the dataset (defaults to source)
|
|
40
|
+
"""
|
|
41
|
+
self._source = source
|
|
42
|
+
self._engine = engine or DuckGuardEngine.get_instance()
|
|
43
|
+
self._name = name or source
|
|
44
|
+
self._columns_cache: list[str] | None = None
|
|
45
|
+
self._row_count_cache: int | None = None
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def source(self) -> str:
|
|
49
|
+
"""Get the source path or connection string."""
|
|
50
|
+
return self._source
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def name(self) -> str:
|
|
54
|
+
"""Get the dataset name."""
|
|
55
|
+
return self._name
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def engine(self) -> DuckGuardEngine:
|
|
59
|
+
"""Get the underlying engine."""
|
|
60
|
+
return self._engine
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def row_count(self) -> int:
|
|
64
|
+
"""
|
|
65
|
+
Get the number of rows in the dataset.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Number of rows
|
|
69
|
+
"""
|
|
70
|
+
if self._row_count_cache is None:
|
|
71
|
+
self._row_count_cache = self._engine.get_row_count(self._source)
|
|
72
|
+
return self._row_count_cache
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def columns(self) -> list[str]:
|
|
76
|
+
"""
|
|
77
|
+
Get the list of column names.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of column names
|
|
81
|
+
"""
|
|
82
|
+
if self._columns_cache is None:
|
|
83
|
+
self._columns_cache = self._engine.get_columns(self._source)
|
|
84
|
+
return self._columns_cache
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def column_count(self) -> int:
|
|
88
|
+
"""Get the number of columns."""
|
|
89
|
+
return len(self.columns)
|
|
90
|
+
|
|
91
|
+
def __getattr__(self, name: str) -> Column:
|
|
92
|
+
"""
|
|
93
|
+
Access columns as attributes.
|
|
94
|
+
|
|
95
|
+
This allows Pythonic access like: dataset.customer_id
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
name: Column name
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Column object for the specified column
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
AttributeError: If the column doesn't exist
|
|
105
|
+
"""
|
|
106
|
+
# Avoid infinite recursion for private attributes
|
|
107
|
+
if name.startswith("_"):
|
|
108
|
+
raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
|
|
109
|
+
|
|
110
|
+
# Check if column exists
|
|
111
|
+
if name not in self.columns:
|
|
112
|
+
raise AttributeError(
|
|
113
|
+
f"Column '{name}' not found. Available columns: {', '.join(self.columns)}"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return Column(name, self)
|
|
117
|
+
|
|
118
|
+
def __getitem__(self, name: str) -> Column:
|
|
119
|
+
"""
|
|
120
|
+
Access columns using bracket notation.
|
|
121
|
+
|
|
122
|
+
This allows access like: dataset["customer_id"]
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
name: Column name
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Column object for the specified column
|
|
129
|
+
"""
|
|
130
|
+
if name not in self.columns:
|
|
131
|
+
raise KeyError(
|
|
132
|
+
f"Column '{name}' not found. Available columns: {', '.join(self.columns)}"
|
|
133
|
+
)
|
|
134
|
+
return Column(name, self)
|
|
135
|
+
|
|
136
|
+
def column(self, name: str) -> Column:
|
|
137
|
+
"""
|
|
138
|
+
Get a Column object by name.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
name: Column name
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Column object
|
|
145
|
+
"""
|
|
146
|
+
return self[name]
|
|
147
|
+
|
|
148
|
+
def has_column(self, name: str) -> bool:
|
|
149
|
+
"""
|
|
150
|
+
Check if a column exists.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
name: Column name to check
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True if column exists
|
|
157
|
+
"""
|
|
158
|
+
return name in self.columns
|
|
159
|
+
|
|
160
|
+
def sample(self, n: int = 10) -> list[dict[str, Any]]:
|
|
161
|
+
"""
|
|
162
|
+
Get a sample of rows from the dataset.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
n: Number of rows to sample
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
List of dictionaries representing rows
|
|
169
|
+
"""
|
|
170
|
+
ref = self._engine.get_source_reference(self._source)
|
|
171
|
+
sql = f"SELECT * FROM {ref} LIMIT {n}"
|
|
172
|
+
result = self._engine.execute(sql)
|
|
173
|
+
|
|
174
|
+
columns = [desc[0] for desc in result.description]
|
|
175
|
+
rows = result.fetchall()
|
|
176
|
+
|
|
177
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
178
|
+
|
|
179
|
+
def head(self, n: int = 5) -> list[dict[str, Any]]:
|
|
180
|
+
"""
|
|
181
|
+
Get the first n rows from the dataset.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
n: Number of rows
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of dictionaries representing rows
|
|
188
|
+
"""
|
|
189
|
+
return self.sample(n)
|
|
190
|
+
|
|
191
|
+
def execute_sql(self, sql: str) -> list[tuple[Any, ...]]:
|
|
192
|
+
"""
|
|
193
|
+
Execute a custom SQL query against this dataset.
|
|
194
|
+
|
|
195
|
+
The query can reference the dataset using {source} placeholder.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
sql: SQL query with optional {source} placeholder
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Query results as list of tuples
|
|
202
|
+
"""
|
|
203
|
+
ref = self._engine.get_source_reference(self._source)
|
|
204
|
+
formatted_sql = sql.format(source=ref)
|
|
205
|
+
return self._engine.fetch_all(formatted_sql)
|
|
206
|
+
|
|
207
|
+
def clear_cache(self) -> None:
|
|
208
|
+
"""Clear cached values (row count, columns)."""
|
|
209
|
+
self._row_count_cache = None
|
|
210
|
+
self._columns_cache = None
|
|
211
|
+
|
|
212
|
+
def __repr__(self) -> str:
|
|
213
|
+
return f"Dataset('{self._source}', rows={self.row_count}, columns={self.column_count})"
|
|
214
|
+
|
|
215
|
+
def __str__(self) -> str:
|
|
216
|
+
return f"Dataset: {self._name} ({self.row_count} rows, {self.column_count} columns)"
|
|
217
|
+
|
|
218
|
+
def __len__(self) -> int:
|
|
219
|
+
"""Return the number of rows."""
|
|
220
|
+
return self.row_count
|
|
221
|
+
|
|
222
|
+
def __contains__(self, column: str) -> bool:
|
|
223
|
+
"""Check if a column exists."""
|
|
224
|
+
return column in self.columns
|
|
225
|
+
|
|
226
|
+
def __iter__(self):
|
|
227
|
+
"""Iterate over column names."""
|
|
228
|
+
return iter(self.columns)
|
|
229
|
+
|
|
230
|
+
def score(
|
|
231
|
+
self,
|
|
232
|
+
weights: dict | None = None,
|
|
233
|
+
) -> "QualityScore":
|
|
234
|
+
"""
|
|
235
|
+
Calculate data quality score for this dataset.
|
|
236
|
+
|
|
237
|
+
Evaluates data across standard quality dimensions:
|
|
238
|
+
- Completeness: Are all required values present?
|
|
239
|
+
- Uniqueness: Are values appropriately unique?
|
|
240
|
+
- Validity: Do values conform to expected formats/ranges?
|
|
241
|
+
- Consistency: Are values consistent?
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
weights: Optional custom weights for dimensions.
|
|
245
|
+
Keys: 'completeness', 'uniqueness', 'validity', 'consistency'
|
|
246
|
+
Values must sum to 1.0
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
QualityScore with overall score, grade, and dimension breakdowns.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
score = orders.score()
|
|
253
|
+
print(score.overall) # 87.5
|
|
254
|
+
print(score.grade) # 'B'
|
|
255
|
+
print(score.completeness) # 95.0
|
|
256
|
+
|
|
257
|
+
# With custom weights
|
|
258
|
+
score = orders.score(weights={
|
|
259
|
+
'completeness': 0.4,
|
|
260
|
+
'uniqueness': 0.2,
|
|
261
|
+
'validity': 0.3,
|
|
262
|
+
'consistency': 0.1,
|
|
263
|
+
})
|
|
264
|
+
"""
|
|
265
|
+
from duckguard.core.scoring import QualityScorer, QualityDimension
|
|
266
|
+
|
|
267
|
+
# Convert string keys to QualityDimension enums if needed
|
|
268
|
+
scorer_weights = None
|
|
269
|
+
if weights:
|
|
270
|
+
scorer_weights = {}
|
|
271
|
+
key_mapping = {
|
|
272
|
+
"completeness": QualityDimension.COMPLETENESS,
|
|
273
|
+
"uniqueness": QualityDimension.UNIQUENESS,
|
|
274
|
+
"validity": QualityDimension.VALIDITY,
|
|
275
|
+
"consistency": QualityDimension.CONSISTENCY,
|
|
276
|
+
}
|
|
277
|
+
for key, value in weights.items():
|
|
278
|
+
if isinstance(key, str):
|
|
279
|
+
scorer_weights[key_mapping[key]] = value
|
|
280
|
+
else:
|
|
281
|
+
scorer_weights[key] = value
|
|
282
|
+
|
|
283
|
+
scorer = QualityScorer(weights=scorer_weights)
|
|
284
|
+
return scorer.score(self)
|
duckguard/core/engine.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""DuckDB-based execution engine for DuckGuard."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import duckdb
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DuckGuardEngine:
|
|
11
|
+
"""
|
|
12
|
+
Central DuckDB execution engine for DuckGuard.
|
|
13
|
+
|
|
14
|
+
This engine handles all database operations, providing a fast,
|
|
15
|
+
memory-efficient way to validate data from various sources.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
_instance: DuckGuardEngine | None = None
|
|
19
|
+
|
|
20
|
+
def __init__(self, memory_limit: str | None = None):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the DuckGuard engine.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
memory_limit: Optional memory limit for DuckDB (e.g., "4GB")
|
|
26
|
+
"""
|
|
27
|
+
self.conn = duckdb.connect(":memory:")
|
|
28
|
+
|
|
29
|
+
# Configure DuckDB for optimal performance
|
|
30
|
+
# Wrap in try-except for compatibility with different DuckDB versions
|
|
31
|
+
try:
|
|
32
|
+
self.conn.execute("SET enable_progress_bar = false")
|
|
33
|
+
except duckdb.InvalidInputException:
|
|
34
|
+
# Setting not supported in this DuckDB version - ignore
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
if memory_limit:
|
|
38
|
+
try:
|
|
39
|
+
self.conn.execute(f"SET memory_limit = '{memory_limit}'")
|
|
40
|
+
except duckdb.InvalidInputException:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
# Track registered sources
|
|
44
|
+
self._sources: dict[str, str] = {}
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def get_instance(cls) -> DuckGuardEngine:
|
|
48
|
+
"""Get or create the singleton engine instance."""
|
|
49
|
+
if cls._instance is None:
|
|
50
|
+
cls._instance = cls()
|
|
51
|
+
return cls._instance
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def reset_instance(cls) -> None:
|
|
55
|
+
"""Reset the singleton instance (useful for testing)."""
|
|
56
|
+
if cls._instance is not None:
|
|
57
|
+
cls._instance.close()
|
|
58
|
+
cls._instance = None
|
|
59
|
+
|
|
60
|
+
def execute(self, sql: str, params: list[Any] | None = None) -> duckdb.DuckDBPyRelation:
|
|
61
|
+
"""
|
|
62
|
+
Execute a SQL query and return the result.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
sql: The SQL query to execute
|
|
66
|
+
params: Optional parameters for the query
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
DuckDB relation with query results
|
|
70
|
+
"""
|
|
71
|
+
if params:
|
|
72
|
+
return self.conn.execute(sql, params)
|
|
73
|
+
return self.conn.execute(sql)
|
|
74
|
+
|
|
75
|
+
def fetch_one(self, sql: str, params: list[Any] | None = None) -> tuple[Any, ...] | None:
|
|
76
|
+
"""Execute a query and fetch one row."""
|
|
77
|
+
result = self.execute(sql, params)
|
|
78
|
+
return result.fetchone()
|
|
79
|
+
|
|
80
|
+
def fetch_all(self, sql: str, params: list[Any] | None = None) -> list[tuple[Any, ...]]:
|
|
81
|
+
"""Execute a query and fetch all rows."""
|
|
82
|
+
result = self.execute(sql, params)
|
|
83
|
+
return result.fetchall()
|
|
84
|
+
|
|
85
|
+
def fetch_value(self, sql: str, params: list[Any] | None = None) -> Any:
|
|
86
|
+
"""Execute a query and fetch a single value."""
|
|
87
|
+
row = self.fetch_one(sql, params)
|
|
88
|
+
return row[0] if row else None
|
|
89
|
+
|
|
90
|
+
def register_file(self, name: str, path: str) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Register a file as a named source.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: Name to reference the source
|
|
96
|
+
path: Path to the file (CSV, Parquet, JSON)
|
|
97
|
+
"""
|
|
98
|
+
# DuckDB auto-detects file type from extension
|
|
99
|
+
self._sources[name] = path
|
|
100
|
+
|
|
101
|
+
def register_dataframe(self, name: str, df: Any) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Register a DataFrame (pandas, polars, or pyarrow) as a named source.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
name: Name to reference the source
|
|
107
|
+
df: DataFrame to register
|
|
108
|
+
"""
|
|
109
|
+
self.conn.register(name, df)
|
|
110
|
+
self._sources[name] = f"registered:{name}"
|
|
111
|
+
|
|
112
|
+
def get_source_reference(self, name: str) -> str:
|
|
113
|
+
"""
|
|
114
|
+
Get the SQL reference for a registered source.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
name: Name of the registered source
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
SQL-safe reference to the source
|
|
121
|
+
"""
|
|
122
|
+
if name in self._sources:
|
|
123
|
+
source = self._sources[name]
|
|
124
|
+
if source.startswith("registered:"):
|
|
125
|
+
return name
|
|
126
|
+
# Return quoted path for file sources
|
|
127
|
+
return f"'{source}'"
|
|
128
|
+
# Assume it's a direct path or table name
|
|
129
|
+
return f"'{name}'" if "." in name or "/" in name or "\\" in name else name
|
|
130
|
+
|
|
131
|
+
def table_exists(self, name: str) -> bool:
|
|
132
|
+
"""Check if a table or source exists."""
|
|
133
|
+
try:
|
|
134
|
+
self.execute(f"SELECT 1 FROM {self.get_source_reference(name)} LIMIT 1")
|
|
135
|
+
return True
|
|
136
|
+
except duckdb.Error:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
def get_columns(self, source: str) -> list[str]:
|
|
140
|
+
"""
|
|
141
|
+
Get column names for a source.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
source: Source reference (file path or registered name)
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of column names
|
|
148
|
+
"""
|
|
149
|
+
ref = self.get_source_reference(source)
|
|
150
|
+
result = self.execute(f"DESCRIBE SELECT * FROM {ref}")
|
|
151
|
+
return [row[0] for row in result.fetchall()]
|
|
152
|
+
|
|
153
|
+
def get_row_count(self, source: str) -> int:
|
|
154
|
+
"""
|
|
155
|
+
Get row count for a source.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
source: Source reference
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Number of rows
|
|
162
|
+
"""
|
|
163
|
+
ref = self.get_source_reference(source)
|
|
164
|
+
return self.fetch_value(f"SELECT COUNT(*) FROM {ref}") or 0
|
|
165
|
+
|
|
166
|
+
def get_column_stats(self, source: str, column: str) -> dict[str, Any]:
|
|
167
|
+
"""
|
|
168
|
+
Get basic statistics for a column.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
source: Source reference
|
|
172
|
+
column: Column name
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dictionary with column statistics
|
|
176
|
+
"""
|
|
177
|
+
ref = self.get_source_reference(source)
|
|
178
|
+
col = f'"{column}"'
|
|
179
|
+
|
|
180
|
+
sql = f"""
|
|
181
|
+
SELECT
|
|
182
|
+
COUNT(*) as total_count,
|
|
183
|
+
COUNT({col}) as non_null_count,
|
|
184
|
+
COUNT(*) - COUNT({col}) as null_count,
|
|
185
|
+
COUNT(DISTINCT {col}) as unique_count,
|
|
186
|
+
MIN({col}) as min_value,
|
|
187
|
+
MAX({col}) as max_value
|
|
188
|
+
FROM {ref}
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
row = self.fetch_one(sql)
|
|
192
|
+
if not row:
|
|
193
|
+
return {}
|
|
194
|
+
|
|
195
|
+
total = row[0] or 0
|
|
196
|
+
non_null = row[1] or 0
|
|
197
|
+
null_count = row[2] or 0
|
|
198
|
+
unique_count = row[3] or 0
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"total_count": total,
|
|
202
|
+
"non_null_count": non_null,
|
|
203
|
+
"null_count": null_count,
|
|
204
|
+
"null_percent": (null_count / total * 100) if total > 0 else 0.0,
|
|
205
|
+
"unique_count": unique_count,
|
|
206
|
+
"unique_percent": (unique_count / total * 100) if total > 0 else 0.0,
|
|
207
|
+
"min_value": row[4],
|
|
208
|
+
"max_value": row[5],
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
def get_numeric_stats(self, source: str, column: str) -> dict[str, Any]:
|
|
212
|
+
"""
|
|
213
|
+
Get numeric statistics for a column.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
source: Source reference
|
|
217
|
+
column: Column name
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Dictionary with numeric statistics
|
|
221
|
+
"""
|
|
222
|
+
ref = self.get_source_reference(source)
|
|
223
|
+
col = f'"{column}"'
|
|
224
|
+
|
|
225
|
+
sql = f"""
|
|
226
|
+
SELECT
|
|
227
|
+
AVG({col}::DOUBLE) as mean_value,
|
|
228
|
+
STDDEV({col}::DOUBLE) as stddev_value,
|
|
229
|
+
MEDIAN({col}::DOUBLE) as median_value,
|
|
230
|
+
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {col}::DOUBLE) as p25,
|
|
231
|
+
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {col}::DOUBLE) as p75
|
|
232
|
+
FROM {ref}
|
|
233
|
+
WHERE {col} IS NOT NULL
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
row = self.fetch_one(sql)
|
|
238
|
+
if not row:
|
|
239
|
+
return {}
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
"mean": row[0],
|
|
243
|
+
"stddev": row[1],
|
|
244
|
+
"median": row[2],
|
|
245
|
+
"p25": row[3],
|
|
246
|
+
"p75": row[4],
|
|
247
|
+
}
|
|
248
|
+
except duckdb.Error:
|
|
249
|
+
# Column might not be numeric
|
|
250
|
+
return {}
|
|
251
|
+
|
|
252
|
+
def close(self) -> None:
|
|
253
|
+
"""Close the database connection."""
|
|
254
|
+
if self.conn:
|
|
255
|
+
self.conn.close()
|
|
256
|
+
|
|
257
|
+
def __enter__(self) -> DuckGuardEngine:
|
|
258
|
+
return self
|
|
259
|
+
|
|
260
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
261
|
+
self.close()
|
duckguard/core/result.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Result types for validation operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CheckStatus(Enum):
|
|
12
|
+
"""Status of a validation check."""
|
|
13
|
+
|
|
14
|
+
PASSED = "passed"
|
|
15
|
+
FAILED = "failed"
|
|
16
|
+
WARNING = "warning"
|
|
17
|
+
ERROR = "error"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class CheckResult:
|
|
22
|
+
"""Result of a single validation check."""
|
|
23
|
+
|
|
24
|
+
name: str
|
|
25
|
+
status: CheckStatus
|
|
26
|
+
actual_value: Any
|
|
27
|
+
expected_value: Any | None = None
|
|
28
|
+
message: str = ""
|
|
29
|
+
column: str | None = None
|
|
30
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def passed(self) -> bool:
|
|
34
|
+
"""Check if the validation passed."""
|
|
35
|
+
return self.status == CheckStatus.PASSED
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def failed(self) -> bool:
|
|
39
|
+
"""Check if the validation failed."""
|
|
40
|
+
return self.status == CheckStatus.FAILED
|
|
41
|
+
|
|
42
|
+
def __bool__(self) -> bool:
|
|
43
|
+
"""Allow using CheckResult in boolean context."""
|
|
44
|
+
return self.passed
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class ValidationResult:
|
|
49
|
+
"""Result of a validation operation that can be used in assertions."""
|
|
50
|
+
|
|
51
|
+
passed: bool
|
|
52
|
+
actual_value: Any
|
|
53
|
+
expected_value: Any | None = None
|
|
54
|
+
message: str = ""
|
|
55
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
def __bool__(self) -> bool:
|
|
58
|
+
"""Allow using ValidationResult in boolean context for assertions."""
|
|
59
|
+
return self.passed
|
|
60
|
+
|
|
61
|
+
def __repr__(self) -> str:
|
|
62
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
63
|
+
return f"ValidationResult({status}, actual={self.actual_value})"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class ProfileResult:
|
|
68
|
+
"""Result of profiling a dataset."""
|
|
69
|
+
|
|
70
|
+
source: str
|
|
71
|
+
row_count: int
|
|
72
|
+
column_count: int
|
|
73
|
+
columns: list[ColumnProfile]
|
|
74
|
+
suggested_rules: list[str] = field(default_factory=list)
|
|
75
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class ColumnProfile:
|
|
80
|
+
"""Profile information for a single column."""
|
|
81
|
+
|
|
82
|
+
name: str
|
|
83
|
+
dtype: str
|
|
84
|
+
null_count: int
|
|
85
|
+
null_percent: float
|
|
86
|
+
unique_count: int
|
|
87
|
+
unique_percent: float
|
|
88
|
+
min_value: Any | None = None
|
|
89
|
+
max_value: Any | None = None
|
|
90
|
+
mean_value: float | None = None
|
|
91
|
+
stddev_value: float | None = None
|
|
92
|
+
sample_values: list[Any] = field(default_factory=list)
|
|
93
|
+
suggested_rules: list[str] = field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class ScanResult:
|
|
98
|
+
"""Result of scanning a dataset for issues."""
|
|
99
|
+
|
|
100
|
+
source: str
|
|
101
|
+
row_count: int
|
|
102
|
+
checks_run: int
|
|
103
|
+
checks_passed: int
|
|
104
|
+
checks_failed: int
|
|
105
|
+
checks_warned: int
|
|
106
|
+
results: list[CheckResult] = field(default_factory=list)
|
|
107
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def passed(self) -> bool:
|
|
111
|
+
"""Check if all validations passed."""
|
|
112
|
+
return self.checks_failed == 0
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def pass_rate(self) -> float:
|
|
116
|
+
"""Calculate the pass rate as a percentage."""
|
|
117
|
+
if self.checks_run == 0:
|
|
118
|
+
return 100.0
|
|
119
|
+
return (self.checks_passed / self.checks_run) * 100
|