duckguard 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +1 -1
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +1 -3
- duckguard/cli/main.py +304 -54
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +110 -5
- duckguard/core/dataset.py +3 -3
- duckguard/core/result.py +92 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +183 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +43 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +515 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +4 -4
- duckguard/rules/loader.py +5 -5
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/validators.py +2 -1
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/METADATA +135 -5
- duckguard-2.2.0.dist-info/RECORD +69 -0
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/licenses/LICENSE +0 -0
duckguard/connectors/postgres.py
CHANGED
|
@@ -2,10 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from urllib.parse import urlparse
|
|
7
6
|
|
|
8
|
-
from duckguard.connectors.base import
|
|
7
|
+
from duckguard.connectors.base import ConnectionConfig, Connector
|
|
9
8
|
from duckguard.core.dataset import Dataset
|
|
10
9
|
from duckguard.core.engine import DuckGuardEngine
|
|
11
10
|
|
duckguard/connectors/redshift.py
CHANGED
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
|
-
from duckguard.connectors.base import
|
|
7
|
+
from duckguard.connectors.base import ConnectionConfig, Connector
|
|
8
8
|
from duckguard.core.dataset import Dataset
|
|
9
9
|
from duckguard.core.engine import DuckGuardEngine
|
|
10
10
|
|
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from typing import Any
|
|
7
6
|
from urllib.parse import parse_qs, urlparse
|
|
8
7
|
|
|
9
|
-
from duckguard.connectors.base import
|
|
8
|
+
from duckguard.connectors.base import ConnectionConfig, Connector
|
|
10
9
|
from duckguard.core.dataset import Dataset
|
|
11
10
|
from duckguard.core.engine import DuckGuardEngine
|
|
12
11
|
|
duckguard/connectors/sqlite.py
CHANGED
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import os
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from duckguard.connectors.base import
|
|
8
|
+
from duckguard.connectors.base import ConnectionConfig, Connector
|
|
9
9
|
from duckguard.core.dataset import Dataset
|
|
10
10
|
from duckguard.core.engine import DuckGuardEngine
|
|
11
11
|
|
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
from urllib.parse import parse_qs, urlparse
|
|
7
7
|
|
|
8
|
-
from duckguard.connectors.base import
|
|
8
|
+
from duckguard.connectors.base import ConnectionConfig, Connector
|
|
9
9
|
from duckguard.core.dataset import Dataset
|
|
10
10
|
from duckguard.core.engine import DuckGuardEngine
|
|
11
11
|
|
|
@@ -55,20 +55,17 @@ class SQLServerConnector(Connector):
|
|
|
55
55
|
Dataset object
|
|
56
56
|
"""
|
|
57
57
|
# Try pyodbc first, then pymssql
|
|
58
|
-
|
|
59
|
-
import pyodbc
|
|
58
|
+
import importlib.util
|
|
60
59
|
|
|
60
|
+
if importlib.util.find_spec("pyodbc") is not None:
|
|
61
61
|
driver_module = "pyodbc"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
"SQL Server support requires pyodbc or pymssql. "
|
|
70
|
-
"Install with: pip install duckguard[sqlserver]"
|
|
71
|
-
)
|
|
62
|
+
elif importlib.util.find_spec("pymssql") is not None:
|
|
63
|
+
driver_module = "pymssql"
|
|
64
|
+
else:
|
|
65
|
+
raise ImportError(
|
|
66
|
+
"SQL Server support requires pyodbc or pymssql. "
|
|
67
|
+
"Install with: pip install duckguard[sqlserver]"
|
|
68
|
+
)
|
|
72
69
|
|
|
73
70
|
if not config.table:
|
|
74
71
|
raise ValueError("Table name is required for SQL Server connections")
|
duckguard/contracts/__init__.py
CHANGED
|
@@ -14,17 +14,17 @@ Example:
|
|
|
14
14
|
print(f"Contract violations: {result.violations}")
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
+
from duckguard.contracts.diff import SchemaDiff, diff_contracts
|
|
18
|
+
from duckguard.contracts.generator import generate_contract
|
|
19
|
+
from duckguard.contracts.loader import contract_to_yaml, load_contract, load_contract_from_string
|
|
17
20
|
from duckguard.contracts.schema import (
|
|
21
|
+
ContractMetadata,
|
|
18
22
|
DataContract,
|
|
19
|
-
SchemaField,
|
|
20
23
|
FieldType,
|
|
21
24
|
QualitySLA,
|
|
22
|
-
|
|
25
|
+
SchemaField,
|
|
23
26
|
)
|
|
24
|
-
from duckguard.contracts.
|
|
25
|
-
from duckguard.contracts.validator import validate_contract, ContractValidationResult
|
|
26
|
-
from duckguard.contracts.generator import generate_contract
|
|
27
|
-
from duckguard.contracts.diff import diff_contracts, SchemaDiff
|
|
27
|
+
from duckguard.contracts.validator import ContractValidationResult, validate_contract
|
|
28
28
|
|
|
29
29
|
__all__ = [
|
|
30
30
|
# Schema
|
duckguard/contracts/diff.py
CHANGED
|
@@ -9,7 +9,7 @@ from dataclasses import dataclass, field
|
|
|
9
9
|
from enum import Enum
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
|
-
from duckguard.contracts.schema import DataContract,
|
|
12
|
+
from duckguard.contracts.schema import DataContract, FieldType, SchemaField
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class ChangeType(Enum):
|
duckguard/contracts/generator.py
CHANGED
|
@@ -7,19 +7,18 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
10
|
|
|
12
|
-
from duckguard.core.dataset import Dataset
|
|
13
11
|
from duckguard.connectors import connect
|
|
12
|
+
from duckguard.contracts.loader import contract_to_yaml
|
|
14
13
|
from duckguard.contracts.schema import (
|
|
14
|
+
ContractMetadata,
|
|
15
15
|
DataContract,
|
|
16
|
-
SchemaField,
|
|
17
|
-
FieldType,
|
|
18
16
|
FieldConstraint,
|
|
17
|
+
FieldType,
|
|
19
18
|
QualitySLA,
|
|
20
|
-
|
|
19
|
+
SchemaField,
|
|
21
20
|
)
|
|
22
|
-
from duckguard.
|
|
21
|
+
from duckguard.core.dataset import Dataset
|
|
23
22
|
from duckguard.semantic import SemanticAnalyzer, SemanticType
|
|
24
23
|
|
|
25
24
|
|
duckguard/contracts/loader.py
CHANGED
|
@@ -47,12 +47,12 @@ from typing import Any
|
|
|
47
47
|
import yaml
|
|
48
48
|
|
|
49
49
|
from duckguard.contracts.schema import (
|
|
50
|
+
ContractMetadata,
|
|
50
51
|
DataContract,
|
|
51
|
-
SchemaField,
|
|
52
|
-
FieldType,
|
|
53
52
|
FieldConstraint,
|
|
53
|
+
FieldType,
|
|
54
54
|
QualitySLA,
|
|
55
|
-
|
|
55
|
+
SchemaField,
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
|
|
@@ -82,7 +82,7 @@ def load_contract(path: str | Path) -> DataContract:
|
|
|
82
82
|
if not path.exists():
|
|
83
83
|
raise FileNotFoundError(f"Contract file not found: {path}")
|
|
84
84
|
|
|
85
|
-
with open(path,
|
|
85
|
+
with open(path, encoding="utf-8") as f:
|
|
86
86
|
content = f.read()
|
|
87
87
|
|
|
88
88
|
return load_contract_from_string(content, source_file=str(path))
|
duckguard/contracts/validator.py
CHANGED
|
@@ -6,14 +6,13 @@ Validates datasets against data contracts to ensure compliance.
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
8
|
from dataclasses import dataclass, field
|
|
9
|
-
from datetime import datetime
|
|
9
|
+
from datetime import datetime
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import Any
|
|
12
|
-
import re
|
|
13
12
|
|
|
14
|
-
from duckguard.core.dataset import Dataset
|
|
15
13
|
from duckguard.connectors import connect
|
|
16
|
-
from duckguard.contracts.schema import DataContract, SchemaField
|
|
14
|
+
from duckguard.contracts.schema import DataContract, SchemaField
|
|
15
|
+
from duckguard.core.dataset import Dataset
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
class ViolationType(Enum):
|
duckguard/core/__init__.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""Core module containing the engine, dataset, and column classes."""
|
|
2
2
|
|
|
3
|
-
from duckguard.core.engine import DuckGuardEngine
|
|
4
|
-
from duckguard.core.dataset import Dataset
|
|
5
3
|
from duckguard.core.column import Column
|
|
6
|
-
from duckguard.core.
|
|
4
|
+
from duckguard.core.dataset import Dataset
|
|
5
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
6
|
+
from duckguard.core.result import CheckResult, ValidationResult
|
|
7
7
|
|
|
8
8
|
__all__ = ["DuckGuardEngine", "Dataset", "Column", "ValidationResult", "CheckResult"]
|
duckguard/core/column.py
CHANGED
|
@@ -2,14 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import re
|
|
6
5
|
from typing import TYPE_CHECKING, Any
|
|
7
6
|
|
|
8
|
-
from duckguard.core.result import ValidationResult
|
|
7
|
+
from duckguard.core.result import FailedRow, ValidationResult
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
10
|
from duckguard.core.dataset import Dataset
|
|
12
11
|
|
|
12
|
+
# Default number of failed rows to capture for debugging
|
|
13
|
+
DEFAULT_SAMPLE_SIZE = 10
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
class Column:
|
|
15
17
|
"""
|
|
@@ -164,13 +166,14 @@ class Column:
|
|
|
164
166
|
message=f"Column '{self._name}' unique_percent is {actual:.2f}% (threshold: {threshold}%)",
|
|
165
167
|
)
|
|
166
168
|
|
|
167
|
-
def between(self, min_val: Any, max_val: Any) -> ValidationResult:
|
|
169
|
+
def between(self, min_val: Any, max_val: Any, capture_failures: bool = True) -> ValidationResult:
|
|
168
170
|
"""
|
|
169
171
|
Check that all values are between min and max (inclusive).
|
|
170
172
|
|
|
171
173
|
Args:
|
|
172
174
|
min_val: Minimum allowed value
|
|
173
175
|
max_val: Maximum allowed value
|
|
176
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
174
177
|
|
|
175
178
|
Returns:
|
|
176
179
|
ValidationResult indicating if all non-null values are in range
|
|
@@ -188,20 +191,53 @@ class Column:
|
|
|
188
191
|
out_of_range = self._dataset.engine.fetch_value(sql) or 0
|
|
189
192
|
passed = out_of_range == 0
|
|
190
193
|
|
|
194
|
+
# Capture sample of failing rows for debugging
|
|
195
|
+
failed_rows = []
|
|
196
|
+
if not passed and capture_failures:
|
|
197
|
+
failed_rows = self._get_failed_rows_between(min_val, max_val)
|
|
198
|
+
|
|
191
199
|
return ValidationResult(
|
|
192
200
|
passed=passed,
|
|
193
201
|
actual_value=out_of_range,
|
|
194
202
|
expected_value=0,
|
|
195
203
|
message=f"Column '{self._name}' has {out_of_range} values outside [{min_val}, {max_val}]",
|
|
196
204
|
details={"min": min_val, "max": max_val, "out_of_range_count": out_of_range},
|
|
205
|
+
failed_rows=failed_rows,
|
|
206
|
+
total_failures=out_of_range,
|
|
197
207
|
)
|
|
198
208
|
|
|
199
|
-
def
|
|
209
|
+
def _get_failed_rows_between(self, min_val: Any, max_val: Any, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
210
|
+
"""Get sample of rows that failed between check."""
|
|
211
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
212
|
+
col = f'"{self._name}"'
|
|
213
|
+
|
|
214
|
+
sql = f"""
|
|
215
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
216
|
+
FROM {ref}
|
|
217
|
+
WHERE {col} IS NOT NULL
|
|
218
|
+
AND ({col} < {min_val} OR {col} > {max_val})
|
|
219
|
+
LIMIT {limit}
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
223
|
+
return [
|
|
224
|
+
FailedRow(
|
|
225
|
+
row_index=row[0],
|
|
226
|
+
column=self._name,
|
|
227
|
+
value=row[1],
|
|
228
|
+
expected=f"between {min_val} and {max_val}",
|
|
229
|
+
reason=f"Value {row[1]} is outside range [{min_val}, {max_val}]",
|
|
230
|
+
)
|
|
231
|
+
for row in rows
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
def matches(self, pattern: str, capture_failures: bool = True) -> ValidationResult:
|
|
200
235
|
"""
|
|
201
236
|
Check that all non-null values match a regex pattern.
|
|
202
237
|
|
|
203
238
|
Args:
|
|
204
239
|
pattern: Regular expression pattern
|
|
240
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
205
241
|
|
|
206
242
|
Returns:
|
|
207
243
|
ValidationResult
|
|
@@ -220,20 +256,53 @@ class Column:
|
|
|
220
256
|
non_matching = self._dataset.engine.fetch_value(sql) or 0
|
|
221
257
|
passed = non_matching == 0
|
|
222
258
|
|
|
259
|
+
# Capture sample of failing rows
|
|
260
|
+
failed_rows = []
|
|
261
|
+
if not passed and capture_failures:
|
|
262
|
+
failed_rows = self._get_failed_rows_pattern(pattern)
|
|
263
|
+
|
|
223
264
|
return ValidationResult(
|
|
224
265
|
passed=passed,
|
|
225
266
|
actual_value=non_matching,
|
|
226
267
|
expected_value=0,
|
|
227
268
|
message=f"Column '{self._name}' has {non_matching} values not matching pattern '{pattern}'",
|
|
228
269
|
details={"pattern": pattern, "non_matching_count": non_matching},
|
|
270
|
+
failed_rows=failed_rows,
|
|
271
|
+
total_failures=non_matching,
|
|
229
272
|
)
|
|
230
273
|
|
|
231
|
-
def
|
|
274
|
+
def _get_failed_rows_pattern(self, pattern: str, limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
275
|
+
"""Get sample of rows that failed pattern match."""
|
|
276
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
277
|
+
col = f'"{self._name}"'
|
|
278
|
+
|
|
279
|
+
sql = f"""
|
|
280
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
281
|
+
FROM {ref}
|
|
282
|
+
WHERE {col} IS NOT NULL
|
|
283
|
+
AND NOT regexp_matches({col}::VARCHAR, '{pattern}')
|
|
284
|
+
LIMIT {limit}
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
288
|
+
return [
|
|
289
|
+
FailedRow(
|
|
290
|
+
row_index=row[0],
|
|
291
|
+
column=self._name,
|
|
292
|
+
value=row[1],
|
|
293
|
+
expected=f"matches pattern '{pattern}'",
|
|
294
|
+
reason=f"Value '{row[1]}' does not match pattern",
|
|
295
|
+
)
|
|
296
|
+
for row in rows
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
def isin(self, values: list[Any], capture_failures: bool = True) -> ValidationResult:
|
|
232
300
|
"""
|
|
233
301
|
Check that all non-null values are in the allowed set.
|
|
234
302
|
|
|
235
303
|
Args:
|
|
236
304
|
values: List of allowed values
|
|
305
|
+
capture_failures: Whether to capture sample failing rows (default: True)
|
|
237
306
|
|
|
238
307
|
Returns:
|
|
239
308
|
ValidationResult
|
|
@@ -256,14 +325,50 @@ class Column:
|
|
|
256
325
|
invalid_count = self._dataset.engine.fetch_value(sql) or 0
|
|
257
326
|
passed = invalid_count == 0
|
|
258
327
|
|
|
328
|
+
# Capture sample of failing rows
|
|
329
|
+
failed_rows = []
|
|
330
|
+
if not passed and capture_failures:
|
|
331
|
+
failed_rows = self._get_failed_rows_isin(values)
|
|
332
|
+
|
|
259
333
|
return ValidationResult(
|
|
260
334
|
passed=passed,
|
|
261
335
|
actual_value=invalid_count,
|
|
262
336
|
expected_value=0,
|
|
263
337
|
message=f"Column '{self._name}' has {invalid_count} values not in allowed set",
|
|
264
338
|
details={"allowed_values": values, "invalid_count": invalid_count},
|
|
339
|
+
failed_rows=failed_rows,
|
|
340
|
+
total_failures=invalid_count,
|
|
265
341
|
)
|
|
266
342
|
|
|
343
|
+
def _get_failed_rows_isin(self, values: list[Any], limit: int = DEFAULT_SAMPLE_SIZE) -> list[FailedRow]:
|
|
344
|
+
"""Get sample of rows that failed isin check."""
|
|
345
|
+
ref = self._dataset.engine.get_source_reference(self._dataset.source)
|
|
346
|
+
col = f'"{self._name}"'
|
|
347
|
+
|
|
348
|
+
formatted_values = ", ".join(
|
|
349
|
+
f"'{v}'" if isinstance(v, str) else str(v) for v in values
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
sql = f"""
|
|
353
|
+
SELECT row_number() OVER () as row_idx, {col} as val
|
|
354
|
+
FROM {ref}
|
|
355
|
+
WHERE {col} IS NOT NULL
|
|
356
|
+
AND {col} NOT IN ({formatted_values})
|
|
357
|
+
LIMIT {limit}
|
|
358
|
+
"""
|
|
359
|
+
|
|
360
|
+
rows = self._dataset.engine.fetch_all(sql)
|
|
361
|
+
return [
|
|
362
|
+
FailedRow(
|
|
363
|
+
row_index=row[0],
|
|
364
|
+
column=self._name,
|
|
365
|
+
value=row[1],
|
|
366
|
+
expected=f"in {values}",
|
|
367
|
+
reason=f"Value '{row[1]}' is not in allowed set",
|
|
368
|
+
)
|
|
369
|
+
for row in rows
|
|
370
|
+
]
|
|
371
|
+
|
|
267
372
|
def has_no_duplicates(self) -> ValidationResult:
|
|
268
373
|
"""
|
|
269
374
|
Check that all values are unique (no duplicates).
|
duckguard/core/dataset.py
CHANGED
|
@@ -4,8 +4,8 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from duckguard.core.engine import DuckGuardEngine
|
|
8
7
|
from duckguard.core.column import Column
|
|
8
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
from duckguard.core.scoring import QualityScore
|
|
@@ -230,7 +230,7 @@ class Dataset:
|
|
|
230
230
|
def score(
|
|
231
231
|
self,
|
|
232
232
|
weights: dict | None = None,
|
|
233
|
-
) ->
|
|
233
|
+
) -> QualityScore:
|
|
234
234
|
"""
|
|
235
235
|
Calculate data quality score for this dataset.
|
|
236
236
|
|
|
@@ -262,7 +262,7 @@ class Dataset:
|
|
|
262
262
|
'consistency': 0.1,
|
|
263
263
|
})
|
|
264
264
|
"""
|
|
265
|
-
from duckguard.core.scoring import
|
|
265
|
+
from duckguard.core.scoring import QualityDimension, QualityScorer
|
|
266
266
|
|
|
267
267
|
# Convert string keys to QualityDimension enums if needed
|
|
268
268
|
scorer_weights = None
|
duckguard/core/result.py
CHANGED
|
@@ -17,6 +17,30 @@ class CheckStatus(Enum):
|
|
|
17
17
|
ERROR = "error"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
@dataclass
|
|
21
|
+
class FailedRow:
|
|
22
|
+
"""Represents a single row that failed validation.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
row_index: The 1-based row number in the source data
|
|
26
|
+
column: The column name that failed validation
|
|
27
|
+
value: The actual value that failed
|
|
28
|
+
expected: What was expected (e.g., "not null", "between 1-100")
|
|
29
|
+
reason: Human-readable explanation of why validation failed
|
|
30
|
+
context: Additional row data for context (optional)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
row_index: int
|
|
34
|
+
column: str
|
|
35
|
+
value: Any
|
|
36
|
+
expected: str
|
|
37
|
+
reason: str = ""
|
|
38
|
+
context: dict[str, Any] = field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
return f"FailedRow(row={self.row_index}, column='{self.column}', value={self.value!r})"
|
|
42
|
+
|
|
43
|
+
|
|
20
44
|
@dataclass
|
|
21
45
|
class CheckResult:
|
|
22
46
|
"""Result of a single validation check."""
|
|
@@ -46,13 +70,27 @@ class CheckResult:
|
|
|
46
70
|
|
|
47
71
|
@dataclass
|
|
48
72
|
class ValidationResult:
|
|
49
|
-
"""Result of a validation operation that can be used in assertions.
|
|
73
|
+
"""Result of a validation operation that can be used in assertions.
|
|
74
|
+
|
|
75
|
+
Enhanced with row-level error capture for debugging failed checks.
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
passed: Whether the validation passed
|
|
79
|
+
actual_value: The actual value found (e.g., count of failures)
|
|
80
|
+
expected_value: What was expected
|
|
81
|
+
message: Human-readable summary
|
|
82
|
+
details: Additional metadata
|
|
83
|
+
failed_rows: List of individual rows that failed validation
|
|
84
|
+
sample_size: How many failed rows to capture (default: 10)
|
|
85
|
+
"""
|
|
50
86
|
|
|
51
87
|
passed: bool
|
|
52
88
|
actual_value: Any
|
|
53
89
|
expected_value: Any | None = None
|
|
54
90
|
message: str = ""
|
|
55
91
|
details: dict[str, Any] = field(default_factory=dict)
|
|
92
|
+
failed_rows: list[FailedRow] = field(default_factory=list)
|
|
93
|
+
total_failures: int = 0
|
|
56
94
|
|
|
57
95
|
def __bool__(self) -> bool:
|
|
58
96
|
"""Allow using ValidationResult in boolean context for assertions."""
|
|
@@ -60,8 +98,61 @@ class ValidationResult:
|
|
|
60
98
|
|
|
61
99
|
def __repr__(self) -> str:
|
|
62
100
|
status = "PASSED" if self.passed else "FAILED"
|
|
101
|
+
if self.failed_rows:
|
|
102
|
+
return f"ValidationResult({status}, actual={self.actual_value}, failed_rows={len(self.failed_rows)})"
|
|
63
103
|
return f"ValidationResult({status}, actual={self.actual_value})"
|
|
64
104
|
|
|
105
|
+
def get_failed_values(self) -> list[Any]:
|
|
106
|
+
"""Get list of values that failed validation."""
|
|
107
|
+
return [row.value for row in self.failed_rows]
|
|
108
|
+
|
|
109
|
+
def get_failed_row_indices(self) -> list[int]:
|
|
110
|
+
"""Get list of row indices that failed validation."""
|
|
111
|
+
return [row.row_index for row in self.failed_rows]
|
|
112
|
+
|
|
113
|
+
def to_dataframe(self):
|
|
114
|
+
"""Convert failed rows to a pandas DataFrame (if pandas available).
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
pandas.DataFrame with failed row details
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ImportError: If pandas is not installed
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
import pandas as pd
|
|
124
|
+
|
|
125
|
+
if not self.failed_rows:
|
|
126
|
+
return pd.DataFrame(columns=["row_index", "column", "value", "expected", "reason"])
|
|
127
|
+
|
|
128
|
+
return pd.DataFrame([
|
|
129
|
+
{
|
|
130
|
+
"row_index": row.row_index,
|
|
131
|
+
"column": row.column,
|
|
132
|
+
"value": row.value,
|
|
133
|
+
"expected": row.expected,
|
|
134
|
+
"reason": row.reason,
|
|
135
|
+
**row.context,
|
|
136
|
+
}
|
|
137
|
+
for row in self.failed_rows
|
|
138
|
+
])
|
|
139
|
+
except ImportError:
|
|
140
|
+
raise ImportError("pandas is required for to_dataframe(). Install with: pip install pandas")
|
|
141
|
+
|
|
142
|
+
def summary(self) -> str:
|
|
143
|
+
"""Get a summary of the validation result with sample failures."""
|
|
144
|
+
lines = [self.message]
|
|
145
|
+
|
|
146
|
+
if self.failed_rows:
|
|
147
|
+
lines.append(f"\nSample of {len(self.failed_rows)} failing rows (total: {self.total_failures}):")
|
|
148
|
+
for row in self.failed_rows[:5]:
|
|
149
|
+
lines.append(f" Row {row.row_index}: {row.column}={row.value!r} - {row.reason or row.expected}")
|
|
150
|
+
|
|
151
|
+
if self.total_failures > 5:
|
|
152
|
+
lines.append(f" ... and {self.total_failures - 5} more failures")
|
|
153
|
+
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
65
156
|
|
|
66
157
|
@dataclass
|
|
67
158
|
class ProfileResult:
|
duckguard/core/scoring.py
CHANGED
|
@@ -14,7 +14,7 @@ from __future__ import annotations
|
|
|
14
14
|
from dataclasses import dataclass, field
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
from enum import Enum
|
|
17
|
-
from typing import
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from duckguard.core.dataset import Dataset
|
|
@@ -302,7 +302,6 @@ class QualityScorer:
|
|
|
302
302
|
# Check for reasonable ranges on numeric columns
|
|
303
303
|
if numeric_stats.get("mean") is not None:
|
|
304
304
|
min_val = stats.get("min_value")
|
|
305
|
-
max_val = stats.get("max_value")
|
|
306
305
|
|
|
307
306
|
# Check for negative values in likely positive-only columns
|
|
308
307
|
is_likely_positive = any(
|