duckguard 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +55 -28
- duckguard/anomaly/__init__.py +1 -1
- duckguard/anomaly/detector.py +1 -5
- duckguard/anomaly/methods.py +1 -3
- duckguard/cli/main.py +304 -54
- duckguard/connectors/__init__.py +2 -2
- duckguard/connectors/bigquery.py +1 -1
- duckguard/connectors/databricks.py +1 -1
- duckguard/connectors/factory.py +2 -3
- duckguard/connectors/files.py +1 -1
- duckguard/connectors/kafka.py +2 -2
- duckguard/connectors/mongodb.py +1 -1
- duckguard/connectors/mysql.py +1 -1
- duckguard/connectors/oracle.py +1 -1
- duckguard/connectors/postgres.py +1 -2
- duckguard/connectors/redshift.py +1 -1
- duckguard/connectors/snowflake.py +1 -2
- duckguard/connectors/sqlite.py +1 -1
- duckguard/connectors/sqlserver.py +10 -13
- duckguard/contracts/__init__.py +6 -6
- duckguard/contracts/diff.py +1 -1
- duckguard/contracts/generator.py +5 -6
- duckguard/contracts/loader.py +4 -4
- duckguard/contracts/validator.py +3 -4
- duckguard/core/__init__.py +3 -3
- duckguard/core/column.py +110 -5
- duckguard/core/dataset.py +3 -3
- duckguard/core/result.py +92 -1
- duckguard/core/scoring.py +1 -2
- duckguard/errors.py +362 -0
- duckguard/history/__init__.py +44 -0
- duckguard/history/schema.py +183 -0
- duckguard/history/storage.py +479 -0
- duckguard/history/trends.py +348 -0
- duckguard/integrations/__init__.py +31 -0
- duckguard/integrations/airflow.py +387 -0
- duckguard/integrations/dbt.py +458 -0
- duckguard/notifications/__init__.py +43 -0
- duckguard/notifications/formatter.py +118 -0
- duckguard/notifications/notifiers.py +357 -0
- duckguard/profiler/auto_profile.py +3 -3
- duckguard/pytest_plugin/__init__.py +1 -1
- duckguard/pytest_plugin/plugin.py +1 -1
- duckguard/reporting/console.py +2 -2
- duckguard/reports/__init__.py +42 -0
- duckguard/reports/html_reporter.py +515 -0
- duckguard/reports/pdf_reporter.py +114 -0
- duckguard/rules/__init__.py +3 -3
- duckguard/rules/executor.py +3 -4
- duckguard/rules/generator.py +4 -4
- duckguard/rules/loader.py +5 -5
- duckguard/semantic/__init__.py +1 -1
- duckguard/semantic/analyzer.py +0 -2
- duckguard/semantic/validators.py +2 -1
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/METADATA +135 -5
- duckguard-2.2.0.dist-info/RECORD +69 -0
- duckguard-2.0.0.dist-info/RECORD +0 -55
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/WHEEL +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.0.0.dist-info → duckguard-2.2.0.dist-info}/licenses/LICENSE +0 -0
duckguard/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ DuckGuard - Data quality that just works.
|
|
|
3
3
|
|
|
4
4
|
A Python-native data quality tool built on DuckDB for speed.
|
|
5
5
|
Features YAML-based rules, semantic type detection, data contracts,
|
|
6
|
-
and
|
|
6
|
+
anomaly detection, notifications, and dbt integration.
|
|
7
7
|
|
|
8
8
|
Quick Start:
|
|
9
9
|
# Python API
|
|
@@ -12,61 +12,80 @@ Quick Start:
|
|
|
12
12
|
assert orders.row_count > 0
|
|
13
13
|
assert orders.customer_id.null_percent == 0
|
|
14
14
|
|
|
15
|
+
# With row-level error capture
|
|
16
|
+
result = orders.quantity.between(1, 100)
|
|
17
|
+
if not result:
|
|
18
|
+
print(result.summary()) # See which rows failed
|
|
19
|
+
|
|
20
|
+
# Notifications
|
|
21
|
+
from duckguard.notifications import SlackNotifier
|
|
22
|
+
slack = SlackNotifier(webhook_url="...")
|
|
23
|
+
slack.send_failure_alert(result)
|
|
24
|
+
|
|
15
25
|
# CLI
|
|
16
26
|
$ duckguard check data.csv
|
|
17
27
|
$ duckguard discover data.csv --output duckguard.yaml
|
|
18
28
|
$ duckguard contract generate data.csv
|
|
19
29
|
|
|
20
|
-
Documentation: https://github.com/
|
|
30
|
+
Documentation: https://github.com/XDataHubAI/duckguard
|
|
21
31
|
"""
|
|
22
32
|
|
|
23
33
|
# Core classes
|
|
24
|
-
|
|
34
|
+
# Anomaly detection
|
|
35
|
+
from duckguard.anomaly import (
|
|
36
|
+
AnomalyDetector,
|
|
37
|
+
AnomalyResult,
|
|
38
|
+
detect_anomalies,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Connectors
|
|
42
|
+
from duckguard.connectors import connect
|
|
43
|
+
|
|
44
|
+
# Data contracts
|
|
45
|
+
from duckguard.contracts import (
|
|
46
|
+
DataContract,
|
|
47
|
+
diff_contracts,
|
|
48
|
+
generate_contract,
|
|
49
|
+
load_contract,
|
|
50
|
+
validate_contract,
|
|
51
|
+
)
|
|
25
52
|
from duckguard.core.column import Column
|
|
53
|
+
from duckguard.core.dataset import Dataset
|
|
26
54
|
from duckguard.core.engine import DuckGuardEngine
|
|
27
|
-
from duckguard.core.result import
|
|
55
|
+
from duckguard.core.result import CheckResult, FailedRow, ValidationResult
|
|
28
56
|
from duckguard.core.scoring import QualityScore, QualityScorer, score
|
|
29
57
|
|
|
30
|
-
#
|
|
31
|
-
from duckguard.
|
|
58
|
+
# Error classes
|
|
59
|
+
from duckguard.errors import (
|
|
60
|
+
ColumnNotFoundError,
|
|
61
|
+
ContractViolationError,
|
|
62
|
+
DuckGuardError,
|
|
63
|
+
RuleParseError,
|
|
64
|
+
UnsupportedConnectorError,
|
|
65
|
+
ValidationError,
|
|
66
|
+
)
|
|
32
67
|
|
|
33
68
|
# Profiling
|
|
34
|
-
from duckguard.profiler import
|
|
69
|
+
from duckguard.profiler import AutoProfiler, profile
|
|
35
70
|
|
|
36
71
|
# Rules (YAML-based)
|
|
37
72
|
from duckguard.rules import (
|
|
38
|
-
|
|
39
|
-
load_rules_from_string,
|
|
73
|
+
RuleSet,
|
|
40
74
|
execute_rules,
|
|
41
75
|
generate_rules,
|
|
42
|
-
|
|
76
|
+
load_rules,
|
|
77
|
+
load_rules_from_string,
|
|
43
78
|
)
|
|
44
79
|
|
|
45
80
|
# Semantic type detection
|
|
46
81
|
from duckguard.semantic import (
|
|
47
|
-
SemanticType,
|
|
48
82
|
SemanticAnalyzer,
|
|
83
|
+
SemanticType,
|
|
49
84
|
detect_type,
|
|
50
85
|
detect_types_for_dataset,
|
|
51
86
|
)
|
|
52
87
|
|
|
53
|
-
|
|
54
|
-
from duckguard.contracts import (
|
|
55
|
-
DataContract,
|
|
56
|
-
load_contract,
|
|
57
|
-
validate_contract,
|
|
58
|
-
generate_contract,
|
|
59
|
-
diff_contracts,
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
# Anomaly detection
|
|
63
|
-
from duckguard.anomaly import (
|
|
64
|
-
AnomalyDetector,
|
|
65
|
-
AnomalyResult,
|
|
66
|
-
detect_anomalies,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
__version__ = "2.0.0"
|
|
88
|
+
__version__ = "2.2.0"
|
|
70
89
|
|
|
71
90
|
__all__ = [
|
|
72
91
|
# Core classes
|
|
@@ -75,6 +94,7 @@ __all__ = [
|
|
|
75
94
|
"DuckGuardEngine",
|
|
76
95
|
"ValidationResult",
|
|
77
96
|
"CheckResult",
|
|
97
|
+
"FailedRow",
|
|
78
98
|
# Scoring
|
|
79
99
|
"QualityScore",
|
|
80
100
|
"QualityScorer",
|
|
@@ -105,6 +125,13 @@ __all__ = [
|
|
|
105
125
|
"AnomalyDetector",
|
|
106
126
|
"AnomalyResult",
|
|
107
127
|
"detect_anomalies",
|
|
128
|
+
# Errors
|
|
129
|
+
"DuckGuardError",
|
|
130
|
+
"ColumnNotFoundError",
|
|
131
|
+
"ContractViolationError",
|
|
132
|
+
"RuleParseError",
|
|
133
|
+
"UnsupportedConnectorError",
|
|
134
|
+
"ValidationError",
|
|
108
135
|
# Version
|
|
109
136
|
"__version__",
|
|
110
137
|
]
|
duckguard/anomaly/__init__.py
CHANGED
duckguard/anomaly/detector.py
CHANGED
|
@@ -10,15 +10,11 @@ from datetime import datetime
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
|
-
from duckguard.core.dataset import Dataset
|
|
14
13
|
from duckguard.anomaly.methods import (
|
|
15
|
-
AnomalyMethod,
|
|
16
14
|
AnomalyScore,
|
|
17
|
-
ZScoreMethod,
|
|
18
|
-
IQRMethod,
|
|
19
|
-
PercentChangeMethod,
|
|
20
15
|
create_method,
|
|
21
16
|
)
|
|
17
|
+
from duckguard.core.dataset import Dataset
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
class AnomalyType(Enum):
|
duckguard/anomaly/methods.py
CHANGED
|
@@ -5,10 +5,10 @@ Implements various statistical methods for detecting anomalies in data.
|
|
|
5
5
|
|
|
6
6
|
from __future__ import annotations
|
|
7
7
|
|
|
8
|
+
import math
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
9
10
|
from dataclasses import dataclass, field
|
|
10
11
|
from typing import Any
|
|
11
|
-
import math
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
@@ -177,8 +177,6 @@ class IQRMethod(AnomalyMethod):
|
|
|
177
177
|
if not clean_values:
|
|
178
178
|
return
|
|
179
179
|
|
|
180
|
-
n = len(clean_values)
|
|
181
|
-
|
|
182
180
|
# Calculate Q1 and Q3
|
|
183
181
|
self._q1 = self._percentile(clean_values, 25)
|
|
184
182
|
self._q3 = self._percentile(clean_values, 75)
|