duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,284 @@
1
+ """Dataset class representing a data source for validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from duckguard.core.engine import DuckGuardEngine
8
+ from duckguard.core.column import Column
9
+
10
+ if TYPE_CHECKING:
11
+ from duckguard.core.scoring import QualityScore
12
+
13
+
14
+ class Dataset:
15
+ """
16
+ Represents a data source with validation capabilities.
17
+
18
+ A Dataset wraps a data source (file, database table, etc.) and provides
19
+ a Pythonic interface for accessing columns and performing validations.
20
+
21
+ Example:
22
+ orders = Dataset("data/orders.csv")
23
+ assert orders.row_count > 0
24
+ assert orders.customer_id.null_percent < 5
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ source: str,
30
+ engine: DuckGuardEngine | None = None,
31
+ name: str | None = None,
32
+ ):
33
+ """
34
+ Initialize a Dataset.
35
+
36
+ Args:
37
+ source: Path to file or connection string
38
+ engine: Optional DuckGuardEngine instance (uses singleton if not provided)
39
+ name: Optional name for the dataset (defaults to source)
40
+ """
41
+ self._source = source
42
+ self._engine = engine or DuckGuardEngine.get_instance()
43
+ self._name = name or source
44
+ self._columns_cache: list[str] | None = None
45
+ self._row_count_cache: int | None = None
46
+
47
+ @property
48
+ def source(self) -> str:
49
+ """Get the source path or connection string."""
50
+ return self._source
51
+
52
+ @property
53
+ def name(self) -> str:
54
+ """Get the dataset name."""
55
+ return self._name
56
+
57
+ @property
58
+ def engine(self) -> DuckGuardEngine:
59
+ """Get the underlying engine."""
60
+ return self._engine
61
+
62
+ @property
63
+ def row_count(self) -> int:
64
+ """
65
+ Get the number of rows in the dataset.
66
+
67
+ Returns:
68
+ Number of rows
69
+ """
70
+ if self._row_count_cache is None:
71
+ self._row_count_cache = self._engine.get_row_count(self._source)
72
+ return self._row_count_cache
73
+
74
+ @property
75
+ def columns(self) -> list[str]:
76
+ """
77
+ Get the list of column names.
78
+
79
+ Returns:
80
+ List of column names
81
+ """
82
+ if self._columns_cache is None:
83
+ self._columns_cache = self._engine.get_columns(self._source)
84
+ return self._columns_cache
85
+
86
+ @property
87
+ def column_count(self) -> int:
88
+ """Get the number of columns."""
89
+ return len(self.columns)
90
+
91
+ def __getattr__(self, name: str) -> Column:
92
+ """
93
+ Access columns as attributes.
94
+
95
+ This allows Pythonic access like: dataset.customer_id
96
+
97
+ Args:
98
+ name: Column name
99
+
100
+ Returns:
101
+ Column object for the specified column
102
+
103
+ Raises:
104
+ AttributeError: If the column doesn't exist
105
+ """
106
+ # Avoid infinite recursion for private attributes
107
+ if name.startswith("_"):
108
+ raise AttributeError(f"'{type(self).__name__}' has no attribute '{name}'")
109
+
110
+ # Check if column exists
111
+ if name not in self.columns:
112
+ raise AttributeError(
113
+ f"Column '{name}' not found. Available columns: {', '.join(self.columns)}"
114
+ )
115
+
116
+ return Column(name, self)
117
+
118
+ def __getitem__(self, name: str) -> Column:
119
+ """
120
+ Access columns using bracket notation.
121
+
122
+ This allows access like: dataset["customer_id"]
123
+
124
+ Args:
125
+ name: Column name
126
+
127
+ Returns:
128
+ Column object for the specified column
129
+ """
130
+ if name not in self.columns:
131
+ raise KeyError(
132
+ f"Column '{name}' not found. Available columns: {', '.join(self.columns)}"
133
+ )
134
+ return Column(name, self)
135
+
136
+ def column(self, name: str) -> Column:
137
+ """
138
+ Get a Column object by name.
139
+
140
+ Args:
141
+ name: Column name
142
+
143
+ Returns:
144
+ Column object
145
+ """
146
+ return self[name]
147
+
148
+ def has_column(self, name: str) -> bool:
149
+ """
150
+ Check if a column exists.
151
+
152
+ Args:
153
+ name: Column name to check
154
+
155
+ Returns:
156
+ True if column exists
157
+ """
158
+ return name in self.columns
159
+
160
+ def sample(self, n: int = 10) -> list[dict[str, Any]]:
161
+ """
162
+ Get a sample of rows from the dataset.
163
+
164
+ Args:
165
+ n: Number of rows to sample
166
+
167
+ Returns:
168
+ List of dictionaries representing rows
169
+ """
170
+ ref = self._engine.get_source_reference(self._source)
171
+ sql = f"SELECT * FROM {ref} LIMIT {n}"
172
+ result = self._engine.execute(sql)
173
+
174
+ columns = [desc[0] for desc in result.description]
175
+ rows = result.fetchall()
176
+
177
+ return [dict(zip(columns, row)) for row in rows]
178
+
179
+ def head(self, n: int = 5) -> list[dict[str, Any]]:
180
+ """
181
+ Get the first n rows from the dataset.
182
+
183
+ Args:
184
+ n: Number of rows
185
+
186
+ Returns:
187
+ List of dictionaries representing rows
188
+ """
189
+ return self.sample(n)
190
+
191
+ def execute_sql(self, sql: str) -> list[tuple[Any, ...]]:
192
+ """
193
+ Execute a custom SQL query against this dataset.
194
+
195
+ The query can reference the dataset using {source} placeholder.
196
+
197
+ Args:
198
+ sql: SQL query with optional {source} placeholder
199
+
200
+ Returns:
201
+ Query results as list of tuples
202
+ """
203
+ ref = self._engine.get_source_reference(self._source)
204
+ formatted_sql = sql.format(source=ref)
205
+ return self._engine.fetch_all(formatted_sql)
206
+
207
+ def clear_cache(self) -> None:
208
+ """Clear cached values (row count, columns)."""
209
+ self._row_count_cache = None
210
+ self._columns_cache = None
211
+
212
+ def __repr__(self) -> str:
213
+ return f"Dataset('{self._source}', rows={self.row_count}, columns={self.column_count})"
214
+
215
+ def __str__(self) -> str:
216
+ return f"Dataset: {self._name} ({self.row_count} rows, {self.column_count} columns)"
217
+
218
+ def __len__(self) -> int:
219
+ """Return the number of rows."""
220
+ return self.row_count
221
+
222
+ def __contains__(self, column: str) -> bool:
223
+ """Check if a column exists."""
224
+ return column in self.columns
225
+
226
+ def __iter__(self):
227
+ """Iterate over column names."""
228
+ return iter(self.columns)
229
+
230
+ def score(
231
+ self,
232
+ weights: dict | None = None,
233
+ ) -> "QualityScore":
234
+ """
235
+ Calculate data quality score for this dataset.
236
+
237
+ Evaluates data across standard quality dimensions:
238
+ - Completeness: Are all required values present?
239
+ - Uniqueness: Are values appropriately unique?
240
+ - Validity: Do values conform to expected formats/ranges?
241
+ - Consistency: Are values consistent?
242
+
243
+ Args:
244
+ weights: Optional custom weights for dimensions.
245
+ Keys: 'completeness', 'uniqueness', 'validity', 'consistency'
246
+ Values must sum to 1.0
247
+
248
+ Returns:
249
+ QualityScore with overall score, grade, and dimension breakdowns.
250
+
251
+ Example:
252
+ score = orders.score()
253
+ print(score.overall) # 87.5
254
+ print(score.grade) # 'B'
255
+ print(score.completeness) # 95.0
256
+
257
+ # With custom weights
258
+ score = orders.score(weights={
259
+ 'completeness': 0.4,
260
+ 'uniqueness': 0.2,
261
+ 'validity': 0.3,
262
+ 'consistency': 0.1,
263
+ })
264
+ """
265
+ from duckguard.core.scoring import QualityScorer, QualityDimension
266
+
267
+ # Convert string keys to QualityDimension enums if needed
268
+ scorer_weights = None
269
+ if weights:
270
+ scorer_weights = {}
271
+ key_mapping = {
272
+ "completeness": QualityDimension.COMPLETENESS,
273
+ "uniqueness": QualityDimension.UNIQUENESS,
274
+ "validity": QualityDimension.VALIDITY,
275
+ "consistency": QualityDimension.CONSISTENCY,
276
+ }
277
+ for key, value in weights.items():
278
+ if isinstance(key, str):
279
+ scorer_weights[key_mapping[key]] = value
280
+ else:
281
+ scorer_weights[key] = value
282
+
283
+ scorer = QualityScorer(weights=scorer_weights)
284
+ return scorer.score(self)
@@ -0,0 +1,261 @@
1
+ """DuckDB-based execution engine for DuckGuard."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import duckdb
8
+
9
+
10
+ class DuckGuardEngine:
11
+ """
12
+ Central DuckDB execution engine for DuckGuard.
13
+
14
+ This engine handles all database operations, providing a fast,
15
+ memory-efficient way to validate data from various sources.
16
+ """
17
+
18
+ _instance: DuckGuardEngine | None = None
19
+
20
+ def __init__(self, memory_limit: str | None = None):
21
+ """
22
+ Initialize the DuckGuard engine.
23
+
24
+ Args:
25
+ memory_limit: Optional memory limit for DuckDB (e.g., "4GB")
26
+ """
27
+ self.conn = duckdb.connect(":memory:")
28
+
29
+ # Configure DuckDB for optimal performance
30
+ # Wrap in try-except for compatibility with different DuckDB versions
31
+ try:
32
+ self.conn.execute("SET enable_progress_bar = false")
33
+ except duckdb.InvalidInputException:
34
+ # Setting not supported in this DuckDB version - ignore
35
+ pass
36
+
37
+ if memory_limit:
38
+ try:
39
+ self.conn.execute(f"SET memory_limit = '{memory_limit}'")
40
+ except duckdb.InvalidInputException:
41
+ pass
42
+
43
+ # Track registered sources
44
+ self._sources: dict[str, str] = {}
45
+
46
+ @classmethod
47
+ def get_instance(cls) -> DuckGuardEngine:
48
+ """Get or create the singleton engine instance."""
49
+ if cls._instance is None:
50
+ cls._instance = cls()
51
+ return cls._instance
52
+
53
+ @classmethod
54
+ def reset_instance(cls) -> None:
55
+ """Reset the singleton instance (useful for testing)."""
56
+ if cls._instance is not None:
57
+ cls._instance.close()
58
+ cls._instance = None
59
+
60
+ def execute(self, sql: str, params: list[Any] | None = None) -> duckdb.DuckDBPyRelation:
61
+ """
62
+ Execute a SQL query and return the result.
63
+
64
+ Args:
65
+ sql: The SQL query to execute
66
+ params: Optional parameters for the query
67
+
68
+ Returns:
69
+ DuckDB relation with query results
70
+ """
71
+ if params:
72
+ return self.conn.execute(sql, params)
73
+ return self.conn.execute(sql)
74
+
75
+ def fetch_one(self, sql: str, params: list[Any] | None = None) -> tuple[Any, ...] | None:
76
+ """Execute a query and fetch one row."""
77
+ result = self.execute(sql, params)
78
+ return result.fetchone()
79
+
80
+ def fetch_all(self, sql: str, params: list[Any] | None = None) -> list[tuple[Any, ...]]:
81
+ """Execute a query and fetch all rows."""
82
+ result = self.execute(sql, params)
83
+ return result.fetchall()
84
+
85
+ def fetch_value(self, sql: str, params: list[Any] | None = None) -> Any:
86
+ """Execute a query and fetch a single value."""
87
+ row = self.fetch_one(sql, params)
88
+ return row[0] if row else None
89
+
90
+ def register_file(self, name: str, path: str) -> None:
91
+ """
92
+ Register a file as a named source.
93
+
94
+ Args:
95
+ name: Name to reference the source
96
+ path: Path to the file (CSV, Parquet, JSON)
97
+ """
98
+ # DuckDB auto-detects file type from extension
99
+ self._sources[name] = path
100
+
101
+ def register_dataframe(self, name: str, df: Any) -> None:
102
+ """
103
+ Register a DataFrame (pandas, polars, or pyarrow) as a named source.
104
+
105
+ Args:
106
+ name: Name to reference the source
107
+ df: DataFrame to register
108
+ """
109
+ self.conn.register(name, df)
110
+ self._sources[name] = f"registered:{name}"
111
+
112
+ def get_source_reference(self, name: str) -> str:
113
+ """
114
+ Get the SQL reference for a registered source.
115
+
116
+ Args:
117
+ name: Name of the registered source
118
+
119
+ Returns:
120
+ SQL-safe reference to the source
121
+ """
122
+ if name in self._sources:
123
+ source = self._sources[name]
124
+ if source.startswith("registered:"):
125
+ return name
126
+ # Return quoted path for file sources
127
+ return f"'{source}'"
128
+ # Assume it's a direct path or table name
129
+ return f"'{name}'" if "." in name or "/" in name or "\\" in name else name
130
+
131
+ def table_exists(self, name: str) -> bool:
132
+ """Check if a table or source exists."""
133
+ try:
134
+ self.execute(f"SELECT 1 FROM {self.get_source_reference(name)} LIMIT 1")
135
+ return True
136
+ except duckdb.Error:
137
+ return False
138
+
139
+ def get_columns(self, source: str) -> list[str]:
140
+ """
141
+ Get column names for a source.
142
+
143
+ Args:
144
+ source: Source reference (file path or registered name)
145
+
146
+ Returns:
147
+ List of column names
148
+ """
149
+ ref = self.get_source_reference(source)
150
+ result = self.execute(f"DESCRIBE SELECT * FROM {ref}")
151
+ return [row[0] for row in result.fetchall()]
152
+
153
+ def get_row_count(self, source: str) -> int:
154
+ """
155
+ Get row count for a source.
156
+
157
+ Args:
158
+ source: Source reference
159
+
160
+ Returns:
161
+ Number of rows
162
+ """
163
+ ref = self.get_source_reference(source)
164
+ return self.fetch_value(f"SELECT COUNT(*) FROM {ref}") or 0
165
+
166
+ def get_column_stats(self, source: str, column: str) -> dict[str, Any]:
167
+ """
168
+ Get basic statistics for a column.
169
+
170
+ Args:
171
+ source: Source reference
172
+ column: Column name
173
+
174
+ Returns:
175
+ Dictionary with column statistics
176
+ """
177
+ ref = self.get_source_reference(source)
178
+ col = f'"{column}"'
179
+
180
+ sql = f"""
181
+ SELECT
182
+ COUNT(*) as total_count,
183
+ COUNT({col}) as non_null_count,
184
+ COUNT(*) - COUNT({col}) as null_count,
185
+ COUNT(DISTINCT {col}) as unique_count,
186
+ MIN({col}) as min_value,
187
+ MAX({col}) as max_value
188
+ FROM {ref}
189
+ """
190
+
191
+ row = self.fetch_one(sql)
192
+ if not row:
193
+ return {}
194
+
195
+ total = row[0] or 0
196
+ non_null = row[1] or 0
197
+ null_count = row[2] or 0
198
+ unique_count = row[3] or 0
199
+
200
+ return {
201
+ "total_count": total,
202
+ "non_null_count": non_null,
203
+ "null_count": null_count,
204
+ "null_percent": (null_count / total * 100) if total > 0 else 0.0,
205
+ "unique_count": unique_count,
206
+ "unique_percent": (unique_count / total * 100) if total > 0 else 0.0,
207
+ "min_value": row[4],
208
+ "max_value": row[5],
209
+ }
210
+
211
+ def get_numeric_stats(self, source: str, column: str) -> dict[str, Any]:
212
+ """
213
+ Get numeric statistics for a column.
214
+
215
+ Args:
216
+ source: Source reference
217
+ column: Column name
218
+
219
+ Returns:
220
+ Dictionary with numeric statistics
221
+ """
222
+ ref = self.get_source_reference(source)
223
+ col = f'"{column}"'
224
+
225
+ sql = f"""
226
+ SELECT
227
+ AVG({col}::DOUBLE) as mean_value,
228
+ STDDEV({col}::DOUBLE) as stddev_value,
229
+ MEDIAN({col}::DOUBLE) as median_value,
230
+ PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {col}::DOUBLE) as p25,
231
+ PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {col}::DOUBLE) as p75
232
+ FROM {ref}
233
+ WHERE {col} IS NOT NULL
234
+ """
235
+
236
+ try:
237
+ row = self.fetch_one(sql)
238
+ if not row:
239
+ return {}
240
+
241
+ return {
242
+ "mean": row[0],
243
+ "stddev": row[1],
244
+ "median": row[2],
245
+ "p25": row[3],
246
+ "p75": row[4],
247
+ }
248
+ except duckdb.Error:
249
+ # Column might not be numeric
250
+ return {}
251
+
252
+ def close(self) -> None:
253
+ """Close the database connection."""
254
+ if self.conn:
255
+ self.conn.close()
256
+
257
+ def __enter__(self) -> DuckGuardEngine:
258
+ return self
259
+
260
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
261
+ self.close()
@@ -0,0 +1,119 @@
1
+ """Result types for validation operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import datetime
7
+ from enum import Enum
8
+ from typing import Any
9
+
10
+
11
+ class CheckStatus(Enum):
12
+ """Status of a validation check."""
13
+
14
+ PASSED = "passed"
15
+ FAILED = "failed"
16
+ WARNING = "warning"
17
+ ERROR = "error"
18
+
19
+
20
+ @dataclass
21
+ class CheckResult:
22
+ """Result of a single validation check."""
23
+
24
+ name: str
25
+ status: CheckStatus
26
+ actual_value: Any
27
+ expected_value: Any | None = None
28
+ message: str = ""
29
+ column: str | None = None
30
+ timestamp: datetime = field(default_factory=datetime.now)
31
+
32
+ @property
33
+ def passed(self) -> bool:
34
+ """Check if the validation passed."""
35
+ return self.status == CheckStatus.PASSED
36
+
37
+ @property
38
+ def failed(self) -> bool:
39
+ """Check if the validation failed."""
40
+ return self.status == CheckStatus.FAILED
41
+
42
+ def __bool__(self) -> bool:
43
+ """Allow using CheckResult in boolean context."""
44
+ return self.passed
45
+
46
+
47
+ @dataclass
48
+ class ValidationResult:
49
+ """Result of a validation operation that can be used in assertions."""
50
+
51
+ passed: bool
52
+ actual_value: Any
53
+ expected_value: Any | None = None
54
+ message: str = ""
55
+ details: dict[str, Any] = field(default_factory=dict)
56
+
57
+ def __bool__(self) -> bool:
58
+ """Allow using ValidationResult in boolean context for assertions."""
59
+ return self.passed
60
+
61
+ def __repr__(self) -> str:
62
+ status = "PASSED" if self.passed else "FAILED"
63
+ return f"ValidationResult({status}, actual={self.actual_value})"
64
+
65
+
66
+ @dataclass
67
+ class ProfileResult:
68
+ """Result of profiling a dataset."""
69
+
70
+ source: str
71
+ row_count: int
72
+ column_count: int
73
+ columns: list[ColumnProfile]
74
+ suggested_rules: list[str] = field(default_factory=list)
75
+ timestamp: datetime = field(default_factory=datetime.now)
76
+
77
+
78
+ @dataclass
79
+ class ColumnProfile:
80
+ """Profile information for a single column."""
81
+
82
+ name: str
83
+ dtype: str
84
+ null_count: int
85
+ null_percent: float
86
+ unique_count: int
87
+ unique_percent: float
88
+ min_value: Any | None = None
89
+ max_value: Any | None = None
90
+ mean_value: float | None = None
91
+ stddev_value: float | None = None
92
+ sample_values: list[Any] = field(default_factory=list)
93
+ suggested_rules: list[str] = field(default_factory=list)
94
+
95
+
96
+ @dataclass
97
+ class ScanResult:
98
+ """Result of scanning a dataset for issues."""
99
+
100
+ source: str
101
+ row_count: int
102
+ checks_run: int
103
+ checks_passed: int
104
+ checks_failed: int
105
+ checks_warned: int
106
+ results: list[CheckResult] = field(default_factory=list)
107
+ timestamp: datetime = field(default_factory=datetime.now)
108
+
109
+ @property
110
+ def passed(self) -> bool:
111
+ """Check if all validations passed."""
112
+ return self.checks_failed == 0
113
+
114
+ @property
115
+ def pass_rate(self) -> float:
116
+ """Calculate the pass rate as a percentage."""
117
+ if self.checks_run == 0:
118
+ return 100.0
119
+ return (self.checks_passed / self.checks_run) * 100