duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,112 @@
1
+ """SQLite connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class SQLiteConnector(Connector):
14
+ """
15
+ Connector for SQLite databases.
16
+
17
+ DuckDB has native SQLite support, making this connector very efficient.
18
+
19
+ Examples:
20
+ # Connect to SQLite file
21
+ data = connect("sqlite:///path/to/database.db", table="users")
22
+
23
+ # Or directly with .db/.sqlite extension
24
+ data = connect("database.sqlite", table="orders")
25
+ """
26
+
27
+ def __init__(self, engine: DuckGuardEngine | None = None):
28
+ super().__init__(engine)
29
+ self._setup_extension()
30
+
31
+ def _setup_extension(self) -> None:
32
+ """Install and load the SQLite extension."""
33
+ try:
34
+ self.engine.execute("INSTALL sqlite")
35
+ self.engine.execute("LOAD sqlite")
36
+ except Exception:
37
+ # Extension might already be loaded
38
+ pass
39
+
40
+ def connect(self, config: ConnectionConfig) -> Dataset:
41
+ """
42
+ Connect to SQLite database and return a Dataset.
43
+
44
+ Args:
45
+ config: Connection configuration
46
+
47
+ Returns:
48
+ Dataset object
49
+ """
50
+ if not config.table:
51
+ raise ValueError("Table name is required for SQLite connections")
52
+
53
+ # Parse the path
54
+ path = self._parse_path(config.source)
55
+
56
+ # Validate file exists
57
+ if not os.path.exists(path):
58
+ raise FileNotFoundError(f"SQLite database not found: {path}")
59
+
60
+ table = config.table
61
+
62
+ # Create a unique alias for this connection
63
+ alias = f"sqlite_{Path(path).stem}"
64
+
65
+ # Attach the SQLite database
66
+ attach_sql = f"ATTACH '{path}' AS {alias} (TYPE sqlite)"
67
+
68
+ try:
69
+ self.engine.execute(attach_sql)
70
+ except Exception as e:
71
+ if "already exists" not in str(e).lower():
72
+ raise
73
+
74
+ # The source reference for DuckDB
75
+ source_ref = f"{alias}.{table}"
76
+
77
+ # Register as a view for easier access
78
+ view_name = f"_duckguard_sqlite_{table}"
79
+ try:
80
+ self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
81
+ except Exception:
82
+ pass
83
+
84
+ return Dataset(source=view_name, engine=self.engine, name=table)
85
+
86
+ def _parse_path(self, source: str) -> str:
87
+ """Parse SQLite connection string to get file path."""
88
+ if source.lower().startswith("sqlite:///"):
89
+ return source[10:] # Remove 'sqlite:///'
90
+ if source.lower().startswith("sqlite://"):
91
+ return source[9:] # Remove 'sqlite://'
92
+ return source
93
+
94
+ @classmethod
95
+ def can_handle(cls, source: str) -> bool:
96
+ """Check if this is a SQLite database."""
97
+ source_lower = source.lower()
98
+
99
+ # Check for sqlite:// protocol
100
+ if source_lower.startswith("sqlite://"):
101
+ return True
102
+
103
+ # Check for common SQLite file extensions
104
+ if source_lower.endswith((".db", ".sqlite", ".sqlite3")):
105
+ return True
106
+
107
+ return False
108
+
109
+ @classmethod
110
+ def get_priority(cls) -> int:
111
+ """SQLite connector has medium-high priority."""
112
+ return 40
@@ -0,0 +1,242 @@
1
+ """Microsoft SQL Server connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import parse_qs, urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class SQLServerConnector(Connector):
14
+ """
15
+ Connector for Microsoft SQL Server.
16
+
17
+ Uses pyodbc or pymssql to connect to SQL Server.
18
+
19
+ Examples:
20
+ # Using connection string
21
+ data = connect(
22
+ "mssql://user:pass@server/database",
23
+ table="orders"
24
+ )
25
+
26
+ # Using options
27
+ data = connect(
28
+ "sqlserver://server/database",
29
+ table="orders",
30
+ user="myuser",
31
+ password="mypass",
32
+ schema="dbo"
33
+ )
34
+
35
+ # Using trusted connection (Windows auth)
36
+ data = connect(
37
+ "mssql://server/database",
38
+ table="orders",
39
+ trusted_connection=True
40
+ )
41
+ """
42
+
43
+ def __init__(self, engine: DuckGuardEngine | None = None):
44
+ super().__init__(engine)
45
+ self._connection = None
46
+
47
+ def connect(self, config: ConnectionConfig) -> Dataset:
48
+ """
49
+ Connect to SQL Server and return a Dataset.
50
+
51
+ Args:
52
+ config: Connection configuration
53
+
54
+ Returns:
55
+ Dataset object
56
+ """
57
+ # Try pyodbc first, then pymssql
58
+ try:
59
+ import pyodbc
60
+
61
+ driver_module = "pyodbc"
62
+ except ImportError:
63
+ try:
64
+ import pymssql
65
+
66
+ driver_module = "pymssql"
67
+ except ImportError:
68
+ raise ImportError(
69
+ "SQL Server support requires pyodbc or pymssql. "
70
+ "Install with: pip install duckguard[sqlserver]"
71
+ )
72
+
73
+ if not config.table:
74
+ raise ValueError("Table name is required for SQL Server connections")
75
+
76
+ # Parse connection parameters
77
+ conn_params = self._parse_connection_string(config.source, config)
78
+
79
+ # Connect using the available driver
80
+ if driver_module == "pyodbc":
81
+ self._connection = self._connect_pyodbc(conn_params)
82
+ else:
83
+ self._connection = self._connect_pymssql(conn_params)
84
+
85
+ table = config.table
86
+ schema = config.schema or conn_params.get("schema", "dbo")
87
+
88
+ # Build fully qualified table name
89
+ fq_table = f"[{schema}].[{table}]"
90
+
91
+ return SQLServerDataset(
92
+ source=fq_table,
93
+ engine=self.engine,
94
+ name=table,
95
+ connection=self._connection,
96
+ )
97
+
98
+ def _connect_pyodbc(self, params: dict) -> Any:
99
+ """Connect using pyodbc."""
100
+ import pyodbc
101
+
102
+ # Build connection string
103
+ conn_str_parts = []
104
+
105
+ driver = params.get("driver", "ODBC Driver 17 for SQL Server")
106
+ conn_str_parts.append(f"DRIVER={{{driver}}}")
107
+
108
+ conn_str_parts.append(f"SERVER={params.get('host', 'localhost')}")
109
+
110
+ if params.get("port"):
111
+ conn_str_parts[-1] += f",{params['port']}"
112
+
113
+ conn_str_parts.append(f"DATABASE={params.get('database', '')}")
114
+
115
+ if params.get("trusted_connection"):
116
+ conn_str_parts.append("Trusted_Connection=yes")
117
+ else:
118
+ conn_str_parts.append(f"UID={params.get('user', '')}")
119
+ conn_str_parts.append(f"PWD={params.get('password', '')}")
120
+
121
+ conn_str = ";".join(conn_str_parts)
122
+ return pyodbc.connect(conn_str)
123
+
124
+ def _connect_pymssql(self, params: dict) -> Any:
125
+ """Connect using pymssql."""
126
+ import pymssql
127
+
128
+ return pymssql.connect(
129
+ server=params.get("host", "localhost"),
130
+ port=params.get("port", "1433"),
131
+ user=params.get("user", ""),
132
+ password=params.get("password", ""),
133
+ database=params.get("database", ""),
134
+ )
135
+
136
+ def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
137
+ """Parse SQL Server connection string."""
138
+ params: dict[str, Any] = {}
139
+
140
+ # Normalize prefixes
141
+ conn_string_lower = conn_string.lower()
142
+ if conn_string_lower.startswith(("mssql://", "sqlserver://")):
143
+ # Convert to standard URL format for parsing
144
+ if conn_string_lower.startswith("mssql://"):
145
+ conn_string = "mssql://" + conn_string[8:]
146
+ else:
147
+ conn_string = "mssql://" + conn_string[12:]
148
+
149
+ parsed = urlparse(conn_string)
150
+
151
+ params["host"] = parsed.hostname or "localhost"
152
+ params["port"] = str(parsed.port) if parsed.port else "1433"
153
+ params["database"] = parsed.path.lstrip("/") if parsed.path else ""
154
+ params["user"] = parsed.username or ""
155
+ params["password"] = parsed.password or ""
156
+
157
+ # Parse query parameters
158
+ if parsed.query:
159
+ query_params = parse_qs(parsed.query)
160
+ for key, values in query_params.items():
161
+ params[key] = values[0] if len(values) == 1 else values
162
+
163
+ # Override with config options
164
+ options = config.options or {}
165
+ for key in [
166
+ "user",
167
+ "password",
168
+ "host",
169
+ "port",
170
+ "database",
171
+ "schema",
172
+ "driver",
173
+ "trusted_connection",
174
+ ]:
175
+ if key in options:
176
+ params[key] = options[key]
177
+
178
+ if config.database:
179
+ params["database"] = config.database
180
+ if config.schema:
181
+ params["schema"] = config.schema
182
+
183
+ return params
184
+
185
+ @classmethod
186
+ def can_handle(cls, source: str) -> bool:
187
+ """Check if this is a SQL Server connection string."""
188
+ source_lower = source.lower()
189
+ return source_lower.startswith(("mssql://", "sqlserver://", "mssql+pyodbc://"))
190
+
191
+ @classmethod
192
+ def get_priority(cls) -> int:
193
+ """SQL Server connector has high priority."""
194
+ return 55
195
+
196
+
197
+ class SQLServerDataset(Dataset):
198
+ """Dataset that queries SQL Server directly."""
199
+
200
+ def __init__(
201
+ self,
202
+ source: str,
203
+ engine: DuckGuardEngine,
204
+ name: str,
205
+ connection: Any,
206
+ ):
207
+ super().__init__(source=source, engine=engine, name=name)
208
+ self._mssql_connection = connection
209
+
210
+ def _execute_query(self, sql: str) -> list[tuple[Any, ...]]:
211
+ """Execute a query on SQL Server."""
212
+ cursor = self._mssql_connection.cursor()
213
+ try:
214
+ cursor.execute(sql)
215
+ return cursor.fetchall()
216
+ finally:
217
+ cursor.close()
218
+
219
+ def _fetch_value(self, sql: str) -> Any:
220
+ """Execute query and return single value."""
221
+ rows = self._execute_query(sql)
222
+ return rows[0][0] if rows else None
223
+
224
+ @property
225
+ def row_count(self) -> int:
226
+ """Get row count from SQL Server."""
227
+ if self._row_count_cache is None:
228
+ sql = f"SELECT COUNT(*) FROM {self._source}"
229
+ self._row_count_cache = self._fetch_value(sql) or 0
230
+ return self._row_count_cache
231
+
232
+ @property
233
+ def columns(self) -> list[str]:
234
+ """Get column names from SQL Server."""
235
+ if self._columns_cache is None:
236
+ cursor = self._mssql_connection.cursor()
237
+ try:
238
+ cursor.execute(f"SELECT TOP 0 * FROM {self._source}")
239
+ self._columns_cache = [desc[0] for desc in cursor.description]
240
+ finally:
241
+ cursor.close()
242
+ return self._columns_cache
@@ -0,0 +1,48 @@
1
+ """Data Contracts for DuckGuard.
2
+
3
+ Data contracts define the expected schema, quality SLAs, and ownership
4
+ for data sources. They enable producer-consumer agreements and
5
+ breaking change detection.
6
+
7
+ Example:
8
+ from duckguard.contracts import load_contract, validate_contract
9
+
10
+ contract = load_contract("contracts/orders.contract.yaml")
11
+ result = validate_contract(contract, "data/orders.csv")
12
+
13
+ if not result.passed:
14
+ print(f"Contract violations: {result.violations}")
15
+ """
16
+
17
+ from duckguard.contracts.schema import (
18
+ DataContract,
19
+ SchemaField,
20
+ FieldType,
21
+ QualitySLA,
22
+ ContractMetadata,
23
+ )
24
+ from duckguard.contracts.loader import load_contract, load_contract_from_string, contract_to_yaml
25
+ from duckguard.contracts.validator import validate_contract, ContractValidationResult
26
+ from duckguard.contracts.generator import generate_contract
27
+ from duckguard.contracts.diff import diff_contracts, SchemaDiff
28
+
29
+ __all__ = [
30
+ # Schema
31
+ "DataContract",
32
+ "SchemaField",
33
+ "FieldType",
34
+ "QualitySLA",
35
+ "ContractMetadata",
36
+ # Loading
37
+ "load_contract",
38
+ "load_contract_from_string",
39
+ "contract_to_yaml",
40
+ # Validation
41
+ "validate_contract",
42
+ "ContractValidationResult",
43
+ # Generation
44
+ "generate_contract",
45
+ # Diff
46
+ "diff_contracts",
47
+ "SchemaDiff",
48
+ ]