duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""SQLite connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
9
|
+
from duckguard.core.dataset import Dataset
|
|
10
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SQLiteConnector(Connector):
|
|
14
|
+
"""
|
|
15
|
+
Connector for SQLite databases.
|
|
16
|
+
|
|
17
|
+
DuckDB has native SQLite support, making this connector very efficient.
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
# Connect to SQLite file
|
|
21
|
+
data = connect("sqlite:///path/to/database.db", table="users")
|
|
22
|
+
|
|
23
|
+
# Or directly with .db/.sqlite extension
|
|
24
|
+
data = connect("database.sqlite", table="orders")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
28
|
+
super().__init__(engine)
|
|
29
|
+
self._setup_extension()
|
|
30
|
+
|
|
31
|
+
def _setup_extension(self) -> None:
|
|
32
|
+
"""Install and load the SQLite extension."""
|
|
33
|
+
try:
|
|
34
|
+
self.engine.execute("INSTALL sqlite")
|
|
35
|
+
self.engine.execute("LOAD sqlite")
|
|
36
|
+
except Exception:
|
|
37
|
+
# Extension might already be loaded
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
41
|
+
"""
|
|
42
|
+
Connect to SQLite database and return a Dataset.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
config: Connection configuration
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Dataset object
|
|
49
|
+
"""
|
|
50
|
+
if not config.table:
|
|
51
|
+
raise ValueError("Table name is required for SQLite connections")
|
|
52
|
+
|
|
53
|
+
# Parse the path
|
|
54
|
+
path = self._parse_path(config.source)
|
|
55
|
+
|
|
56
|
+
# Validate file exists
|
|
57
|
+
if not os.path.exists(path):
|
|
58
|
+
raise FileNotFoundError(f"SQLite database not found: {path}")
|
|
59
|
+
|
|
60
|
+
table = config.table
|
|
61
|
+
|
|
62
|
+
# Create a unique alias for this connection
|
|
63
|
+
alias = f"sqlite_{Path(path).stem}"
|
|
64
|
+
|
|
65
|
+
# Attach the SQLite database
|
|
66
|
+
attach_sql = f"ATTACH '{path}' AS {alias} (TYPE sqlite)"
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
self.engine.execute(attach_sql)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
if "already exists" not in str(e).lower():
|
|
72
|
+
raise
|
|
73
|
+
|
|
74
|
+
# The source reference for DuckDB
|
|
75
|
+
source_ref = f"{alias}.{table}"
|
|
76
|
+
|
|
77
|
+
# Register as a view for easier access
|
|
78
|
+
view_name = f"_duckguard_sqlite_{table}"
|
|
79
|
+
try:
|
|
80
|
+
self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
return Dataset(source=view_name, engine=self.engine, name=table)
|
|
85
|
+
|
|
86
|
+
def _parse_path(self, source: str) -> str:
|
|
87
|
+
"""Parse SQLite connection string to get file path."""
|
|
88
|
+
if source.lower().startswith("sqlite:///"):
|
|
89
|
+
return source[10:] # Remove 'sqlite:///'
|
|
90
|
+
if source.lower().startswith("sqlite://"):
|
|
91
|
+
return source[9:] # Remove 'sqlite://'
|
|
92
|
+
return source
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def can_handle(cls, source: str) -> bool:
|
|
96
|
+
"""Check if this is a SQLite database."""
|
|
97
|
+
source_lower = source.lower()
|
|
98
|
+
|
|
99
|
+
# Check for sqlite:// protocol
|
|
100
|
+
if source_lower.startswith("sqlite://"):
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
# Check for common SQLite file extensions
|
|
104
|
+
if source_lower.endswith((".db", ".sqlite", ".sqlite3")):
|
|
105
|
+
return True
|
|
106
|
+
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def get_priority(cls) -> int:
|
|
111
|
+
"""SQLite connector has medium-high priority."""
|
|
112
|
+
return 40
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Microsoft SQL Server connector."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import parse_qs, urlparse
|
|
7
|
+
|
|
8
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
9
|
+
from duckguard.core.dataset import Dataset
|
|
10
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SQLServerConnector(Connector):
|
|
14
|
+
"""
|
|
15
|
+
Connector for Microsoft SQL Server.
|
|
16
|
+
|
|
17
|
+
Uses pyodbc or pymssql to connect to SQL Server.
|
|
18
|
+
|
|
19
|
+
Examples:
|
|
20
|
+
# Using connection string
|
|
21
|
+
data = connect(
|
|
22
|
+
"mssql://user:pass@server/database",
|
|
23
|
+
table="orders"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Using options
|
|
27
|
+
data = connect(
|
|
28
|
+
"sqlserver://server/database",
|
|
29
|
+
table="orders",
|
|
30
|
+
user="myuser",
|
|
31
|
+
password="mypass",
|
|
32
|
+
schema="dbo"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Using trusted connection (Windows auth)
|
|
36
|
+
data = connect(
|
|
37
|
+
"mssql://server/database",
|
|
38
|
+
table="orders",
|
|
39
|
+
trusted_connection=True
|
|
40
|
+
)
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
44
|
+
super().__init__(engine)
|
|
45
|
+
self._connection = None
|
|
46
|
+
|
|
47
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
48
|
+
"""
|
|
49
|
+
Connect to SQL Server and return a Dataset.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: Connection configuration
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dataset object
|
|
56
|
+
"""
|
|
57
|
+
# Try pyodbc first, then pymssql
|
|
58
|
+
try:
|
|
59
|
+
import pyodbc
|
|
60
|
+
|
|
61
|
+
driver_module = "pyodbc"
|
|
62
|
+
except ImportError:
|
|
63
|
+
try:
|
|
64
|
+
import pymssql
|
|
65
|
+
|
|
66
|
+
driver_module = "pymssql"
|
|
67
|
+
except ImportError:
|
|
68
|
+
raise ImportError(
|
|
69
|
+
"SQL Server support requires pyodbc or pymssql. "
|
|
70
|
+
"Install with: pip install duckguard[sqlserver]"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if not config.table:
|
|
74
|
+
raise ValueError("Table name is required for SQL Server connections")
|
|
75
|
+
|
|
76
|
+
# Parse connection parameters
|
|
77
|
+
conn_params = self._parse_connection_string(config.source, config)
|
|
78
|
+
|
|
79
|
+
# Connect using the available driver
|
|
80
|
+
if driver_module == "pyodbc":
|
|
81
|
+
self._connection = self._connect_pyodbc(conn_params)
|
|
82
|
+
else:
|
|
83
|
+
self._connection = self._connect_pymssql(conn_params)
|
|
84
|
+
|
|
85
|
+
table = config.table
|
|
86
|
+
schema = config.schema or conn_params.get("schema", "dbo")
|
|
87
|
+
|
|
88
|
+
# Build fully qualified table name
|
|
89
|
+
fq_table = f"[{schema}].[{table}]"
|
|
90
|
+
|
|
91
|
+
return SQLServerDataset(
|
|
92
|
+
source=fq_table,
|
|
93
|
+
engine=self.engine,
|
|
94
|
+
name=table,
|
|
95
|
+
connection=self._connection,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _connect_pyodbc(self, params: dict) -> Any:
|
|
99
|
+
"""Connect using pyodbc."""
|
|
100
|
+
import pyodbc
|
|
101
|
+
|
|
102
|
+
# Build connection string
|
|
103
|
+
conn_str_parts = []
|
|
104
|
+
|
|
105
|
+
driver = params.get("driver", "ODBC Driver 17 for SQL Server")
|
|
106
|
+
conn_str_parts.append(f"DRIVER={{{driver}}}")
|
|
107
|
+
|
|
108
|
+
conn_str_parts.append(f"SERVER={params.get('host', 'localhost')}")
|
|
109
|
+
|
|
110
|
+
if params.get("port"):
|
|
111
|
+
conn_str_parts[-1] += f",{params['port']}"
|
|
112
|
+
|
|
113
|
+
conn_str_parts.append(f"DATABASE={params.get('database', '')}")
|
|
114
|
+
|
|
115
|
+
if params.get("trusted_connection"):
|
|
116
|
+
conn_str_parts.append("Trusted_Connection=yes")
|
|
117
|
+
else:
|
|
118
|
+
conn_str_parts.append(f"UID={params.get('user', '')}")
|
|
119
|
+
conn_str_parts.append(f"PWD={params.get('password', '')}")
|
|
120
|
+
|
|
121
|
+
conn_str = ";".join(conn_str_parts)
|
|
122
|
+
return pyodbc.connect(conn_str)
|
|
123
|
+
|
|
124
|
+
def _connect_pymssql(self, params: dict) -> Any:
|
|
125
|
+
"""Connect using pymssql."""
|
|
126
|
+
import pymssql
|
|
127
|
+
|
|
128
|
+
return pymssql.connect(
|
|
129
|
+
server=params.get("host", "localhost"),
|
|
130
|
+
port=params.get("port", "1433"),
|
|
131
|
+
user=params.get("user", ""),
|
|
132
|
+
password=params.get("password", ""),
|
|
133
|
+
database=params.get("database", ""),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
|
|
137
|
+
"""Parse SQL Server connection string."""
|
|
138
|
+
params: dict[str, Any] = {}
|
|
139
|
+
|
|
140
|
+
# Normalize prefixes
|
|
141
|
+
conn_string_lower = conn_string.lower()
|
|
142
|
+
if conn_string_lower.startswith(("mssql://", "sqlserver://")):
|
|
143
|
+
# Convert to standard URL format for parsing
|
|
144
|
+
if conn_string_lower.startswith("mssql://"):
|
|
145
|
+
conn_string = "mssql://" + conn_string[8:]
|
|
146
|
+
else:
|
|
147
|
+
conn_string = "mssql://" + conn_string[12:]
|
|
148
|
+
|
|
149
|
+
parsed = urlparse(conn_string)
|
|
150
|
+
|
|
151
|
+
params["host"] = parsed.hostname or "localhost"
|
|
152
|
+
params["port"] = str(parsed.port) if parsed.port else "1433"
|
|
153
|
+
params["database"] = parsed.path.lstrip("/") if parsed.path else ""
|
|
154
|
+
params["user"] = parsed.username or ""
|
|
155
|
+
params["password"] = parsed.password or ""
|
|
156
|
+
|
|
157
|
+
# Parse query parameters
|
|
158
|
+
if parsed.query:
|
|
159
|
+
query_params = parse_qs(parsed.query)
|
|
160
|
+
for key, values in query_params.items():
|
|
161
|
+
params[key] = values[0] if len(values) == 1 else values
|
|
162
|
+
|
|
163
|
+
# Override with config options
|
|
164
|
+
options = config.options or {}
|
|
165
|
+
for key in [
|
|
166
|
+
"user",
|
|
167
|
+
"password",
|
|
168
|
+
"host",
|
|
169
|
+
"port",
|
|
170
|
+
"database",
|
|
171
|
+
"schema",
|
|
172
|
+
"driver",
|
|
173
|
+
"trusted_connection",
|
|
174
|
+
]:
|
|
175
|
+
if key in options:
|
|
176
|
+
params[key] = options[key]
|
|
177
|
+
|
|
178
|
+
if config.database:
|
|
179
|
+
params["database"] = config.database
|
|
180
|
+
if config.schema:
|
|
181
|
+
params["schema"] = config.schema
|
|
182
|
+
|
|
183
|
+
return params
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def can_handle(cls, source: str) -> bool:
|
|
187
|
+
"""Check if this is a SQL Server connection string."""
|
|
188
|
+
source_lower = source.lower()
|
|
189
|
+
return source_lower.startswith(("mssql://", "sqlserver://", "mssql+pyodbc://"))
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def get_priority(cls) -> int:
|
|
193
|
+
"""SQL Server connector has high priority."""
|
|
194
|
+
return 55
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class SQLServerDataset(Dataset):
|
|
198
|
+
"""Dataset that queries SQL Server directly."""
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
source: str,
|
|
203
|
+
engine: DuckGuardEngine,
|
|
204
|
+
name: str,
|
|
205
|
+
connection: Any,
|
|
206
|
+
):
|
|
207
|
+
super().__init__(source=source, engine=engine, name=name)
|
|
208
|
+
self._mssql_connection = connection
|
|
209
|
+
|
|
210
|
+
def _execute_query(self, sql: str) -> list[tuple[Any, ...]]:
|
|
211
|
+
"""Execute a query on SQL Server."""
|
|
212
|
+
cursor = self._mssql_connection.cursor()
|
|
213
|
+
try:
|
|
214
|
+
cursor.execute(sql)
|
|
215
|
+
return cursor.fetchall()
|
|
216
|
+
finally:
|
|
217
|
+
cursor.close()
|
|
218
|
+
|
|
219
|
+
def _fetch_value(self, sql: str) -> Any:
|
|
220
|
+
"""Execute query and return single value."""
|
|
221
|
+
rows = self._execute_query(sql)
|
|
222
|
+
return rows[0][0] if rows else None
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def row_count(self) -> int:
|
|
226
|
+
"""Get row count from SQL Server."""
|
|
227
|
+
if self._row_count_cache is None:
|
|
228
|
+
sql = f"SELECT COUNT(*) FROM {self._source}"
|
|
229
|
+
self._row_count_cache = self._fetch_value(sql) or 0
|
|
230
|
+
return self._row_count_cache
|
|
231
|
+
|
|
232
|
+
@property
|
|
233
|
+
def columns(self) -> list[str]:
|
|
234
|
+
"""Get column names from SQL Server."""
|
|
235
|
+
if self._columns_cache is None:
|
|
236
|
+
cursor = self._mssql_connection.cursor()
|
|
237
|
+
try:
|
|
238
|
+
cursor.execute(f"SELECT TOP 0 * FROM {self._source}")
|
|
239
|
+
self._columns_cache = [desc[0] for desc in cursor.description]
|
|
240
|
+
finally:
|
|
241
|
+
cursor.close()
|
|
242
|
+
return self._columns_cache
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Data Contracts for DuckGuard.
|
|
2
|
+
|
|
3
|
+
Data contracts define the expected schema, quality SLAs, and ownership
|
|
4
|
+
for data sources. They enable producer-consumer agreements and
|
|
5
|
+
breaking change detection.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
from duckguard.contracts import load_contract, validate_contract
|
|
9
|
+
|
|
10
|
+
contract = load_contract("contracts/orders.contract.yaml")
|
|
11
|
+
result = validate_contract(contract, "data/orders.csv")
|
|
12
|
+
|
|
13
|
+
if not result.passed:
|
|
14
|
+
print(f"Contract violations: {result.violations}")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from duckguard.contracts.schema import (
|
|
18
|
+
DataContract,
|
|
19
|
+
SchemaField,
|
|
20
|
+
FieldType,
|
|
21
|
+
QualitySLA,
|
|
22
|
+
ContractMetadata,
|
|
23
|
+
)
|
|
24
|
+
from duckguard.contracts.loader import load_contract, load_contract_from_string, contract_to_yaml
|
|
25
|
+
from duckguard.contracts.validator import validate_contract, ContractValidationResult
|
|
26
|
+
from duckguard.contracts.generator import generate_contract
|
|
27
|
+
from duckguard.contracts.diff import diff_contracts, SchemaDiff
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Schema
|
|
31
|
+
"DataContract",
|
|
32
|
+
"SchemaField",
|
|
33
|
+
"FieldType",
|
|
34
|
+
"QualitySLA",
|
|
35
|
+
"ContractMetadata",
|
|
36
|
+
# Loading
|
|
37
|
+
"load_contract",
|
|
38
|
+
"load_contract_from_string",
|
|
39
|
+
"contract_to_yaml",
|
|
40
|
+
# Validation
|
|
41
|
+
"validate_contract",
|
|
42
|
+
"ContractValidationResult",
|
|
43
|
+
# Generation
|
|
44
|
+
"generate_contract",
|
|
45
|
+
# Diff
|
|
46
|
+
"diff_contracts",
|
|
47
|
+
"SchemaDiff",
|
|
48
|
+
]
|