duckguard 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +110 -0
- duckguard/anomaly/__init__.py +34 -0
- duckguard/anomaly/detector.py +394 -0
- duckguard/anomaly/methods.py +432 -0
- duckguard/cli/__init__.py +5 -0
- duckguard/cli/main.py +706 -0
- duckguard/connectors/__init__.py +58 -0
- duckguard/connectors/base.py +80 -0
- duckguard/connectors/bigquery.py +171 -0
- duckguard/connectors/databricks.py +201 -0
- duckguard/connectors/factory.py +292 -0
- duckguard/connectors/files.py +135 -0
- duckguard/connectors/kafka.py +343 -0
- duckguard/connectors/mongodb.py +236 -0
- duckguard/connectors/mysql.py +121 -0
- duckguard/connectors/oracle.py +196 -0
- duckguard/connectors/postgres.py +99 -0
- duckguard/connectors/redshift.py +154 -0
- duckguard/connectors/snowflake.py +226 -0
- duckguard/connectors/sqlite.py +112 -0
- duckguard/connectors/sqlserver.py +242 -0
- duckguard/contracts/__init__.py +48 -0
- duckguard/contracts/diff.py +432 -0
- duckguard/contracts/generator.py +334 -0
- duckguard/contracts/loader.py +367 -0
- duckguard/contracts/schema.py +242 -0
- duckguard/contracts/validator.py +453 -0
- duckguard/core/__init__.py +8 -0
- duckguard/core/column.py +437 -0
- duckguard/core/dataset.py +284 -0
- duckguard/core/engine.py +261 -0
- duckguard/core/result.py +119 -0
- duckguard/core/scoring.py +508 -0
- duckguard/profiler/__init__.py +5 -0
- duckguard/profiler/auto_profile.py +350 -0
- duckguard/pytest_plugin/__init__.py +5 -0
- duckguard/pytest_plugin/plugin.py +161 -0
- duckguard/reporting/__init__.py +6 -0
- duckguard/reporting/console.py +88 -0
- duckguard/reporting/json_report.py +96 -0
- duckguard/rules/__init__.py +28 -0
- duckguard/rules/executor.py +616 -0
- duckguard/rules/generator.py +341 -0
- duckguard/rules/loader.py +483 -0
- duckguard/rules/schema.py +289 -0
- duckguard/semantic/__init__.py +31 -0
- duckguard/semantic/analyzer.py +270 -0
- duckguard/semantic/detector.py +459 -0
- duckguard/semantic/validators.py +354 -0
- duckguard/validators/__init__.py +7 -0
- duckguard-2.0.0.dist-info/METADATA +221 -0
- duckguard-2.0.0.dist-info/RECORD +55 -0
- duckguard-2.0.0.dist-info/WHEEL +4 -0
- duckguard-2.0.0.dist-info/entry_points.txt +5 -0
- duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Factory function for creating connections."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
8
|
+
from duckguard.connectors.files import FileConnector, S3Connector, GCSConnector, AzureConnector
|
|
9
|
+
from duckguard.core.dataset import Dataset
|
|
10
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Registry of available connectors
|
|
14
|
+
_CONNECTORS: list[type[Connector]] = [
|
|
15
|
+
S3Connector,
|
|
16
|
+
GCSConnector,
|
|
17
|
+
AzureConnector,
|
|
18
|
+
FileConnector,
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_connector(connector_class: type[Connector]) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Register a custom connector.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
connector_class: Connector class to register
|
|
28
|
+
"""
|
|
29
|
+
_CONNECTORS.append(connector_class)
|
|
30
|
+
# Sort by priority (highest first)
|
|
31
|
+
_CONNECTORS.sort(key=lambda c: c.get_priority(), reverse=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def connect(
|
|
35
|
+
source: str,
|
|
36
|
+
*,
|
|
37
|
+
table: str | None = None,
|
|
38
|
+
schema: str | None = None,
|
|
39
|
+
database: str | None = None,
|
|
40
|
+
engine: DuckGuardEngine | None = None,
|
|
41
|
+
**options: Any,
|
|
42
|
+
) -> Dataset:
|
|
43
|
+
"""
|
|
44
|
+
Connect to a data source and return a Dataset.
|
|
45
|
+
|
|
46
|
+
This is the main entry point for connecting to data sources.
|
|
47
|
+
It automatically detects the source type and uses the appropriate connector.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
source: Path to file, connection string, or URL
|
|
51
|
+
table: Table name (for database connections)
|
|
52
|
+
schema: Schema name (for database connections)
|
|
53
|
+
database: Database name (for database connections)
|
|
54
|
+
engine: Optional DuckGuardEngine instance
|
|
55
|
+
**options: Additional options passed to the connector
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Dataset object ready for validation
|
|
59
|
+
|
|
60
|
+
Examples:
|
|
61
|
+
# Connect to a CSV file
|
|
62
|
+
orders = connect("data/orders.csv")
|
|
63
|
+
|
|
64
|
+
# Connect to a Parquet file on S3
|
|
65
|
+
orders = connect("s3://bucket/orders.parquet")
|
|
66
|
+
|
|
67
|
+
# Connect to PostgreSQL
|
|
68
|
+
orders = connect("postgres://localhost/mydb", table="orders")
|
|
69
|
+
|
|
70
|
+
# Connect to Snowflake
|
|
71
|
+
orders = connect("snowflake://account/db", table="orders", schema="public")
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If no connector can handle the source
|
|
75
|
+
"""
|
|
76
|
+
config = ConnectionConfig(
|
|
77
|
+
source=source,
|
|
78
|
+
table=table,
|
|
79
|
+
schema=schema,
|
|
80
|
+
database=database,
|
|
81
|
+
options=options,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Find a connector that can handle this source
|
|
85
|
+
for connector_class in _CONNECTORS:
|
|
86
|
+
if connector_class.can_handle(source):
|
|
87
|
+
connector = connector_class(engine=engine)
|
|
88
|
+
return connector.connect(config)
|
|
89
|
+
|
|
90
|
+
# Check for database connection strings
|
|
91
|
+
if _is_database_connection(source):
|
|
92
|
+
return _handle_database_connection(source, config, engine)
|
|
93
|
+
|
|
94
|
+
raise ValueError(
|
|
95
|
+
f"No connector found for source: {source}\n"
|
|
96
|
+
f"Supported formats: CSV, Parquet, JSON, Excel\n"
|
|
97
|
+
f"Supported protocols: s3://, gs://, az://, postgres://, mysql://"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _is_database_connection(source: str) -> bool:
|
|
102
|
+
"""Check if source is a database connection string."""
|
|
103
|
+
db_prefixes = (
|
|
104
|
+
"postgres://",
|
|
105
|
+
"postgresql://",
|
|
106
|
+
"mysql://",
|
|
107
|
+
"mysql+pymysql://",
|
|
108
|
+
"sqlite://",
|
|
109
|
+
"snowflake://",
|
|
110
|
+
"bigquery://",
|
|
111
|
+
"redshift://",
|
|
112
|
+
"mssql://",
|
|
113
|
+
"sqlserver://",
|
|
114
|
+
"databricks://",
|
|
115
|
+
"oracle://",
|
|
116
|
+
"mongodb://",
|
|
117
|
+
"mongodb+srv://",
|
|
118
|
+
"kafka://",
|
|
119
|
+
)
|
|
120
|
+
source_lower = source.lower()
|
|
121
|
+
|
|
122
|
+
# Check prefixes
|
|
123
|
+
if source_lower.startswith(db_prefixes):
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
# Check for SQLite file extensions
|
|
127
|
+
if source_lower.endswith((".db", ".sqlite", ".sqlite3")):
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
# Check for Redshift hostname
|
|
131
|
+
if "redshift.amazonaws.com" in source_lower:
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
# Check for Databricks hostname
|
|
135
|
+
if ".databricks.com" in source_lower:
|
|
136
|
+
return True
|
|
137
|
+
|
|
138
|
+
return False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _handle_database_connection(
|
|
142
|
+
source: str,
|
|
143
|
+
config: ConnectionConfig,
|
|
144
|
+
engine: DuckGuardEngine | None,
|
|
145
|
+
) -> Dataset:
|
|
146
|
+
"""Handle database connection strings."""
|
|
147
|
+
source_lower = source.lower()
|
|
148
|
+
|
|
149
|
+
# PostgreSQL
|
|
150
|
+
if source_lower.startswith(("postgres://", "postgresql://")):
|
|
151
|
+
try:
|
|
152
|
+
from duckguard.connectors.postgres import PostgresConnector
|
|
153
|
+
|
|
154
|
+
connector = PostgresConnector(engine=engine)
|
|
155
|
+
return connector.connect(config)
|
|
156
|
+
except ImportError:
|
|
157
|
+
raise ImportError(
|
|
158
|
+
"PostgreSQL support requires psycopg2. "
|
|
159
|
+
"Install with: pip install duckguard[postgres]"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# MySQL
|
|
163
|
+
if source_lower.startswith(("mysql://", "mysql+pymysql://")):
|
|
164
|
+
try:
|
|
165
|
+
from duckguard.connectors.mysql import MySQLConnector
|
|
166
|
+
|
|
167
|
+
connector = MySQLConnector(engine=engine)
|
|
168
|
+
return connector.connect(config)
|
|
169
|
+
except ImportError:
|
|
170
|
+
raise ImportError(
|
|
171
|
+
"MySQL support requires pymysql. "
|
|
172
|
+
"Install with: pip install duckguard[mysql]"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# SQLite
|
|
176
|
+
if source_lower.startswith("sqlite://") or source_lower.endswith(
|
|
177
|
+
(".db", ".sqlite", ".sqlite3")
|
|
178
|
+
):
|
|
179
|
+
from duckguard.connectors.sqlite import SQLiteConnector
|
|
180
|
+
|
|
181
|
+
connector = SQLiteConnector(engine=engine)
|
|
182
|
+
return connector.connect(config)
|
|
183
|
+
|
|
184
|
+
# Snowflake
|
|
185
|
+
if source_lower.startswith("snowflake://"):
|
|
186
|
+
try:
|
|
187
|
+
from duckguard.connectors.snowflake import SnowflakeConnector
|
|
188
|
+
|
|
189
|
+
connector = SnowflakeConnector(engine=engine)
|
|
190
|
+
return connector.connect(config)
|
|
191
|
+
except ImportError:
|
|
192
|
+
raise ImportError(
|
|
193
|
+
"Snowflake support requires snowflake-connector-python. "
|
|
194
|
+
"Install with: pip install duckguard[snowflake]"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# BigQuery
|
|
198
|
+
if source_lower.startswith("bigquery://"):
|
|
199
|
+
try:
|
|
200
|
+
from duckguard.connectors.bigquery import BigQueryConnector
|
|
201
|
+
|
|
202
|
+
connector = BigQueryConnector(engine=engine)
|
|
203
|
+
return connector.connect(config)
|
|
204
|
+
except ImportError:
|
|
205
|
+
raise ImportError(
|
|
206
|
+
"BigQuery support requires google-cloud-bigquery. "
|
|
207
|
+
"Install with: pip install duckguard[bigquery]"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Redshift
|
|
211
|
+
if source_lower.startswith("redshift://") or "redshift.amazonaws.com" in source_lower:
|
|
212
|
+
from duckguard.connectors.redshift import RedshiftConnector
|
|
213
|
+
|
|
214
|
+
connector = RedshiftConnector(engine=engine)
|
|
215
|
+
return connector.connect(config)
|
|
216
|
+
|
|
217
|
+
# SQL Server
|
|
218
|
+
if source_lower.startswith(("mssql://", "sqlserver://", "mssql+pyodbc://")):
|
|
219
|
+
try:
|
|
220
|
+
from duckguard.connectors.sqlserver import SQLServerConnector
|
|
221
|
+
|
|
222
|
+
connector = SQLServerConnector(engine=engine)
|
|
223
|
+
return connector.connect(config)
|
|
224
|
+
except ImportError:
|
|
225
|
+
raise ImportError(
|
|
226
|
+
"SQL Server support requires pyodbc or pymssql. "
|
|
227
|
+
"Install with: pip install duckguard[sqlserver]"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Databricks
|
|
231
|
+
if source_lower.startswith("databricks://") or ".databricks.com" in source_lower:
|
|
232
|
+
try:
|
|
233
|
+
from duckguard.connectors.databricks import DatabricksConnector
|
|
234
|
+
|
|
235
|
+
connector = DatabricksConnector(engine=engine)
|
|
236
|
+
return connector.connect(config)
|
|
237
|
+
except ImportError:
|
|
238
|
+
raise ImportError(
|
|
239
|
+
"Databricks support requires databricks-sql-connector. "
|
|
240
|
+
"Install with: pip install duckguard[databricks]"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Oracle
|
|
244
|
+
if source_lower.startswith("oracle://"):
|
|
245
|
+
try:
|
|
246
|
+
from duckguard.connectors.oracle import OracleConnector
|
|
247
|
+
|
|
248
|
+
connector = OracleConnector(engine=engine)
|
|
249
|
+
return connector.connect(config)
|
|
250
|
+
except ImportError:
|
|
251
|
+
raise ImportError(
|
|
252
|
+
"Oracle support requires oracledb. "
|
|
253
|
+
"Install with: pip install duckguard[oracle]"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# MongoDB
|
|
257
|
+
if source_lower.startswith(("mongodb://", "mongodb+srv://")):
|
|
258
|
+
try:
|
|
259
|
+
from duckguard.connectors.mongodb import MongoDBConnector
|
|
260
|
+
|
|
261
|
+
connector = MongoDBConnector(engine=engine)
|
|
262
|
+
return connector.connect(config)
|
|
263
|
+
except ImportError:
|
|
264
|
+
raise ImportError(
|
|
265
|
+
"MongoDB support requires pymongo. "
|
|
266
|
+
"Install with: pip install duckguard[mongodb]"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Kafka
|
|
270
|
+
if source_lower.startswith("kafka://"):
|
|
271
|
+
try:
|
|
272
|
+
from duckguard.connectors.kafka import KafkaConnector
|
|
273
|
+
|
|
274
|
+
connector = KafkaConnector(engine=engine)
|
|
275
|
+
return connector.connect(config)
|
|
276
|
+
except ImportError:
|
|
277
|
+
raise ImportError(
|
|
278
|
+
"Kafka support requires kafka-python. "
|
|
279
|
+
"Install with: pip install duckguard[kafka]"
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# For other databases, raise helpful error
|
|
283
|
+
raise ValueError(
|
|
284
|
+
f"Database connector not yet implemented for: {source}\n"
|
|
285
|
+
f"Currently supported: postgres://, mysql://, sqlite://, snowflake://, "
|
|
286
|
+
f"bigquery://, redshift://, mssql://, databricks://, oracle://, "
|
|
287
|
+
f"mongodb://, kafka://"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# Alias for backwards compatibility
|
|
292
|
+
load = connect
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""File-based connectors (CSV, Parquet, JSON, Excel)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from duckguard.connectors.base import Connector, ConnectionConfig
|
|
9
|
+
from duckguard.core.dataset import Dataset
|
|
10
|
+
from duckguard.core.engine import DuckGuardEngine
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FileConnector(Connector):
|
|
14
|
+
"""
|
|
15
|
+
Connector for file-based data sources.
|
|
16
|
+
|
|
17
|
+
Supports:
|
|
18
|
+
- CSV files (.csv)
|
|
19
|
+
- Parquet files (.parquet, .pq)
|
|
20
|
+
- JSON files (.json, .jsonl, .ndjson)
|
|
21
|
+
- Excel files (.xlsx, .xls) - requires additional setup
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
SUPPORTED_EXTENSIONS = {
|
|
25
|
+
".csv": "csv",
|
|
26
|
+
".parquet": "parquet",
|
|
27
|
+
".pq": "parquet",
|
|
28
|
+
".json": "json",
|
|
29
|
+
".jsonl": "json",
|
|
30
|
+
".ndjson": "json",
|
|
31
|
+
".xlsx": "excel",
|
|
32
|
+
".xls": "excel",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def __init__(self, engine: DuckGuardEngine | None = None):
|
|
36
|
+
super().__init__(engine)
|
|
37
|
+
|
|
38
|
+
def connect(self, config: ConnectionConfig) -> Dataset:
|
|
39
|
+
"""
|
|
40
|
+
Connect to a file and return a Dataset.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: Connection configuration with file path
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dataset object
|
|
47
|
+
"""
|
|
48
|
+
path = config.source
|
|
49
|
+
ext = Path(path).suffix.lower()
|
|
50
|
+
|
|
51
|
+
# Determine file type
|
|
52
|
+
file_type = self.SUPPORTED_EXTENSIONS.get(ext)
|
|
53
|
+
if not file_type:
|
|
54
|
+
raise ValueError(f"Unsupported file type: {ext}")
|
|
55
|
+
|
|
56
|
+
# Validate file exists (for local files)
|
|
57
|
+
if not self._is_remote_path(path) and not os.path.exists(path):
|
|
58
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
59
|
+
|
|
60
|
+
# Create dataset name from filename
|
|
61
|
+
name = Path(path).stem
|
|
62
|
+
|
|
63
|
+
return Dataset(source=path, engine=self.engine, name=name)
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def can_handle(cls, source: str) -> bool:
|
|
67
|
+
"""Check if this connector can handle the source."""
|
|
68
|
+
# Check for file extensions
|
|
69
|
+
path = Path(source)
|
|
70
|
+
ext = path.suffix.lower()
|
|
71
|
+
|
|
72
|
+
if ext in cls.SUPPORTED_EXTENSIONS:
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
# Check for S3/GCS/Azure paths with supported extensions
|
|
76
|
+
if cls._is_remote_path(source):
|
|
77
|
+
# Extract extension from remote path
|
|
78
|
+
for supported_ext in cls.SUPPORTED_EXTENSIONS:
|
|
79
|
+
if source.lower().endswith(supported_ext):
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _is_remote_path(path: str) -> bool:
|
|
86
|
+
"""Check if path is a remote storage path."""
|
|
87
|
+
remote_prefixes = ("s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://")
|
|
88
|
+
return path.lower().startswith(remote_prefixes)
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def get_priority(cls) -> int:
|
|
92
|
+
"""File connector has default priority."""
|
|
93
|
+
return 10
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class S3Connector(FileConnector):
|
|
97
|
+
"""Connector for S3 paths."""
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def can_handle(cls, source: str) -> bool:
|
|
101
|
+
"""Check if this is an S3 path."""
|
|
102
|
+
return source.lower().startswith("s3://")
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def get_priority(cls) -> int:
|
|
106
|
+
"""S3 connector has higher priority."""
|
|
107
|
+
return 20
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class GCSConnector(FileConnector):
|
|
111
|
+
"""Connector for Google Cloud Storage paths."""
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def can_handle(cls, source: str) -> bool:
|
|
115
|
+
"""Check if this is a GCS path."""
|
|
116
|
+
return source.lower().startswith(("gs://", "gcs://"))
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def get_priority(cls) -> int:
|
|
120
|
+
"""GCS connector has higher priority."""
|
|
121
|
+
return 20
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class AzureConnector(FileConnector):
|
|
125
|
+
"""Connector for Azure Blob Storage paths."""
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def can_handle(cls, source: str) -> bool:
|
|
129
|
+
"""Check if this is an Azure path."""
|
|
130
|
+
return source.lower().startswith(("az://", "abfs://"))
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def get_priority(cls) -> int:
|
|
134
|
+
"""Azure connector has higher priority."""
|
|
135
|
+
return 20
|