duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,292 @@
1
+ """Factory function for creating connections."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from duckguard.connectors.base import Connector, ConnectionConfig
8
+ from duckguard.connectors.files import FileConnector, S3Connector, GCSConnector, AzureConnector
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ # Registry of available connectors
14
+ _CONNECTORS: list[type[Connector]] = [
15
+ S3Connector,
16
+ GCSConnector,
17
+ AzureConnector,
18
+ FileConnector,
19
+ ]
20
+
21
+
22
+ def register_connector(connector_class: type[Connector]) -> None:
23
+ """
24
+ Register a custom connector.
25
+
26
+ Args:
27
+ connector_class: Connector class to register
28
+ """
29
+ _CONNECTORS.append(connector_class)
30
+ # Sort by priority (highest first)
31
+ _CONNECTORS.sort(key=lambda c: c.get_priority(), reverse=True)
32
+
33
+
34
+ def connect(
35
+ source: str,
36
+ *,
37
+ table: str | None = None,
38
+ schema: str | None = None,
39
+ database: str | None = None,
40
+ engine: DuckGuardEngine | None = None,
41
+ **options: Any,
42
+ ) -> Dataset:
43
+ """
44
+ Connect to a data source and return a Dataset.
45
+
46
+ This is the main entry point for connecting to data sources.
47
+ It automatically detects the source type and uses the appropriate connector.
48
+
49
+ Args:
50
+ source: Path to file, connection string, or URL
51
+ table: Table name (for database connections)
52
+ schema: Schema name (for database connections)
53
+ database: Database name (for database connections)
54
+ engine: Optional DuckGuardEngine instance
55
+ **options: Additional options passed to the connector
56
+
57
+ Returns:
58
+ Dataset object ready for validation
59
+
60
+ Examples:
61
+ # Connect to a CSV file
62
+ orders = connect("data/orders.csv")
63
+
64
+ # Connect to a Parquet file on S3
65
+ orders = connect("s3://bucket/orders.parquet")
66
+
67
+ # Connect to PostgreSQL
68
+ orders = connect("postgres://localhost/mydb", table="orders")
69
+
70
+ # Connect to Snowflake
71
+ orders = connect("snowflake://account/db", table="orders", schema="public")
72
+
73
+ Raises:
74
+ ValueError: If no connector can handle the source
75
+ """
76
+ config = ConnectionConfig(
77
+ source=source,
78
+ table=table,
79
+ schema=schema,
80
+ database=database,
81
+ options=options,
82
+ )
83
+
84
+ # Find a connector that can handle this source
85
+ for connector_class in _CONNECTORS:
86
+ if connector_class.can_handle(source):
87
+ connector = connector_class(engine=engine)
88
+ return connector.connect(config)
89
+
90
+ # Check for database connection strings
91
+ if _is_database_connection(source):
92
+ return _handle_database_connection(source, config, engine)
93
+
94
+ raise ValueError(
95
+ f"No connector found for source: {source}\n"
96
+ f"Supported formats: CSV, Parquet, JSON, Excel\n"
97
+ f"Supported protocols: s3://, gs://, az://, postgres://, mysql://"
98
+ )
99
+
100
+
101
+ def _is_database_connection(source: str) -> bool:
102
+ """Check if source is a database connection string."""
103
+ db_prefixes = (
104
+ "postgres://",
105
+ "postgresql://",
106
+ "mysql://",
107
+ "mysql+pymysql://",
108
+ "sqlite://",
109
+ "snowflake://",
110
+ "bigquery://",
111
+ "redshift://",
112
+ "mssql://",
113
+ "sqlserver://",
114
+ "databricks://",
115
+ "oracle://",
116
+ "mongodb://",
117
+ "mongodb+srv://",
118
+ "kafka://",
119
+ )
120
+ source_lower = source.lower()
121
+
122
+ # Check prefixes
123
+ if source_lower.startswith(db_prefixes):
124
+ return True
125
+
126
+ # Check for SQLite file extensions
127
+ if source_lower.endswith((".db", ".sqlite", ".sqlite3")):
128
+ return True
129
+
130
+ # Check for Redshift hostname
131
+ if "redshift.amazonaws.com" in source_lower:
132
+ return True
133
+
134
+ # Check for Databricks hostname
135
+ if ".databricks.com" in source_lower:
136
+ return True
137
+
138
+ return False
139
+
140
+
141
+ def _handle_database_connection(
142
+ source: str,
143
+ config: ConnectionConfig,
144
+ engine: DuckGuardEngine | None,
145
+ ) -> Dataset:
146
+ """Handle database connection strings."""
147
+ source_lower = source.lower()
148
+
149
+ # PostgreSQL
150
+ if source_lower.startswith(("postgres://", "postgresql://")):
151
+ try:
152
+ from duckguard.connectors.postgres import PostgresConnector
153
+
154
+ connector = PostgresConnector(engine=engine)
155
+ return connector.connect(config)
156
+ except ImportError:
157
+ raise ImportError(
158
+ "PostgreSQL support requires psycopg2. "
159
+ "Install with: pip install duckguard[postgres]"
160
+ )
161
+
162
+ # MySQL
163
+ if source_lower.startswith(("mysql://", "mysql+pymysql://")):
164
+ try:
165
+ from duckguard.connectors.mysql import MySQLConnector
166
+
167
+ connector = MySQLConnector(engine=engine)
168
+ return connector.connect(config)
169
+ except ImportError:
170
+ raise ImportError(
171
+ "MySQL support requires pymysql. "
172
+ "Install with: pip install duckguard[mysql]"
173
+ )
174
+
175
+ # SQLite
176
+ if source_lower.startswith("sqlite://") or source_lower.endswith(
177
+ (".db", ".sqlite", ".sqlite3")
178
+ ):
179
+ from duckguard.connectors.sqlite import SQLiteConnector
180
+
181
+ connector = SQLiteConnector(engine=engine)
182
+ return connector.connect(config)
183
+
184
+ # Snowflake
185
+ if source_lower.startswith("snowflake://"):
186
+ try:
187
+ from duckguard.connectors.snowflake import SnowflakeConnector
188
+
189
+ connector = SnowflakeConnector(engine=engine)
190
+ return connector.connect(config)
191
+ except ImportError:
192
+ raise ImportError(
193
+ "Snowflake support requires snowflake-connector-python. "
194
+ "Install with: pip install duckguard[snowflake]"
195
+ )
196
+
197
+ # BigQuery
198
+ if source_lower.startswith("bigquery://"):
199
+ try:
200
+ from duckguard.connectors.bigquery import BigQueryConnector
201
+
202
+ connector = BigQueryConnector(engine=engine)
203
+ return connector.connect(config)
204
+ except ImportError:
205
+ raise ImportError(
206
+ "BigQuery support requires google-cloud-bigquery. "
207
+ "Install with: pip install duckguard[bigquery]"
208
+ )
209
+
210
+ # Redshift
211
+ if source_lower.startswith("redshift://") or "redshift.amazonaws.com" in source_lower:
212
+ from duckguard.connectors.redshift import RedshiftConnector
213
+
214
+ connector = RedshiftConnector(engine=engine)
215
+ return connector.connect(config)
216
+
217
+ # SQL Server
218
+ if source_lower.startswith(("mssql://", "sqlserver://", "mssql+pyodbc://")):
219
+ try:
220
+ from duckguard.connectors.sqlserver import SQLServerConnector
221
+
222
+ connector = SQLServerConnector(engine=engine)
223
+ return connector.connect(config)
224
+ except ImportError:
225
+ raise ImportError(
226
+ "SQL Server support requires pyodbc or pymssql. "
227
+ "Install with: pip install duckguard[sqlserver]"
228
+ )
229
+
230
+ # Databricks
231
+ if source_lower.startswith("databricks://") or ".databricks.com" in source_lower:
232
+ try:
233
+ from duckguard.connectors.databricks import DatabricksConnector
234
+
235
+ connector = DatabricksConnector(engine=engine)
236
+ return connector.connect(config)
237
+ except ImportError:
238
+ raise ImportError(
239
+ "Databricks support requires databricks-sql-connector. "
240
+ "Install with: pip install duckguard[databricks]"
241
+ )
242
+
243
+ # Oracle
244
+ if source_lower.startswith("oracle://"):
245
+ try:
246
+ from duckguard.connectors.oracle import OracleConnector
247
+
248
+ connector = OracleConnector(engine=engine)
249
+ return connector.connect(config)
250
+ except ImportError:
251
+ raise ImportError(
252
+ "Oracle support requires oracledb. "
253
+ "Install with: pip install duckguard[oracle]"
254
+ )
255
+
256
+ # MongoDB
257
+ if source_lower.startswith(("mongodb://", "mongodb+srv://")):
258
+ try:
259
+ from duckguard.connectors.mongodb import MongoDBConnector
260
+
261
+ connector = MongoDBConnector(engine=engine)
262
+ return connector.connect(config)
263
+ except ImportError:
264
+ raise ImportError(
265
+ "MongoDB support requires pymongo. "
266
+ "Install with: pip install duckguard[mongodb]"
267
+ )
268
+
269
+ # Kafka
270
+ if source_lower.startswith("kafka://"):
271
+ try:
272
+ from duckguard.connectors.kafka import KafkaConnector
273
+
274
+ connector = KafkaConnector(engine=engine)
275
+ return connector.connect(config)
276
+ except ImportError:
277
+ raise ImportError(
278
+ "Kafka support requires kafka-python. "
279
+ "Install with: pip install duckguard[kafka]"
280
+ )
281
+
282
+ # For other databases, raise helpful error
283
+ raise ValueError(
284
+ f"Database connector not yet implemented for: {source}\n"
285
+ f"Currently supported: postgres://, mysql://, sqlite://, snowflake://, "
286
+ f"bigquery://, redshift://, mssql://, databricks://, oracle://, "
287
+ f"mongodb://, kafka://"
288
+ )
289
+
290
+
291
+ # Alias for backwards compatibility
292
+ load = connect
@@ -0,0 +1,135 @@
1
+ """File-based connectors (CSV, Parquet, JSON, Excel)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class FileConnector(Connector):
14
+ """
15
+ Connector for file-based data sources.
16
+
17
+ Supports:
18
+ - CSV files (.csv)
19
+ - Parquet files (.parquet, .pq)
20
+ - JSON files (.json, .jsonl, .ndjson)
21
+ - Excel files (.xlsx, .xls) - requires additional setup
22
+ """
23
+
24
+ SUPPORTED_EXTENSIONS = {
25
+ ".csv": "csv",
26
+ ".parquet": "parquet",
27
+ ".pq": "parquet",
28
+ ".json": "json",
29
+ ".jsonl": "json",
30
+ ".ndjson": "json",
31
+ ".xlsx": "excel",
32
+ ".xls": "excel",
33
+ }
34
+
35
+ def __init__(self, engine: DuckGuardEngine | None = None):
36
+ super().__init__(engine)
37
+
38
+ def connect(self, config: ConnectionConfig) -> Dataset:
39
+ """
40
+ Connect to a file and return a Dataset.
41
+
42
+ Args:
43
+ config: Connection configuration with file path
44
+
45
+ Returns:
46
+ Dataset object
47
+ """
48
+ path = config.source
49
+ ext = Path(path).suffix.lower()
50
+
51
+ # Determine file type
52
+ file_type = self.SUPPORTED_EXTENSIONS.get(ext)
53
+ if not file_type:
54
+ raise ValueError(f"Unsupported file type: {ext}")
55
+
56
+ # Validate file exists (for local files)
57
+ if not self._is_remote_path(path) and not os.path.exists(path):
58
+ raise FileNotFoundError(f"File not found: {path}")
59
+
60
+ # Create dataset name from filename
61
+ name = Path(path).stem
62
+
63
+ return Dataset(source=path, engine=self.engine, name=name)
64
+
65
+ @classmethod
66
+ def can_handle(cls, source: str) -> bool:
67
+ """Check if this connector can handle the source."""
68
+ # Check for file extensions
69
+ path = Path(source)
70
+ ext = path.suffix.lower()
71
+
72
+ if ext in cls.SUPPORTED_EXTENSIONS:
73
+ return True
74
+
75
+ # Check for S3/GCS/Azure paths with supported extensions
76
+ if cls._is_remote_path(source):
77
+ # Extract extension from remote path
78
+ for supported_ext in cls.SUPPORTED_EXTENSIONS:
79
+ if source.lower().endswith(supported_ext):
80
+ return True
81
+
82
+ return False
83
+
84
+ @staticmethod
85
+ def _is_remote_path(path: str) -> bool:
86
+ """Check if path is a remote storage path."""
87
+ remote_prefixes = ("s3://", "gs://", "gcs://", "az://", "abfs://", "http://", "https://")
88
+ return path.lower().startswith(remote_prefixes)
89
+
90
+ @classmethod
91
+ def get_priority(cls) -> int:
92
+ """File connector has default priority."""
93
+ return 10
94
+
95
+
96
+ class S3Connector(FileConnector):
97
+ """Connector for S3 paths."""
98
+
99
+ @classmethod
100
+ def can_handle(cls, source: str) -> bool:
101
+ """Check if this is an S3 path."""
102
+ return source.lower().startswith("s3://")
103
+
104
+ @classmethod
105
+ def get_priority(cls) -> int:
106
+ """S3 connector has higher priority."""
107
+ return 20
108
+
109
+
110
+ class GCSConnector(FileConnector):
111
+ """Connector for Google Cloud Storage paths."""
112
+
113
+ @classmethod
114
+ def can_handle(cls, source: str) -> bool:
115
+ """Check if this is a GCS path."""
116
+ return source.lower().startswith(("gs://", "gcs://"))
117
+
118
+ @classmethod
119
+ def get_priority(cls) -> int:
120
+ """GCS connector has higher priority."""
121
+ return 20
122
+
123
+
124
+ class AzureConnector(FileConnector):
125
+ """Connector for Azure Blob Storage paths."""
126
+
127
+ @classmethod
128
+ def can_handle(cls, source: str) -> bool:
129
+ """Check if this is an Azure path."""
130
+ return source.lower().startswith(("az://", "abfs://"))
131
+
132
+ @classmethod
133
+ def get_priority(cls) -> int:
134
+ """Azure connector has higher priority."""
135
+ return 20