duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,58 @@
1
+ """Connectors for various data sources."""
2
+
3
+ from duckguard.connectors.base import Connector, ConnectionConfig
4
+ from duckguard.connectors.files import FileConnector, S3Connector, GCSConnector, AzureConnector
5
+ from duckguard.connectors.factory import connect, register_connector
6
+
7
+ # Database connectors (imported lazily to avoid import errors)
8
+ __all__ = [
9
+ # Base classes
10
+ "Connector",
11
+ "ConnectionConfig",
12
+ # File connectors
13
+ "FileConnector",
14
+ "S3Connector",
15
+ "GCSConnector",
16
+ "AzureConnector",
17
+ # Factory functions
18
+ "connect",
19
+ "register_connector",
20
+ ]
21
+
22
+
23
+ def __getattr__(name: str):
24
+ """Lazy import database connectors to avoid import errors."""
25
+ if name == "PostgresConnector":
26
+ from duckguard.connectors.postgres import PostgresConnector
27
+ return PostgresConnector
28
+ if name == "MySQLConnector":
29
+ from duckguard.connectors.mysql import MySQLConnector
30
+ return MySQLConnector
31
+ if name == "SQLiteConnector":
32
+ from duckguard.connectors.sqlite import SQLiteConnector
33
+ return SQLiteConnector
34
+ if name == "SnowflakeConnector":
35
+ from duckguard.connectors.snowflake import SnowflakeConnector
36
+ return SnowflakeConnector
37
+ if name == "BigQueryConnector":
38
+ from duckguard.connectors.bigquery import BigQueryConnector
39
+ return BigQueryConnector
40
+ if name == "RedshiftConnector":
41
+ from duckguard.connectors.redshift import RedshiftConnector
42
+ return RedshiftConnector
43
+ if name == "SQLServerConnector":
44
+ from duckguard.connectors.sqlserver import SQLServerConnector
45
+ return SQLServerConnector
46
+ if name == "DatabricksConnector":
47
+ from duckguard.connectors.databricks import DatabricksConnector
48
+ return DatabricksConnector
49
+ if name == "OracleConnector":
50
+ from duckguard.connectors.oracle import OracleConnector
51
+ return OracleConnector
52
+ if name == "MongoDBConnector":
53
+ from duckguard.connectors.mongodb import MongoDBConnector
54
+ return MongoDBConnector
55
+ if name == "KafkaConnector":
56
+ from duckguard.connectors.kafka import KafkaConnector
57
+ return KafkaConnector
58
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,80 @@
1
+ """Base connector interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ @dataclass
14
+ class ConnectionConfig:
15
+ """Configuration for a data source connection."""
16
+
17
+ source: str
18
+ table: str | None = None
19
+ schema: str | None = None
20
+ database: str | None = None
21
+ options: dict[str, Any] | None = None
22
+
23
+ def __post_init__(self) -> None:
24
+ if self.options is None:
25
+ self.options = {}
26
+
27
+
28
+ class Connector(ABC):
29
+ """
30
+ Base class for data source connectors.
31
+
32
+ Connectors handle the logic of connecting to different data sources
33
+ and creating Dataset objects.
34
+ """
35
+
36
+ def __init__(self, engine: DuckGuardEngine | None = None):
37
+ """
38
+ Initialize the connector.
39
+
40
+ Args:
41
+ engine: Optional DuckGuardEngine instance
42
+ """
43
+ self.engine = engine or DuckGuardEngine.get_instance()
44
+
45
+ @abstractmethod
46
+ def connect(self, config: ConnectionConfig) -> Dataset:
47
+ """
48
+ Connect to a data source and return a Dataset.
49
+
50
+ Args:
51
+ config: Connection configuration
52
+
53
+ Returns:
54
+ Dataset object
55
+ """
56
+ pass
57
+
58
+ @classmethod
59
+ @abstractmethod
60
+ def can_handle(cls, source: str) -> bool:
61
+ """
62
+ Check if this connector can handle the given source.
63
+
64
+ Args:
65
+ source: Source string (path, URL, connection string)
66
+
67
+ Returns:
68
+ True if this connector can handle the source
69
+ """
70
+ pass
71
+
72
+ @classmethod
73
+ def get_priority(cls) -> int:
74
+ """
75
+ Get the priority of this connector (higher = checked first).
76
+
77
+ Returns:
78
+ Priority value
79
+ """
80
+ return 0
@@ -0,0 +1,171 @@
1
+ """BigQuery connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class BigQueryConnector(Connector):
14
+ """
15
+ Connector for Google BigQuery.
16
+
17
+ Uses the google-cloud-bigquery package to connect and query,
18
+ then processes results with DuckDB for validation.
19
+
20
+ Examples:
21
+ # Using connection string
22
+ data = connect(
23
+ "bigquery://project-id/dataset",
24
+ table="orders"
25
+ )
26
+
27
+ # Using options with service account
28
+ data = connect(
29
+ "bigquery://project-id",
30
+ table="orders",
31
+ dataset="my_dataset",
32
+ credentials_path="/path/to/service-account.json"
33
+ )
34
+ """
35
+
36
+ def __init__(self, engine: DuckGuardEngine | None = None):
37
+ super().__init__(engine)
38
+ self._client = None
39
+
40
+ def connect(self, config: ConnectionConfig) -> Dataset:
41
+ """
42
+ Connect to BigQuery and return a Dataset.
43
+
44
+ Args:
45
+ config: Connection configuration
46
+
47
+ Returns:
48
+ Dataset object
49
+ """
50
+ try:
51
+ from google.cloud import bigquery
52
+ except ImportError:
53
+ raise ImportError(
54
+ "BigQuery support requires google-cloud-bigquery. "
55
+ "Install with: pip install duckguard[bigquery]"
56
+ )
57
+
58
+ if not config.table:
59
+ raise ValueError("Table name is required for BigQuery connections")
60
+
61
+ # Parse connection parameters
62
+ conn_params = self._parse_connection_string(config.source, config)
63
+
64
+ # Initialize BigQuery client
65
+ if conn_params.get("credentials_path"):
66
+ self._client = bigquery.Client.from_service_account_json(
67
+ conn_params["credentials_path"]
68
+ )
69
+ else:
70
+ self._client = bigquery.Client(project=conn_params.get("project"))
71
+
72
+ table = config.table
73
+ dataset = conn_params.get("dataset", "")
74
+ project = conn_params.get("project", self._client.project)
75
+
76
+ # Build fully qualified table name
77
+ if project and dataset:
78
+ fq_table = f"`{project}.{dataset}.{table}`"
79
+ elif dataset:
80
+ fq_table = f"`{dataset}.{table}`"
81
+ else:
82
+ fq_table = f"`{table}`"
83
+
84
+ return BigQueryDataset(
85
+ source=fq_table,
86
+ engine=self.engine,
87
+ name=table,
88
+ client=self._client,
89
+ )
90
+
91
+ def _parse_connection_string(
92
+ self, conn_string: str, config: ConnectionConfig
93
+ ) -> dict[str, Any]:
94
+ """Parse BigQuery connection string and merge with config options."""
95
+ params: dict[str, Any] = {}
96
+
97
+ # Parse URL format: bigquery://project-id/dataset
98
+ if conn_string.lower().startswith("bigquery://"):
99
+ parsed = urlparse(conn_string)
100
+
101
+ params["project"] = parsed.hostname or ""
102
+
103
+ # Parse path for dataset
104
+ path_parts = [p for p in parsed.path.split("/") if p]
105
+ if len(path_parts) >= 1:
106
+ params["dataset"] = path_parts[0]
107
+
108
+ # Override with config options
109
+ options = config.options or {}
110
+ for key in ["project", "dataset", "credentials_path", "location"]:
111
+ if key in options:
112
+ params[key] = options[key]
113
+
114
+ if config.database:
115
+ params["dataset"] = config.database
116
+ if config.schema:
117
+ params["dataset"] = config.schema
118
+
119
+ return params
120
+
121
+ @classmethod
122
+ def can_handle(cls, source: str) -> bool:
123
+ """Check if this is a BigQuery connection string."""
124
+ return source.lower().startswith("bigquery://")
125
+
126
+ @classmethod
127
+ def get_priority(cls) -> int:
128
+ """BigQuery connector has high priority."""
129
+ return 60
130
+
131
+
132
+ class BigQueryDataset(Dataset):
133
+ """Dataset that queries BigQuery directly."""
134
+
135
+ def __init__(
136
+ self,
137
+ source: str,
138
+ engine: DuckGuardEngine,
139
+ name: str,
140
+ client: Any,
141
+ ):
142
+ super().__init__(source=source, engine=engine, name=name)
143
+ self._bq_client = client
144
+
145
+ def _execute_bq_query(self, sql: str) -> list[Any]:
146
+ """Execute a query on BigQuery."""
147
+ query_job = self._bq_client.query(sql)
148
+ return list(query_job.result())
149
+
150
+ def _fetch_bq_value(self, sql: str) -> Any:
151
+ """Execute query and return single value."""
152
+ rows = self._execute_bq_query(sql)
153
+ return rows[0][0] if rows else None
154
+
155
+ @property
156
+ def row_count(self) -> int:
157
+ """Get row count from BigQuery."""
158
+ if self._row_count_cache is None:
159
+ sql = f"SELECT COUNT(*) FROM {self._source}"
160
+ self._row_count_cache = self._fetch_bq_value(sql) or 0
161
+ return self._row_count_cache
162
+
163
+ @property
164
+ def columns(self) -> list[str]:
165
+ """Get column names from BigQuery."""
166
+ if self._columns_cache is None:
167
+ sql = f"SELECT * FROM {self._source} LIMIT 0"
168
+ query_job = self._bq_client.query(sql)
169
+ result = query_job.result()
170
+ self._columns_cache = [field.name for field in result.schema]
171
+ return self._columns_cache
@@ -0,0 +1,201 @@
1
+ """Databricks connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import parse_qs, urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class DatabricksConnector(Connector):
14
+ """
15
+ Connector for Databricks SQL Warehouse and Unity Catalog.
16
+
17
+ Uses the databricks-sql-connector package for efficient querying.
18
+
19
+ Examples:
20
+ # Using connection string
21
+ data = connect(
22
+ "databricks://workspace.cloud.databricks.com/catalog/schema",
23
+ table="orders",
24
+ token="dapi..."
25
+ )
26
+
27
+ # Using options
28
+ data = connect(
29
+ "databricks://workspace.cloud.databricks.com",
30
+ table="orders",
31
+ catalog="main",
32
+ schema="default",
33
+ http_path="/sql/1.0/warehouses/abc123",
34
+ token="dapi..."
35
+ )
36
+ """
37
+
38
+ def __init__(self, engine: DuckGuardEngine | None = None):
39
+ super().__init__(engine)
40
+ self._connection = None
41
+
42
+ def connect(self, config: ConnectionConfig) -> Dataset:
43
+ """
44
+ Connect to Databricks and return a Dataset.
45
+
46
+ Args:
47
+ config: Connection configuration
48
+
49
+ Returns:
50
+ Dataset object
51
+ """
52
+ try:
53
+ from databricks import sql as databricks_sql
54
+ except ImportError:
55
+ raise ImportError(
56
+ "Databricks support requires databricks-sql-connector. "
57
+ "Install with: pip install duckguard[databricks]"
58
+ )
59
+
60
+ if not config.table:
61
+ raise ValueError("Table name is required for Databricks connections")
62
+
63
+ # Parse connection parameters
64
+ conn_params = self._parse_connection_string(config.source, config)
65
+
66
+ # Validate required parameters
67
+ if not conn_params.get("server_hostname"):
68
+ raise ValueError("Databricks server hostname is required")
69
+ if not conn_params.get("http_path"):
70
+ raise ValueError("Databricks http_path is required (SQL Warehouse path)")
71
+ if not conn_params.get("access_token"):
72
+ raise ValueError("Databricks access token is required")
73
+
74
+ # Connect to Databricks
75
+ self._connection = databricks_sql.connect(
76
+ server_hostname=conn_params["server_hostname"],
77
+ http_path=conn_params["http_path"],
78
+ access_token=conn_params["access_token"],
79
+ )
80
+
81
+ table = config.table
82
+ catalog = conn_params.get("catalog", "main")
83
+ schema = config.schema or conn_params.get("schema", "default")
84
+
85
+ # Build fully qualified table name
86
+ fq_table = f"`{catalog}`.`{schema}`.`{table}`"
87
+
88
+ return DatabricksDataset(
89
+ source=fq_table,
90
+ engine=self.engine,
91
+ name=table,
92
+ connection=self._connection,
93
+ )
94
+
95
+ def _parse_connection_string(
96
+ self, conn_string: str, config: ConnectionConfig
97
+ ) -> dict[str, Any]:
98
+ """Parse Databricks connection string and merge with config options."""
99
+ params: dict[str, Any] = {}
100
+
101
+ # Parse URL format: databricks://workspace.cloud.databricks.com/catalog/schema
102
+ if conn_string.lower().startswith("databricks://"):
103
+ parsed = urlparse(conn_string)
104
+
105
+ params["server_hostname"] = parsed.hostname or ""
106
+
107
+ # Parse path for catalog/schema
108
+ path_parts = [p for p in parsed.path.split("/") if p]
109
+ if len(path_parts) >= 1:
110
+ params["catalog"] = path_parts[0]
111
+ if len(path_parts) >= 2:
112
+ params["schema"] = path_parts[1]
113
+
114
+ # Parse query parameters
115
+ if parsed.query:
116
+ query_params = parse_qs(parsed.query)
117
+ for key, values in query_params.items():
118
+ params[key] = values[0] if len(values) == 1 else values
119
+
120
+ # Override with config options
121
+ options = config.options or {}
122
+ for key in [
123
+ "server_hostname",
124
+ "http_path",
125
+ "access_token",
126
+ "token",
127
+ "catalog",
128
+ "schema",
129
+ ]:
130
+ if key in options:
131
+ # Handle token alias
132
+ if key == "token":
133
+ params["access_token"] = options[key]
134
+ else:
135
+ params[key] = options[key]
136
+
137
+ if config.database:
138
+ params["catalog"] = config.database
139
+ if config.schema:
140
+ params["schema"] = config.schema
141
+
142
+ return params
143
+
144
+ @classmethod
145
+ def can_handle(cls, source: str) -> bool:
146
+ """Check if this is a Databricks connection string."""
147
+ source_lower = source.lower()
148
+ return source_lower.startswith("databricks://") or ".databricks.com" in source_lower
149
+
150
+ @classmethod
151
+ def get_priority(cls) -> int:
152
+ """Databricks connector has high priority."""
153
+ return 60
154
+
155
+
156
+ class DatabricksDataset(Dataset):
157
+ """Dataset that queries Databricks directly."""
158
+
159
+ def __init__(
160
+ self,
161
+ source: str,
162
+ engine: DuckGuardEngine,
163
+ name: str,
164
+ connection: Any,
165
+ ):
166
+ super().__init__(source=source, engine=engine, name=name)
167
+ self._db_connection = connection
168
+
169
+ def _execute_query(self, sql: str) -> list[tuple[Any, ...]]:
170
+ """Execute a query on Databricks."""
171
+ cursor = self._db_connection.cursor()
172
+ try:
173
+ cursor.execute(sql)
174
+ return cursor.fetchall()
175
+ finally:
176
+ cursor.close()
177
+
178
+ def _fetch_value(self, sql: str) -> Any:
179
+ """Execute query and return single value."""
180
+ rows = self._execute_query(sql)
181
+ return rows[0][0] if rows else None
182
+
183
+ @property
184
+ def row_count(self) -> int:
185
+ """Get row count from Databricks."""
186
+ if self._row_count_cache is None:
187
+ sql = f"SELECT COUNT(*) FROM {self._source}"
188
+ self._row_count_cache = self._fetch_value(sql) or 0
189
+ return self._row_count_cache
190
+
191
+ @property
192
+ def columns(self) -> list[str]:
193
+ """Get column names from Databricks."""
194
+ if self._columns_cache is None:
195
+ cursor = self._db_connection.cursor()
196
+ try:
197
+ cursor.execute(f"SELECT * FROM {self._source} LIMIT 0")
198
+ self._columns_cache = [desc[0] for desc in cursor.description]
199
+ finally:
200
+ cursor.close()
201
+ return self._columns_cache