duckguard 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. duckguard/__init__.py +110 -0
  2. duckguard/anomaly/__init__.py +34 -0
  3. duckguard/anomaly/detector.py +394 -0
  4. duckguard/anomaly/methods.py +432 -0
  5. duckguard/cli/__init__.py +5 -0
  6. duckguard/cli/main.py +706 -0
  7. duckguard/connectors/__init__.py +58 -0
  8. duckguard/connectors/base.py +80 -0
  9. duckguard/connectors/bigquery.py +171 -0
  10. duckguard/connectors/databricks.py +201 -0
  11. duckguard/connectors/factory.py +292 -0
  12. duckguard/connectors/files.py +135 -0
  13. duckguard/connectors/kafka.py +343 -0
  14. duckguard/connectors/mongodb.py +236 -0
  15. duckguard/connectors/mysql.py +121 -0
  16. duckguard/connectors/oracle.py +196 -0
  17. duckguard/connectors/postgres.py +99 -0
  18. duckguard/connectors/redshift.py +154 -0
  19. duckguard/connectors/snowflake.py +226 -0
  20. duckguard/connectors/sqlite.py +112 -0
  21. duckguard/connectors/sqlserver.py +242 -0
  22. duckguard/contracts/__init__.py +48 -0
  23. duckguard/contracts/diff.py +432 -0
  24. duckguard/contracts/generator.py +334 -0
  25. duckguard/contracts/loader.py +367 -0
  26. duckguard/contracts/schema.py +242 -0
  27. duckguard/contracts/validator.py +453 -0
  28. duckguard/core/__init__.py +8 -0
  29. duckguard/core/column.py +437 -0
  30. duckguard/core/dataset.py +284 -0
  31. duckguard/core/engine.py +261 -0
  32. duckguard/core/result.py +119 -0
  33. duckguard/core/scoring.py +508 -0
  34. duckguard/profiler/__init__.py +5 -0
  35. duckguard/profiler/auto_profile.py +350 -0
  36. duckguard/pytest_plugin/__init__.py +5 -0
  37. duckguard/pytest_plugin/plugin.py +161 -0
  38. duckguard/reporting/__init__.py +6 -0
  39. duckguard/reporting/console.py +88 -0
  40. duckguard/reporting/json_report.py +96 -0
  41. duckguard/rules/__init__.py +28 -0
  42. duckguard/rules/executor.py +616 -0
  43. duckguard/rules/generator.py +341 -0
  44. duckguard/rules/loader.py +483 -0
  45. duckguard/rules/schema.py +289 -0
  46. duckguard/semantic/__init__.py +31 -0
  47. duckguard/semantic/analyzer.py +270 -0
  48. duckguard/semantic/detector.py +459 -0
  49. duckguard/semantic/validators.py +354 -0
  50. duckguard/validators/__init__.py +7 -0
  51. duckguard-2.0.0.dist-info/METADATA +221 -0
  52. duckguard-2.0.0.dist-info/RECORD +55 -0
  53. duckguard-2.0.0.dist-info/WHEEL +4 -0
  54. duckguard-2.0.0.dist-info/entry_points.txt +5 -0
  55. duckguard-2.0.0.dist-info/licenses/LICENSE +55 -0
@@ -0,0 +1,196 @@
1
+ """Oracle Database connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+ from urllib.parse import urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class OracleConnector(Connector):
14
+ """
15
+ Connector for Oracle Database.
16
+
17
+ Uses the oracledb (python-oracledb) package for connectivity.
18
+
19
+ Examples:
20
+ # Using connection string
21
+ data = connect(
22
+ "oracle://user:pass@host:1521/service_name",
23
+ table="orders"
24
+ )
25
+
26
+ # Using TNS alias
27
+ data = connect(
28
+ "oracle://user:pass@tns_alias",
29
+ table="orders"
30
+ )
31
+
32
+ # Using options
33
+ data = connect(
34
+ "oracle://host:1521/service_name",
35
+ table="orders",
36
+ user="myuser",
37
+ password="mypass",
38
+ schema="HR"
39
+ )
40
+ """
41
+
42
+ def __init__(self, engine: DuckGuardEngine | None = None):
43
+ super().__init__(engine)
44
+ self._connection = None
45
+
46
+ def connect(self, config: ConnectionConfig) -> Dataset:
47
+ """
48
+ Connect to Oracle and return a Dataset.
49
+
50
+ Args:
51
+ config: Connection configuration
52
+
53
+ Returns:
54
+ Dataset object
55
+ """
56
+ try:
57
+ import oracledb
58
+ except ImportError:
59
+ raise ImportError(
60
+ "Oracle support requires oracledb. "
61
+ "Install with: pip install duckguard[oracle]"
62
+ )
63
+
64
+ if not config.table:
65
+ raise ValueError("Table name is required for Oracle connections")
66
+
67
+ # Parse connection parameters
68
+ conn_params = self._parse_connection_string(config.source, config)
69
+
70
+ # Build connection
71
+ if conn_params.get("dsn"):
72
+ # Using DSN/TNS
73
+ self._connection = oracledb.connect(
74
+ user=conn_params.get("user"),
75
+ password=conn_params.get("password"),
76
+ dsn=conn_params["dsn"],
77
+ )
78
+ else:
79
+ # Using host/port/service
80
+ self._connection = oracledb.connect(
81
+ user=conn_params.get("user"),
82
+ password=conn_params.get("password"),
83
+ host=conn_params.get("host", "localhost"),
84
+ port=int(conn_params.get("port", 1521)),
85
+ service_name=conn_params.get("service_name"),
86
+ )
87
+
88
+ table = config.table
89
+ schema = config.schema or conn_params.get("schema", conn_params.get("user", "").upper())
90
+
91
+ # Build fully qualified table name
92
+ if schema:
93
+ fq_table = f'"{schema}"."{table.upper()}"'
94
+ else:
95
+ fq_table = f'"{table.upper()}"'
96
+
97
+ return OracleDataset(
98
+ source=fq_table,
99
+ engine=self.engine,
100
+ name=table,
101
+ connection=self._connection,
102
+ )
103
+
104
+ def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
105
+ """Parse Oracle connection string."""
106
+ params: dict[str, Any] = {}
107
+
108
+ # Parse URL format: oracle://user:pass@host:port/service_name
109
+ if conn_string.lower().startswith("oracle://"):
110
+ parsed = urlparse(conn_string)
111
+
112
+ params["user"] = parsed.username or ""
113
+ params["password"] = parsed.password or ""
114
+ params["host"] = parsed.hostname or "localhost"
115
+ params["port"] = str(parsed.port) if parsed.port else "1521"
116
+
117
+ # Path is service name or SID
118
+ if parsed.path:
119
+ service = parsed.path.lstrip("/")
120
+ if service:
121
+ params["service_name"] = service
122
+
123
+ # Check if it's a TNS alias (no port specified and no dots in hostname)
124
+ if not parsed.port and parsed.hostname and "." not in parsed.hostname:
125
+ params["dsn"] = parsed.hostname
126
+
127
+ # Override with config options
128
+ options = config.options or {}
129
+ for key in ["user", "password", "host", "port", "service_name", "dsn", "schema"]:
130
+ if key in options:
131
+ params[key] = options[key]
132
+
133
+ if config.database:
134
+ params["service_name"] = config.database
135
+ if config.schema:
136
+ params["schema"] = config.schema
137
+
138
+ return params
139
+
140
+ @classmethod
141
+ def can_handle(cls, source: str) -> bool:
142
+ """Check if this is an Oracle connection string."""
143
+ return source.lower().startswith("oracle://")
144
+
145
+ @classmethod
146
+ def get_priority(cls) -> int:
147
+ """Oracle connector has high priority."""
148
+ return 55
149
+
150
+
151
+ class OracleDataset(Dataset):
152
+ """Dataset that queries Oracle directly."""
153
+
154
+ def __init__(
155
+ self,
156
+ source: str,
157
+ engine: DuckGuardEngine,
158
+ name: str,
159
+ connection: Any,
160
+ ):
161
+ super().__init__(source=source, engine=engine, name=name)
162
+ self._ora_connection = connection
163
+
164
+ def _execute_query(self, sql: str) -> list[tuple[Any, ...]]:
165
+ """Execute a query on Oracle."""
166
+ cursor = self._ora_connection.cursor()
167
+ try:
168
+ cursor.execute(sql)
169
+ return cursor.fetchall()
170
+ finally:
171
+ cursor.close()
172
+
173
+ def _fetch_value(self, sql: str) -> Any:
174
+ """Execute query and return single value."""
175
+ rows = self._execute_query(sql)
176
+ return rows[0][0] if rows else None
177
+
178
+ @property
179
+ def row_count(self) -> int:
180
+ """Get row count from Oracle."""
181
+ if self._row_count_cache is None:
182
+ sql = f"SELECT COUNT(*) FROM {self._source}"
183
+ self._row_count_cache = self._fetch_value(sql) or 0
184
+ return self._row_count_cache
185
+
186
+ @property
187
+ def columns(self) -> list[str]:
188
+ """Get column names from Oracle."""
189
+ if self._columns_cache is None:
190
+ cursor = self._ora_connection.cursor()
191
+ try:
192
+ cursor.execute(f"SELECT * FROM {self._source} WHERE ROWNUM = 0")
193
+ self._columns_cache = [desc[0] for desc in cursor.description]
194
+ finally:
195
+ cursor.close()
196
+ return self._columns_cache
@@ -0,0 +1,99 @@
1
+ """PostgreSQL connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from urllib.parse import urlparse
7
+
8
+ from duckguard.connectors.base import Connector, ConnectionConfig
9
+ from duckguard.core.dataset import Dataset
10
+ from duckguard.core.engine import DuckGuardEngine
11
+
12
+
13
+ class PostgresConnector(Connector):
14
+ """
15
+ Connector for PostgreSQL databases.
16
+
17
+ Uses DuckDB's postgres extension for efficient query pushdown.
18
+ """
19
+
20
+ def __init__(self, engine: DuckGuardEngine | None = None):
21
+ super().__init__(engine)
22
+ self._setup_extension()
23
+
24
+ def _setup_extension(self) -> None:
25
+ """Install and load the postgres extension."""
26
+ try:
27
+ self.engine.execute("INSTALL postgres")
28
+ self.engine.execute("LOAD postgres")
29
+ except Exception:
30
+ # Extension might already be loaded
31
+ pass
32
+
33
+ def connect(self, config: ConnectionConfig) -> Dataset:
34
+ """
35
+ Connect to PostgreSQL and return a Dataset.
36
+
37
+ Args:
38
+ config: Connection configuration
39
+
40
+ Returns:
41
+ Dataset object
42
+ """
43
+ if not config.table:
44
+ raise ValueError("Table name is required for PostgreSQL connections")
45
+
46
+ # Parse connection string
47
+ conn_info = self._parse_connection_string(config.source)
48
+
49
+ # Build the full table reference
50
+ schema = config.schema or conn_info.get("schema", "public")
51
+ table = config.table
52
+ full_table = f"{schema}.{table}"
53
+
54
+ # Create a unique alias for this connection
55
+ alias = f"pg_{table}"
56
+
57
+ # Attach the database
58
+ attach_sql = f"ATTACH '{config.source}' AS {alias} (TYPE postgres)"
59
+
60
+ try:
61
+ self.engine.execute(attach_sql)
62
+ except Exception as e:
63
+ if "already exists" not in str(e).lower():
64
+ raise
65
+
66
+ # The source reference for DuckDB
67
+ source_ref = f"{alias}.{full_table}"
68
+
69
+ # Register as a view for easier access
70
+ view_name = f"_duckguard_{table}"
71
+ try:
72
+ self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
73
+ except Exception:
74
+ pass
75
+
76
+ return Dataset(source=view_name, engine=self.engine, name=table)
77
+
78
+ def _parse_connection_string(self, conn_string: str) -> dict[str, str]:
79
+ """Parse PostgreSQL connection string."""
80
+ parsed = urlparse(conn_string)
81
+
82
+ return {
83
+ "host": parsed.hostname or "localhost",
84
+ "port": str(parsed.port or 5432),
85
+ "database": parsed.path.lstrip("/") if parsed.path else "",
86
+ "user": parsed.username or "",
87
+ "password": parsed.password or "",
88
+ "schema": "public",
89
+ }
90
+
91
+ @classmethod
92
+ def can_handle(cls, source: str) -> bool:
93
+ """Check if this is a PostgreSQL connection string."""
94
+ return source.lower().startswith(("postgres://", "postgresql://"))
95
+
96
+ @classmethod
97
+ def get_priority(cls) -> int:
98
+ """Database connectors have high priority."""
99
+ return 50
@@ -0,0 +1,154 @@
1
+ """Amazon Redshift connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from urllib.parse import urlparse
6
+
7
+ from duckguard.connectors.base import Connector, ConnectionConfig
8
+ from duckguard.core.dataset import Dataset
9
+ from duckguard.core.engine import DuckGuardEngine
10
+
11
+
12
+ class RedshiftConnector(Connector):
13
+ """
14
+ Connector for Amazon Redshift.
15
+
16
+ Redshift is PostgreSQL-compatible, so we can use the PostgreSQL
17
+ extension in DuckDB or the redshift_connector package.
18
+
19
+ Examples:
20
+ # Using connection string
21
+ data = connect(
22
+ "redshift://user:pass@cluster.region.redshift.amazonaws.com:5439/database",
23
+ table="orders"
24
+ )
25
+
26
+ # Using options
27
+ data = connect(
28
+ "redshift://cluster.region.redshift.amazonaws.com:5439/database",
29
+ table="orders",
30
+ user="myuser",
31
+ password="mypass",
32
+ schema="public"
33
+ )
34
+ """
35
+
36
+ def __init__(self, engine: DuckGuardEngine | None = None):
37
+ super().__init__(engine)
38
+ self._setup_extension()
39
+
40
+ def _setup_extension(self) -> None:
41
+ """Install and load the postgres extension (Redshift compatible)."""
42
+ try:
43
+ self.engine.execute("INSTALL postgres")
44
+ self.engine.execute("LOAD postgres")
45
+ except Exception:
46
+ pass
47
+
48
+ def connect(self, config: ConnectionConfig) -> Dataset:
49
+ """
50
+ Connect to Redshift and return a Dataset.
51
+
52
+ Args:
53
+ config: Connection configuration
54
+
55
+ Returns:
56
+ Dataset object
57
+ """
58
+ if not config.table:
59
+ raise ValueError("Table name is required for Redshift connections")
60
+
61
+ # Parse connection string
62
+ conn_info = self._parse_connection_string(config.source, config)
63
+
64
+ table = config.table
65
+ schema = config.schema or conn_info.get("schema", "public")
66
+
67
+ # Create a unique alias
68
+ alias = f"redshift_{table}"
69
+
70
+ # Build PostgreSQL-compatible connection string for DuckDB
71
+ pg_conn = self._build_connection_string(conn_info)
72
+
73
+ # Attach using PostgreSQL extension (Redshift is PG-compatible)
74
+ attach_sql = f"ATTACH '{pg_conn}' AS {alias} (TYPE postgres)"
75
+
76
+ try:
77
+ self.engine.execute(attach_sql)
78
+ except Exception as e:
79
+ if "already exists" not in str(e).lower():
80
+ raise
81
+
82
+ # Build source reference
83
+ source_ref = f"{alias}.{schema}.{table}"
84
+
85
+ # Register as a view
86
+ view_name = f"_duckguard_redshift_{table}"
87
+ try:
88
+ self.engine.execute(f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {source_ref}")
89
+ except Exception:
90
+ pass
91
+
92
+ return Dataset(source=view_name, engine=self.engine, name=table)
93
+
94
+ def _parse_connection_string(self, conn_string: str, config: ConnectionConfig) -> dict:
95
+ """Parse Redshift connection string."""
96
+ # Handle redshift:// prefix
97
+ if conn_string.lower().startswith("redshift://"):
98
+ conn_string = "postgresql://" + conn_string[11:]
99
+
100
+ parsed = urlparse(conn_string)
101
+
102
+ params = {
103
+ "host": parsed.hostname or "",
104
+ "port": str(parsed.port or 5439),
105
+ "database": parsed.path.lstrip("/") if parsed.path else "",
106
+ "user": parsed.username or "",
107
+ "password": parsed.password or "",
108
+ }
109
+
110
+ # Override with config options
111
+ options = config.options or {}
112
+ for key in ["user", "password", "host", "port", "database", "schema", "sslmode"]:
113
+ if key in options:
114
+ params[key] = options[key]
115
+
116
+ if config.database:
117
+ params["database"] = config.database
118
+ if config.schema:
119
+ params["schema"] = config.schema
120
+
121
+ return params
122
+
123
+ def _build_connection_string(self, conn_info: dict) -> str:
124
+ """Build connection string for DuckDB PostgreSQL extension."""
125
+ parts = []
126
+
127
+ if conn_info.get("host"):
128
+ parts.append(f"host={conn_info['host']}")
129
+ if conn_info.get("port"):
130
+ parts.append(f"port={conn_info['port']}")
131
+ if conn_info.get("user"):
132
+ parts.append(f"user={conn_info['user']}")
133
+ if conn_info.get("password"):
134
+ parts.append(f"password={conn_info['password']}")
135
+ if conn_info.get("database"):
136
+ parts.append(f"dbname={conn_info['database']}")
137
+
138
+ # Redshift requires SSL
139
+ parts.append("sslmode=require")
140
+
141
+ return " ".join(parts)
142
+
143
+ @classmethod
144
+ def can_handle(cls, source: str) -> bool:
145
+ """Check if this is a Redshift connection string."""
146
+ source_lower = source.lower()
147
+ return source_lower.startswith("redshift://") or (
148
+ "redshift.amazonaws.com" in source_lower
149
+ )
150
+
151
+ @classmethod
152
+ def get_priority(cls) -> int:
153
+ """Redshift connector has high priority."""
154
+ return 55
@@ -0,0 +1,226 @@
1
+ """Snowflake connector."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+ from urllib.parse import parse_qs, urlparse
8
+
9
+ from duckguard.connectors.base import Connector, ConnectionConfig
10
+ from duckguard.core.dataset import Dataset
11
+ from duckguard.core.engine import DuckGuardEngine
12
+
13
+
14
+ class SnowflakeConnector(Connector):
15
+ """
16
+ Connector for Snowflake data warehouse.
17
+
18
+ Uses the snowflake-connector-python package to connect and query,
19
+ then processes results with DuckDB for validation.
20
+
21
+ Examples:
22
+ # Using connection string
23
+ data = connect(
24
+ "snowflake://user:pass@account/database/schema",
25
+ table="orders"
26
+ )
27
+
28
+ # Using options
29
+ data = connect(
30
+ "snowflake://account",
31
+ table="orders",
32
+ user="myuser",
33
+ password="mypass",
34
+ database="mydb",
35
+ schema="public",
36
+ warehouse="compute_wh"
37
+ )
38
+ """
39
+
40
+ def __init__(self, engine: DuckGuardEngine | None = None):
41
+ super().__init__(engine)
42
+ self._connection = None
43
+
44
+ def connect(self, config: ConnectionConfig) -> Dataset:
45
+ """
46
+ Connect to Snowflake and return a Dataset.
47
+
48
+ Args:
49
+ config: Connection configuration
50
+
51
+ Returns:
52
+ Dataset object
53
+ """
54
+ try:
55
+ import snowflake.connector
56
+ except ImportError:
57
+ raise ImportError(
58
+ "Snowflake support requires snowflake-connector-python. "
59
+ "Install with: pip install duckguard[snowflake]"
60
+ )
61
+
62
+ if not config.table:
63
+ raise ValueError("Table name is required for Snowflake connections")
64
+
65
+ # Parse connection parameters
66
+ conn_params = self._parse_connection_string(config.source, config)
67
+
68
+ # Connect to Snowflake
69
+ self._connection = snowflake.connector.connect(**conn_params)
70
+
71
+ table = config.table
72
+ schema = config.schema or conn_params.get("schema", "PUBLIC")
73
+ database = config.database or conn_params.get("database", "")
74
+
75
+ # Build fully qualified table name
76
+ if database and schema:
77
+ fq_table = f"{database}.{schema}.{table}"
78
+ elif schema:
79
+ fq_table = f"{schema}.{table}"
80
+ else:
81
+ fq_table = table
82
+
83
+ # Create a wrapper dataset that uses Snowflake for queries
84
+ return SnowflakeDataset(
85
+ source=fq_table,
86
+ engine=self.engine,
87
+ name=table,
88
+ connection=self._connection,
89
+ conn_params=conn_params,
90
+ )
91
+
92
+ def _parse_connection_string(
93
+ self, conn_string: str, config: ConnectionConfig
94
+ ) -> dict[str, Any]:
95
+ """Parse Snowflake connection string and merge with config options."""
96
+ params: dict[str, Any] = {}
97
+
98
+ # Parse URL format: snowflake://user:pass@account/database/schema
99
+ if conn_string.lower().startswith("snowflake://"):
100
+ parsed = urlparse(conn_string)
101
+
102
+ params["account"] = parsed.hostname or ""
103
+ if parsed.username:
104
+ params["user"] = parsed.username
105
+ if parsed.password:
106
+ params["password"] = parsed.password
107
+
108
+ # Parse path for database/schema
109
+ path_parts = [p for p in parsed.path.split("/") if p]
110
+ if len(path_parts) >= 1:
111
+ params["database"] = path_parts[0]
112
+ if len(path_parts) >= 2:
113
+ params["schema"] = path_parts[1]
114
+
115
+ # Parse query parameters
116
+ if parsed.query:
117
+ query_params = parse_qs(parsed.query)
118
+ for key, values in query_params.items():
119
+ params[key] = values[0] if len(values) == 1 else values
120
+
121
+ # Override with config options
122
+ options = config.options or {}
123
+ for key in ["user", "password", "account", "warehouse", "role", "database", "schema"]:
124
+ if key in options:
125
+ params[key] = options[key]
126
+
127
+ if config.database:
128
+ params["database"] = config.database
129
+ if config.schema:
130
+ params["schema"] = config.schema
131
+
132
+ return params
133
+
134
+ @classmethod
135
+ def can_handle(cls, source: str) -> bool:
136
+ """Check if this is a Snowflake connection string."""
137
+ return source.lower().startswith("snowflake://")
138
+
139
+ @classmethod
140
+ def get_priority(cls) -> int:
141
+ """Snowflake connector has high priority."""
142
+ return 60
143
+
144
+
145
+ class SnowflakeDataset(Dataset):
146
+ """
147
+ Dataset that queries Snowflake directly for statistics.
148
+
149
+ Uses query pushdown to compute aggregations in Snowflake,
150
+ minimizing data transfer.
151
+ """
152
+
153
+ def __init__(
154
+ self,
155
+ source: str,
156
+ engine: DuckGuardEngine,
157
+ name: str,
158
+ connection: Any,
159
+ conn_params: dict[str, Any],
160
+ ):
161
+ super().__init__(source=source, engine=engine, name=name)
162
+ self._sf_connection = connection
163
+ self._sf_params = conn_params
164
+
165
+ def _execute_sf_query(self, sql: str) -> list[tuple[Any, ...]]:
166
+ """Execute a query on Snowflake."""
167
+ cursor = self._sf_connection.cursor()
168
+ try:
169
+ cursor.execute(sql)
170
+ return cursor.fetchall()
171
+ finally:
172
+ cursor.close()
173
+
174
+ def _fetch_sf_value(self, sql: str) -> Any:
175
+ """Execute query and return single value."""
176
+ rows = self._execute_sf_query(sql)
177
+ return rows[0][0] if rows else None
178
+
179
+ @property
180
+ def row_count(self) -> int:
181
+ """Get row count from Snowflake."""
182
+ if self._row_count_cache is None:
183
+ sql = f"SELECT COUNT(*) FROM {self._source}"
184
+ self._row_count_cache = self._fetch_sf_value(sql) or 0
185
+ return self._row_count_cache
186
+
187
+ @property
188
+ def columns(self) -> list[str]:
189
+ """Get column names from Snowflake."""
190
+ if self._columns_cache is None:
191
+ sql = f"SELECT * FROM {self._source} LIMIT 0"
192
+ cursor = self._sf_connection.cursor()
193
+ try:
194
+ cursor.execute(sql)
195
+ self._columns_cache = [desc[0] for desc in cursor.description]
196
+ finally:
197
+ cursor.close()
198
+ return self._columns_cache
199
+
200
+
201
+ class SnowflakeColumn:
202
+ """Column that queries Snowflake directly."""
203
+
204
+ def __init__(self, name: str, dataset: SnowflakeDataset):
205
+ self._name = name
206
+ self._dataset = dataset
207
+
208
+ @property
209
+ def null_percent(self) -> float:
210
+ """Get null percentage from Snowflake."""
211
+ sql = f"""
212
+ SELECT
213
+ ROUND(100.0 * SUM(CASE WHEN "{self._name}" IS NULL THEN 1 ELSE 0 END) / COUNT(*), 2)
214
+ FROM {self._dataset._source}
215
+ """
216
+ return self._dataset._fetch_sf_value(sql) or 0.0
217
+
218
+ @property
219
+ def unique_percent(self) -> float:
220
+ """Get unique percentage from Snowflake."""
221
+ sql = f"""
222
+ SELECT
223
+ ROUND(100.0 * COUNT(DISTINCT "{self._name}") / COUNT(*), 2)
224
+ FROM {self._dataset._source}
225
+ """
226
+ return self._dataset._fetch_sf_value(sql) or 0.0