kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# src/kontra/connectors/sqlserver.py
|
|
2
|
+
"""
|
|
3
|
+
SQL Server connection utilities for Kontra.
|
|
4
|
+
|
|
5
|
+
Supports multiple authentication methods:
|
|
6
|
+
1. Full URI: mssql://user:pass@host:port/database/schema.table
|
|
7
|
+
2. Environment variables: MSSQL_HOST, MSSQL_PORT, MSSQL_USER, MSSQL_PASSWORD, MSSQL_DATABASE
|
|
8
|
+
3. SQLSERVER_URL (similar to DATABASE_URL pattern)
|
|
9
|
+
|
|
10
|
+
Priority: URI values > SQLSERVER_URL > MSSQL_XXX env vars > defaults
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from typing import Any, Dict, Optional
|
|
17
|
+
|
|
18
|
+
from .db_utils import (
|
|
19
|
+
DbConnectionConfig,
|
|
20
|
+
resolve_connection_params as _resolve_params,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# SQL Server-specific configuration for parameter resolution
|
|
25
|
+
_MSSQL_CONFIG = DbConnectionConfig(
|
|
26
|
+
default_host="localhost",
|
|
27
|
+
default_port=1433,
|
|
28
|
+
default_user="sa",
|
|
29
|
+
default_schema="dbo",
|
|
30
|
+
env_host="MSSQL_HOST",
|
|
31
|
+
env_port="MSSQL_PORT",
|
|
32
|
+
env_user="MSSQL_USER",
|
|
33
|
+
env_password="MSSQL_PASSWORD",
|
|
34
|
+
env_database="MSSQL_DATABASE",
|
|
35
|
+
env_url="SQLSERVER_URL",
|
|
36
|
+
db_name="SQL Server",
|
|
37
|
+
uri_example="mssql://user:pass@host:1433/database/schema.table",
|
|
38
|
+
env_example="MSSQL_DATABASE",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class SqlServerConnectionParams:
|
|
44
|
+
"""Resolved SQL Server connection parameters."""
|
|
45
|
+
|
|
46
|
+
host: str
|
|
47
|
+
port: int
|
|
48
|
+
user: str
|
|
49
|
+
password: Optional[str]
|
|
50
|
+
database: str
|
|
51
|
+
schema: str
|
|
52
|
+
table: str
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
55
|
+
"""Return connection kwargs for pymssql.connect()."""
|
|
56
|
+
return {
|
|
57
|
+
"server": self.host,
|
|
58
|
+
"port": self.port,
|
|
59
|
+
"user": self.user,
|
|
60
|
+
"password": self.password,
|
|
61
|
+
"database": self.database,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def qualified_table(self) -> str:
|
|
66
|
+
"""Return schema.table identifier."""
|
|
67
|
+
return f"{self.schema}.{self.table}"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def resolve_connection_params(uri: str) -> SqlServerConnectionParams:
|
|
71
|
+
"""
|
|
72
|
+
Resolve SQL Server connection parameters from URI + environment.
|
|
73
|
+
|
|
74
|
+
URI format:
|
|
75
|
+
mssql://user:pass@host:port/database/schema.table
|
|
76
|
+
mssql:///dbo.users (uses env vars for connection)
|
|
77
|
+
sqlserver://... (alias for mssql://)
|
|
78
|
+
|
|
79
|
+
Priority: URI values > SQLSERVER_URL > MSSQL_XXX env vars > defaults
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ValueError: If required parameters (database, table) cannot be resolved.
|
|
83
|
+
"""
|
|
84
|
+
resolved = _resolve_params(uri, _MSSQL_CONFIG)
|
|
85
|
+
|
|
86
|
+
return SqlServerConnectionParams(
|
|
87
|
+
host=resolved.host,
|
|
88
|
+
port=resolved.port,
|
|
89
|
+
user=resolved.user,
|
|
90
|
+
password=resolved.password,
|
|
91
|
+
database=resolved.database, # type: ignore (validated in _resolve_params)
|
|
92
|
+
schema=resolved.schema,
|
|
93
|
+
table=resolved.table, # type: ignore (validated in _resolve_params)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_connection(params: SqlServerConnectionParams):
|
|
98
|
+
"""
|
|
99
|
+
Create a pymssql connection from resolved parameters.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
pymssql.Connection
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
import pymssql
|
|
106
|
+
except ImportError as e:
|
|
107
|
+
raise ImportError(
|
|
108
|
+
"pymssql is required for SQL Server support.\n"
|
|
109
|
+
"Install with: pip install pymssql"
|
|
110
|
+
) from e
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
return pymssql.connect(**params.to_dict())
|
|
114
|
+
except pymssql.OperationalError as e:
|
|
115
|
+
raise ConnectionError(
|
|
116
|
+
f"SQL Server connection failed: {e}\n\n"
|
|
117
|
+
f"Connection details:\n"
|
|
118
|
+
f" Host: {params.host}:{params.port}\n"
|
|
119
|
+
f" Database: {params.database}\n"
|
|
120
|
+
f" User: {params.user}\n\n"
|
|
121
|
+
"Check your connection settings or set environment variables:\n"
|
|
122
|
+
" export MSSQL_HOST=localhost\n"
|
|
123
|
+
" export MSSQL_PORT=1433\n"
|
|
124
|
+
" export MSSQL_USER=your_user\n"
|
|
125
|
+
" export MSSQL_PASSWORD=your_password\n"
|
|
126
|
+
" export MSSQL_DATABASE=your_database"
|
|
127
|
+
) from e
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def fetch_sqlserver_stats(params: SqlServerConnectionParams) -> Dict[str, Dict[str, Any]]:
|
|
131
|
+
"""
|
|
132
|
+
Fetch SQL Server statistics from sys.dm_db_stats_properties and related DMVs.
|
|
133
|
+
|
|
134
|
+
Returns a dict keyed by column name with stats:
|
|
135
|
+
{
|
|
136
|
+
"column_name": {
|
|
137
|
+
"null_frac": 0.02, # Estimated fraction of nulls
|
|
138
|
+
"n_distinct": 1000, # Estimated distinct values (-1 = unique)
|
|
139
|
+
"rows": 10000, # Rows when stats were computed
|
|
140
|
+
},
|
|
141
|
+
"__table__": {
|
|
142
|
+
"row_estimate": 10000,
|
|
143
|
+
"page_count": 100,
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
"""
|
|
147
|
+
import pymssql
|
|
148
|
+
|
|
149
|
+
with get_connection(params) as conn:
|
|
150
|
+
with conn.cursor() as cursor:
|
|
151
|
+
# Table-level stats from sys.dm_db_partition_stats
|
|
152
|
+
cursor.execute(
|
|
153
|
+
"""
|
|
154
|
+
SELECT SUM(row_count) AS row_estimate,
|
|
155
|
+
SUM(used_page_count) AS page_count
|
|
156
|
+
FROM sys.dm_db_partition_stats ps
|
|
157
|
+
JOIN sys.objects o ON ps.object_id = o.object_id
|
|
158
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
159
|
+
WHERE s.name = %s AND o.name = %s AND ps.index_id IN (0, 1)
|
|
160
|
+
""",
|
|
161
|
+
(params.schema, params.table),
|
|
162
|
+
)
|
|
163
|
+
row = cursor.fetchone()
|
|
164
|
+
table_stats = {
|
|
165
|
+
"row_estimate": row[0] if row and row[0] else 0,
|
|
166
|
+
"page_count": row[1] if row and row[1] else 0,
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Column-level stats from sys.dm_db_stats_properties + DBCC SHOW_STATISTICS
|
|
170
|
+
# We use density_vector from stats to estimate distinct values
|
|
171
|
+
cursor.execute(
|
|
172
|
+
"""
|
|
173
|
+
SELECT
|
|
174
|
+
c.name AS column_name,
|
|
175
|
+
s.name AS stat_name,
|
|
176
|
+
sp.rows,
|
|
177
|
+
sp.modification_counter
|
|
178
|
+
FROM sys.stats s
|
|
179
|
+
JOIN sys.stats_columns sc ON s.stats_id = sc.stats_id AND s.object_id = sc.object_id
|
|
180
|
+
JOIN sys.columns c ON sc.column_id = c.column_id AND sc.object_id = c.object_id
|
|
181
|
+
CROSS APPLY sys.dm_db_stats_properties(s.object_id, s.stats_id) sp
|
|
182
|
+
WHERE s.object_id = OBJECT_ID(%s)
|
|
183
|
+
""",
|
|
184
|
+
(f"{params.schema}.{params.table}",),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
result: Dict[str, Dict[str, Any]] = {"__table__": table_stats}
|
|
188
|
+
|
|
189
|
+
for row in cursor.fetchall():
|
|
190
|
+
col_name, stat_name, rows, mod_counter = row
|
|
191
|
+
if col_name not in result:
|
|
192
|
+
result[col_name] = {
|
|
193
|
+
"rows": rows,
|
|
194
|
+
"modification_counter": mod_counter,
|
|
195
|
+
"stat_name": stat_name,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# For each column with stats, get density (1/distinct) from DBCC SHOW_STATISTICS
|
|
199
|
+
# This requires more complex parsing, so we'll do a simpler approach:
|
|
200
|
+
# Query actual distinct counts for key columns (more reliable for preplan)
|
|
201
|
+
for col_name in list(result.keys()):
|
|
202
|
+
if col_name == "__table__":
|
|
203
|
+
continue
|
|
204
|
+
try:
|
|
205
|
+
# Get null fraction
|
|
206
|
+
cursor.execute(
|
|
207
|
+
f"""
|
|
208
|
+
SELECT
|
|
209
|
+
CAST(SUM(CASE WHEN [{col_name}] IS NULL THEN 1 ELSE 0 END) AS FLOAT)
|
|
210
|
+
/ NULLIF(COUNT(*), 0) AS null_frac,
|
|
211
|
+
COUNT(DISTINCT [{col_name}]) AS n_distinct
|
|
212
|
+
FROM [{params.schema}].[{params.table}]
|
|
213
|
+
""",
|
|
214
|
+
)
|
|
215
|
+
stats_row = cursor.fetchone()
|
|
216
|
+
if stats_row:
|
|
217
|
+
result[col_name]["null_frac"] = stats_row[0] or 0.0
|
|
218
|
+
result[col_name]["n_distinct"] = stats_row[1] or 0
|
|
219
|
+
# Mark as unique if distinct = row count
|
|
220
|
+
if result[col_name]["n_distinct"] == table_stats["row_estimate"]:
|
|
221
|
+
result[col_name]["n_distinct"] = -1 # Convention: -1 = all unique
|
|
222
|
+
except Exception:
|
|
223
|
+
# Stats query failed, leave partial data
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
return result
|
|
File without changes
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# src/kontra/backends/duckdb_session.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import duckdb
|
|
9
|
+
from kontra.connectors.handle import DatasetHandle
|
|
10
|
+
|
|
11
|
+
# --- Public API ---
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_duckdb_connection(handle: DatasetHandle) -> duckdb.DuckDBPyConnection:
|
|
15
|
+
"""
|
|
16
|
+
Create a DuckDB connection configured specifically for the given DatasetHandle.
|
|
17
|
+
|
|
18
|
+
This is the centralized factory for all DuckDB instances in Kontra.
|
|
19
|
+
It inspects the handle's scheme and fs_opts to load the correct
|
|
20
|
+
extensions (httpfs) and apply the necessary configuration
|
|
21
|
+
(e.g., S3 endpoints, credentials, region) for I/O.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
handle: The DatasetHandle containing the URI and filesystem options.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
A configured duckdb.DuckDBPyConnection.
|
|
28
|
+
"""
|
|
29
|
+
con = duckdb.connect()
|
|
30
|
+
|
|
31
|
+
# Apply performance/threading tweaks (reads env, but for runtime, not I/O)
|
|
32
|
+
_configure_threads(con)
|
|
33
|
+
|
|
34
|
+
# Apply I/O and credential configuration based on the data source
|
|
35
|
+
match handle.scheme:
|
|
36
|
+
case "s3":
|
|
37
|
+
_configure_s3(con, handle.fs_opts)
|
|
38
|
+
case "abfs" | "abfss" | "az":
|
|
39
|
+
_configure_azure(con, handle.fs_opts) # Stubbed for future work
|
|
40
|
+
case "http" | "https":
|
|
41
|
+
_configure_http(con, handle.fs_opts)
|
|
42
|
+
case "file" | "":
|
|
43
|
+
# Local files need no special I/O config
|
|
44
|
+
pass
|
|
45
|
+
case _:
|
|
46
|
+
# Best-effort for unknown schemes: load httpfs just in case
|
|
47
|
+
try:
|
|
48
|
+
_configure_http(con, handle.fs_opts)
|
|
49
|
+
except Exception:
|
|
50
|
+
pass # Ignore if httpfs fails to load
|
|
51
|
+
|
|
52
|
+
return con
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# --- Internal Helpers ---
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _safe_set(con: duckdb.DuckDBPyConnection, key: str, value: Any) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Safely execute a DuckDB SET command, ignoring errors.
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
con.execute(f"SET {key} = ?", [str(value)])
|
|
64
|
+
except Exception:
|
|
65
|
+
# Fails gracefully if the setting doesn't exist (e.g., wrong DuckDB version)
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _configure_threads(con: duckdb.DuckDBPyConnection) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Configure DuckDB thread count based on env vars or CPU count.
|
|
72
|
+
This is a performance tweak, not an I/O secret.
|
|
73
|
+
"""
|
|
74
|
+
env_threads = os.getenv("DUCKDB_THREADS")
|
|
75
|
+
try:
|
|
76
|
+
nthreads = int(env_threads) if env_threads else (os.cpu_count() or 4)
|
|
77
|
+
except Exception:
|
|
78
|
+
nthreads = os.cpu_count() or 4
|
|
79
|
+
|
|
80
|
+
# Try both PRAGMA (older) and SET (newer) for compatibility
|
|
81
|
+
for sql in (f"PRAGMA threads={int(nthreads)};", f"SET threads = {int(nthreads)};"):
|
|
82
|
+
try:
|
|
83
|
+
con.execute(sql)
|
|
84
|
+
break
|
|
85
|
+
except Exception:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _configure_http(
|
|
90
|
+
con: duckdb.DuckDBPyConnection, fs_opts: Dict[str, str]
|
|
91
|
+
) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Install and load the httpfs extension for reading http(s):// files.
|
|
94
|
+
"""
|
|
95
|
+
con.execute("INSTALL httpfs;")
|
|
96
|
+
con.execute("LOAD httpfs;")
|
|
97
|
+
_safe_set(con, "enable_object_cache", "true")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _configure_s3(con: duckdb.DuckDBPyConnection, fs_opts: Dict[str, str]) -> None:
|
|
101
|
+
"""
|
|
102
|
+
Configure the httpfs extension for S3-compatible storage (AWS, MinIO, R2).
|
|
103
|
+
|
|
104
|
+
Expected fs_opts keys:
|
|
105
|
+
- s3_endpoint
|
|
106
|
+
- s3_region
|
|
107
|
+
- s3_url_style ('path' | 'host')
|
|
108
|
+
- s3_use_ssl ('true' | 'false')
|
|
109
|
+
- s3_access_key_id
|
|
110
|
+
- s3_secret_access_key
|
|
111
|
+
- s3_session_token
|
|
112
|
+
- s3_max_connections
|
|
113
|
+
"""
|
|
114
|
+
_configure_http(con, fs_opts) # S3 depends on httpfs
|
|
115
|
+
|
|
116
|
+
# Credentials
|
|
117
|
+
if ak := fs_opts.get("s3_access_key_id"):
|
|
118
|
+
_safe_set(con, "s3_access_key_id", ak)
|
|
119
|
+
if sk := fs_opts.get("s3_secret_access_key"):
|
|
120
|
+
_safe_set(con, "s3_secret_access_key", sk)
|
|
121
|
+
if st := fs_opts.get("s3_session_token"):
|
|
122
|
+
_safe_set(con, "s3_session_token", st)
|
|
123
|
+
|
|
124
|
+
# Region
|
|
125
|
+
if region := fs_opts.get("s3_region"):
|
|
126
|
+
_safe_set(con, "s3_region", region)
|
|
127
|
+
|
|
128
|
+
# Endpoint (MinIO/S3-compatible)
|
|
129
|
+
endpoint = fs_opts.get("s3_endpoint")
|
|
130
|
+
url_style = fs_opts.get("s3_url_style")
|
|
131
|
+
use_ssl = fs_opts.get("s3_use_ssl")
|
|
132
|
+
|
|
133
|
+
if endpoint:
|
|
134
|
+
# Parse "http://host:port" or just "host:port"
|
|
135
|
+
parsed = urlparse(endpoint)
|
|
136
|
+
hostport = parsed.netloc or parsed.path or endpoint
|
|
137
|
+
_safe_set(con, "s3_endpoint", hostport)
|
|
138
|
+
|
|
139
|
+
# Infer SSL from endpoint scheme if not explicitly set
|
|
140
|
+
if use_ssl is None:
|
|
141
|
+
use_ssl = "true" if parsed.scheme == "https" else "false"
|
|
142
|
+
_safe_set(con, "s3_use_ssl", use_ssl)
|
|
143
|
+
|
|
144
|
+
# Default to path-style for custom endpoints (MinIO-friendly)
|
|
145
|
+
if url_style is None:
|
|
146
|
+
url_style = "path"
|
|
147
|
+
|
|
148
|
+
if url_style:
|
|
149
|
+
_safe_set(con, "s3_url_style", url_style)
|
|
150
|
+
|
|
151
|
+
# Performance and reliability for large files over S3/HTTP
|
|
152
|
+
# http_timeout is in seconds (default 30s - increase for large files)
|
|
153
|
+
_safe_set(con, "http_timeout", "600") # 10 minutes for large files
|
|
154
|
+
_safe_set(con, "http_retries", "5") # More retries for reliability
|
|
155
|
+
_safe_set(con, "http_retry_wait_ms", "2000") # 2s between retries
|
|
156
|
+
# Disable keep-alive for MinIO/S3-compatible - connection pooling can cause issues
|
|
157
|
+
_safe_set(con, "http_keep_alive", "false")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _configure_azure(
|
|
161
|
+
con: duckdb.DuckDBPyConnection, fs_opts: Dict[str, str]
|
|
162
|
+
) -> None:
|
|
163
|
+
"""
|
|
164
|
+
Configure the Azure extension for ADLS Gen2 (abfs://, abfss://) and Azure Blob (az://).
|
|
165
|
+
|
|
166
|
+
DuckDB 0.10+ has native Azure support via the 'azure' extension.
|
|
167
|
+
This handles authentication and endpoint configuration.
|
|
168
|
+
|
|
169
|
+
Expected fs_opts keys:
|
|
170
|
+
- azure_account_name: Storage account name
|
|
171
|
+
- azure_account_key: Storage account key
|
|
172
|
+
- azure_sas_token: SAS token (alternative to key)
|
|
173
|
+
- azure_connection_string: Full connection string (alternative)
|
|
174
|
+
- azure_tenant_id: For OAuth/service principal
|
|
175
|
+
- azure_client_id: For OAuth/service principal
|
|
176
|
+
- azure_client_secret: For OAuth/service principal
|
|
177
|
+
- azure_endpoint: Custom endpoint (Databricks, sovereign clouds, Azurite)
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
RuntimeError: If Azure extension is not available (DuckDB < 0.10.0)
|
|
181
|
+
"""
|
|
182
|
+
# Install and load the Azure extension
|
|
183
|
+
try:
|
|
184
|
+
con.execute("INSTALL azure;")
|
|
185
|
+
con.execute("LOAD azure;")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
raise RuntimeError(
|
|
188
|
+
f"Azure extension not available. DuckDB >= 0.10.0 is required for Azure support. "
|
|
189
|
+
f"Error: {e}"
|
|
190
|
+
) from e
|
|
191
|
+
|
|
192
|
+
# Account name (required for key/SAS auth)
|
|
193
|
+
if account_name := fs_opts.get("azure_account_name"):
|
|
194
|
+
_safe_set(con, "azure_storage_account_name", account_name)
|
|
195
|
+
|
|
196
|
+
# Account key auth
|
|
197
|
+
if account_key := fs_opts.get("azure_account_key"):
|
|
198
|
+
_safe_set(con, "azure_account_key", account_key)
|
|
199
|
+
|
|
200
|
+
# SAS token auth (alternative to account key)
|
|
201
|
+
# Note: DuckDB expects the token without leading '?'
|
|
202
|
+
if sas_token := fs_opts.get("azure_sas_token"):
|
|
203
|
+
# Strip leading '?' if present
|
|
204
|
+
if sas_token.startswith("?"):
|
|
205
|
+
sas_token = sas_token[1:]
|
|
206
|
+
_safe_set(con, "azure_sas_token", sas_token)
|
|
207
|
+
|
|
208
|
+
# Connection string auth
|
|
209
|
+
if conn_string := fs_opts.get("azure_connection_string"):
|
|
210
|
+
_safe_set(con, "azure_storage_connection_string", conn_string)
|
|
211
|
+
|
|
212
|
+
# OAuth / Service Principal auth
|
|
213
|
+
if tenant_id := fs_opts.get("azure_tenant_id"):
|
|
214
|
+
_safe_set(con, "azure_tenant_id", tenant_id)
|
|
215
|
+
if client_id := fs_opts.get("azure_client_id"):
|
|
216
|
+
_safe_set(con, "azure_client_id", client_id)
|
|
217
|
+
if client_secret := fs_opts.get("azure_client_secret"):
|
|
218
|
+
_safe_set(con, "azure_client_secret", client_secret)
|
|
219
|
+
|
|
220
|
+
# Custom endpoint (for Databricks, sovereign clouds, Azurite emulator)
|
|
221
|
+
if endpoint := fs_opts.get("azure_endpoint"):
|
|
222
|
+
_safe_set(con, "azure_endpoint", endpoint)
|
|
223
|
+
|
|
224
|
+
# Performance settings (same as S3)
|
|
225
|
+
_safe_set(con, "http_timeout", "600") # 10 minutes for large files
|
|
226
|
+
_safe_set(con, "http_retries", "5")
|
|
227
|
+
_safe_set(con, "http_retry_wait_ms", "2000")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# src/kontra/engine/backends/duckdb_utils.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def esc_ident(name: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Quote an identifier for DuckDB (double quotes, escape internal quotes).
|
|
8
|
+
This is a centralized helper used by executors and materializers.
|
|
9
|
+
"""
|
|
10
|
+
return '"' + name.replace('"', '""') + '"'
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def lit_str(s: str) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Return a single-quoted SQL string literal with internal quotes escaped.
|
|
16
|
+
This is a centralized helper used by executors and materializers.
|
|
17
|
+
"""
|
|
18
|
+
return "'" + s.replace("'", "''") + "'"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# src/kontra/engine/backends/polars_backend.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Polars Backend (Adapter)
|
|
6
|
+
|
|
7
|
+
Thin adapter that defers execution to the RuleExecutionPlan's compiled executor.
|
|
8
|
+
Keeps the backend boundary explicit and behavior deterministic.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import Any, Callable, Dict, List
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PolarsBackend:
|
|
17
|
+
name = "polars"
|
|
18
|
+
|
|
19
|
+
def __init__(self, executor: Callable[[pl.DataFrame, Any], List[Dict[str, Any]]]):
|
|
20
|
+
"""
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
executor : callable
|
|
24
|
+
Function that evaluates the compiled plan against a materialized
|
|
25
|
+
Polars DataFrame (typically RuleExecutionPlan.execute_compiled).
|
|
26
|
+
"""
|
|
27
|
+
self._executor = executor
|
|
28
|
+
|
|
29
|
+
def supports(self, connector_caps: int) -> bool:
|
|
30
|
+
"""Capability hook reserved for future; always True for local DataFrames."""
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
def compile(self, compiled_plan: Any) -> Any:
|
|
34
|
+
"""No-op for Polars: pass through the compiled plan."""
|
|
35
|
+
return compiled_plan
|
|
36
|
+
|
|
37
|
+
def execute(self, df: pl.DataFrame, compiled_artifact: Any) -> Dict[str, Any]:
|
|
38
|
+
"""Execute the compiled artifact against `df` and wrap results."""
|
|
39
|
+
results = self._executor(df, compiled_artifact)
|
|
40
|
+
return {"results": results}
|
|
41
|
+
|
|
42
|
+
def introspect(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
43
|
+
"""Basic observability: row count and available columns."""
|
|
44
|
+
return {
|
|
45
|
+
"row_count": int(df.height),
|
|
46
|
+
"available_cols": list(df.columns),
|
|
47
|
+
}
|