kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# src/kontra/engine/executors/postgres_sql.py
|
|
2
|
+
"""
|
|
3
|
+
PostgreSQL SQL Executor - executes validation rules via SQL pushdown.
|
|
4
|
+
|
|
5
|
+
Uses DatabaseSqlExecutor base class for shared compile/execute logic.
|
|
6
|
+
|
|
7
|
+
Supports rules:
|
|
8
|
+
- not_null(column) - uses EXISTS (fast early termination)
|
|
9
|
+
- unique(column) - uses COUNT DISTINCT
|
|
10
|
+
- min_rows(threshold) - uses COUNT
|
|
11
|
+
- max_rows(threshold) - uses COUNT
|
|
12
|
+
- allowed_values(column, values) - uses SUM CASE
|
|
13
|
+
- freshness(column, max_age) - uses MAX
|
|
14
|
+
- range(column, min, max) - uses SUM CASE
|
|
15
|
+
- regex(column, pattern) - uses ~ operator
|
|
16
|
+
- compare(left, right, op) - uses SUM CASE
|
|
17
|
+
- conditional_not_null(column, when) - uses SUM CASE
|
|
18
|
+
- conditional_range(column, when, min, max) - uses SUM CASE
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from contextlib import contextmanager
|
|
24
|
+
from typing import Any, Dict, List, Tuple
|
|
25
|
+
|
|
26
|
+
from kontra.connectors.handle import DatasetHandle
|
|
27
|
+
from kontra.connectors.postgres import PostgresConnectionParams, get_connection
|
|
28
|
+
from kontra.connectors.detection import parse_table_reference, get_default_schema, POSTGRESQL
|
|
29
|
+
|
|
30
|
+
from .database_base import DatabaseSqlExecutor
|
|
31
|
+
from .registry import register_executor
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@register_executor("postgres")
|
|
35
|
+
class PostgresSqlExecutor(DatabaseSqlExecutor):
|
|
36
|
+
"""
|
|
37
|
+
PostgreSQL SQL pushdown executor.
|
|
38
|
+
|
|
39
|
+
Inherits compile() and execute() from DatabaseSqlExecutor.
|
|
40
|
+
Provides PostgreSQL-specific connection and table handling.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
DIALECT = "postgres"
|
|
44
|
+
SUPPORTED_RULES = {
|
|
45
|
+
"not_null", "unique", "min_rows", "max_rows",
|
|
46
|
+
"allowed_values", "disallowed_values",
|
|
47
|
+
"freshness", "range", "length",
|
|
48
|
+
"regex", "contains", "starts_with", "ends_with",
|
|
49
|
+
"compare", "conditional_not_null", "conditional_range",
|
|
50
|
+
"custom_sql_check", "custom_agg"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def name(self) -> str:
|
|
55
|
+
return "postgres"
|
|
56
|
+
|
|
57
|
+
def _supports_scheme(self, scheme: str, handle: DatasetHandle) -> bool:
|
|
58
|
+
"""Check if this executor supports the given URI scheme."""
|
|
59
|
+
# BYOC: check dialect for external connections
|
|
60
|
+
if scheme == "byoc" and handle.dialect == "postgresql":
|
|
61
|
+
return handle.external_conn is not None
|
|
62
|
+
|
|
63
|
+
# URI-based: handle postgres:// URIs
|
|
64
|
+
return scheme in {"postgres", "postgresql"}
|
|
65
|
+
|
|
66
|
+
@contextmanager
|
|
67
|
+
def _get_connection_ctx(self, handle: DatasetHandle):
|
|
68
|
+
"""
|
|
69
|
+
Get a PostgreSQL connection context.
|
|
70
|
+
|
|
71
|
+
For BYOC, yields the external connection directly (not owned by us).
|
|
72
|
+
For URI-based, yields a new connection (owned by context manager).
|
|
73
|
+
"""
|
|
74
|
+
if handle.scheme == "byoc" and handle.external_conn is not None:
|
|
75
|
+
yield handle.external_conn
|
|
76
|
+
elif handle.db_params:
|
|
77
|
+
with get_connection(handle.db_params) as conn:
|
|
78
|
+
yield conn
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError("Handle has neither external_conn nor db_params")
|
|
81
|
+
|
|
82
|
+
def _get_table_reference(self, handle: DatasetHandle) -> str:
|
|
83
|
+
"""
|
|
84
|
+
Get the fully-qualified table reference for PostgreSQL.
|
|
85
|
+
|
|
86
|
+
Returns: "schema"."table" format.
|
|
87
|
+
"""
|
|
88
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
89
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
90
|
+
schema = schema or get_default_schema(POSTGRESQL)
|
|
91
|
+
return f"{self._esc(schema)}.{self._esc(table)}"
|
|
92
|
+
elif handle.db_params:
|
|
93
|
+
params: PostgresConnectionParams = handle.db_params
|
|
94
|
+
return f"{self._esc(params.schema)}.{self._esc(params.table)}"
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
97
|
+
|
|
98
|
+
def _get_schema_and_table(self, handle: DatasetHandle) -> Tuple[str, str]:
|
|
99
|
+
"""
|
|
100
|
+
Get schema and table name separately for custom SQL placeholder replacement.
|
|
101
|
+
|
|
102
|
+
Returns: Tuple of (schema, table_name)
|
|
103
|
+
"""
|
|
104
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
105
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
106
|
+
schema = schema or get_default_schema(POSTGRESQL)
|
|
107
|
+
return schema, table
|
|
108
|
+
elif handle.db_params:
|
|
109
|
+
params: PostgresConnectionParams = handle.db_params
|
|
110
|
+
return params.schema, params.table
|
|
111
|
+
else:
|
|
112
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
113
|
+
|
|
114
|
+
def _get_cursor(self, conn):
|
|
115
|
+
"""Get a cursor using context manager."""
|
|
116
|
+
# PostgreSQL cursors support context managers
|
|
117
|
+
return conn.cursor()
|
|
118
|
+
|
|
119
|
+
def _close_cursor(self, cursor):
|
|
120
|
+
"""Close the cursor."""
|
|
121
|
+
cursor.close()
|
|
122
|
+
|
|
123
|
+
def introspect(self, handle: DatasetHandle, **kwargs) -> Dict[str, Any]:
|
|
124
|
+
"""
|
|
125
|
+
Introspect the PostgreSQL table for metadata.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
{"row_count": int, "available_cols": [...], "staging": None}
|
|
129
|
+
"""
|
|
130
|
+
table = self._get_table_reference(handle)
|
|
131
|
+
|
|
132
|
+
# Get schema and table name for information_schema query
|
|
133
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
134
|
+
_db, schema, table_name = parse_table_reference(handle.table_ref)
|
|
135
|
+
schema = schema or get_default_schema(POSTGRESQL)
|
|
136
|
+
elif handle.db_params:
|
|
137
|
+
params: PostgresConnectionParams = handle.db_params
|
|
138
|
+
schema = params.schema
|
|
139
|
+
table_name = params.table
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
142
|
+
|
|
143
|
+
with self._get_connection_ctx(handle) as conn:
|
|
144
|
+
with conn.cursor() as cur:
|
|
145
|
+
# Get row count
|
|
146
|
+
cur.execute(f"SELECT COUNT(*) FROM {table}")
|
|
147
|
+
row_count = cur.fetchone()
|
|
148
|
+
n = int(row_count[0]) if row_count else 0
|
|
149
|
+
|
|
150
|
+
# Get column names
|
|
151
|
+
cur.execute(
|
|
152
|
+
"""
|
|
153
|
+
SELECT column_name
|
|
154
|
+
FROM information_schema.columns
|
|
155
|
+
WHERE table_schema = %s AND table_name = %s
|
|
156
|
+
ORDER BY ordinal_position
|
|
157
|
+
""",
|
|
158
|
+
(schema, table_name),
|
|
159
|
+
)
|
|
160
|
+
cols = [row[0] for row in cur.fetchall()]
|
|
161
|
+
|
|
162
|
+
return {"row_count": n, "available_cols": cols, "staging": None}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# src/kontra/engine/executors/registry.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from kontra.connectors.handle import DatasetHandle
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from .base import SqlExecutor
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Global registry: maps "executor_name" -> constructor()
|
|
13
|
+
_EXECUTORS: Dict[str, Callable[[], SqlExecutor]] = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_executor(name: str):
|
|
17
|
+
"""
|
|
18
|
+
Decorator to register a SQL executor class.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def deco(cls: Callable[[], SqlExecutor]) -> Callable[[], SqlExecutor]:
|
|
22
|
+
if name in _EXECUTORS:
|
|
23
|
+
raise ValueError(f"Executor '{name}' is already registered.")
|
|
24
|
+
_EXECUTORS[name] = cls
|
|
25
|
+
return cls
|
|
26
|
+
|
|
27
|
+
return deco
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def pick_executor(
|
|
31
|
+
handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
|
|
32
|
+
) -> Optional[SqlExecutor]:
|
|
33
|
+
"""
|
|
34
|
+
Find the first registered executor that supports the given handle and rules.
|
|
35
|
+
"""
|
|
36
|
+
if not sql_specs:
|
|
37
|
+
return None # Nothing to push down
|
|
38
|
+
|
|
39
|
+
# Iterate over registered constructors
|
|
40
|
+
for name, ctor in _EXECUTORS.items():
|
|
41
|
+
executor = ctor() # Instantiate the executor
|
|
42
|
+
try:
|
|
43
|
+
if executor.supports(handle, sql_specs):
|
|
44
|
+
return executor
|
|
45
|
+
except Exception:
|
|
46
|
+
# Be conservative: ignore faulty executors
|
|
47
|
+
continue
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def register_default_executors() -> None:
|
|
52
|
+
"""
|
|
53
|
+
Eagerly import built-in executors so their @register_executor
|
|
54
|
+
decorators run and populate the registry.
|
|
55
|
+
"""
|
|
56
|
+
# Local import triggers decorator side-effect
|
|
57
|
+
from . import duckdb_sql # noqa: F401
|
|
58
|
+
|
|
59
|
+
# PostgreSQL executor (optional - requires psycopg)
|
|
60
|
+
try:
|
|
61
|
+
from . import postgres_sql # noqa: F401
|
|
62
|
+
except ImportError:
|
|
63
|
+
pass # psycopg not installed, skip postgres executor
|
|
64
|
+
|
|
65
|
+
# SQL Server executor (optional - requires pymssql)
|
|
66
|
+
try:
|
|
67
|
+
from . import sqlserver_sql # noqa: F401
|
|
68
|
+
except ImportError:
|
|
69
|
+
pass # pymssql not installed, skip sqlserver executor
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# src/kontra/engine/executors/sqlserver_sql.py
|
|
2
|
+
"""
|
|
3
|
+
SQL Server SQL Executor - executes validation rules via SQL pushdown.
|
|
4
|
+
|
|
5
|
+
Uses DatabaseSqlExecutor base class for shared compile/execute logic.
|
|
6
|
+
|
|
7
|
+
Supports rules:
|
|
8
|
+
- not_null(column) - uses EXISTS (fast early termination)
|
|
9
|
+
- unique(column) - uses COUNT DISTINCT
|
|
10
|
+
- min_rows(threshold) - uses COUNT
|
|
11
|
+
- max_rows(threshold) - uses COUNT
|
|
12
|
+
- allowed_values(column, values) - uses SUM CASE
|
|
13
|
+
- freshness(column, max_age) - uses MAX
|
|
14
|
+
- range(column, min, max) - uses SUM CASE
|
|
15
|
+
- compare(left, right, op) - uses SUM CASE
|
|
16
|
+
- conditional_not_null(column, when) - uses SUM CASE
|
|
17
|
+
- conditional_range(column, when, min, max) - uses SUM CASE
|
|
18
|
+
|
|
19
|
+
NOT supported (falls back to Polars):
|
|
20
|
+
- regex(column, pattern) - PATINDEX uses LIKE wildcards, not regex
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from contextlib import contextmanager
|
|
26
|
+
from typing import Any, Dict, List, Tuple
|
|
27
|
+
|
|
28
|
+
from kontra.connectors.handle import DatasetHandle
|
|
29
|
+
from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
|
|
30
|
+
from kontra.connectors.detection import parse_table_reference, get_default_schema, SQLSERVER
|
|
31
|
+
|
|
32
|
+
from .database_base import DatabaseSqlExecutor
|
|
33
|
+
from .registry import register_executor
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@register_executor("sqlserver")
|
|
37
|
+
class SqlServerSqlExecutor(DatabaseSqlExecutor):
|
|
38
|
+
"""
|
|
39
|
+
SQL Server SQL pushdown executor.
|
|
40
|
+
|
|
41
|
+
Inherits compile() and execute() from DatabaseSqlExecutor.
|
|
42
|
+
Provides SQL Server-specific connection and table handling.
|
|
43
|
+
|
|
44
|
+
Note: regex is NOT supported because PATINDEX uses LIKE-style wildcards.
|
|
45
|
+
Regex rules fall back to Polars execution.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
DIALECT = "sqlserver"
|
|
49
|
+
# Note: regex is NOT supported - PATINDEX uses LIKE wildcards, not regex.
|
|
50
|
+
# But contains/starts_with/ends_with use LIKE, so they work!
|
|
51
|
+
SUPPORTED_RULES = {
|
|
52
|
+
"not_null", "unique", "min_rows", "max_rows",
|
|
53
|
+
"allowed_values", "disallowed_values",
|
|
54
|
+
"freshness", "range", "length",
|
|
55
|
+
"contains", "starts_with", "ends_with", # LIKE-based, works on SQL Server
|
|
56
|
+
"compare", "conditional_not_null", "conditional_range",
|
|
57
|
+
"custom_sql_check", "custom_agg"
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
return "sqlserver"
|
|
63
|
+
|
|
64
|
+
def _supports_scheme(self, scheme: str, handle: DatasetHandle) -> bool:
|
|
65
|
+
"""Check if this executor supports the given URI scheme."""
|
|
66
|
+
# BYOC: check dialect for external connections
|
|
67
|
+
if scheme == "byoc" and handle.dialect == "sqlserver":
|
|
68
|
+
return handle.external_conn is not None
|
|
69
|
+
|
|
70
|
+
# URI-based: handle mssql:// or sqlserver:// URIs
|
|
71
|
+
return scheme in {"mssql", "sqlserver"}
|
|
72
|
+
|
|
73
|
+
@contextmanager
|
|
74
|
+
def _get_connection_ctx(self, handle: DatasetHandle):
|
|
75
|
+
"""
|
|
76
|
+
Get a SQL Server connection context.
|
|
77
|
+
|
|
78
|
+
For BYOC, yields the external connection directly (not owned by us).
|
|
79
|
+
For URI-based, yields a new connection (owned by context manager).
|
|
80
|
+
"""
|
|
81
|
+
if handle.scheme == "byoc" and handle.external_conn is not None:
|
|
82
|
+
yield handle.external_conn
|
|
83
|
+
elif handle.db_params:
|
|
84
|
+
with get_connection(handle.db_params) as conn:
|
|
85
|
+
yield conn
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError("Handle has neither external_conn nor db_params")
|
|
88
|
+
|
|
89
|
+
def _get_table_reference(self, handle: DatasetHandle) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Get the fully-qualified table reference for SQL Server.
|
|
92
|
+
|
|
93
|
+
Returns: [schema].[table] format.
|
|
94
|
+
"""
|
|
95
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
96
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
97
|
+
schema = schema or get_default_schema(SQLSERVER)
|
|
98
|
+
return f"{self._esc(schema)}.{self._esc(table)}"
|
|
99
|
+
elif handle.db_params:
|
|
100
|
+
params: SqlServerConnectionParams = handle.db_params
|
|
101
|
+
return f"{self._esc(params.schema)}.{self._esc(params.table)}"
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
104
|
+
|
|
105
|
+
def _get_schema_and_table(self, handle: DatasetHandle) -> Tuple[str, str]:
|
|
106
|
+
"""
|
|
107
|
+
Get schema and table name separately for custom SQL placeholder replacement.
|
|
108
|
+
|
|
109
|
+
Returns: Tuple of (schema, table_name)
|
|
110
|
+
"""
|
|
111
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
112
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
113
|
+
schema = schema or get_default_schema(SQLSERVER)
|
|
114
|
+
return schema, table
|
|
115
|
+
elif handle.db_params:
|
|
116
|
+
params: SqlServerConnectionParams = handle.db_params
|
|
117
|
+
return params.schema, params.table
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
120
|
+
|
|
121
|
+
def introspect(self, handle: DatasetHandle, **kwargs) -> Dict[str, Any]:
|
|
122
|
+
"""
|
|
123
|
+
Introspect the SQL Server table for metadata.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
{"row_count": int, "available_cols": [...], "staging": None}
|
|
127
|
+
"""
|
|
128
|
+
table = self._get_table_reference(handle)
|
|
129
|
+
|
|
130
|
+
# Get schema and table name for information_schema query
|
|
131
|
+
if handle.scheme == "byoc" and handle.table_ref:
|
|
132
|
+
_db, schema, table_name = parse_table_reference(handle.table_ref)
|
|
133
|
+
schema = schema or get_default_schema(SQLSERVER)
|
|
134
|
+
elif handle.db_params:
|
|
135
|
+
params: SqlServerConnectionParams = handle.db_params
|
|
136
|
+
schema = params.schema
|
|
137
|
+
table_name = params.table
|
|
138
|
+
else:
|
|
139
|
+
raise ValueError("Handle has neither table_ref nor db_params")
|
|
140
|
+
|
|
141
|
+
with self._get_connection_ctx(handle) as conn:
|
|
142
|
+
cursor = conn.cursor()
|
|
143
|
+
try:
|
|
144
|
+
# Get row count
|
|
145
|
+
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
|
146
|
+
row_count = cursor.fetchone()
|
|
147
|
+
n = int(row_count[0]) if row_count else 0
|
|
148
|
+
|
|
149
|
+
# Get column names (pymssql uses %s for parameters)
|
|
150
|
+
cursor.execute(
|
|
151
|
+
"""
|
|
152
|
+
SELECT column_name
|
|
153
|
+
FROM information_schema.columns
|
|
154
|
+
WHERE table_schema = %s AND table_name = %s
|
|
155
|
+
ORDER BY ordinal_position
|
|
156
|
+
""",
|
|
157
|
+
(schema, table_name),
|
|
158
|
+
)
|
|
159
|
+
cols = [row[0] for row in cursor.fetchall()]
|
|
160
|
+
finally:
|
|
161
|
+
cursor.close()
|
|
162
|
+
|
|
163
|
+
return {"row_count": n, "available_cols": cols, "staging": None}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/__init__.py
|
|
2
|
+
from .base import BaseMaterializer
|
|
3
|
+
from .registry import (
|
|
4
|
+
pick_materializer,
|
|
5
|
+
register_default_materializers,
|
|
6
|
+
register_materializer,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"BaseMaterializer",
|
|
11
|
+
"pick_materializer",
|
|
12
|
+
"register_materializer",
|
|
13
|
+
"register_default_materializers",
|
|
14
|
+
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/base.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from kontra.connectors.handle import DatasetHandle
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseMaterializer:
|
|
12
|
+
"""
|
|
13
|
+
Minimal base class for materializers.
|
|
14
|
+
|
|
15
|
+
Defines the interface for:
|
|
16
|
+
- Loading a source into a Polars DataFrame (with projection)
|
|
17
|
+
- Peeking the schema
|
|
18
|
+
- Reporting I/O diagnostics
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
materializer_name: str = "unknown"
|
|
22
|
+
|
|
23
|
+
def __init__(self, handle: DatasetHandle):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the materializer with a data source handle.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
handle: The DatasetHandle containing the URI and fs_opts.
|
|
29
|
+
"""
|
|
30
|
+
self.handle = handle
|
|
31
|
+
|
|
32
|
+
def schema(self) -> List[str]:
|
|
33
|
+
"""Return column names without materializing data (best effort)."""
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
|
|
37
|
+
"""Materialize directly as a Polars DataFrame."""
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
def io_debug(self) -> Optional[Dict[str, Any]]:
|
|
41
|
+
"""Return last I/O diagnostics for observability (or None)."""
|
|
42
|
+
return None # Default implementation
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/duckdb.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
import duckdb
|
|
10
|
+
|
|
11
|
+
# --- Kontra Imports ---
|
|
12
|
+
from kontra.engine.backends.duckdb_session import create_duckdb_connection
|
|
13
|
+
from kontra.engine.backends.duckdb_utils import (
|
|
14
|
+
esc_ident,
|
|
15
|
+
lit_str,
|
|
16
|
+
)
|
|
17
|
+
from kontra.connectors.handle import DatasetHandle
|
|
18
|
+
|
|
19
|
+
from .base import BaseMaterializer # Import from new base file
|
|
20
|
+
from .registry import register_materializer
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_materializer("duckdb")
|
|
24
|
+
class DuckDBMaterializer(BaseMaterializer):
|
|
25
|
+
"""
|
|
26
|
+
Column-pruned materialization via DuckDB httpfs → Arrow → Polars.
|
|
27
|
+
|
|
28
|
+
Guarantees:
|
|
29
|
+
- **Format aware**: Parquet via read_parquet(), CSV via read_csv_auto().
|
|
30
|
+
- **Projection**: SELECT only requested columns at source (true pruning).
|
|
31
|
+
- **Low copy**: Arrow table handoff → Polars DataFrame.
|
|
32
|
+
- **Remote support**: S3/HTTP via DuckDB httpfs (loaded in session factory).
|
|
33
|
+
|
|
34
|
+
Scope:
|
|
35
|
+
- I/O only. Does NOT execute rule SQL; the SQL executor handles pushdown.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, handle: DatasetHandle):
|
|
39
|
+
super().__init__(handle)
|
|
40
|
+
self.source = handle.uri
|
|
41
|
+
self._io_debug_enabled = bool(os.getenv("KONTRA_IO_DEBUG"))
|
|
42
|
+
self._last_io_debug: Optional[Dict[str, Any]] = None
|
|
43
|
+
self.con = create_duckdb_connection(self.handle)
|
|
44
|
+
|
|
45
|
+
# ---------- Materializer API ----------
|
|
46
|
+
|
|
47
|
+
def schema(self) -> List[str]:
|
|
48
|
+
"""
|
|
49
|
+
Return column names without materializing data (best effort, format-aware).
|
|
50
|
+
"""
|
|
51
|
+
read_fn = self._get_read_function()
|
|
52
|
+
cur = self.con.execute(
|
|
53
|
+
f"SELECT * FROM {read_fn}({lit_str(self.source)}) LIMIT 0"
|
|
54
|
+
)
|
|
55
|
+
return [d[0] for d in cur.description] if cur.description else []
|
|
56
|
+
|
|
57
|
+
def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
|
|
58
|
+
"""
|
|
59
|
+
Materialize the requested columns as a Polars DataFrame via Arrow.
|
|
60
|
+
"""
|
|
61
|
+
# Route through Arrow for consistent, low-copy materialization.
|
|
62
|
+
import pyarrow as pa # noqa: F401
|
|
63
|
+
|
|
64
|
+
cols_sql = (
|
|
65
|
+
", ".join(esc_ident(c) for c in (columns or [])) if columns else "*"
|
|
66
|
+
)
|
|
67
|
+
read_func = self._get_read_function()
|
|
68
|
+
|
|
69
|
+
t0 = time.perf_counter()
|
|
70
|
+
query = f"SELECT {cols_sql} FROM {read_func}({lit_str(self.source)})"
|
|
71
|
+
cur = self.con.execute(query)
|
|
72
|
+
table = cur.fetch_arrow_table()
|
|
73
|
+
t1 = time.perf_counter()
|
|
74
|
+
|
|
75
|
+
if self._io_debug_enabled:
|
|
76
|
+
self._last_io_debug = {
|
|
77
|
+
"materializer": "duckdb",
|
|
78
|
+
"mode": "duckdb_project_to_arrow",
|
|
79
|
+
"columns_requested": list(columns or []),
|
|
80
|
+
"column_count": len(columns or []),
|
|
81
|
+
"elapsed_ms": int((t1 - t0) * 1000),
|
|
82
|
+
}
|
|
83
|
+
else:
|
|
84
|
+
self._last_io_debug = None
|
|
85
|
+
|
|
86
|
+
return pl.from_arrow(table)
|
|
87
|
+
|
|
88
|
+
def io_debug(self) -> Optional[Dict[str, Any]]:
|
|
89
|
+
return self._last_io_debug
|
|
90
|
+
|
|
91
|
+
# ---------- Internals ----------
|
|
92
|
+
|
|
93
|
+
def _get_read_function(self) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Return the correct DuckDB read function based on file format.
|
|
96
|
+
|
|
97
|
+
Notes:
|
|
98
|
+
- For CSV, we prefer DuckDB's read_csv_auto() for performance.
|
|
99
|
+
- CSV options (delimiter, header, etc.) can be threaded from
|
|
100
|
+
connector handle in future (TODO), but auto inference is robust
|
|
101
|
+
for most standardized data lake dumps.
|
|
102
|
+
"""
|
|
103
|
+
fmt = (self.handle.format or "").lower()
|
|
104
|
+
if fmt == "parquet":
|
|
105
|
+
return "read_parquet"
|
|
106
|
+
if fmt == "csv":
|
|
107
|
+
return "read_csv_auto"
|
|
108
|
+
|
|
109
|
+
# Fallback: attempt format autodetection; Parquet is most common.
|
|
110
|
+
return "read_parquet"
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from ...connectors.capabilities import ConnectorCapabilities as CC # re-export path differs when imported from engine
|
|
4
|
+
from .duckdb import DuckDBMaterializer
|
|
5
|
+
from .polars_connector import PolarsConnectorMaterializer
|
|
6
|
+
|
|
7
|
+
def is_s3_uri(val: str | None) -> bool:
|
|
8
|
+
return isinstance(val, str) and val.lower().startswith("s3://")
|
|
9
|
+
|
|
10
|
+
class MaterializerFactory:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def from_source(source: str, connector, caps: int, prefer_remote_pruning: bool):
|
|
13
|
+
"""
|
|
14
|
+
Choose the best materializer for a given source and connector capabilities.
|
|
15
|
+
|
|
16
|
+
Strategy (v1):
|
|
17
|
+
- If remote S3 and we prefer pruning → DuckDBMaterializer (httpfs + Arrow)
|
|
18
|
+
- Else → PolarsConnectorMaterializer (direct connector.load)
|
|
19
|
+
"""
|
|
20
|
+
if is_s3_uri(source) and prefer_remote_pruning and (caps & (CC.PUSHDOWN | CC.REMOTE_PARTIAL)):
|
|
21
|
+
return DuckDBMaterializer(source)
|
|
22
|
+
return PolarsConnectorMaterializer(source, connector)
|