kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/polars_connector.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
PolarsConnectorMaterializer
|
|
6
|
+
|
|
7
|
+
Purpose
|
|
8
|
+
-------
|
|
9
|
+
Local, dependency-light materializer that produces a Polars DataFrame from
|
|
10
|
+
file-based datasets (Parquet/CSV). Supports column projection. Does *not*
|
|
11
|
+
require the legacy connectors package.
|
|
12
|
+
|
|
13
|
+
Design
|
|
14
|
+
------
|
|
15
|
+
- First tries legacy `ConnectorFactory` (for back-compat if present).
|
|
16
|
+
- Otherwise uses native Polars lazy scans:
|
|
17
|
+
- scan_* → optional .select(projection) → collect()
|
|
18
|
+
|
|
19
|
+
Notes
|
|
20
|
+
-----
|
|
21
|
+
Polars ≥ 1.34 removed `columns=` from scan_*; apply projection via `.select()`.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from typing import Any, Dict, List, Optional
|
|
25
|
+
|
|
26
|
+
import polars as pl
|
|
27
|
+
|
|
28
|
+
from kontra.connectors.handle import DatasetHandle
|
|
29
|
+
from .base import BaseMaterializer
|
|
30
|
+
from .registry import register_materializer
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _infer_format(uri: str, explicit: Optional[str]) -> str:
|
|
34
|
+
"""Resolve file format from explicit handle.format or file extension."""
|
|
35
|
+
if explicit:
|
|
36
|
+
return explicit.lower()
|
|
37
|
+
low = uri.lower()
|
|
38
|
+
if low.endswith(".parquet"):
|
|
39
|
+
return "parquet"
|
|
40
|
+
if low.endswith(".csv"):
|
|
41
|
+
return "csv"
|
|
42
|
+
return ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@register_materializer("polars-connector")
|
|
46
|
+
class PolarsConnectorMaterializer(BaseMaterializer):
|
|
47
|
+
"""
|
|
48
|
+
Minimal, deterministic materializer for local files.
|
|
49
|
+
|
|
50
|
+
Responsibilities
|
|
51
|
+
----------------
|
|
52
|
+
- Cheap schema peek (names only)
|
|
53
|
+
- DataFrame materialization with optional projection
|
|
54
|
+
- No side effects; no hidden state
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
name = "polars-connector"
|
|
58
|
+
|
|
59
|
+
def __init__(self, handle: DatasetHandle):
|
|
60
|
+
super().__init__(handle)
|
|
61
|
+
self._io_debug: Optional[Dict[str, Any]] = None # retained for parity with duckdb materializer
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------ #
|
|
64
|
+
# Introspection
|
|
65
|
+
# ------------------------------------------------------------------ #
|
|
66
|
+
|
|
67
|
+
def schema(self) -> List[str]:
|
|
68
|
+
"""
|
|
69
|
+
Return column names using a lazy scan. Never raises — empty list on failure.
|
|
70
|
+
"""
|
|
71
|
+
uri = self.handle.uri
|
|
72
|
+
fmt = _infer_format(uri, getattr(self.handle, "format", None))
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
if fmt == "parquet":
|
|
76
|
+
return list(pl.scan_parquet(uri).collect_schema().names())
|
|
77
|
+
if fmt == "csv":
|
|
78
|
+
return list(pl.scan_csv(uri).collect_schema().names())
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
# ------------------------------------------------------------------ #
|
|
84
|
+
# Materialization
|
|
85
|
+
# ------------------------------------------------------------------ #
|
|
86
|
+
|
|
87
|
+
def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
|
|
88
|
+
"""
|
|
89
|
+
Materialize dataset into a Polars DataFrame.
|
|
90
|
+
|
|
91
|
+
Strategy
|
|
92
|
+
--------
|
|
93
|
+
1) Attempt legacy connectors path (if installed) to preserve behavior.
|
|
94
|
+
2) Otherwise, native Polars scan with projection via `.select()`.
|
|
95
|
+
"""
|
|
96
|
+
# --- Legacy path (optional/back-compat) --------------------------------
|
|
97
|
+
try:
|
|
98
|
+
from kontra.connectors.factory import ConnectorFactory # type: ignore
|
|
99
|
+
|
|
100
|
+
connector = ConnectorFactory.from_source(self.handle.uri)
|
|
101
|
+
# The legacy API accepts `columns=` (best-effort).
|
|
102
|
+
return connector.load(self.handle.uri, columns=columns)
|
|
103
|
+
except (ImportError, ModuleNotFoundError):
|
|
104
|
+
# Fall back to native Polars path
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
# --- Native Polars path -------------------------------------------------
|
|
108
|
+
uri = self.handle.uri
|
|
109
|
+
fmt = _infer_format(uri, getattr(self.handle, "format", None))
|
|
110
|
+
|
|
111
|
+
if fmt == "parquet":
|
|
112
|
+
lf = pl.scan_parquet(uri)
|
|
113
|
+
elif fmt == "csv":
|
|
114
|
+
# Add CSV options here if your data requires (delimiter, nulls, dtypes).
|
|
115
|
+
lf = pl.scan_csv(uri)
|
|
116
|
+
else:
|
|
117
|
+
raise IOError(f"Unsupported format for PolarsConnectorMaterializer: {uri}")
|
|
118
|
+
|
|
119
|
+
if columns:
|
|
120
|
+
lf = lf.select([pl.col(c) for c in columns])
|
|
121
|
+
|
|
122
|
+
# NOTE: streaming=True is deprecated; default engine suffices for tests and CI.
|
|
123
|
+
return lf.collect()
|
|
124
|
+
|
|
125
|
+
# ------------------------------------------------------------------ #
|
|
126
|
+
# Diagnostics
|
|
127
|
+
# ------------------------------------------------------------------ #
|
|
128
|
+
|
|
129
|
+
def io_debug(self) -> Optional[Dict[str, Any]]:
|
|
130
|
+
"""Reserved hook for I/O diagnostics (none for this materializer)."""
|
|
131
|
+
return None
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/postgres.py
|
|
2
|
+
"""
|
|
3
|
+
PostgreSQL Materializer - loads PostgreSQL tables to Polars DataFrames.
|
|
4
|
+
|
|
5
|
+
Uses psycopg3's efficient binary COPY protocol for streaming data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
from kontra.connectors.handle import DatasetHandle
|
|
17
|
+
from kontra.connectors.postgres import PostgresConnectionParams, get_connection
|
|
18
|
+
from kontra.connectors.detection import parse_table_reference, get_default_schema, POSTGRESQL
|
|
19
|
+
from contextlib import contextmanager
|
|
20
|
+
|
|
21
|
+
from .base import BaseMaterializer
|
|
22
|
+
from .registry import register_materializer
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def _get_connection_ctx(handle: DatasetHandle):
|
|
27
|
+
"""
|
|
28
|
+
Get a connection context for either BYOC or URI-based handles.
|
|
29
|
+
|
|
30
|
+
For BYOC, yields the external connection directly (not owned by us).
|
|
31
|
+
For URI-based, yields a new connection (owned by context manager).
|
|
32
|
+
"""
|
|
33
|
+
if handle.scheme == "byoc" and handle.external_conn is not None:
|
|
34
|
+
# BYOC: yield external connection directly, don't close it
|
|
35
|
+
yield handle.external_conn
|
|
36
|
+
elif handle.db_params:
|
|
37
|
+
# URI-based: use our connection manager
|
|
38
|
+
with get_connection(handle.db_params) as conn:
|
|
39
|
+
yield conn
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError("Handle has neither external_conn nor db_params")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@register_materializer("postgres")
|
|
45
|
+
class PostgresMaterializer(BaseMaterializer):
|
|
46
|
+
"""
|
|
47
|
+
Materialize PostgreSQL tables as Polars DataFrames with column projection.
|
|
48
|
+
|
|
49
|
+
Features:
|
|
50
|
+
- Efficient data loading via psycopg3
|
|
51
|
+
- Column projection at source (SELECT only needed columns)
|
|
52
|
+
- Binary protocol for faster transfers (when available)
|
|
53
|
+
- BYOC (Bring Your Own Connection) support
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, handle: DatasetHandle):
|
|
57
|
+
super().__init__(handle)
|
|
58
|
+
|
|
59
|
+
self._is_byoc = handle.scheme == "byoc" and handle.external_conn is not None
|
|
60
|
+
|
|
61
|
+
if self._is_byoc:
|
|
62
|
+
# BYOC: get table info from handle
|
|
63
|
+
if not handle.table_ref:
|
|
64
|
+
raise ValueError("BYOC handle missing table_ref")
|
|
65
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
66
|
+
self._schema_name = schema or get_default_schema(POSTGRESQL)
|
|
67
|
+
self._table_name = table
|
|
68
|
+
self._qualified_table = f'{_esc_ident(self._schema_name)}.{_esc_ident(self._table_name)}'
|
|
69
|
+
elif handle.db_params:
|
|
70
|
+
# URI-based: use params
|
|
71
|
+
self.params: PostgresConnectionParams = handle.db_params
|
|
72
|
+
self._schema_name = self.params.schema
|
|
73
|
+
self._table_name = self.params.table
|
|
74
|
+
self._qualified_table = self.params.qualified_table
|
|
75
|
+
else:
|
|
76
|
+
raise ValueError("PostgreSQL handle missing db_params or external_conn")
|
|
77
|
+
|
|
78
|
+
self._io_debug_enabled = bool(os.getenv("KONTRA_IO_DEBUG"))
|
|
79
|
+
self._last_io_debug: Optional[Dict[str, Any]] = None
|
|
80
|
+
|
|
81
|
+
def schema(self) -> List[str]:
|
|
82
|
+
"""Return column names without loading data."""
|
|
83
|
+
with _get_connection_ctx(self.handle) as conn:
|
|
84
|
+
with conn.cursor() as cur:
|
|
85
|
+
cur.execute(
|
|
86
|
+
"""
|
|
87
|
+
SELECT column_name
|
|
88
|
+
FROM information_schema.columns
|
|
89
|
+
WHERE table_schema = %s AND table_name = %s
|
|
90
|
+
ORDER BY ordinal_position
|
|
91
|
+
""",
|
|
92
|
+
(self._schema_name, self._table_name),
|
|
93
|
+
)
|
|
94
|
+
return [row[0] for row in cur.fetchall()]
|
|
95
|
+
|
|
96
|
+
def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
|
|
97
|
+
"""
|
|
98
|
+
Load table data as a Polars DataFrame with optional column projection.
|
|
99
|
+
|
|
100
|
+
Supports both URI-based connections (handle.db_params) and
|
|
101
|
+
BYOC connections (handle.external_conn).
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
columns: List of columns to load. If None, loads all columns.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Polars DataFrame with the requested columns.
|
|
108
|
+
"""
|
|
109
|
+
t0 = time.perf_counter()
|
|
110
|
+
|
|
111
|
+
# Build column list for SELECT
|
|
112
|
+
if columns:
|
|
113
|
+
cols_sql = ", ".join(_esc_ident(c) for c in columns)
|
|
114
|
+
else:
|
|
115
|
+
cols_sql = "*"
|
|
116
|
+
|
|
117
|
+
query = f"SELECT {cols_sql} FROM {self._qualified_table}"
|
|
118
|
+
|
|
119
|
+
with _get_connection_ctx(self.handle) as conn:
|
|
120
|
+
with conn.cursor() as cur:
|
|
121
|
+
cur.execute(query)
|
|
122
|
+
# Fetch all rows - for large tables, consider chunked loading
|
|
123
|
+
rows = cur.fetchall()
|
|
124
|
+
col_names = [desc[0] for desc in cur.description] if cur.description else []
|
|
125
|
+
|
|
126
|
+
t1 = time.perf_counter()
|
|
127
|
+
|
|
128
|
+
# Convert to Polars DataFrame
|
|
129
|
+
if rows:
|
|
130
|
+
df = pl.DataFrame(rows, schema=col_names, orient="row")
|
|
131
|
+
else:
|
|
132
|
+
# Empty DataFrame with correct schema
|
|
133
|
+
df = pl.DataFrame(schema={name: pl.Utf8 for name in col_names})
|
|
134
|
+
|
|
135
|
+
if self._io_debug_enabled:
|
|
136
|
+
self._last_io_debug = {
|
|
137
|
+
"materializer": "postgres",
|
|
138
|
+
"mode": "psycopg_fetch" if not self._is_byoc else "byoc_fetch",
|
|
139
|
+
"table": self._qualified_table,
|
|
140
|
+
"columns_requested": list(columns or []),
|
|
141
|
+
"column_count": len(columns or col_names),
|
|
142
|
+
"row_count": len(rows) if rows else 0,
|
|
143
|
+
"elapsed_ms": int((t1 - t0) * 1000),
|
|
144
|
+
}
|
|
145
|
+
else:
|
|
146
|
+
self._last_io_debug = None
|
|
147
|
+
|
|
148
|
+
return df
|
|
149
|
+
|
|
150
|
+
def io_debug(self) -> Optional[Dict[str, Any]]:
|
|
151
|
+
return self._last_io_debug
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _esc_ident(name: str) -> str:
|
|
155
|
+
"""Escape a PostgreSQL identifier (column/table name)."""
|
|
156
|
+
# Double any internal quotes and wrap in quotes
|
|
157
|
+
return '"' + name.replace('"', '""') + '"'
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/registry.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import TYPE_CHECKING, Callable, Dict, List
|
|
5
|
+
|
|
6
|
+
from kontra.connectors.handle import DatasetHandle
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
# Import from the new base file
|
|
10
|
+
from .base import BaseMaterializer as Materializer
|
|
11
|
+
from .duckdb import DuckDBMaterializer # noqa: F401
|
|
12
|
+
from .polars_connector import PolarsConnectorMaterializer # noqa: F401
|
|
13
|
+
from .postgres import PostgresMaterializer # noqa: F401
|
|
14
|
+
from .sqlserver import SqlServerMaterializer # noqa: F401
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Registry: materializer_name -> ctor(handle) function
|
|
18
|
+
_MATS: Dict[str, Callable[[DatasetHandle], Materializer]] = {}
|
|
19
|
+
# Simple order for picking when multiple can handle a handle
|
|
20
|
+
_ORDER: List[str] = []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def register_materializer(name: str):
|
|
24
|
+
"""
|
|
25
|
+
Decorator to register a materializer class under a stable name.
|
|
26
|
+
The class must implement the Materializer protocol.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def deco(cls: Callable[[DatasetHandle], Materializer]) -> Callable[
|
|
30
|
+
[DatasetHandle], Materializer
|
|
31
|
+
]:
|
|
32
|
+
if name in _MATS:
|
|
33
|
+
raise ValueError(f"Materializer '{name}' is already registered.")
|
|
34
|
+
_MATS[name] = cls
|
|
35
|
+
if name not in _ORDER:
|
|
36
|
+
_ORDER.append(name)
|
|
37
|
+
cls.materializer_name = name # friendly label for stats.io
|
|
38
|
+
return cls
|
|
39
|
+
|
|
40
|
+
return deco
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def pick_materializer(handle: DatasetHandle) -> Materializer:
|
|
44
|
+
"""
|
|
45
|
+
Choose the best materializer for the given dataset handle.
|
|
46
|
+
|
|
47
|
+
Policy (v1.4 - BYOC support):
|
|
48
|
+
- BYOC handles use the materializer matching their dialect.
|
|
49
|
+
- PostgreSQL URIs use the PostgreSQL materializer.
|
|
50
|
+
- SQL Server URIs use the SQL Server materializer.
|
|
51
|
+
- Remote files (s3, http) with known formats use DuckDB materializer.
|
|
52
|
+
- Otherwise, fall back to PolarsConnector materializer.
|
|
53
|
+
|
|
54
|
+
This logic is INDEPENDENT of the projection flag.
|
|
55
|
+
"""
|
|
56
|
+
# BYOC: route based on dialect
|
|
57
|
+
if handle.scheme == "byoc":
|
|
58
|
+
if handle.dialect == "postgresql":
|
|
59
|
+
ctor = _MATS.get("postgres")
|
|
60
|
+
if ctor:
|
|
61
|
+
return ctor(handle)
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"PostgreSQL materializer not registered. "
|
|
64
|
+
"Ensure psycopg is installed: pip install 'psycopg[binary]'"
|
|
65
|
+
)
|
|
66
|
+
elif handle.dialect == "sqlserver":
|
|
67
|
+
ctor = _MATS.get("sqlserver")
|
|
68
|
+
if ctor:
|
|
69
|
+
return ctor(handle)
|
|
70
|
+
raise RuntimeError(
|
|
71
|
+
"SQL Server materializer not registered. "
|
|
72
|
+
"Ensure pymssql is installed: pip install pymssql"
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
raise RuntimeError(
|
|
76
|
+
f"Unsupported BYOC dialect: {handle.dialect}. "
|
|
77
|
+
"Supported: postgresql, sqlserver"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# PostgreSQL: use dedicated materializer
|
|
81
|
+
if handle.scheme in ("postgres", "postgresql"):
|
|
82
|
+
ctor = _MATS.get("postgres")
|
|
83
|
+
if ctor:
|
|
84
|
+
return ctor(handle)
|
|
85
|
+
raise RuntimeError(
|
|
86
|
+
"PostgreSQL materializer not registered. "
|
|
87
|
+
"Ensure psycopg is installed: pip install 'psycopg[binary]'"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# SQL Server: use dedicated materializer
|
|
91
|
+
if handle.scheme in ("mssql", "sqlserver"):
|
|
92
|
+
ctor = _MATS.get("sqlserver")
|
|
93
|
+
if ctor:
|
|
94
|
+
return ctor(handle)
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
"SQL Server materializer not registered. "
|
|
97
|
+
"Ensure pymssql is installed: pip install pymssql"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Remote files with known formats: use DuckDB for efficient I/O
|
|
101
|
+
# Includes S3, HTTP(S), and Azure (ADLS Gen2, Azure Blob)
|
|
102
|
+
is_remote = handle.scheme in ("s3", "http", "https", "abfs", "abfss", "az")
|
|
103
|
+
is_known_format = handle.format in ("parquet", "csv")
|
|
104
|
+
|
|
105
|
+
if is_remote and is_known_format:
|
|
106
|
+
ctor = _MATS.get("duckdb")
|
|
107
|
+
if ctor:
|
|
108
|
+
return ctor(handle)
|
|
109
|
+
|
|
110
|
+
# Fallback for local files or unknown formats
|
|
111
|
+
ctor = _MATS.get("polars-connector")
|
|
112
|
+
if not ctor:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
"No default materializer registered (polars-connector missing)"
|
|
115
|
+
)
|
|
116
|
+
return ctor(handle)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def register_default_materializers() -> None:
|
|
120
|
+
"""
|
|
121
|
+
Eagerly import built-in materializers so their @register_materializer
|
|
122
|
+
decorators run and populate the registry.
|
|
123
|
+
"""
|
|
124
|
+
# Local imports to trigger decorator side-effects
|
|
125
|
+
from . import duckdb # noqa: F401
|
|
126
|
+
from . import polars_connector # noqa: F401
|
|
127
|
+
|
|
128
|
+
# PostgreSQL materializer (optional - requires psycopg)
|
|
129
|
+
try:
|
|
130
|
+
from . import postgres # noqa: F401
|
|
131
|
+
except ImportError:
|
|
132
|
+
pass # psycopg not installed, skip postgres materializer
|
|
133
|
+
|
|
134
|
+
# SQL Server materializer (optional - requires pymssql)
|
|
135
|
+
try:
|
|
136
|
+
from . import sqlserver # noqa: F401
|
|
137
|
+
except ImportError:
|
|
138
|
+
pass # pymssql not installed, skip sqlserver materializer
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# src/kontra/engine/materializers/sqlserver.py
|
|
2
|
+
"""
|
|
3
|
+
SQL Server Materializer - loads SQL Server tables to Polars DataFrames.
|
|
4
|
+
|
|
5
|
+
Uses pymssql for database connectivity.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
from kontra.connectors.handle import DatasetHandle
|
|
17
|
+
from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
|
|
18
|
+
from kontra.connectors.detection import parse_table_reference, get_default_schema, SQLSERVER
|
|
19
|
+
from contextlib import contextmanager
|
|
20
|
+
|
|
21
|
+
from .base import BaseMaterializer
|
|
22
|
+
from .registry import register_materializer
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def _get_connection_ctx(handle: DatasetHandle):
|
|
27
|
+
"""
|
|
28
|
+
Get a connection context for either BYOC or URI-based handles.
|
|
29
|
+
|
|
30
|
+
For BYOC, yields the external connection directly (not owned by us).
|
|
31
|
+
For URI-based, yields a new connection (owned by context manager).
|
|
32
|
+
"""
|
|
33
|
+
if handle.scheme == "byoc" and handle.external_conn is not None:
|
|
34
|
+
# BYOC: yield external connection directly, don't close it
|
|
35
|
+
yield handle.external_conn
|
|
36
|
+
elif handle.db_params:
|
|
37
|
+
# URI-based: use our connection manager
|
|
38
|
+
with get_connection(handle.db_params) as conn:
|
|
39
|
+
yield conn
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError("Handle has neither external_conn nor db_params")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@register_materializer("sqlserver")
|
|
45
|
+
class SqlServerMaterializer(BaseMaterializer):
|
|
46
|
+
"""
|
|
47
|
+
Materialize SQL Server tables as Polars DataFrames with column projection.
|
|
48
|
+
|
|
49
|
+
Features:
|
|
50
|
+
- Efficient data loading via pymssql
|
|
51
|
+
- Column projection at source (SELECT only needed columns)
|
|
52
|
+
- BYOC (Bring Your Own Connection) support
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
materializer_name = "sqlserver"
|
|
56
|
+
|
|
57
|
+
def __init__(self, handle: DatasetHandle):
|
|
58
|
+
super().__init__(handle)
|
|
59
|
+
|
|
60
|
+
self._is_byoc = handle.scheme == "byoc" and handle.external_conn is not None
|
|
61
|
+
|
|
62
|
+
if self._is_byoc:
|
|
63
|
+
# BYOC: get table info from handle
|
|
64
|
+
if not handle.table_ref:
|
|
65
|
+
raise ValueError("BYOC handle missing table_ref")
|
|
66
|
+
_db, schema, table = parse_table_reference(handle.table_ref)
|
|
67
|
+
self._schema_name = schema or get_default_schema(SQLSERVER)
|
|
68
|
+
self._table_name = table
|
|
69
|
+
self._qualified_table = f'[{self._schema_name}].[{self._table_name}]'
|
|
70
|
+
elif handle.db_params:
|
|
71
|
+
# URI-based: use params
|
|
72
|
+
self.params: SqlServerConnectionParams = handle.db_params
|
|
73
|
+
self._schema_name = self.params.schema
|
|
74
|
+
self._table_name = self.params.table
|
|
75
|
+
self._qualified_table = f'[{self.params.schema}].[{self.params.table}]'
|
|
76
|
+
else:
|
|
77
|
+
raise ValueError("SQL Server handle missing db_params or external_conn")
|
|
78
|
+
|
|
79
|
+
self._io_debug_enabled = bool(os.getenv("KONTRA_IO_DEBUG"))
|
|
80
|
+
self._last_io_debug: Optional[Dict[str, Any]] = None
|
|
81
|
+
|
|
82
|
+
def schema(self) -> List[str]:
|
|
83
|
+
"""Return column names without loading data."""
|
|
84
|
+
with _get_connection_ctx(self.handle) as conn:
|
|
85
|
+
cursor = conn.cursor()
|
|
86
|
+
# pymssql uses %s as placeholder (pyodbc uses ?)
|
|
87
|
+
cursor.execute(
|
|
88
|
+
"""
|
|
89
|
+
SELECT column_name
|
|
90
|
+
FROM information_schema.columns
|
|
91
|
+
WHERE table_schema = %s AND table_name = %s
|
|
92
|
+
ORDER BY ordinal_position
|
|
93
|
+
""",
|
|
94
|
+
(self._schema_name, self._table_name),
|
|
95
|
+
)
|
|
96
|
+
return [row[0] for row in cursor.fetchall()]
|
|
97
|
+
|
|
98
|
+
def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
|
|
99
|
+
"""
|
|
100
|
+
Load table data as a Polars DataFrame with optional column projection.
|
|
101
|
+
|
|
102
|
+
Supports both URI-based connections (handle.db_params) and
|
|
103
|
+
BYOC connections (handle.external_conn).
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
columns: List of columns to load. If None, loads all columns.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Polars DataFrame with the requested columns.
|
|
110
|
+
"""
|
|
111
|
+
t0 = time.perf_counter()
|
|
112
|
+
|
|
113
|
+
# Build column list for SELECT
|
|
114
|
+
if columns:
|
|
115
|
+
cols_sql = ", ".join(_esc_ident(c) for c in columns)
|
|
116
|
+
else:
|
|
117
|
+
cols_sql = "*"
|
|
118
|
+
|
|
119
|
+
query = f"SELECT {cols_sql} FROM {self._qualified_table}"
|
|
120
|
+
|
|
121
|
+
with _get_connection_ctx(self.handle) as conn:
|
|
122
|
+
cursor = conn.cursor()
|
|
123
|
+
cursor.execute(query)
|
|
124
|
+
# Fetch all rows - for large tables, consider chunked loading
|
|
125
|
+
rows = cursor.fetchall()
|
|
126
|
+
col_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
127
|
+
|
|
128
|
+
t1 = time.perf_counter()
|
|
129
|
+
|
|
130
|
+
# Convert to Polars DataFrame
|
|
131
|
+
if rows:
|
|
132
|
+
df = pl.DataFrame(rows, schema=col_names, orient="row")
|
|
133
|
+
else:
|
|
134
|
+
# Empty DataFrame with correct schema
|
|
135
|
+
df = pl.DataFrame(schema={name: pl.Utf8 for name in col_names})
|
|
136
|
+
|
|
137
|
+
if self._io_debug_enabled:
|
|
138
|
+
self._last_io_debug = {
|
|
139
|
+
"materializer": "sqlserver",
|
|
140
|
+
"mode": "pymssql_fetch" if not self._is_byoc else "byoc_fetch",
|
|
141
|
+
"table": self._qualified_table,
|
|
142
|
+
"columns_requested": list(columns or []),
|
|
143
|
+
"column_count": len(columns or col_names),
|
|
144
|
+
"row_count": len(rows) if rows else 0,
|
|
145
|
+
"elapsed_ms": int((t1 - t0) * 1000),
|
|
146
|
+
}
|
|
147
|
+
else:
|
|
148
|
+
self._last_io_debug = None
|
|
149
|
+
|
|
150
|
+
return df
|
|
151
|
+
|
|
152
|
+
def io_debug(self) -> Optional[Dict[str, Any]]:
|
|
153
|
+
return self._last_io_debug
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _esc_ident(name: str) -> str:
|
|
157
|
+
"""Escape a SQL Server identifier (column/table name)."""
|
|
158
|
+
# SQL Server uses [brackets] for quoting identifiers
|
|
159
|
+
# Double any internal brackets
|
|
160
|
+
return "[" + name.replace("]", "]]") + "]"
|
kontra/engine/result.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# src/contra/engine/result.py
|
|
2
|
+
from typing import List, Dict, Any
|
|
3
|
+
from dataclasses import dataclass, asdict
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ValidationResult:
|
|
7
|
+
dataset: str
|
|
8
|
+
results: List[Dict[str, Any]]
|
|
9
|
+
summary: Dict[str, Any]
|
|
10
|
+
|
|
11
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
12
|
+
return asdict(self)
|
|
13
|
+
|
|
14
|
+
def passed(self) -> bool:
|
|
15
|
+
return self.summary.get("passed", False)
|