kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,162 @@
1
+ # src/kontra/engine/executors/postgres_sql.py
2
+ """
3
+ PostgreSQL SQL Executor - executes validation rules via SQL pushdown.
4
+
5
+ Uses DatabaseSqlExecutor base class for shared compile/execute logic.
6
+
7
+ Supports rules:
8
+ - not_null(column) - uses EXISTS (fast early termination)
9
+ - unique(column) - uses COUNT DISTINCT
10
+ - min_rows(threshold) - uses COUNT
11
+ - max_rows(threshold) - uses COUNT
12
+ - allowed_values(column, values) - uses SUM CASE
13
+ - freshness(column, max_age) - uses MAX
14
+ - range(column, min, max) - uses SUM CASE
15
+ - regex(column, pattern) - uses ~ operator
16
+ - compare(left, right, op) - uses SUM CASE
17
+ - conditional_not_null(column, when) - uses SUM CASE
18
+ - conditional_range(column, when, min, max) - uses SUM CASE
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from contextlib import contextmanager
24
+ from typing import Any, Dict, List, Tuple
25
+
26
+ from kontra.connectors.handle import DatasetHandle
27
+ from kontra.connectors.postgres import PostgresConnectionParams, get_connection
28
+ from kontra.connectors.detection import parse_table_reference, get_default_schema, POSTGRESQL
29
+
30
+ from .database_base import DatabaseSqlExecutor
31
+ from .registry import register_executor
32
+
33
+
34
+ @register_executor("postgres")
35
+ class PostgresSqlExecutor(DatabaseSqlExecutor):
36
+ """
37
+ PostgreSQL SQL pushdown executor.
38
+
39
+ Inherits compile() and execute() from DatabaseSqlExecutor.
40
+ Provides PostgreSQL-specific connection and table handling.
41
+ """
42
+
43
+ DIALECT = "postgres"
44
+ SUPPORTED_RULES = {
45
+ "not_null", "unique", "min_rows", "max_rows",
46
+ "allowed_values", "disallowed_values",
47
+ "freshness", "range", "length",
48
+ "regex", "contains", "starts_with", "ends_with",
49
+ "compare", "conditional_not_null", "conditional_range",
50
+ "custom_sql_check", "custom_agg"
51
+ }
52
+
53
+ @property
54
+ def name(self) -> str:
55
+ return "postgres"
56
+
57
+ def _supports_scheme(self, scheme: str, handle: DatasetHandle) -> bool:
58
+ """Check if this executor supports the given URI scheme."""
59
+ # BYOC: check dialect for external connections
60
+ if scheme == "byoc" and handle.dialect == "postgresql":
61
+ return handle.external_conn is not None
62
+
63
+ # URI-based: handle postgres:// URIs
64
+ return scheme in {"postgres", "postgresql"}
65
+
66
+ @contextmanager
67
+ def _get_connection_ctx(self, handle: DatasetHandle):
68
+ """
69
+ Get a PostgreSQL connection context.
70
+
71
+ For BYOC, yields the external connection directly (not owned by us).
72
+ For URI-based, yields a new connection (owned by context manager).
73
+ """
74
+ if handle.scheme == "byoc" and handle.external_conn is not None:
75
+ yield handle.external_conn
76
+ elif handle.db_params:
77
+ with get_connection(handle.db_params) as conn:
78
+ yield conn
79
+ else:
80
+ raise ValueError("Handle has neither external_conn nor db_params")
81
+
82
+ def _get_table_reference(self, handle: DatasetHandle) -> str:
83
+ """
84
+ Get the fully-qualified table reference for PostgreSQL.
85
+
86
+ Returns: "schema"."table" format.
87
+ """
88
+ if handle.scheme == "byoc" and handle.table_ref:
89
+ _db, schema, table = parse_table_reference(handle.table_ref)
90
+ schema = schema or get_default_schema(POSTGRESQL)
91
+ return f"{self._esc(schema)}.{self._esc(table)}"
92
+ elif handle.db_params:
93
+ params: PostgresConnectionParams = handle.db_params
94
+ return f"{self._esc(params.schema)}.{self._esc(params.table)}"
95
+ else:
96
+ raise ValueError("Handle has neither table_ref nor db_params")
97
+
98
+ def _get_schema_and_table(self, handle: DatasetHandle) -> Tuple[str, str]:
99
+ """
100
+ Get schema and table name separately for custom SQL placeholder replacement.
101
+
102
+ Returns: Tuple of (schema, table_name)
103
+ """
104
+ if handle.scheme == "byoc" and handle.table_ref:
105
+ _db, schema, table = parse_table_reference(handle.table_ref)
106
+ schema = schema or get_default_schema(POSTGRESQL)
107
+ return schema, table
108
+ elif handle.db_params:
109
+ params: PostgresConnectionParams = handle.db_params
110
+ return params.schema, params.table
111
+ else:
112
+ raise ValueError("Handle has neither table_ref nor db_params")
113
+
114
+ def _get_cursor(self, conn):
115
+ """Get a cursor using context manager."""
116
+ # PostgreSQL cursors support context managers
117
+ return conn.cursor()
118
+
119
+ def _close_cursor(self, cursor):
120
+ """Close the cursor."""
121
+ cursor.close()
122
+
123
+ def introspect(self, handle: DatasetHandle, **kwargs) -> Dict[str, Any]:
124
+ """
125
+ Introspect the PostgreSQL table for metadata.
126
+
127
+ Returns:
128
+ {"row_count": int, "available_cols": [...], "staging": None}
129
+ """
130
+ table = self._get_table_reference(handle)
131
+
132
+ # Get schema and table name for information_schema query
133
+ if handle.scheme == "byoc" and handle.table_ref:
134
+ _db, schema, table_name = parse_table_reference(handle.table_ref)
135
+ schema = schema or get_default_schema(POSTGRESQL)
136
+ elif handle.db_params:
137
+ params: PostgresConnectionParams = handle.db_params
138
+ schema = params.schema
139
+ table_name = params.table
140
+ else:
141
+ raise ValueError("Handle has neither table_ref nor db_params")
142
+
143
+ with self._get_connection_ctx(handle) as conn:
144
+ with conn.cursor() as cur:
145
+ # Get row count
146
+ cur.execute(f"SELECT COUNT(*) FROM {table}")
147
+ row_count = cur.fetchone()
148
+ n = int(row_count[0]) if row_count else 0
149
+
150
+ # Get column names
151
+ cur.execute(
152
+ """
153
+ SELECT column_name
154
+ FROM information_schema.columns
155
+ WHERE table_schema = %s AND table_name = %s
156
+ ORDER BY ordinal_position
157
+ """,
158
+ (schema, table_name),
159
+ )
160
+ cols = [row[0] for row in cur.fetchall()]
161
+
162
+ return {"row_count": n, "available_cols": cols, "staging": None}
@@ -0,0 +1,69 @@
1
+ # src/kontra/engine/executors/registry.py
2
+ from __future__ import annotations
3
+
4
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
5
+
6
+ from kontra.connectors.handle import DatasetHandle
7
+
8
+ if TYPE_CHECKING:
9
+ from .base import SqlExecutor
10
+
11
+
12
+ # Global registry: maps "executor_name" -> constructor()
13
+ _EXECUTORS: Dict[str, Callable[[], SqlExecutor]] = {}
14
+
15
+
16
+ def register_executor(name: str):
17
+ """
18
+ Decorator to register a SQL executor class.
19
+ """
20
+
21
+ def deco(cls: Callable[[], SqlExecutor]) -> Callable[[], SqlExecutor]:
22
+ if name in _EXECUTORS:
23
+ raise ValueError(f"Executor '{name}' is already registered.")
24
+ _EXECUTORS[name] = cls
25
+ return cls
26
+
27
+ return deco
28
+
29
+
30
+ def pick_executor(
31
+ handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
32
+ ) -> Optional[SqlExecutor]:
33
+ """
34
+ Find the first registered executor that supports the given handle and rules.
35
+ """
36
+ if not sql_specs:
37
+ return None # Nothing to push down
38
+
39
+ # Iterate over registered constructors
40
+ for name, ctor in _EXECUTORS.items():
41
+ executor = ctor() # Instantiate the executor
42
+ try:
43
+ if executor.supports(handle, sql_specs):
44
+ return executor
45
+ except Exception:
46
+ # Be conservative: ignore faulty executors
47
+ continue
48
+ return None
49
+
50
+
51
+ def register_default_executors() -> None:
52
+ """
53
+ Eagerly import built-in executors so their @register_executor
54
+ decorators run and populate the registry.
55
+ """
56
+ # Local import triggers decorator side-effect
57
+ from . import duckdb_sql # noqa: F401
58
+
59
+ # PostgreSQL executor (optional - requires psycopg)
60
+ try:
61
+ from . import postgres_sql # noqa: F401
62
+ except ImportError:
63
+ pass # psycopg not installed, skip postgres executor
64
+
65
+ # SQL Server executor (optional - requires pymssql)
66
+ try:
67
+ from . import sqlserver_sql # noqa: F401
68
+ except ImportError:
69
+ pass # pymssql not installed, skip sqlserver executor
@@ -0,0 +1,163 @@
1
+ # src/kontra/engine/executors/sqlserver_sql.py
2
+ """
3
+ SQL Server SQL Executor - executes validation rules via SQL pushdown.
4
+
5
+ Uses DatabaseSqlExecutor base class for shared compile/execute logic.
6
+
7
+ Supports rules:
8
+ - not_null(column) - uses EXISTS (fast early termination)
9
+ - unique(column) - uses COUNT DISTINCT
10
+ - min_rows(threshold) - uses COUNT
11
+ - max_rows(threshold) - uses COUNT
12
+ - allowed_values(column, values) - uses SUM CASE
13
+ - freshness(column, max_age) - uses MAX
14
+ - range(column, min, max) - uses SUM CASE
15
+ - compare(left, right, op) - uses SUM CASE
16
+ - conditional_not_null(column, when) - uses SUM CASE
17
+ - conditional_range(column, when, min, max) - uses SUM CASE
18
+
19
+ NOT supported (falls back to Polars):
20
+ - regex(column, pattern) - PATINDEX uses LIKE wildcards, not regex
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from contextlib import contextmanager
26
+ from typing import Any, Dict, List, Tuple
27
+
28
+ from kontra.connectors.handle import DatasetHandle
29
+ from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
30
+ from kontra.connectors.detection import parse_table_reference, get_default_schema, SQLSERVER
31
+
32
+ from .database_base import DatabaseSqlExecutor
33
+ from .registry import register_executor
34
+
35
+
36
+ @register_executor("sqlserver")
37
+ class SqlServerSqlExecutor(DatabaseSqlExecutor):
38
+ """
39
+ SQL Server SQL pushdown executor.
40
+
41
+ Inherits compile() and execute() from DatabaseSqlExecutor.
42
+ Provides SQL Server-specific connection and table handling.
43
+
44
+ Note: regex is NOT supported because PATINDEX uses LIKE-style wildcards.
45
+ Regex rules fall back to Polars execution.
46
+ """
47
+
48
+ DIALECT = "sqlserver"
49
+ # Note: regex is NOT supported - PATINDEX uses LIKE wildcards, not regex.
50
+ # But contains/starts_with/ends_with use LIKE, so they work!
51
+ SUPPORTED_RULES = {
52
+ "not_null", "unique", "min_rows", "max_rows",
53
+ "allowed_values", "disallowed_values",
54
+ "freshness", "range", "length",
55
+ "contains", "starts_with", "ends_with", # LIKE-based, works on SQL Server
56
+ "compare", "conditional_not_null", "conditional_range",
57
+ "custom_sql_check", "custom_agg"
58
+ }
59
+
60
+ @property
61
+ def name(self) -> str:
62
+ return "sqlserver"
63
+
64
+ def _supports_scheme(self, scheme: str, handle: DatasetHandle) -> bool:
65
+ """Check if this executor supports the given URI scheme."""
66
+ # BYOC: check dialect for external connections
67
+ if scheme == "byoc" and handle.dialect == "sqlserver":
68
+ return handle.external_conn is not None
69
+
70
+ # URI-based: handle mssql:// or sqlserver:// URIs
71
+ return scheme in {"mssql", "sqlserver"}
72
+
73
+ @contextmanager
74
+ def _get_connection_ctx(self, handle: DatasetHandle):
75
+ """
76
+ Get a SQL Server connection context.
77
+
78
+ For BYOC, yields the external connection directly (not owned by us).
79
+ For URI-based, yields a new connection (owned by context manager).
80
+ """
81
+ if handle.scheme == "byoc" and handle.external_conn is not None:
82
+ yield handle.external_conn
83
+ elif handle.db_params:
84
+ with get_connection(handle.db_params) as conn:
85
+ yield conn
86
+ else:
87
+ raise ValueError("Handle has neither external_conn nor db_params")
88
+
89
+ def _get_table_reference(self, handle: DatasetHandle) -> str:
90
+ """
91
+ Get the fully-qualified table reference for SQL Server.
92
+
93
+ Returns: [schema].[table] format.
94
+ """
95
+ if handle.scheme == "byoc" and handle.table_ref:
96
+ _db, schema, table = parse_table_reference(handle.table_ref)
97
+ schema = schema or get_default_schema(SQLSERVER)
98
+ return f"{self._esc(schema)}.{self._esc(table)}"
99
+ elif handle.db_params:
100
+ params: SqlServerConnectionParams = handle.db_params
101
+ return f"{self._esc(params.schema)}.{self._esc(params.table)}"
102
+ else:
103
+ raise ValueError("Handle has neither table_ref nor db_params")
104
+
105
+ def _get_schema_and_table(self, handle: DatasetHandle) -> Tuple[str, str]:
106
+ """
107
+ Get schema and table name separately for custom SQL placeholder replacement.
108
+
109
+ Returns: Tuple of (schema, table_name)
110
+ """
111
+ if handle.scheme == "byoc" and handle.table_ref:
112
+ _db, schema, table = parse_table_reference(handle.table_ref)
113
+ schema = schema or get_default_schema(SQLSERVER)
114
+ return schema, table
115
+ elif handle.db_params:
116
+ params: SqlServerConnectionParams = handle.db_params
117
+ return params.schema, params.table
118
+ else:
119
+ raise ValueError("Handle has neither table_ref nor db_params")
120
+
121
+ def introspect(self, handle: DatasetHandle, **kwargs) -> Dict[str, Any]:
122
+ """
123
+ Introspect the SQL Server table for metadata.
124
+
125
+ Returns:
126
+ {"row_count": int, "available_cols": [...], "staging": None}
127
+ """
128
+ table = self._get_table_reference(handle)
129
+
130
+ # Get schema and table name for information_schema query
131
+ if handle.scheme == "byoc" and handle.table_ref:
132
+ _db, schema, table_name = parse_table_reference(handle.table_ref)
133
+ schema = schema or get_default_schema(SQLSERVER)
134
+ elif handle.db_params:
135
+ params: SqlServerConnectionParams = handle.db_params
136
+ schema = params.schema
137
+ table_name = params.table
138
+ else:
139
+ raise ValueError("Handle has neither table_ref nor db_params")
140
+
141
+ with self._get_connection_ctx(handle) as conn:
142
+ cursor = conn.cursor()
143
+ try:
144
+ # Get row count
145
+ cursor.execute(f"SELECT COUNT(*) FROM {table}")
146
+ row_count = cursor.fetchone()
147
+ n = int(row_count[0]) if row_count else 0
148
+
149
+ # Get column names (pymssql uses %s for parameters)
150
+ cursor.execute(
151
+ """
152
+ SELECT column_name
153
+ FROM information_schema.columns
154
+ WHERE table_schema = %s AND table_name = %s
155
+ ORDER BY ordinal_position
156
+ """,
157
+ (schema, table_name),
158
+ )
159
+ cols = [row[0] for row in cursor.fetchall()]
160
+ finally:
161
+ cursor.close()
162
+
163
+ return {"row_count": n, "available_cols": cols, "staging": None}
@@ -0,0 +1,14 @@
1
+ # src/kontra/engine/materializers/__init__.py
2
+ from .base import BaseMaterializer
3
+ from .registry import (
4
+ pick_materializer,
5
+ register_default_materializers,
6
+ register_materializer,
7
+ )
8
+
9
+ __all__ = [
10
+ "BaseMaterializer",
11
+ "pick_materializer",
12
+ "register_materializer",
13
+ "register_default_materializers",
14
+ ]
@@ -0,0 +1,42 @@
1
+ # src/kontra/engine/materializers/base.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import polars as pl
7
+
8
+ from kontra.connectors.handle import DatasetHandle
9
+
10
+
11
+ class BaseMaterializer:
12
+ """
13
+ Minimal base class for materializers.
14
+
15
+ Defines the interface for:
16
+ - Loading a source into a Polars DataFrame (with projection)
17
+ - Peeking the schema
18
+ - Reporting I/O diagnostics
19
+ """
20
+
21
+ materializer_name: str = "unknown"
22
+
23
+ def __init__(self, handle: DatasetHandle):
24
+ """
25
+ Initialize the materializer with a data source handle.
26
+
27
+ Args:
28
+ handle: The DatasetHandle containing the URI and fs_opts.
29
+ """
30
+ self.handle = handle
31
+
32
+ def schema(self) -> List[str]:
33
+ """Return column names without materializing data (best effort)."""
34
+ raise NotImplementedError
35
+
36
+ def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
37
+ """Materialize directly as a Polars DataFrame."""
38
+ raise NotImplementedError
39
+
40
+ def io_debug(self) -> Optional[Dict[str, Any]]:
41
+ """Return last I/O diagnostics for observability (or None)."""
42
+ return None # Default implementation
@@ -0,0 +1,110 @@
1
+ # src/kontra/engine/materializers/duckdb.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import time
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import polars as pl
9
+ import duckdb
10
+
11
+ # --- Kontra Imports ---
12
+ from kontra.engine.backends.duckdb_session import create_duckdb_connection
13
+ from kontra.engine.backends.duckdb_utils import (
14
+ esc_ident,
15
+ lit_str,
16
+ )
17
+ from kontra.connectors.handle import DatasetHandle
18
+
19
+ from .base import BaseMaterializer # Import from new base file
20
+ from .registry import register_materializer
21
+
22
+
23
+ @register_materializer("duckdb")
24
+ class DuckDBMaterializer(BaseMaterializer):
25
+ """
26
+ Column-pruned materialization via DuckDB httpfs → Arrow → Polars.
27
+
28
+ Guarantees:
29
+ - **Format aware**: Parquet via read_parquet(), CSV via read_csv_auto().
30
+ - **Projection**: SELECT only requested columns at source (true pruning).
31
+ - **Low copy**: Arrow table handoff → Polars DataFrame.
32
+ - **Remote support**: S3/HTTP via DuckDB httpfs (loaded in session factory).
33
+
34
+ Scope:
35
+ - I/O only. Does NOT execute rule SQL; the SQL executor handles pushdown.
36
+ """
37
+
38
+ def __init__(self, handle: DatasetHandle):
39
+ super().__init__(handle)
40
+ self.source = handle.uri
41
+ self._io_debug_enabled = bool(os.getenv("KONTRA_IO_DEBUG"))
42
+ self._last_io_debug: Optional[Dict[str, Any]] = None
43
+ self.con = create_duckdb_connection(self.handle)
44
+
45
+ # ---------- Materializer API ----------
46
+
47
+ def schema(self) -> List[str]:
48
+ """
49
+ Return column names without materializing data (best effort, format-aware).
50
+ """
51
+ read_fn = self._get_read_function()
52
+ cur = self.con.execute(
53
+ f"SELECT * FROM {read_fn}({lit_str(self.source)}) LIMIT 0"
54
+ )
55
+ return [d[0] for d in cur.description] if cur.description else []
56
+
57
+ def to_polars(self, columns: Optional[List[str]]) -> pl.DataFrame:
58
+ """
59
+ Materialize the requested columns as a Polars DataFrame via Arrow.
60
+ """
61
+ # Route through Arrow for consistent, low-copy materialization.
62
+ import pyarrow as pa # noqa: F401
63
+
64
+ cols_sql = (
65
+ ", ".join(esc_ident(c) for c in (columns or [])) if columns else "*"
66
+ )
67
+ read_func = self._get_read_function()
68
+
69
+ t0 = time.perf_counter()
70
+ query = f"SELECT {cols_sql} FROM {read_func}({lit_str(self.source)})"
71
+ cur = self.con.execute(query)
72
+ table = cur.fetch_arrow_table()
73
+ t1 = time.perf_counter()
74
+
75
+ if self._io_debug_enabled:
76
+ self._last_io_debug = {
77
+ "materializer": "duckdb",
78
+ "mode": "duckdb_project_to_arrow",
79
+ "columns_requested": list(columns or []),
80
+ "column_count": len(columns or []),
81
+ "elapsed_ms": int((t1 - t0) * 1000),
82
+ }
83
+ else:
84
+ self._last_io_debug = None
85
+
86
+ return pl.from_arrow(table)
87
+
88
+ def io_debug(self) -> Optional[Dict[str, Any]]:
89
+ return self._last_io_debug
90
+
91
+ # ---------- Internals ----------
92
+
93
+ def _get_read_function(self) -> str:
94
+ """
95
+ Return the correct DuckDB read function based on file format.
96
+
97
+ Notes:
98
+ - For CSV, we prefer DuckDB's read_csv_auto() for performance.
99
+ - CSV options (delimiter, header, etc.) can be threaded from
100
+ connector handle in future (TODO), but auto inference is robust
101
+ for most standardized data lake dumps.
102
+ """
103
+ fmt = (self.handle.format or "").lower()
104
+ if fmt == "parquet":
105
+ return "read_parquet"
106
+ if fmt == "csv":
107
+ return "read_csv_auto"
108
+
109
+ # Fallback: attempt format autodetection; Parquet is most common.
110
+ return "read_parquet"
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+ from typing import Optional
3
+ from ...connectors.capabilities import ConnectorCapabilities as CC # re-export path differs when imported from engine
4
+ from .duckdb import DuckDBMaterializer
5
+ from .polars_connector import PolarsConnectorMaterializer
6
+
7
+ def is_s3_uri(val: str | None) -> bool:
8
+ return isinstance(val, str) and val.lower().startswith("s3://")
9
+
10
+ class MaterializerFactory:
11
+ @staticmethod
12
+ def from_source(source: str, connector, caps: int, prefer_remote_pruning: bool):
13
+ """
14
+ Choose the best materializer for a given source and connector capabilities.
15
+
16
+ Strategy (v1):
17
+ - If remote S3 and we prefer pruning → DuckDBMaterializer (httpfs + Arrow)
18
+ - Else → PolarsConnectorMaterializer (direct connector.load)
19
+ """
20
+ if is_s3_uri(source) and prefer_remote_pruning and (caps & (CC.PUSHDOWN | CC.REMOTE_PARTIAL)):
21
+ return DuckDBMaterializer(source)
22
+ return PolarsConnectorMaterializer(source, connector)