kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# src/kontra/engine/executors/__init__.py
|
|
2
|
+
from .base import SqlExecutor
|
|
3
|
+
from .registry import (
|
|
4
|
+
pick_executor,
|
|
5
|
+
register_default_executors,
|
|
6
|
+
register_executor,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
# Re-export for convenience
|
|
10
|
+
__all__ = [
|
|
11
|
+
"SqlExecutor",
|
|
12
|
+
"pick_executor",
|
|
13
|
+
"register_executor",
|
|
14
|
+
"register_default_executors",
|
|
15
|
+
]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# src/kontra/engine/executors/base.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List, Protocol
|
|
5
|
+
|
|
6
|
+
from kontra.connectors.handle import DatasetHandle
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SqlExecutor(Protocol):
|
|
10
|
+
"""
|
|
11
|
+
Protocol for a pluggable, SQL-based rule executor.
|
|
12
|
+
|
|
13
|
+
An executor is responsible for:
|
|
14
|
+
1. Reporting if it can handle a given data source and rule set.
|
|
15
|
+
2. Compiling a list of Kontra rules into a single SQL query.
|
|
16
|
+
3. Executing that query and returning results in the Kontra format.
|
|
17
|
+
4. (Optional) Introspecting the data source for metadata.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
name: str = "sql_executor"
|
|
21
|
+
|
|
22
|
+
def supports(
|
|
23
|
+
self, handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
|
|
24
|
+
) -> bool:
|
|
25
|
+
"""
|
|
26
|
+
Return True if this executor can run against the given handle
|
|
27
|
+
and supports at least one of the provided SQL-compatible rules.
|
|
28
|
+
"""
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
def compile(self, sql_specs: List[Dict[str, Any]]) -> Any:
|
|
32
|
+
"""
|
|
33
|
+
Compile the list of rule specs into a native, executable query plan
|
|
34
|
+
(e.g., a SQL string).
|
|
35
|
+
"""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
def execute(self, handle: DatasetHandle, compiled_plan: Any) -> Dict[str, Any]:
|
|
39
|
+
"""
|
|
40
|
+
Execute the compiled plan against the data in the handle.
|
|
41
|
+
Must return a dict: {"results": [...]}
|
|
42
|
+
"""
|
|
43
|
+
...
|
|
44
|
+
|
|
45
|
+
def introspect(self, handle: DatasetHandle) -> Dict[str, Any]:
|
|
46
|
+
"""
|
|
47
|
+
Perform lightweight introspection (e.g., row count, column names).
|
|
48
|
+
Must return a dict: {"row_count": int, "available_cols": list[str]}
|
|
49
|
+
"""
|
|
50
|
+
...
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
# src/kontra/engine/executors/database_base.py
|
|
2
|
+
"""
|
|
3
|
+
Base class for database SQL executors (PostgreSQL, SQL Server).
|
|
4
|
+
|
|
5
|
+
This module provides shared implementation for compile() and execute() methods,
|
|
6
|
+
reducing code duplication between database-specific executors.
|
|
7
|
+
|
|
8
|
+
Each subclass must define:
|
|
9
|
+
- DIALECT: "postgres" or "sqlserver"
|
|
10
|
+
- SUPPORTED_RULES: Set of rule kinds this executor supports
|
|
11
|
+
- _get_connection_ctx(): Connection context manager
|
|
12
|
+
- _get_table_reference(): Fully-qualified table reference
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from contextlib import contextmanager
|
|
19
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
20
|
+
|
|
21
|
+
from kontra.connectors.handle import DatasetHandle
|
|
22
|
+
from kontra.engine.sql_utils import (
|
|
23
|
+
esc_ident,
|
|
24
|
+
agg_unique,
|
|
25
|
+
agg_min_rows,
|
|
26
|
+
agg_max_rows,
|
|
27
|
+
agg_allowed_values,
|
|
28
|
+
agg_disallowed_values,
|
|
29
|
+
agg_freshness,
|
|
30
|
+
agg_range,
|
|
31
|
+
agg_length,
|
|
32
|
+
agg_regex,
|
|
33
|
+
agg_contains,
|
|
34
|
+
agg_starts_with,
|
|
35
|
+
agg_ends_with,
|
|
36
|
+
agg_compare,
|
|
37
|
+
agg_conditional_not_null,
|
|
38
|
+
agg_conditional_range,
|
|
39
|
+
exists_not_null,
|
|
40
|
+
results_from_row,
|
|
41
|
+
Dialect,
|
|
42
|
+
)
|
|
43
|
+
from kontra.engine.sql_validator import validate_sql, replace_table_placeholder, to_count_query
|
|
44
|
+
from kontra.logging import get_logger
|
|
45
|
+
|
|
46
|
+
from .base import SqlExecutor
|
|
47
|
+
|
|
48
|
+
_logger = get_logger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabaseSqlExecutor(SqlExecutor, ABC):
|
|
52
|
+
"""
|
|
53
|
+
Abstract base class for database-backed SQL executors.
|
|
54
|
+
|
|
55
|
+
Provides shared implementation for compile() and execute() methods.
|
|
56
|
+
Subclasses must implement dialect-specific connection and table handling.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
# Subclasses must define these
|
|
60
|
+
DIALECT: Dialect
|
|
61
|
+
SUPPORTED_RULES: Set[str]
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def name(self) -> str:
|
|
66
|
+
"""Executor name for registry."""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
@abstractmethod
|
|
70
|
+
@contextmanager
|
|
71
|
+
def _get_connection_ctx(self, handle: DatasetHandle):
|
|
72
|
+
"""
|
|
73
|
+
Get a database connection context manager.
|
|
74
|
+
|
|
75
|
+
For BYOC, yields the external connection directly.
|
|
76
|
+
For URI-based, yields a new owned connection.
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def _get_table_reference(self, handle: DatasetHandle) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Get the fully-qualified table reference for the handle.
|
|
84
|
+
|
|
85
|
+
Returns: "schema.table" format with proper escaping.
|
|
86
|
+
"""
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def _supports_scheme(self, scheme: str, handle: DatasetHandle) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Check if this executor supports the given URI scheme.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
scheme: The URI scheme (lowercase)
|
|
96
|
+
handle: The dataset handle for additional context (e.g., dialect)
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
True if this executor can handle the scheme
|
|
100
|
+
"""
|
|
101
|
+
...
|
|
102
|
+
|
|
103
|
+
def _esc(self, name: str) -> str:
|
|
104
|
+
"""Escape an identifier for this dialect."""
|
|
105
|
+
return esc_ident(name, self.DIALECT)
|
|
106
|
+
|
|
107
|
+
def _get_schema_and_table(self, handle: DatasetHandle) -> Tuple[str, str]:
|
|
108
|
+
"""
|
|
109
|
+
Get schema and table name separately (for custom SQL placeholder replacement).
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Tuple of (schema, table_name)
|
|
113
|
+
"""
|
|
114
|
+
# Default implementation - subclasses should override
|
|
115
|
+
# This extracts from the table reference or connection params
|
|
116
|
+
raise NotImplementedError("Subclass must implement _get_schema_and_table")
|
|
117
|
+
|
|
118
|
+
def _assemble_single_row(self, selects: List[str], table: str) -> str:
|
|
119
|
+
"""Build a single-row aggregate query from multiple SELECT expressions."""
|
|
120
|
+
if not selects:
|
|
121
|
+
return "SELECT 0 AS __no_sql_rules__;"
|
|
122
|
+
return f"SELECT {', '.join(selects)} FROM {table};"
|
|
123
|
+
|
|
124
|
+
def _assemble_exists_query(self, exists_exprs: List[str]) -> str:
|
|
125
|
+
"""Build a query with multiple EXISTS checks."""
|
|
126
|
+
if not exists_exprs:
|
|
127
|
+
return ""
|
|
128
|
+
return f"SELECT {', '.join(exists_exprs)};"
|
|
129
|
+
|
|
130
|
+
def supports(
|
|
131
|
+
self, handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
|
|
132
|
+
) -> bool:
|
|
133
|
+
"""Check if this executor can handle the given handle and rules."""
|
|
134
|
+
scheme = (handle.scheme or "").lower()
|
|
135
|
+
|
|
136
|
+
if not self._supports_scheme(scheme, handle):
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
# Must have at least one supported rule
|
|
140
|
+
return any(
|
|
141
|
+
s.get("kind") in self.SUPPORTED_RULES
|
|
142
|
+
for s in (sql_specs or [])
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def compile(self, sql_specs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
146
|
+
"""
|
|
147
|
+
Compile rule specs into three-phase execution plan.
|
|
148
|
+
|
|
149
|
+
Phase 1: EXISTS checks for not_null rules (fast, early-terminate)
|
|
150
|
+
Phase 2: Aggregate query for most rules (batched into single query)
|
|
151
|
+
Phase 3: Custom SQL queries (each executed individually)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
{
|
|
155
|
+
"exists_specs": [...], # Phase 1: not_null rules
|
|
156
|
+
"aggregate_selects": [...], # Phase 2: aggregate expressions
|
|
157
|
+
"aggregate_specs": [...], # Phase 2: specs for aggregates
|
|
158
|
+
"custom_sql_specs": [...], # Phase 3: custom SQL queries
|
|
159
|
+
"supported_specs": [...], # All supported specs
|
|
160
|
+
}
|
|
161
|
+
"""
|
|
162
|
+
exists_specs: List[Dict[str, Any]] = []
|
|
163
|
+
aggregate_selects: List[str] = []
|
|
164
|
+
aggregate_specs: List[Dict[str, Any]] = []
|
|
165
|
+
custom_sql_specs: List[Dict[str, Any]] = []
|
|
166
|
+
supported_specs: List[Dict[str, Any]] = []
|
|
167
|
+
|
|
168
|
+
for spec in sql_specs or []:
|
|
169
|
+
kind = spec.get("kind")
|
|
170
|
+
rule_id = spec.get("rule_id")
|
|
171
|
+
|
|
172
|
+
if not (kind and rule_id):
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Skip unsupported rules
|
|
176
|
+
if kind not in self.SUPPORTED_RULES:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
if kind == "custom_sql_check":
|
|
180
|
+
# Validate SQL is safe using sqlglot before accepting
|
|
181
|
+
user_sql = spec.get("sql")
|
|
182
|
+
if user_sql:
|
|
183
|
+
# Replace {table} with dummy name for validation
|
|
184
|
+
# (sqlglot can't parse {table} as valid SQL)
|
|
185
|
+
test_sql = user_sql.replace("{table}", "_validation_table_")
|
|
186
|
+
validation = validate_sql(test_sql, dialect=self.DIALECT)
|
|
187
|
+
if validation.is_safe:
|
|
188
|
+
custom_sql_specs.append(spec)
|
|
189
|
+
supported_specs.append(spec)
|
|
190
|
+
else:
|
|
191
|
+
_logger.warning(
|
|
192
|
+
f"custom_sql_check '{rule_id}' rejected for remote execution: "
|
|
193
|
+
f"{validation.reason}"
|
|
194
|
+
)
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
if kind == "not_null":
|
|
198
|
+
col = spec.get("column")
|
|
199
|
+
if isinstance(col, str) and col:
|
|
200
|
+
exists_specs.append(spec)
|
|
201
|
+
supported_specs.append(spec)
|
|
202
|
+
|
|
203
|
+
elif kind == "unique":
|
|
204
|
+
col = spec.get("column")
|
|
205
|
+
if isinstance(col, str) and col:
|
|
206
|
+
aggregate_selects.append(agg_unique(col, rule_id, self.DIALECT))
|
|
207
|
+
aggregate_specs.append(spec)
|
|
208
|
+
supported_specs.append(spec)
|
|
209
|
+
|
|
210
|
+
elif kind == "min_rows":
|
|
211
|
+
threshold = spec.get("threshold", 0)
|
|
212
|
+
aggregate_selects.append(agg_min_rows(int(threshold), rule_id, self.DIALECT))
|
|
213
|
+
aggregate_specs.append(spec)
|
|
214
|
+
supported_specs.append(spec)
|
|
215
|
+
|
|
216
|
+
elif kind == "max_rows":
|
|
217
|
+
threshold = spec.get("threshold", 0)
|
|
218
|
+
aggregate_selects.append(agg_max_rows(int(threshold), rule_id, self.DIALECT))
|
|
219
|
+
aggregate_specs.append(spec)
|
|
220
|
+
supported_specs.append(spec)
|
|
221
|
+
|
|
222
|
+
elif kind == "allowed_values":
|
|
223
|
+
col = spec.get("column")
|
|
224
|
+
values = spec.get("values", [])
|
|
225
|
+
if isinstance(col, str) and col and values:
|
|
226
|
+
aggregate_selects.append(agg_allowed_values(col, values, rule_id, self.DIALECT))
|
|
227
|
+
aggregate_specs.append(spec)
|
|
228
|
+
supported_specs.append(spec)
|
|
229
|
+
|
|
230
|
+
elif kind == "disallowed_values":
|
|
231
|
+
col = spec.get("column")
|
|
232
|
+
values = spec.get("values", [])
|
|
233
|
+
if isinstance(col, str) and col and values:
|
|
234
|
+
aggregate_selects.append(agg_disallowed_values(col, values, rule_id, self.DIALECT))
|
|
235
|
+
aggregate_specs.append(spec)
|
|
236
|
+
supported_specs.append(spec)
|
|
237
|
+
|
|
238
|
+
elif kind == "freshness":
|
|
239
|
+
col = spec.get("column")
|
|
240
|
+
max_age_seconds = spec.get("max_age_seconds")
|
|
241
|
+
if isinstance(col, str) and col and isinstance(max_age_seconds, int):
|
|
242
|
+
aggregate_selects.append(agg_freshness(col, max_age_seconds, rule_id, self.DIALECT))
|
|
243
|
+
aggregate_specs.append(spec)
|
|
244
|
+
supported_specs.append(spec)
|
|
245
|
+
|
|
246
|
+
elif kind == "range":
|
|
247
|
+
col = spec.get("column")
|
|
248
|
+
min_val = spec.get("min")
|
|
249
|
+
max_val = spec.get("max")
|
|
250
|
+
if isinstance(col, str) and col and (min_val is not None or max_val is not None):
|
|
251
|
+
aggregate_selects.append(agg_range(col, min_val, max_val, rule_id, self.DIALECT))
|
|
252
|
+
aggregate_specs.append(spec)
|
|
253
|
+
supported_specs.append(spec)
|
|
254
|
+
|
|
255
|
+
elif kind == "length":
|
|
256
|
+
col = spec.get("column")
|
|
257
|
+
min_len = spec.get("min")
|
|
258
|
+
max_len = spec.get("max")
|
|
259
|
+
if isinstance(col, str) and col and (min_len is not None or max_len is not None):
|
|
260
|
+
aggregate_selects.append(agg_length(col, min_len, max_len, rule_id, self.DIALECT))
|
|
261
|
+
aggregate_specs.append(spec)
|
|
262
|
+
supported_specs.append(spec)
|
|
263
|
+
|
|
264
|
+
elif kind == "regex":
|
|
265
|
+
col = spec.get("column")
|
|
266
|
+
pattern = spec.get("pattern")
|
|
267
|
+
if isinstance(col, str) and col and isinstance(pattern, str) and pattern:
|
|
268
|
+
aggregate_selects.append(agg_regex(col, pattern, rule_id, self.DIALECT))
|
|
269
|
+
aggregate_specs.append(spec)
|
|
270
|
+
supported_specs.append(spec)
|
|
271
|
+
|
|
272
|
+
elif kind == "contains":
|
|
273
|
+
col = spec.get("column")
|
|
274
|
+
substring = spec.get("substring")
|
|
275
|
+
if isinstance(col, str) and col and isinstance(substring, str) and substring:
|
|
276
|
+
aggregate_selects.append(agg_contains(col, substring, rule_id, self.DIALECT))
|
|
277
|
+
aggregate_specs.append(spec)
|
|
278
|
+
supported_specs.append(spec)
|
|
279
|
+
|
|
280
|
+
elif kind == "starts_with":
|
|
281
|
+
col = spec.get("column")
|
|
282
|
+
prefix = spec.get("prefix")
|
|
283
|
+
if isinstance(col, str) and col and isinstance(prefix, str) and prefix:
|
|
284
|
+
aggregate_selects.append(agg_starts_with(col, prefix, rule_id, self.DIALECT))
|
|
285
|
+
aggregate_specs.append(spec)
|
|
286
|
+
supported_specs.append(spec)
|
|
287
|
+
|
|
288
|
+
elif kind == "ends_with":
|
|
289
|
+
col = spec.get("column")
|
|
290
|
+
suffix = spec.get("suffix")
|
|
291
|
+
if isinstance(col, str) and col and isinstance(suffix, str) and suffix:
|
|
292
|
+
aggregate_selects.append(agg_ends_with(col, suffix, rule_id, self.DIALECT))
|
|
293
|
+
aggregate_specs.append(spec)
|
|
294
|
+
supported_specs.append(spec)
|
|
295
|
+
|
|
296
|
+
elif kind == "compare":
|
|
297
|
+
left = spec.get("left")
|
|
298
|
+
right = spec.get("right")
|
|
299
|
+
op = spec.get("op")
|
|
300
|
+
if (isinstance(left, str) and left and
|
|
301
|
+
isinstance(right, str) and right and
|
|
302
|
+
isinstance(op, str) and op):
|
|
303
|
+
aggregate_selects.append(agg_compare(left, right, op, rule_id, self.DIALECT))
|
|
304
|
+
aggregate_specs.append(spec)
|
|
305
|
+
supported_specs.append(spec)
|
|
306
|
+
|
|
307
|
+
elif kind == "conditional_not_null":
|
|
308
|
+
col = spec.get("column")
|
|
309
|
+
when_column = spec.get("when_column")
|
|
310
|
+
when_op = spec.get("when_op")
|
|
311
|
+
when_value = spec.get("when_value")
|
|
312
|
+
if (isinstance(col, str) and col and
|
|
313
|
+
isinstance(when_column, str) and when_column and
|
|
314
|
+
isinstance(when_op, str) and when_op):
|
|
315
|
+
aggregate_selects.append(
|
|
316
|
+
agg_conditional_not_null(col, when_column, when_op, when_value, rule_id, self.DIALECT)
|
|
317
|
+
)
|
|
318
|
+
aggregate_specs.append(spec)
|
|
319
|
+
supported_specs.append(spec)
|
|
320
|
+
|
|
321
|
+
elif kind == "conditional_range":
|
|
322
|
+
col = spec.get("column")
|
|
323
|
+
when_column = spec.get("when_column")
|
|
324
|
+
when_op = spec.get("when_op")
|
|
325
|
+
when_value = spec.get("when_value")
|
|
326
|
+
min_val = spec.get("min")
|
|
327
|
+
max_val = spec.get("max")
|
|
328
|
+
if (isinstance(col, str) and col and
|
|
329
|
+
isinstance(when_column, str) and when_column and
|
|
330
|
+
isinstance(when_op, str) and when_op and
|
|
331
|
+
(min_val is not None or max_val is not None)):
|
|
332
|
+
aggregate_selects.append(
|
|
333
|
+
agg_conditional_range(col, when_column, when_op, when_value, min_val, max_val, rule_id, self.DIALECT)
|
|
334
|
+
)
|
|
335
|
+
aggregate_specs.append(spec)
|
|
336
|
+
supported_specs.append(spec)
|
|
337
|
+
|
|
338
|
+
elif kind == "custom_agg":
|
|
339
|
+
# Custom rule with to_sql_agg() - use the pre-generated SQL
|
|
340
|
+
sql_agg = spec.get("sql_agg", {})
|
|
341
|
+
# Try exact dialect match first, then fallback for sqlserver/mssql naming
|
|
342
|
+
agg_expr = sql_agg.get(self.DIALECT)
|
|
343
|
+
if not agg_expr and self.DIALECT == "sqlserver":
|
|
344
|
+
agg_expr = sql_agg.get("mssql") # Fallback: mssql -> sqlserver
|
|
345
|
+
if agg_expr:
|
|
346
|
+
aggregate_selects.append(f'{agg_expr} AS "{rule_id}"')
|
|
347
|
+
aggregate_specs.append(spec)
|
|
348
|
+
supported_specs.append(spec)
|
|
349
|
+
|
|
350
|
+
return {
|
|
351
|
+
"exists_specs": exists_specs,
|
|
352
|
+
"aggregate_selects": aggregate_selects,
|
|
353
|
+
"aggregate_specs": aggregate_specs,
|
|
354
|
+
"custom_sql_specs": custom_sql_specs,
|
|
355
|
+
"supported_specs": supported_specs,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
def execute(
|
|
359
|
+
self,
|
|
360
|
+
handle: DatasetHandle,
|
|
361
|
+
compiled_plan: Dict[str, Any],
|
|
362
|
+
**kwargs,
|
|
363
|
+
) -> Dict[str, Any]:
|
|
364
|
+
"""
|
|
365
|
+
Execute the compiled plan in three phases.
|
|
366
|
+
|
|
367
|
+
Phase 1: EXISTS checks for not_null (fast, can early-terminate)
|
|
368
|
+
Phase 2: Aggregate query for most rules (batched)
|
|
369
|
+
Phase 3: Custom SQL queries (each executed individually)
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
{"results": [...], "staging": None}
|
|
373
|
+
"""
|
|
374
|
+
exists_specs = compiled_plan.get("exists_specs", [])
|
|
375
|
+
aggregate_selects = compiled_plan.get("aggregate_selects", [])
|
|
376
|
+
custom_sql_specs = compiled_plan.get("custom_sql_specs", [])
|
|
377
|
+
|
|
378
|
+
if not exists_specs and not aggregate_selects and not custom_sql_specs:
|
|
379
|
+
return {"results": [], "staging": None}
|
|
380
|
+
|
|
381
|
+
table = self._get_table_reference(handle)
|
|
382
|
+
results: List[Dict[str, Any]] = []
|
|
383
|
+
|
|
384
|
+
# Build rule_kinds mapping from specs
|
|
385
|
+
rule_kinds = {}
|
|
386
|
+
for spec in exists_specs:
|
|
387
|
+
rule_kinds[spec["rule_id"]] = spec.get("kind")
|
|
388
|
+
for spec in compiled_plan.get("aggregate_specs", []):
|
|
389
|
+
rule_kinds[spec["rule_id"]] = spec.get("kind")
|
|
390
|
+
for spec in custom_sql_specs:
|
|
391
|
+
rule_kinds[spec["rule_id"]] = spec.get("kind")
|
|
392
|
+
|
|
393
|
+
with self._get_connection_ctx(handle) as conn:
|
|
394
|
+
cursor = self._get_cursor(conn)
|
|
395
|
+
try:
|
|
396
|
+
# Phase 1: EXISTS checks for not_null rules
|
|
397
|
+
if exists_specs:
|
|
398
|
+
exists_exprs = [
|
|
399
|
+
exists_not_null(
|
|
400
|
+
spec["column"],
|
|
401
|
+
spec["rule_id"],
|
|
402
|
+
table,
|
|
403
|
+
self.DIALECT
|
|
404
|
+
)
|
|
405
|
+
for spec in exists_specs
|
|
406
|
+
]
|
|
407
|
+
exists_sql = self._assemble_exists_query(exists_exprs)
|
|
408
|
+
cursor.execute(exists_sql)
|
|
409
|
+
row = cursor.fetchone()
|
|
410
|
+
columns = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
411
|
+
|
|
412
|
+
if row and columns:
|
|
413
|
+
exists_results = results_from_row(columns, row, is_exists=True, rule_kinds=rule_kinds)
|
|
414
|
+
results.extend(exists_results)
|
|
415
|
+
|
|
416
|
+
# Phase 2: Aggregate query for remaining rules
|
|
417
|
+
if aggregate_selects:
|
|
418
|
+
agg_sql = self._assemble_single_row(aggregate_selects, table)
|
|
419
|
+
cursor.execute(agg_sql)
|
|
420
|
+
row = cursor.fetchone()
|
|
421
|
+
columns = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
422
|
+
|
|
423
|
+
if row and columns:
|
|
424
|
+
agg_results = results_from_row(columns, row, is_exists=False, rule_kinds=rule_kinds)
|
|
425
|
+
results.extend(agg_results)
|
|
426
|
+
|
|
427
|
+
# Phase 3: Custom SQL queries (executed individually)
|
|
428
|
+
if custom_sql_specs:
|
|
429
|
+
custom_results = self._execute_custom_sql_queries(
|
|
430
|
+
cursor, handle, custom_sql_specs
|
|
431
|
+
)
|
|
432
|
+
results.extend(custom_results)
|
|
433
|
+
finally:
|
|
434
|
+
self._close_cursor(cursor)
|
|
435
|
+
|
|
436
|
+
return {"results": results, "staging": None}
|
|
437
|
+
|
|
438
|
+
def _execute_custom_sql_queries(
|
|
439
|
+
self,
|
|
440
|
+
cursor,
|
|
441
|
+
handle: DatasetHandle,
|
|
442
|
+
custom_sql_specs: List[Dict[str, Any]],
|
|
443
|
+
) -> List[Dict[str, Any]]:
|
|
444
|
+
"""
|
|
445
|
+
Execute custom SQL queries (Phase 3).
|
|
446
|
+
|
|
447
|
+
Each custom_sql_check query is transformed to return a COUNT(*) and executed.
|
|
448
|
+
The user writes a query that selects "violation rows", and we count them.
|
|
449
|
+
|
|
450
|
+
Transformation strategy:
|
|
451
|
+
- Simple SELECT: Rewrite to COUNT(*) directly
|
|
452
|
+
- DISTINCT/GROUP BY/LIMIT: Wrap in SELECT COUNT(*) FROM (...) AS _v
|
|
453
|
+
"""
|
|
454
|
+
results: List[Dict[str, Any]] = []
|
|
455
|
+
|
|
456
|
+
# Get schema and table for placeholder replacement
|
|
457
|
+
try:
|
|
458
|
+
schema, table_name = self._get_schema_and_table(handle)
|
|
459
|
+
except NotImplementedError:
|
|
460
|
+
# Fallback: extract from full table reference
|
|
461
|
+
_logger.warning("_get_schema_and_table not implemented, custom SQL skipped")
|
|
462
|
+
return results
|
|
463
|
+
|
|
464
|
+
for spec in custom_sql_specs:
|
|
465
|
+
rule_id = spec["rule_id"]
|
|
466
|
+
user_sql = spec.get("sql", "")
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
# Step 1: Replace {table} placeholder with properly formatted table reference
|
|
470
|
+
formatted_sql = replace_table_placeholder(
|
|
471
|
+
sql=user_sql,
|
|
472
|
+
schema=schema,
|
|
473
|
+
table=table_name,
|
|
474
|
+
dialect=self.DIALECT,
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Step 2: Transform to COUNT(*) query
|
|
478
|
+
success, count_sql = to_count_query(formatted_sql, dialect=self.DIALECT)
|
|
479
|
+
if not success:
|
|
480
|
+
raise ValueError(f"Failed to transform SQL: {count_sql}")
|
|
481
|
+
|
|
482
|
+
# Step 3: Execute and read the count
|
|
483
|
+
cursor.execute(count_sql)
|
|
484
|
+
row = cursor.fetchone()
|
|
485
|
+
|
|
486
|
+
if row is None or len(row) < 1:
|
|
487
|
+
raise ValueError("Query returned no result")
|
|
488
|
+
|
|
489
|
+
failed_count = int(row[0]) if row[0] is not None else 0
|
|
490
|
+
|
|
491
|
+
passed = failed_count == 0
|
|
492
|
+
results.append({
|
|
493
|
+
"rule_id": rule_id,
|
|
494
|
+
"passed": passed,
|
|
495
|
+
"failed_count": failed_count,
|
|
496
|
+
"message": "Passed" if passed else f"Custom SQL check failed for {failed_count} rows",
|
|
497
|
+
"execution_source": self.DIALECT,
|
|
498
|
+
})
|
|
499
|
+
|
|
500
|
+
except Exception as e:
|
|
501
|
+
_logger.warning(f"Custom SQL execution failed for '{rule_id}': {e}")
|
|
502
|
+
results.append({
|
|
503
|
+
"rule_id": rule_id,
|
|
504
|
+
"passed": False,
|
|
505
|
+
"failed_count": 1, # Unknown, but at least 1 issue
|
|
506
|
+
"message": f"Custom SQL execution failed: {e}",
|
|
507
|
+
"execution_source": self.DIALECT,
|
|
508
|
+
})
|
|
509
|
+
|
|
510
|
+
return results
|
|
511
|
+
|
|
512
|
+
def _get_cursor(self, conn):
|
|
513
|
+
"""
|
|
514
|
+
Get a cursor from the connection.
|
|
515
|
+
|
|
516
|
+
Default implementation calls conn.cursor().
|
|
517
|
+
Subclasses can override for different behavior.
|
|
518
|
+
"""
|
|
519
|
+
return conn.cursor()
|
|
520
|
+
|
|
521
|
+
def _close_cursor(self, cursor):
|
|
522
|
+
"""
|
|
523
|
+
Close a cursor if needed.
|
|
524
|
+
|
|
525
|
+
Default implementation does nothing (cursor closed by context manager).
|
|
526
|
+
Subclasses can override for connections that don't use context managers.
|
|
527
|
+
"""
|
|
528
|
+
pass
|