kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
DuckDB SQL Executor — format-aware with reliable CSV→Parquet staging.
|
|
5
|
+
|
|
6
|
+
- Parquet sources: read_parquet(...)
|
|
7
|
+
- CSV sources:
|
|
8
|
+
csv_mode=auto → try read_csv_auto(...); on failure stage to Parquet
|
|
9
|
+
csv_mode=duckdb → read_csv_auto(...) only (propagate errors)
|
|
10
|
+
csv_mode=parquet → always stage CSV→Parquet via DuckDB COPY (forced execution)
|
|
11
|
+
|
|
12
|
+
Executor computes aggregate failure counts for SQL-capable rules and exposes
|
|
13
|
+
light introspection. The engine may reuse staged Parquet for materialization
|
|
14
|
+
to avoid a second CSV parse.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import tempfile
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
import duckdb
|
|
23
|
+
|
|
24
|
+
# --- Kontra Imports ---
|
|
25
|
+
from kontra.engine.backends.duckdb_session import create_duckdb_connection
|
|
26
|
+
from kontra.engine.backends.duckdb_utils import esc_ident, lit_str
|
|
27
|
+
from kontra.connectors.handle import DatasetHandle
|
|
28
|
+
from kontra.engine.sql_utils import (
|
|
29
|
+
esc_ident as sql_esc_ident,
|
|
30
|
+
agg_min_rows,
|
|
31
|
+
agg_max_rows,
|
|
32
|
+
agg_freshness,
|
|
33
|
+
agg_range,
|
|
34
|
+
agg_length,
|
|
35
|
+
agg_regex,
|
|
36
|
+
agg_unique,
|
|
37
|
+
agg_contains,
|
|
38
|
+
agg_starts_with,
|
|
39
|
+
agg_ends_with,
|
|
40
|
+
agg_compare,
|
|
41
|
+
agg_conditional_not_null,
|
|
42
|
+
agg_conditional_range,
|
|
43
|
+
agg_allowed_values,
|
|
44
|
+
agg_disallowed_values,
|
|
45
|
+
exists_not_null,
|
|
46
|
+
results_from_row,
|
|
47
|
+
SQL_OP_MAP,
|
|
48
|
+
RULE_KIND_TO_FAILURE_MODE,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Optional: s3fs + polars for fallback when DuckDB httpfs fails
|
|
52
|
+
try:
|
|
53
|
+
import s3fs
|
|
54
|
+
import polars as pl
|
|
55
|
+
_HAS_S3FS = True
|
|
56
|
+
except ImportError:
|
|
57
|
+
_HAS_S3FS = False
|
|
58
|
+
|
|
59
|
+
from .base import SqlExecutor
|
|
60
|
+
from .registry import register_executor
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ------------------------------- CSV helpers -------------------------------- #
|
|
64
|
+
|
|
65
|
+
def _is_csv(handle: DatasetHandle) -> bool:
|
|
66
|
+
fmt = (getattr(handle, "format", "") or "").lower()
|
|
67
|
+
if fmt:
|
|
68
|
+
return fmt == "csv"
|
|
69
|
+
uri = (handle.uri or "").lower().split("?", 1)[0]
|
|
70
|
+
return uri.endswith(".csv") or uri.endswith(".csv.gz")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _install_httpfs(con: duckdb.DuckDBPyConnection, handle: DatasetHandle) -> None:
|
|
74
|
+
scheme = (handle.scheme or "").lower()
|
|
75
|
+
if scheme in {"s3", "http", "https"}:
|
|
76
|
+
con.execute("INSTALL httpfs;")
|
|
77
|
+
con.execute("LOAD httpfs;")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _stage_csv_to_parquet_with_duckdb(
|
|
81
|
+
con: duckdb.DuckDBPyConnection, source_uri: str
|
|
82
|
+
) -> Tuple[str, tempfile.TemporaryDirectory]:
|
|
83
|
+
"""
|
|
84
|
+
Force a real CSV scan and Parquet write using DuckDB COPY.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
(parquet_path, tmpdir) — tmpdir MUST be kept alive by the caller.
|
|
88
|
+
"""
|
|
89
|
+
tmpdir = tempfile.TemporaryDirectory(prefix="kontra_csv_stage_")
|
|
90
|
+
stage_path = Path(tmpdir.name) / "kontra_stage.parquet"
|
|
91
|
+
|
|
92
|
+
# Ensure httpfs is loaded for remote URIs; COPY will stream CSV → Parquet.
|
|
93
|
+
# We explicitly go through a SELECT to allow future CSV options if needed.
|
|
94
|
+
con.execute(
|
|
95
|
+
f"COPY (SELECT * FROM read_csv_auto({lit_str(source_uri)})) "
|
|
96
|
+
f"TO {lit_str(str(stage_path))} (FORMAT PARQUET)"
|
|
97
|
+
)
|
|
98
|
+
return str(stage_path), tmpdir
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _stage_csv_to_parquet_with_s3fs(
|
|
102
|
+
handle: DatasetHandle,
|
|
103
|
+
) -> Tuple[str, tempfile.TemporaryDirectory]:
|
|
104
|
+
"""
|
|
105
|
+
Fallback: Stage S3 CSV to Parquet using s3fs + Polars.
|
|
106
|
+
Used when DuckDB httpfs fails with connection errors on large files.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
(parquet_path, tmpdir) — tmpdir MUST be kept alive by the caller.
|
|
110
|
+
"""
|
|
111
|
+
if not _HAS_S3FS:
|
|
112
|
+
raise ImportError("s3fs and polars required for S3 CSV fallback")
|
|
113
|
+
|
|
114
|
+
tmpdir = tempfile.TemporaryDirectory(prefix="kontra_csv_stage_s3fs_")
|
|
115
|
+
stage_path = Path(tmpdir.name) / "kontra_stage.parquet"
|
|
116
|
+
|
|
117
|
+
# Build s3fs client from handle's fs_opts
|
|
118
|
+
opts = handle.fs_opts or {}
|
|
119
|
+
s3_kwargs: Dict[str, Any] = {}
|
|
120
|
+
if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
|
|
121
|
+
s3_kwargs["key"] = opts["s3_access_key_id"]
|
|
122
|
+
s3_kwargs["secret"] = opts["s3_secret_access_key"]
|
|
123
|
+
if opts.get("s3_endpoint"):
|
|
124
|
+
endpoint = opts["s3_endpoint"]
|
|
125
|
+
# s3fs expects endpoint_url with scheme
|
|
126
|
+
if not endpoint.startswith(("http://", "https://")):
|
|
127
|
+
# Infer scheme from s3_use_ssl or default to http for custom endpoints
|
|
128
|
+
scheme = "https" if opts.get("s3_use_ssl", "").lower() == "true" else "http"
|
|
129
|
+
endpoint = f"{scheme}://{endpoint}"
|
|
130
|
+
s3_kwargs["endpoint_url"] = endpoint
|
|
131
|
+
# Force path-style for custom endpoints (MinIO)
|
|
132
|
+
s3_kwargs["client_kwargs"] = {"region_name": opts.get("s3_region", "us-east-1")}
|
|
133
|
+
|
|
134
|
+
fs = s3fs.S3FileSystem(**s3_kwargs)
|
|
135
|
+
|
|
136
|
+
# Strip s3:// prefix for s3fs
|
|
137
|
+
s3_path = handle.uri
|
|
138
|
+
if s3_path.lower().startswith("s3://"):
|
|
139
|
+
s3_path = s3_path[5:]
|
|
140
|
+
|
|
141
|
+
# Read CSV with s3fs → Polars → write Parquet
|
|
142
|
+
with fs.open(s3_path, "rb") as f:
|
|
143
|
+
df = pl.read_csv(f)
|
|
144
|
+
df.write_parquet(str(stage_path))
|
|
145
|
+
|
|
146
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
147
|
+
print(f"[INFO] Staged S3 CSV via s3fs+Polars: {handle.uri} → {stage_path}")
|
|
148
|
+
|
|
149
|
+
return str(stage_path), tmpdir
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _create_source_view(
|
|
153
|
+
con: duckdb.DuckDBPyConnection,
|
|
154
|
+
handle: DatasetHandle,
|
|
155
|
+
view: str,
|
|
156
|
+
*,
|
|
157
|
+
csv_mode: str = "auto", # auto | duckdb | parquet
|
|
158
|
+
) -> Tuple[Optional[tempfile.TemporaryDirectory], Optional[str], str]:
|
|
159
|
+
"""
|
|
160
|
+
Create a DuckDB view named `view` over the dataset (format-aware).
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
(owned_tmpdir, staged_parquet_path, mode_used)
|
|
164
|
+
"""
|
|
165
|
+
_install_httpfs(con, handle)
|
|
166
|
+
|
|
167
|
+
if not _is_csv(handle):
|
|
168
|
+
con.execute(
|
|
169
|
+
f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
|
|
170
|
+
f"SELECT * FROM read_parquet({lit_str(handle.uri)})"
|
|
171
|
+
)
|
|
172
|
+
return None, None, "parquet"
|
|
173
|
+
|
|
174
|
+
mode = (csv_mode or "auto").lower()
|
|
175
|
+
if mode not in {"auto", "duckdb", "parquet"}:
|
|
176
|
+
mode = "auto"
|
|
177
|
+
|
|
178
|
+
if mode in {"auto", "duckdb"}:
|
|
179
|
+
try:
|
|
180
|
+
con.execute(
|
|
181
|
+
f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
|
|
182
|
+
f"SELECT * FROM read_csv_auto({lit_str(handle.uri)})"
|
|
183
|
+
)
|
|
184
|
+
return None, None, "duckdb"
|
|
185
|
+
except duckdb.Error:
|
|
186
|
+
if mode == "duckdb":
|
|
187
|
+
# Caller asked to use DuckDB CSV strictly; bubble up.
|
|
188
|
+
raise
|
|
189
|
+
con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)}")
|
|
190
|
+
|
|
191
|
+
# Explicit staging path (or auto-fallback) using DuckDB COPY
|
|
192
|
+
# For S3 CSV files, DuckDB httpfs can fail with connection errors on large files.
|
|
193
|
+
# In that case, fall back to s3fs + Polars staging.
|
|
194
|
+
try:
|
|
195
|
+
staged_path, tmpdir = _stage_csv_to_parquet_with_duckdb(con, handle.uri)
|
|
196
|
+
except duckdb.Error as e:
|
|
197
|
+
err_str = str(e).lower()
|
|
198
|
+
is_connection_error = (
|
|
199
|
+
"connection error" in err_str
|
|
200
|
+
or "failed to read" in err_str
|
|
201
|
+
or "timeout" in err_str
|
|
202
|
+
or "timed out" in err_str
|
|
203
|
+
)
|
|
204
|
+
is_s3 = (handle.scheme or "").lower() == "s3"
|
|
205
|
+
|
|
206
|
+
if is_connection_error and is_s3 and _HAS_S3FS:
|
|
207
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
208
|
+
print(f"[INFO] DuckDB httpfs failed for S3 CSV, falling back to s3fs+Polars: {e}")
|
|
209
|
+
staged_path, tmpdir = _stage_csv_to_parquet_with_s3fs(handle)
|
|
210
|
+
else:
|
|
211
|
+
raise
|
|
212
|
+
|
|
213
|
+
con.execute(
|
|
214
|
+
f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
|
|
215
|
+
f"SELECT * FROM read_parquet({lit_str(staged_path)})"
|
|
216
|
+
)
|
|
217
|
+
return tmpdir, staged_path, "parquet"
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# ------------------------------- SQL helpers -------------------------------- #
|
|
221
|
+
|
|
222
|
+
# DuckDB dialect constant
|
|
223
|
+
DIALECT = "duckdb"
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _assemble_single_row(selects: List[str]) -> str:
|
|
227
|
+
if not selects:
|
|
228
|
+
return "SELECT 0 AS __no_sql_rules__ LIMIT 1;"
|
|
229
|
+
ctes, aliases = [], []
|
|
230
|
+
for i, sel in enumerate(selects):
|
|
231
|
+
nm = f"a{i}"
|
|
232
|
+
ctes.append(f"{nm} AS (SELECT {sel} FROM _data)")
|
|
233
|
+
aliases.append(nm)
|
|
234
|
+
with_clause = "WITH " + ", ".join(ctes)
|
|
235
|
+
cross = " CROSS JOIN ".join(aliases)
|
|
236
|
+
return f"{with_clause} SELECT * FROM {cross};"
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _results_from_single_row_map(values: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
240
|
+
out: List[Dict[str, Any]] = []
|
|
241
|
+
for rule_id, failed in values.items():
|
|
242
|
+
if rule_id == "__no_sql_rules__":
|
|
243
|
+
continue
|
|
244
|
+
failed_count = int(failed) if failed is not None else 0
|
|
245
|
+
out.append(
|
|
246
|
+
{
|
|
247
|
+
"rule_id": rule_id,
|
|
248
|
+
"passed": failed_count == 0,
|
|
249
|
+
"failed_count": failed_count,
|
|
250
|
+
"message": "Passed" if failed_count == 0 else "Failed",
|
|
251
|
+
"severity": "ERROR",
|
|
252
|
+
"actions_executed": [],
|
|
253
|
+
"execution_source": "sql",
|
|
254
|
+
}
|
|
255
|
+
)
|
|
256
|
+
return out
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# --------------------------- DuckDB SQL Executor ------------------------------
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@register_executor("duckdb")
|
|
263
|
+
class DuckDBSqlExecutor(SqlExecutor):
|
|
264
|
+
"""
|
|
265
|
+
DuckDB-based SQL pushdown executor:
|
|
266
|
+
- not_null(column)
|
|
267
|
+
- min_rows(threshold)
|
|
268
|
+
- max_rows(threshold)
|
|
269
|
+
- freshness(column, max_age_seconds)
|
|
270
|
+
- range(column, min, max)
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
name = "duckdb"
|
|
274
|
+
|
|
275
|
+
SUPPORTED_RULES = {
|
|
276
|
+
"not_null", "unique", "min_rows", "max_rows", "freshness",
|
|
277
|
+
"range", "length",
|
|
278
|
+
"regex", "contains", "starts_with", "ends_with",
|
|
279
|
+
"compare", "conditional_not_null", "conditional_range",
|
|
280
|
+
"custom_agg", "allowed_values", "disallowed_values"
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
def supports(
|
|
284
|
+
self, handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
|
|
285
|
+
) -> bool:
|
|
286
|
+
scheme = (handle.scheme or "").lower()
|
|
287
|
+
if scheme not in {"", "file", "s3", "http", "https"}:
|
|
288
|
+
return False
|
|
289
|
+
return any((s.get("kind") in self.SUPPORTED_RULES) for s in (sql_specs or []))
|
|
290
|
+
|
|
291
|
+
def compile(self, sql_specs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
292
|
+
"""
|
|
293
|
+
Compile rule specs into two-phase execution plan.
|
|
294
|
+
|
|
295
|
+
Phase 1: EXISTS checks for not_null rules (fast, early-terminate)
|
|
296
|
+
Phase 2: Aggregate query for remaining rules
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
{
|
|
300
|
+
"exists_specs": [...], # Phase 1: not_null rules
|
|
301
|
+
"aggregate_selects": [...], # Phase 2: aggregate expressions
|
|
302
|
+
"aggregate_specs": [...], # Phase 2: specs for aggregates
|
|
303
|
+
"supported_specs": [...], # All supported specs
|
|
304
|
+
}
|
|
305
|
+
"""
|
|
306
|
+
exists_specs: List[Dict[str, Any]] = []
|
|
307
|
+
aggregate_selects: List[str] = []
|
|
308
|
+
aggregate_specs: List[Dict[str, Any]] = []
|
|
309
|
+
supported_specs: List[Dict[str, Any]] = []
|
|
310
|
+
|
|
311
|
+
for spec in sql_specs or []:
|
|
312
|
+
kind = spec.get("kind")
|
|
313
|
+
rid = spec.get("rule_id")
|
|
314
|
+
if not (kind and rid):
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
if kind == "not_null":
|
|
318
|
+
# Phase 1: Use EXISTS for not_null (faster with early termination)
|
|
319
|
+
col = spec.get("column")
|
|
320
|
+
if isinstance(col, str) and col:
|
|
321
|
+
exists_specs.append(spec)
|
|
322
|
+
supported_specs.append(spec)
|
|
323
|
+
|
|
324
|
+
elif kind == "unique":
|
|
325
|
+
col = spec.get("column")
|
|
326
|
+
if isinstance(col, str) and col:
|
|
327
|
+
aggregate_selects.append(agg_unique(col, rid, DIALECT))
|
|
328
|
+
aggregate_specs.append(spec)
|
|
329
|
+
supported_specs.append(spec)
|
|
330
|
+
|
|
331
|
+
elif kind == "min_rows":
|
|
332
|
+
aggregate_selects.append(agg_min_rows(int(spec.get("threshold", 0)), rid, DIALECT))
|
|
333
|
+
aggregate_specs.append(spec)
|
|
334
|
+
supported_specs.append(spec)
|
|
335
|
+
|
|
336
|
+
elif kind == "max_rows":
|
|
337
|
+
aggregate_selects.append(agg_max_rows(int(spec.get("threshold", 0)), rid, DIALECT))
|
|
338
|
+
aggregate_specs.append(spec)
|
|
339
|
+
supported_specs.append(spec)
|
|
340
|
+
|
|
341
|
+
elif kind == "freshness":
|
|
342
|
+
col = spec.get("column")
|
|
343
|
+
max_age_seconds = spec.get("max_age_seconds")
|
|
344
|
+
if isinstance(col, str) and col and isinstance(max_age_seconds, int):
|
|
345
|
+
aggregate_selects.append(agg_freshness(col, max_age_seconds, rid, DIALECT))
|
|
346
|
+
aggregate_specs.append(spec)
|
|
347
|
+
supported_specs.append(spec)
|
|
348
|
+
|
|
349
|
+
elif kind == "range":
|
|
350
|
+
col = spec.get("column")
|
|
351
|
+
min_val = spec.get("min")
|
|
352
|
+
max_val = spec.get("max")
|
|
353
|
+
if isinstance(col, str) and col and (min_val is not None or max_val is not None):
|
|
354
|
+
aggregate_selects.append(agg_range(col, min_val, max_val, rid, DIALECT))
|
|
355
|
+
aggregate_specs.append(spec)
|
|
356
|
+
supported_specs.append(spec)
|
|
357
|
+
|
|
358
|
+
elif kind == "regex":
|
|
359
|
+
col = spec.get("column")
|
|
360
|
+
pattern = spec.get("pattern")
|
|
361
|
+
if isinstance(col, str) and col and isinstance(pattern, str) and pattern:
|
|
362
|
+
aggregate_selects.append(agg_regex(col, pattern, rid, DIALECT))
|
|
363
|
+
aggregate_specs.append(spec)
|
|
364
|
+
supported_specs.append(spec)
|
|
365
|
+
|
|
366
|
+
elif kind == "allowed_values":
|
|
367
|
+
col = spec.get("column")
|
|
368
|
+
values = spec.get("values")
|
|
369
|
+
if isinstance(col, str) and col and values is not None:
|
|
370
|
+
aggregate_selects.append(agg_allowed_values(col, values, rid, DIALECT))
|
|
371
|
+
aggregate_specs.append(spec)
|
|
372
|
+
supported_specs.append(spec)
|
|
373
|
+
|
|
374
|
+
elif kind == "disallowed_values":
|
|
375
|
+
col = spec.get("column")
|
|
376
|
+
values = spec.get("values")
|
|
377
|
+
if isinstance(col, str) and col and values is not None:
|
|
378
|
+
aggregate_selects.append(agg_disallowed_values(col, values, rid, DIALECT))
|
|
379
|
+
aggregate_specs.append(spec)
|
|
380
|
+
supported_specs.append(spec)
|
|
381
|
+
|
|
382
|
+
elif kind == "length":
|
|
383
|
+
col = spec.get("column")
|
|
384
|
+
min_len = spec.get("min")
|
|
385
|
+
max_len = spec.get("max")
|
|
386
|
+
if isinstance(col, str) and col and (min_len is not None or max_len is not None):
|
|
387
|
+
aggregate_selects.append(agg_length(col, min_len, max_len, rid, DIALECT))
|
|
388
|
+
aggregate_specs.append(spec)
|
|
389
|
+
supported_specs.append(spec)
|
|
390
|
+
|
|
391
|
+
elif kind == "contains":
|
|
392
|
+
col = spec.get("column")
|
|
393
|
+
substring = spec.get("substring")
|
|
394
|
+
if isinstance(col, str) and col and isinstance(substring, str) and substring:
|
|
395
|
+
aggregate_selects.append(agg_contains(col, substring, rid, DIALECT))
|
|
396
|
+
aggregate_specs.append(spec)
|
|
397
|
+
supported_specs.append(spec)
|
|
398
|
+
|
|
399
|
+
elif kind == "starts_with":
|
|
400
|
+
col = spec.get("column")
|
|
401
|
+
prefix = spec.get("prefix")
|
|
402
|
+
if isinstance(col, str) and col and isinstance(prefix, str) and prefix:
|
|
403
|
+
aggregate_selects.append(agg_starts_with(col, prefix, rid, DIALECT))
|
|
404
|
+
aggregate_specs.append(spec)
|
|
405
|
+
supported_specs.append(spec)
|
|
406
|
+
|
|
407
|
+
elif kind == "ends_with":
|
|
408
|
+
col = spec.get("column")
|
|
409
|
+
suffix = spec.get("suffix")
|
|
410
|
+
if isinstance(col, str) and col and isinstance(suffix, str) and suffix:
|
|
411
|
+
aggregate_selects.append(agg_ends_with(col, suffix, rid, DIALECT))
|
|
412
|
+
aggregate_specs.append(spec)
|
|
413
|
+
supported_specs.append(spec)
|
|
414
|
+
|
|
415
|
+
elif kind == "compare":
|
|
416
|
+
left = spec.get("left")
|
|
417
|
+
right = spec.get("right")
|
|
418
|
+
op = spec.get("op")
|
|
419
|
+
if (isinstance(left, str) and left and
|
|
420
|
+
isinstance(right, str) and right and
|
|
421
|
+
isinstance(op, str) and op in SQL_OP_MAP):
|
|
422
|
+
aggregate_selects.append(agg_compare(left, right, op, rid, DIALECT))
|
|
423
|
+
aggregate_specs.append(spec)
|
|
424
|
+
supported_specs.append(spec)
|
|
425
|
+
|
|
426
|
+
elif kind == "conditional_not_null":
|
|
427
|
+
col = spec.get("column")
|
|
428
|
+
when_column = spec.get("when_column")
|
|
429
|
+
when_op = spec.get("when_op")
|
|
430
|
+
when_value = spec.get("when_value") # Can be None
|
|
431
|
+
if (isinstance(col, str) and col and
|
|
432
|
+
isinstance(when_column, str) and when_column and
|
|
433
|
+
isinstance(when_op, str) and when_op in SQL_OP_MAP):
|
|
434
|
+
aggregate_selects.append(agg_conditional_not_null(col, when_column, when_op, when_value, rid, DIALECT))
|
|
435
|
+
aggregate_specs.append(spec)
|
|
436
|
+
supported_specs.append(spec)
|
|
437
|
+
|
|
438
|
+
elif kind == "conditional_range":
|
|
439
|
+
col = spec.get("column")
|
|
440
|
+
when_column = spec.get("when_column")
|
|
441
|
+
when_op = spec.get("when_op")
|
|
442
|
+
when_value = spec.get("when_value") # Can be None
|
|
443
|
+
min_val = spec.get("min")
|
|
444
|
+
max_val = spec.get("max")
|
|
445
|
+
if (isinstance(col, str) and col and
|
|
446
|
+
isinstance(when_column, str) and when_column and
|
|
447
|
+
isinstance(when_op, str) and when_op in SQL_OP_MAP and
|
|
448
|
+
(min_val is not None or max_val is not None)):
|
|
449
|
+
aggregate_selects.append(agg_conditional_range(col, when_column, when_op, when_value, min_val, max_val, rid, DIALECT))
|
|
450
|
+
aggregate_specs.append(spec)
|
|
451
|
+
supported_specs.append(spec)
|
|
452
|
+
|
|
453
|
+
elif kind == "custom_agg":
|
|
454
|
+
# Custom rule with to_sql_agg() - use the pre-generated SQL
|
|
455
|
+
sql_agg = spec.get("sql_agg", {})
|
|
456
|
+
agg_expr = sql_agg.get(DIALECT) or sql_agg.get("duckdb")
|
|
457
|
+
if agg_expr:
|
|
458
|
+
aggregate_selects.append(f'{agg_expr} AS "{rid}"')
|
|
459
|
+
aggregate_specs.append(spec)
|
|
460
|
+
supported_specs.append(spec)
|
|
461
|
+
|
|
462
|
+
return {
|
|
463
|
+
"exists_specs": exists_specs,
|
|
464
|
+
"aggregate_selects": aggregate_selects,
|
|
465
|
+
"aggregate_specs": aggregate_specs,
|
|
466
|
+
"supported_specs": supported_specs,
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
def execute(
|
|
470
|
+
self,
|
|
471
|
+
handle: DatasetHandle,
|
|
472
|
+
compiled_plan: Dict[str, Any],
|
|
473
|
+
*,
|
|
474
|
+
csv_mode: str = "auto",
|
|
475
|
+
) -> Dict[str, Any]:
|
|
476
|
+
"""
|
|
477
|
+
Execute the compiled plan in two phases, honoring csv_mode for CSV URIs.
|
|
478
|
+
|
|
479
|
+
Phase 1: EXISTS checks for not_null (fast, can early-terminate)
|
|
480
|
+
Phase 2: Aggregate query for remaining rules
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
{
|
|
484
|
+
"results": [...],
|
|
485
|
+
"staging": {"path": <parquet_path>|None, "tmpdir": <TemporaryDirectory>|None}
|
|
486
|
+
}
|
|
487
|
+
"""
|
|
488
|
+
exists_specs = compiled_plan.get("exists_specs", [])
|
|
489
|
+
aggregate_selects = compiled_plan.get("aggregate_selects", [])
|
|
490
|
+
|
|
491
|
+
if not exists_specs and not aggregate_selects:
|
|
492
|
+
return {"results": [], "staging": {"path": None, "tmpdir": None}}
|
|
493
|
+
|
|
494
|
+
con = create_duckdb_connection(handle)
|
|
495
|
+
view = "_data"
|
|
496
|
+
tmpdir: Optional[tempfile.TemporaryDirectory] = None
|
|
497
|
+
staged_path: Optional[str] = None
|
|
498
|
+
results: List[Dict[str, Any]] = []
|
|
499
|
+
|
|
500
|
+
# Build rule_kinds mapping from specs
|
|
501
|
+
rule_kinds = {}
|
|
502
|
+
for spec in exists_specs:
|
|
503
|
+
rule_kinds[spec["rule_id"]] = spec.get("kind")
|
|
504
|
+
for spec in compiled_plan.get("aggregate_specs", []):
|
|
505
|
+
rule_kinds[spec["rule_id"]] = spec.get("kind")
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
tmpdir, staged_path, _ = _create_source_view(con, handle, view, csv_mode=csv_mode)
|
|
509
|
+
|
|
510
|
+
# Phase 1: EXISTS checks for not_null rules
|
|
511
|
+
if exists_specs:
|
|
512
|
+
exists_exprs = [
|
|
513
|
+
exists_not_null(
|
|
514
|
+
spec["column"],
|
|
515
|
+
spec["rule_id"],
|
|
516
|
+
esc_ident(view),
|
|
517
|
+
"duckdb"
|
|
518
|
+
)
|
|
519
|
+
for spec in exists_specs
|
|
520
|
+
]
|
|
521
|
+
exists_sql = f"SELECT {', '.join(exists_exprs)};"
|
|
522
|
+
cur = con.execute(exists_sql)
|
|
523
|
+
row = cur.fetchone()
|
|
524
|
+
cols = [d[0] for d in cur.description] if (row and cur.description) else []
|
|
525
|
+
|
|
526
|
+
if row and cols:
|
|
527
|
+
exists_results = results_from_row(cols, row, is_exists=True, rule_kinds=rule_kinds)
|
|
528
|
+
results.extend(exists_results)
|
|
529
|
+
|
|
530
|
+
# Phase 2: Aggregate query for remaining rules
|
|
531
|
+
if aggregate_selects:
|
|
532
|
+
agg_sql = _assemble_single_row(aggregate_selects)
|
|
533
|
+
cur = con.execute(agg_sql)
|
|
534
|
+
row = cur.fetchone()
|
|
535
|
+
cols = [d[0] for d in cur.description] if (row and cur.description) else []
|
|
536
|
+
|
|
537
|
+
if row and cols:
|
|
538
|
+
agg_results = results_from_row(cols, row, is_exists=False, rule_kinds=rule_kinds)
|
|
539
|
+
results.extend(agg_results)
|
|
540
|
+
|
|
541
|
+
# Get row count and column names (avoid separate introspect call)
|
|
542
|
+
row_count = None
|
|
543
|
+
available_cols = []
|
|
544
|
+
try:
|
|
545
|
+
nrow = con.execute(f"SELECT COUNT(*) FROM {esc_ident(view)}").fetchone()
|
|
546
|
+
row_count = int(nrow[0]) if nrow and nrow[0] is not None else None
|
|
547
|
+
cur = con.execute(f"SELECT * FROM {esc_ident(view)} LIMIT 0")
|
|
548
|
+
available_cols = [d[0] for d in cur.description] if cur.description else []
|
|
549
|
+
except Exception:
|
|
550
|
+
pass # Non-fatal - introspect can still be called
|
|
551
|
+
|
|
552
|
+
return {
|
|
553
|
+
"results": results,
|
|
554
|
+
"staging": {"path": staged_path, "tmpdir": tmpdir},
|
|
555
|
+
"row_count": row_count,
|
|
556
|
+
"available_cols": available_cols,
|
|
557
|
+
}
|
|
558
|
+
except Exception:
|
|
559
|
+
if tmpdir is not None:
|
|
560
|
+
tmpdir.cleanup()
|
|
561
|
+
raise
|
|
562
|
+
finally:
|
|
563
|
+
try:
|
|
564
|
+
con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)};")
|
|
565
|
+
except Exception:
|
|
566
|
+
pass
|
|
567
|
+
|
|
568
|
+
def introspect(
|
|
569
|
+
self,
|
|
570
|
+
handle: DatasetHandle,
|
|
571
|
+
*,
|
|
572
|
+
csv_mode: str = "auto",
|
|
573
|
+
) -> Dict[str, Any]:
|
|
574
|
+
"""
|
|
575
|
+
Introspect row count and columns, honoring csv_mode.
|
|
576
|
+
Returns:
|
|
577
|
+
{
|
|
578
|
+
"row_count": int,
|
|
579
|
+
"available_cols": [...],
|
|
580
|
+
"staging": {"path": <parquet_path>|None, "tmpdir": <TemporaryDirectory>|None}
|
|
581
|
+
}
|
|
582
|
+
"""
|
|
583
|
+
con = create_duckdb_connection(handle)
|
|
584
|
+
view = "_data"
|
|
585
|
+
tmpdir: Optional[tempfile.TemporaryDirectory] = None
|
|
586
|
+
staged_path: Optional[str] = None
|
|
587
|
+
|
|
588
|
+
try:
|
|
589
|
+
tmpdir, staged_path, _ = _create_source_view(con, handle, view, csv_mode=csv_mode)
|
|
590
|
+
nrow = con.execute(f"SELECT COUNT(*) AS n FROM {esc_ident(view)}").fetchone()
|
|
591
|
+
n = int(nrow[0]) if nrow and nrow[0] is not None else 0
|
|
592
|
+
cur = con.execute(f"SELECT * FROM {esc_ident(view)} LIMIT 0")
|
|
593
|
+
cols = [d[0] for d in cur.description] if cur.description else []
|
|
594
|
+
return {
|
|
595
|
+
"row_count": n,
|
|
596
|
+
"available_cols": cols,
|
|
597
|
+
"staging": {"path": staged_path, "tmpdir": tmpdir},
|
|
598
|
+
}
|
|
599
|
+
except Exception:
|
|
600
|
+
if tmpdir is not None:
|
|
601
|
+
tmpdir.cleanup()
|
|
602
|
+
raise
|
|
603
|
+
finally:
|
|
604
|
+
try:
|
|
605
|
+
con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)};")
|
|
606
|
+
except Exception:
|
|
607
|
+
pass
|