kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,607 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ DuckDB SQL Executor — format-aware with reliable CSV→Parquet staging.
5
+
6
+ - Parquet sources: read_parquet(...)
7
+ - CSV sources:
8
+ csv_mode=auto → try read_csv_auto(...); on failure stage to Parquet
9
+ csv_mode=duckdb → read_csv_auto(...) only (propagate errors)
10
+ csv_mode=parquet → always stage CSV→Parquet via DuckDB COPY (forced execution)
11
+
12
+ Executor computes aggregate failure counts for SQL-capable rules and exposes
13
+ light introspection. The engine may reuse staged Parquet for materialization
14
+ to avoid a second CSV parse.
15
+ """
16
+
17
+ import os
18
+ import tempfile
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import duckdb
23
+
24
+ # --- Kontra Imports ---
25
+ from kontra.engine.backends.duckdb_session import create_duckdb_connection
26
+ from kontra.engine.backends.duckdb_utils import esc_ident, lit_str
27
+ from kontra.connectors.handle import DatasetHandle
28
+ from kontra.engine.sql_utils import (
29
+ esc_ident as sql_esc_ident,
30
+ agg_min_rows,
31
+ agg_max_rows,
32
+ agg_freshness,
33
+ agg_range,
34
+ agg_length,
35
+ agg_regex,
36
+ agg_unique,
37
+ agg_contains,
38
+ agg_starts_with,
39
+ agg_ends_with,
40
+ agg_compare,
41
+ agg_conditional_not_null,
42
+ agg_conditional_range,
43
+ agg_allowed_values,
44
+ agg_disallowed_values,
45
+ exists_not_null,
46
+ results_from_row,
47
+ SQL_OP_MAP,
48
+ RULE_KIND_TO_FAILURE_MODE,
49
+ )
50
+
51
+ # Optional: s3fs + polars for fallback when DuckDB httpfs fails
52
+ try:
53
+ import s3fs
54
+ import polars as pl
55
+ _HAS_S3FS = True
56
+ except ImportError:
57
+ _HAS_S3FS = False
58
+
59
+ from .base import SqlExecutor
60
+ from .registry import register_executor
61
+
62
+
63
+ # ------------------------------- CSV helpers -------------------------------- #
64
+
65
+ def _is_csv(handle: DatasetHandle) -> bool:
66
+ fmt = (getattr(handle, "format", "") or "").lower()
67
+ if fmt:
68
+ return fmt == "csv"
69
+ uri = (handle.uri or "").lower().split("?", 1)[0]
70
+ return uri.endswith(".csv") or uri.endswith(".csv.gz")
71
+
72
+
73
+ def _install_httpfs(con: duckdb.DuckDBPyConnection, handle: DatasetHandle) -> None:
74
+ scheme = (handle.scheme or "").lower()
75
+ if scheme in {"s3", "http", "https"}:
76
+ con.execute("INSTALL httpfs;")
77
+ con.execute("LOAD httpfs;")
78
+
79
+
80
+ def _stage_csv_to_parquet_with_duckdb(
81
+ con: duckdb.DuckDBPyConnection, source_uri: str
82
+ ) -> Tuple[str, tempfile.TemporaryDirectory]:
83
+ """
84
+ Force a real CSV scan and Parquet write using DuckDB COPY.
85
+
86
+ Returns:
87
+ (parquet_path, tmpdir) — tmpdir MUST be kept alive by the caller.
88
+ """
89
+ tmpdir = tempfile.TemporaryDirectory(prefix="kontra_csv_stage_")
90
+ stage_path = Path(tmpdir.name) / "kontra_stage.parquet"
91
+
92
+ # Ensure httpfs is loaded for remote URIs; COPY will stream CSV → Parquet.
93
+ # We explicitly go through a SELECT to allow future CSV options if needed.
94
+ con.execute(
95
+ f"COPY (SELECT * FROM read_csv_auto({lit_str(source_uri)})) "
96
+ f"TO {lit_str(str(stage_path))} (FORMAT PARQUET)"
97
+ )
98
+ return str(stage_path), tmpdir
99
+
100
+
101
+ def _stage_csv_to_parquet_with_s3fs(
102
+ handle: DatasetHandle,
103
+ ) -> Tuple[str, tempfile.TemporaryDirectory]:
104
+ """
105
+ Fallback: Stage S3 CSV to Parquet using s3fs + Polars.
106
+ Used when DuckDB httpfs fails with connection errors on large files.
107
+
108
+ Returns:
109
+ (parquet_path, tmpdir) — tmpdir MUST be kept alive by the caller.
110
+ """
111
+ if not _HAS_S3FS:
112
+ raise ImportError("s3fs and polars required for S3 CSV fallback")
113
+
114
+ tmpdir = tempfile.TemporaryDirectory(prefix="kontra_csv_stage_s3fs_")
115
+ stage_path = Path(tmpdir.name) / "kontra_stage.parquet"
116
+
117
+ # Build s3fs client from handle's fs_opts
118
+ opts = handle.fs_opts or {}
119
+ s3_kwargs: Dict[str, Any] = {}
120
+ if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
121
+ s3_kwargs["key"] = opts["s3_access_key_id"]
122
+ s3_kwargs["secret"] = opts["s3_secret_access_key"]
123
+ if opts.get("s3_endpoint"):
124
+ endpoint = opts["s3_endpoint"]
125
+ # s3fs expects endpoint_url with scheme
126
+ if not endpoint.startswith(("http://", "https://")):
127
+ # Infer scheme from s3_use_ssl or default to http for custom endpoints
128
+ scheme = "https" if opts.get("s3_use_ssl", "").lower() == "true" else "http"
129
+ endpoint = f"{scheme}://{endpoint}"
130
+ s3_kwargs["endpoint_url"] = endpoint
131
+ # Force path-style for custom endpoints (MinIO)
132
+ s3_kwargs["client_kwargs"] = {"region_name": opts.get("s3_region", "us-east-1")}
133
+
134
+ fs = s3fs.S3FileSystem(**s3_kwargs)
135
+
136
+ # Strip s3:// prefix for s3fs
137
+ s3_path = handle.uri
138
+ if s3_path.lower().startswith("s3://"):
139
+ s3_path = s3_path[5:]
140
+
141
+ # Read CSV with s3fs → Polars → write Parquet
142
+ with fs.open(s3_path, "rb") as f:
143
+ df = pl.read_csv(f)
144
+ df.write_parquet(str(stage_path))
145
+
146
+ if os.getenv("KONTRA_VERBOSE"):
147
+ print(f"[INFO] Staged S3 CSV via s3fs+Polars: {handle.uri} → {stage_path}")
148
+
149
+ return str(stage_path), tmpdir
150
+
151
+
152
+ def _create_source_view(
153
+ con: duckdb.DuckDBPyConnection,
154
+ handle: DatasetHandle,
155
+ view: str,
156
+ *,
157
+ csv_mode: str = "auto", # auto | duckdb | parquet
158
+ ) -> Tuple[Optional[tempfile.TemporaryDirectory], Optional[str], str]:
159
+ """
160
+ Create a DuckDB view named `view` over the dataset (format-aware).
161
+
162
+ Returns:
163
+ (owned_tmpdir, staged_parquet_path, mode_used)
164
+ """
165
+ _install_httpfs(con, handle)
166
+
167
+ if not _is_csv(handle):
168
+ con.execute(
169
+ f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
170
+ f"SELECT * FROM read_parquet({lit_str(handle.uri)})"
171
+ )
172
+ return None, None, "parquet"
173
+
174
+ mode = (csv_mode or "auto").lower()
175
+ if mode not in {"auto", "duckdb", "parquet"}:
176
+ mode = "auto"
177
+
178
+ if mode in {"auto", "duckdb"}:
179
+ try:
180
+ con.execute(
181
+ f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
182
+ f"SELECT * FROM read_csv_auto({lit_str(handle.uri)})"
183
+ )
184
+ return None, None, "duckdb"
185
+ except duckdb.Error:
186
+ if mode == "duckdb":
187
+ # Caller asked to use DuckDB CSV strictly; bubble up.
188
+ raise
189
+ con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)}")
190
+
191
+ # Explicit staging path (or auto-fallback) using DuckDB COPY
192
+ # For S3 CSV files, DuckDB httpfs can fail with connection errors on large files.
193
+ # In that case, fall back to s3fs + Polars staging.
194
+ try:
195
+ staged_path, tmpdir = _stage_csv_to_parquet_with_duckdb(con, handle.uri)
196
+ except duckdb.Error as e:
197
+ err_str = str(e).lower()
198
+ is_connection_error = (
199
+ "connection error" in err_str
200
+ or "failed to read" in err_str
201
+ or "timeout" in err_str
202
+ or "timed out" in err_str
203
+ )
204
+ is_s3 = (handle.scheme or "").lower() == "s3"
205
+
206
+ if is_connection_error and is_s3 and _HAS_S3FS:
207
+ if os.getenv("KONTRA_VERBOSE"):
208
+ print(f"[INFO] DuckDB httpfs failed for S3 CSV, falling back to s3fs+Polars: {e}")
209
+ staged_path, tmpdir = _stage_csv_to_parquet_with_s3fs(handle)
210
+ else:
211
+ raise
212
+
213
+ con.execute(
214
+ f"CREATE OR REPLACE VIEW {esc_ident(view)} AS "
215
+ f"SELECT * FROM read_parquet({lit_str(staged_path)})"
216
+ )
217
+ return tmpdir, staged_path, "parquet"
218
+
219
+
220
+ # ------------------------------- SQL helpers -------------------------------- #
221
+
222
+ # DuckDB dialect constant
223
+ DIALECT = "duckdb"
224
+
225
+
226
+ def _assemble_single_row(selects: List[str]) -> str:
227
+ if not selects:
228
+ return "SELECT 0 AS __no_sql_rules__ LIMIT 1;"
229
+ ctes, aliases = [], []
230
+ for i, sel in enumerate(selects):
231
+ nm = f"a{i}"
232
+ ctes.append(f"{nm} AS (SELECT {sel} FROM _data)")
233
+ aliases.append(nm)
234
+ with_clause = "WITH " + ", ".join(ctes)
235
+ cross = " CROSS JOIN ".join(aliases)
236
+ return f"{with_clause} SELECT * FROM {cross};"
237
+
238
+
239
+ def _results_from_single_row_map(values: Dict[str, Any]) -> List[Dict[str, Any]]:
240
+ out: List[Dict[str, Any]] = []
241
+ for rule_id, failed in values.items():
242
+ if rule_id == "__no_sql_rules__":
243
+ continue
244
+ failed_count = int(failed) if failed is not None else 0
245
+ out.append(
246
+ {
247
+ "rule_id": rule_id,
248
+ "passed": failed_count == 0,
249
+ "failed_count": failed_count,
250
+ "message": "Passed" if failed_count == 0 else "Failed",
251
+ "severity": "ERROR",
252
+ "actions_executed": [],
253
+ "execution_source": "sql",
254
+ }
255
+ )
256
+ return out
257
+
258
+
259
+ # --------------------------- DuckDB SQL Executor ------------------------------
260
+
261
+
262
+ @register_executor("duckdb")
263
+ class DuckDBSqlExecutor(SqlExecutor):
264
+ """
265
+ DuckDB-based SQL pushdown executor:
266
+ - not_null(column)
267
+ - min_rows(threshold)
268
+ - max_rows(threshold)
269
+ - freshness(column, max_age_seconds)
270
+ - range(column, min, max)
271
+ """
272
+
273
+ name = "duckdb"
274
+
275
+ SUPPORTED_RULES = {
276
+ "not_null", "unique", "min_rows", "max_rows", "freshness",
277
+ "range", "length",
278
+ "regex", "contains", "starts_with", "ends_with",
279
+ "compare", "conditional_not_null", "conditional_range",
280
+ "custom_agg", "allowed_values", "disallowed_values"
281
+ }
282
+
283
+ def supports(
284
+ self, handle: DatasetHandle, sql_specs: List[Dict[str, Any]]
285
+ ) -> bool:
286
+ scheme = (handle.scheme or "").lower()
287
+ if scheme not in {"", "file", "s3", "http", "https"}:
288
+ return False
289
+ return any((s.get("kind") in self.SUPPORTED_RULES) for s in (sql_specs or []))
290
+
291
+ def compile(self, sql_specs: List[Dict[str, Any]]) -> Dict[str, Any]:
292
+ """
293
+ Compile rule specs into two-phase execution plan.
294
+
295
+ Phase 1: EXISTS checks for not_null rules (fast, early-terminate)
296
+ Phase 2: Aggregate query for remaining rules
297
+
298
+ Returns:
299
+ {
300
+ "exists_specs": [...], # Phase 1: not_null rules
301
+ "aggregate_selects": [...], # Phase 2: aggregate expressions
302
+ "aggregate_specs": [...], # Phase 2: specs for aggregates
303
+ "supported_specs": [...], # All supported specs
304
+ }
305
+ """
306
+ exists_specs: List[Dict[str, Any]] = []
307
+ aggregate_selects: List[str] = []
308
+ aggregate_specs: List[Dict[str, Any]] = []
309
+ supported_specs: List[Dict[str, Any]] = []
310
+
311
+ for spec in sql_specs or []:
312
+ kind = spec.get("kind")
313
+ rid = spec.get("rule_id")
314
+ if not (kind and rid):
315
+ continue
316
+
317
+ if kind == "not_null":
318
+ # Phase 1: Use EXISTS for not_null (faster with early termination)
319
+ col = spec.get("column")
320
+ if isinstance(col, str) and col:
321
+ exists_specs.append(spec)
322
+ supported_specs.append(spec)
323
+
324
+ elif kind == "unique":
325
+ col = spec.get("column")
326
+ if isinstance(col, str) and col:
327
+ aggregate_selects.append(agg_unique(col, rid, DIALECT))
328
+ aggregate_specs.append(spec)
329
+ supported_specs.append(spec)
330
+
331
+ elif kind == "min_rows":
332
+ aggregate_selects.append(agg_min_rows(int(spec.get("threshold", 0)), rid, DIALECT))
333
+ aggregate_specs.append(spec)
334
+ supported_specs.append(spec)
335
+
336
+ elif kind == "max_rows":
337
+ aggregate_selects.append(agg_max_rows(int(spec.get("threshold", 0)), rid, DIALECT))
338
+ aggregate_specs.append(spec)
339
+ supported_specs.append(spec)
340
+
341
+ elif kind == "freshness":
342
+ col = spec.get("column")
343
+ max_age_seconds = spec.get("max_age_seconds")
344
+ if isinstance(col, str) and col and isinstance(max_age_seconds, int):
345
+ aggregate_selects.append(agg_freshness(col, max_age_seconds, rid, DIALECT))
346
+ aggregate_specs.append(spec)
347
+ supported_specs.append(spec)
348
+
349
+ elif kind == "range":
350
+ col = spec.get("column")
351
+ min_val = spec.get("min")
352
+ max_val = spec.get("max")
353
+ if isinstance(col, str) and col and (min_val is not None or max_val is not None):
354
+ aggregate_selects.append(agg_range(col, min_val, max_val, rid, DIALECT))
355
+ aggregate_specs.append(spec)
356
+ supported_specs.append(spec)
357
+
358
+ elif kind == "regex":
359
+ col = spec.get("column")
360
+ pattern = spec.get("pattern")
361
+ if isinstance(col, str) and col and isinstance(pattern, str) and pattern:
362
+ aggregate_selects.append(agg_regex(col, pattern, rid, DIALECT))
363
+ aggregate_specs.append(spec)
364
+ supported_specs.append(spec)
365
+
366
+ elif kind == "allowed_values":
367
+ col = spec.get("column")
368
+ values = spec.get("values")
369
+ if isinstance(col, str) and col and values is not None:
370
+ aggregate_selects.append(agg_allowed_values(col, values, rid, DIALECT))
371
+ aggregate_specs.append(spec)
372
+ supported_specs.append(spec)
373
+
374
+ elif kind == "disallowed_values":
375
+ col = spec.get("column")
376
+ values = spec.get("values")
377
+ if isinstance(col, str) and col and values is not None:
378
+ aggregate_selects.append(agg_disallowed_values(col, values, rid, DIALECT))
379
+ aggregate_specs.append(spec)
380
+ supported_specs.append(spec)
381
+
382
+ elif kind == "length":
383
+ col = spec.get("column")
384
+ min_len = spec.get("min")
385
+ max_len = spec.get("max")
386
+ if isinstance(col, str) and col and (min_len is not None or max_len is not None):
387
+ aggregate_selects.append(agg_length(col, min_len, max_len, rid, DIALECT))
388
+ aggregate_specs.append(spec)
389
+ supported_specs.append(spec)
390
+
391
+ elif kind == "contains":
392
+ col = spec.get("column")
393
+ substring = spec.get("substring")
394
+ if isinstance(col, str) and col and isinstance(substring, str) and substring:
395
+ aggregate_selects.append(agg_contains(col, substring, rid, DIALECT))
396
+ aggregate_specs.append(spec)
397
+ supported_specs.append(spec)
398
+
399
+ elif kind == "starts_with":
400
+ col = spec.get("column")
401
+ prefix = spec.get("prefix")
402
+ if isinstance(col, str) and col and isinstance(prefix, str) and prefix:
403
+ aggregate_selects.append(agg_starts_with(col, prefix, rid, DIALECT))
404
+ aggregate_specs.append(spec)
405
+ supported_specs.append(spec)
406
+
407
+ elif kind == "ends_with":
408
+ col = spec.get("column")
409
+ suffix = spec.get("suffix")
410
+ if isinstance(col, str) and col and isinstance(suffix, str) and suffix:
411
+ aggregate_selects.append(agg_ends_with(col, suffix, rid, DIALECT))
412
+ aggregate_specs.append(spec)
413
+ supported_specs.append(spec)
414
+
415
+ elif kind == "compare":
416
+ left = spec.get("left")
417
+ right = spec.get("right")
418
+ op = spec.get("op")
419
+ if (isinstance(left, str) and left and
420
+ isinstance(right, str) and right and
421
+ isinstance(op, str) and op in SQL_OP_MAP):
422
+ aggregate_selects.append(agg_compare(left, right, op, rid, DIALECT))
423
+ aggregate_specs.append(spec)
424
+ supported_specs.append(spec)
425
+
426
+ elif kind == "conditional_not_null":
427
+ col = spec.get("column")
428
+ when_column = spec.get("when_column")
429
+ when_op = spec.get("when_op")
430
+ when_value = spec.get("when_value") # Can be None
431
+ if (isinstance(col, str) and col and
432
+ isinstance(when_column, str) and when_column and
433
+ isinstance(when_op, str) and when_op in SQL_OP_MAP):
434
+ aggregate_selects.append(agg_conditional_not_null(col, when_column, when_op, when_value, rid, DIALECT))
435
+ aggregate_specs.append(spec)
436
+ supported_specs.append(spec)
437
+
438
+ elif kind == "conditional_range":
439
+ col = spec.get("column")
440
+ when_column = spec.get("when_column")
441
+ when_op = spec.get("when_op")
442
+ when_value = spec.get("when_value") # Can be None
443
+ min_val = spec.get("min")
444
+ max_val = spec.get("max")
445
+ if (isinstance(col, str) and col and
446
+ isinstance(when_column, str) and when_column and
447
+ isinstance(when_op, str) and when_op in SQL_OP_MAP and
448
+ (min_val is not None or max_val is not None)):
449
+ aggregate_selects.append(agg_conditional_range(col, when_column, when_op, when_value, min_val, max_val, rid, DIALECT))
450
+ aggregate_specs.append(spec)
451
+ supported_specs.append(spec)
452
+
453
+ elif kind == "custom_agg":
454
+ # Custom rule with to_sql_agg() - use the pre-generated SQL
455
+ sql_agg = spec.get("sql_agg", {})
456
+ agg_expr = sql_agg.get(DIALECT) or sql_agg.get("duckdb")
457
+ if agg_expr:
458
+ aggregate_selects.append(f'{agg_expr} AS "{rid}"')
459
+ aggregate_specs.append(spec)
460
+ supported_specs.append(spec)
461
+
462
+ return {
463
+ "exists_specs": exists_specs,
464
+ "aggregate_selects": aggregate_selects,
465
+ "aggregate_specs": aggregate_specs,
466
+ "supported_specs": supported_specs,
467
+ }
468
+
469
+ def execute(
470
+ self,
471
+ handle: DatasetHandle,
472
+ compiled_plan: Dict[str, Any],
473
+ *,
474
+ csv_mode: str = "auto",
475
+ ) -> Dict[str, Any]:
476
+ """
477
+ Execute the compiled plan in two phases, honoring csv_mode for CSV URIs.
478
+
479
+ Phase 1: EXISTS checks for not_null (fast, can early-terminate)
480
+ Phase 2: Aggregate query for remaining rules
481
+
482
+ Returns:
483
+ {
484
+ "results": [...],
485
+ "staging": {"path": <parquet_path>|None, "tmpdir": <TemporaryDirectory>|None}
486
+ }
487
+ """
488
+ exists_specs = compiled_plan.get("exists_specs", [])
489
+ aggregate_selects = compiled_plan.get("aggregate_selects", [])
490
+
491
+ if not exists_specs and not aggregate_selects:
492
+ return {"results": [], "staging": {"path": None, "tmpdir": None}}
493
+
494
+ con = create_duckdb_connection(handle)
495
+ view = "_data"
496
+ tmpdir: Optional[tempfile.TemporaryDirectory] = None
497
+ staged_path: Optional[str] = None
498
+ results: List[Dict[str, Any]] = []
499
+
500
+ # Build rule_kinds mapping from specs
501
+ rule_kinds = {}
502
+ for spec in exists_specs:
503
+ rule_kinds[spec["rule_id"]] = spec.get("kind")
504
+ for spec in compiled_plan.get("aggregate_specs", []):
505
+ rule_kinds[spec["rule_id"]] = spec.get("kind")
506
+
507
+ try:
508
+ tmpdir, staged_path, _ = _create_source_view(con, handle, view, csv_mode=csv_mode)
509
+
510
+ # Phase 1: EXISTS checks for not_null rules
511
+ if exists_specs:
512
+ exists_exprs = [
513
+ exists_not_null(
514
+ spec["column"],
515
+ spec["rule_id"],
516
+ esc_ident(view),
517
+ "duckdb"
518
+ )
519
+ for spec in exists_specs
520
+ ]
521
+ exists_sql = f"SELECT {', '.join(exists_exprs)};"
522
+ cur = con.execute(exists_sql)
523
+ row = cur.fetchone()
524
+ cols = [d[0] for d in cur.description] if (row and cur.description) else []
525
+
526
+ if row and cols:
527
+ exists_results = results_from_row(cols, row, is_exists=True, rule_kinds=rule_kinds)
528
+ results.extend(exists_results)
529
+
530
+ # Phase 2: Aggregate query for remaining rules
531
+ if aggregate_selects:
532
+ agg_sql = _assemble_single_row(aggregate_selects)
533
+ cur = con.execute(agg_sql)
534
+ row = cur.fetchone()
535
+ cols = [d[0] for d in cur.description] if (row and cur.description) else []
536
+
537
+ if row and cols:
538
+ agg_results = results_from_row(cols, row, is_exists=False, rule_kinds=rule_kinds)
539
+ results.extend(agg_results)
540
+
541
+ # Get row count and column names (avoid separate introspect call)
542
+ row_count = None
543
+ available_cols = []
544
+ try:
545
+ nrow = con.execute(f"SELECT COUNT(*) FROM {esc_ident(view)}").fetchone()
546
+ row_count = int(nrow[0]) if nrow and nrow[0] is not None else None
547
+ cur = con.execute(f"SELECT * FROM {esc_ident(view)} LIMIT 0")
548
+ available_cols = [d[0] for d in cur.description] if cur.description else []
549
+ except Exception:
550
+ pass # Non-fatal - introspect can still be called
551
+
552
+ return {
553
+ "results": results,
554
+ "staging": {"path": staged_path, "tmpdir": tmpdir},
555
+ "row_count": row_count,
556
+ "available_cols": available_cols,
557
+ }
558
+ except Exception:
559
+ if tmpdir is not None:
560
+ tmpdir.cleanup()
561
+ raise
562
+ finally:
563
+ try:
564
+ con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)};")
565
+ except Exception:
566
+ pass
567
+
568
+ def introspect(
569
+ self,
570
+ handle: DatasetHandle,
571
+ *,
572
+ csv_mode: str = "auto",
573
+ ) -> Dict[str, Any]:
574
+ """
575
+ Introspect row count and columns, honoring csv_mode.
576
+ Returns:
577
+ {
578
+ "row_count": int,
579
+ "available_cols": [...],
580
+ "staging": {"path": <parquet_path>|None, "tmpdir": <TemporaryDirectory>|None}
581
+ }
582
+ """
583
+ con = create_duckdb_connection(handle)
584
+ view = "_data"
585
+ tmpdir: Optional[tempfile.TemporaryDirectory] = None
586
+ staged_path: Optional[str] = None
587
+
588
+ try:
589
+ tmpdir, staged_path, _ = _create_source_view(con, handle, view, csv_mode=csv_mode)
590
+ nrow = con.execute(f"SELECT COUNT(*) AS n FROM {esc_ident(view)}").fetchone()
591
+ n = int(nrow[0]) if nrow and nrow[0] is not None else 0
592
+ cur = con.execute(f"SELECT * FROM {esc_ident(view)} LIMIT 0")
593
+ cols = [d[0] for d in cur.description] if cur.description else []
594
+ return {
595
+ "row_count": n,
596
+ "available_cols": cols,
597
+ "staging": {"path": staged_path, "tmpdir": tmpdir},
598
+ }
599
+ except Exception:
600
+ if tmpdir is not None:
601
+ tmpdir.cleanup()
602
+ raise
603
+ finally:
604
+ try:
605
+ con.execute(f"DROP VIEW IF EXISTS {esc_ident(view)};")
606
+ except Exception:
607
+ pass