kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1205 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Validation Engine — preplan-aware, hybrid, projection-efficient, deterministic.
5
+
6
+ Flow
7
+ ----
8
+ 1) Load contract
9
+ 2) Build rules → compile plan (required columns + SQL-capable candidates)
10
+ 3) (Optional) Preplan (metadata-only, Parquet): prove PASS/FAIL, build scan manifest
11
+ 4) Pick materializer (e.g., DuckDB for S3 / staged CSV)
12
+ 5) (Optional) SQL pushdown for eligible *remaining* rules (may stage CSV → Parquet)
13
+ 6) Materialize residual slice for Polars (row-groups + projection)
14
+ 7) Execute residual rules in Polars
15
+ 8) Merge results (preplan → SQL → Polars), summarize, attach small stats dict
16
+
17
+ Principles
18
+ ----------
19
+ - Deterministic: identical inputs → identical outputs
20
+ - Layered & independent toggles:
21
+ * Preplan (metadata) — independent of pushdown/projection
22
+ * Pushdown (SQL execution) — independent of preplan/projection
23
+ * Projection (contract-driven columns) — independent of preplan/pushdown
24
+ - Performance-first: plan → prune → load minimal slice → execute
25
+ - Clear separation: engine orchestrates; preplan is a leaf; reporters format/print
26
+ """
27
+
28
+ import os
29
+ from pathlib import Path
30
+ from typing import Any, Dict, List, Literal, Optional, Set, TYPE_CHECKING, Union
31
+
32
+ import polars as pl
33
+
34
+ if TYPE_CHECKING:
35
+ from kontra.state.backends.base import StateBackend
36
+ from kontra.state.types import ValidationState
37
+ import pyarrow as pa
38
+ import pyarrow.fs as pafs # <-- Added
39
+ import pyarrow.parquet as pq
40
+
41
+ from kontra.config.loader import ContractLoader
42
+ from kontra.config.models import Contract
43
+ from kontra.connectors.handle import DatasetHandle
44
+ from kontra.engine.backends.polars_backend import PolarsBackend
45
+ from kontra.engine.executors.registry import pick_executor, register_default_executors
46
+ from kontra.engine.materializers.registry import pick_materializer, register_default_materializers
47
+ from kontra.engine.stats import RunTimers, basic_summary, columns_touched, now_ms, profile_for
48
+ from kontra.reporters.rich_reporter import report_failure, report_success
49
+ from kontra.rules.execution_plan import RuleExecutionPlan
50
+ from kontra.rules.factory import RuleFactory
51
+ from kontra.logging import get_logger, log_exception
52
+
53
+ _logger = get_logger(__name__)
54
+
55
+ # Preplan (metadata-only) + static predicate extraction
56
+ from kontra.preplan.planner import preplan_single_parquet
57
+ from kontra.preplan.types import PrePlan
58
+ from kontra.rules.static_predicates import extract_static_predicates
59
+
60
+ # Built-ins (side-effect registration)
61
+ import kontra.rules.builtin.allowed_values # noqa: F401
62
+ import kontra.rules.builtin.disallowed_values # noqa: F401
63
+ import kontra.rules.builtin.custom_sql_check # noqa: F401
64
+ import kontra.rules.builtin.dtype # noqa: F401
65
+ import kontra.rules.builtin.freshness # noqa: F401
66
+ import kontra.rules.builtin.max_rows # noqa: F401
67
+ import kontra.rules.builtin.min_rows # noqa: F401
68
+ import kontra.rules.builtin.not_null # noqa: F401
69
+ import kontra.rules.builtin.range # noqa: F401
70
+ import kontra.rules.builtin.length # noqa: F401
71
+ import kontra.rules.builtin.regex # noqa: F401
72
+ import kontra.rules.builtin.contains # noqa: F401
73
+ import kontra.rules.builtin.starts_with # noqa: F401
74
+ import kontra.rules.builtin.ends_with # noqa: F401
75
+ import kontra.rules.builtin.unique # noqa: F401
76
+ import kontra.rules.builtin.compare # noqa: F401
77
+ import kontra.rules.builtin.conditional_not_null # noqa: F401
78
+ import kontra.rules.builtin.conditional_range # noqa: F401
79
+
80
+
81
+ # --------------------------------------------------------------------------- #
82
+ # Helpers
83
+ # --------------------------------------------------------------------------- #
84
+
85
+ def _resolve_datasource_uri(reference: str) -> str:
86
+ """
87
+ Resolve a datasource reference to a concrete URI.
88
+
89
+ Tries to resolve named datasources (e.g., "prod_db.users") through config.
90
+ Falls back to returning the reference as-is if not found in config.
91
+
92
+ Args:
93
+ reference: Named datasource ("prod_db.users") or direct URI/path
94
+
95
+ Returns:
96
+ Resolved URI (e.g., "postgres://host/db/public.users" or "./data.parquet")
97
+ """
98
+ try:
99
+ from kontra.config.settings import resolve_datasource
100
+ return resolve_datasource(reference)
101
+ except (ValueError, ImportError):
102
+ # Not a named datasource or config not available - use as-is
103
+ return reference
104
+
105
+
106
+ def _is_s3_uri(val: str | None) -> bool:
107
+ return isinstance(val, str) and val.lower().startswith("s3://")
108
+
109
+
110
+ def _s3_uri_to_path(uri: str) -> str:
111
+ """Convert s3://bucket/key to bucket/key (PyArrow S3FileSystem format)."""
112
+ if uri.lower().startswith("s3://"):
113
+ return uri[5:] # Strip 's3://'
114
+ return uri
115
+
116
+
117
+ def _create_s3_filesystem(handle: DatasetHandle) -> pafs.S3FileSystem:
118
+ """
119
+ Create a PyArrow S3FileSystem from handle's fs_opts (populated from env vars).
120
+ Supports MinIO and other S3-compatible storage via custom endpoints.
121
+ """
122
+ opts = handle.fs_opts or {}
123
+
124
+ # Map our fs_opts keys to PyArrow S3FileSystem kwargs
125
+ kwargs: Dict[str, Any] = {}
126
+ if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
127
+ kwargs["access_key"] = opts["s3_access_key_id"]
128
+ kwargs["secret_key"] = opts["s3_secret_access_key"]
129
+ if opts.get("s3_session_token"):
130
+ kwargs["session_token"] = opts["s3_session_token"]
131
+ if opts.get("s3_region"):
132
+ kwargs["region"] = opts["s3_region"]
133
+ if opts.get("s3_endpoint"):
134
+ # PyArrow expects endpoint_override without the scheme
135
+ endpoint = opts["s3_endpoint"]
136
+ # Strip scheme if present and set scheme kwarg
137
+ if endpoint.startswith("http://"):
138
+ endpoint = endpoint[7:]
139
+ kwargs["scheme"] = "http"
140
+ elif endpoint.startswith("https://"):
141
+ endpoint = endpoint[8:]
142
+ kwargs["scheme"] = "https"
143
+ kwargs["endpoint_override"] = endpoint
144
+
145
+ # MinIO and some S3-compatible storage require path-style URLs (not virtual-hosted)
146
+ # DUCKDB_S3_URL_STYLE=path -> force_virtual_addressing=False
147
+ url_style = opts.get("s3_url_style", "").lower()
148
+ if url_style == "path":
149
+ kwargs["force_virtual_addressing"] = False
150
+ elif url_style == "host":
151
+ kwargs["force_virtual_addressing"] = True
152
+ # If endpoint is set but no url_style, default to path-style (common for MinIO)
153
+ elif opts.get("s3_endpoint"):
154
+ kwargs["force_virtual_addressing"] = False
155
+
156
+ return pafs.S3FileSystem(**kwargs)
157
+
158
+
159
+ def _is_parquet(path: str | None) -> bool:
160
+ return isinstance(path, str) and path.lower().endswith(".parquet")
161
+
162
+
163
+ # --------------------------------------------------------------------------- #
164
+ # Engine
165
+ # --------------------------------------------------------------------------- #
166
+
167
+ class ValidationEngine:
168
+ """
169
+ Orchestrates:
170
+ - Rule planning
171
+ - Preplan (metadata-only; Parquet) [independent]
172
+ - SQL pushdown (optional) [independent]
173
+ - Residual Polars execution
174
+ - Reporting + stats
175
+
176
+ Usage:
177
+ # From file paths
178
+ engine = ValidationEngine(contract_path="contract.yml")
179
+ result = engine.run()
180
+
181
+ # With DataFrame (skips preplan/pushdown, uses Polars directly)
182
+ import polars as pl
183
+ df = pl.read_parquet("data.parquet")
184
+ engine = ValidationEngine(contract_path="contract.yml", dataframe=df)
185
+ result = engine.run()
186
+
187
+ # With pandas DataFrame
188
+ import pandas as pd
189
+ pdf = pd.read_parquet("data.parquet")
190
+ engine = ValidationEngine(contract_path="contract.yml", dataframe=pdf)
191
+ result = engine.run()
192
+ """
193
+
194
+ def __init__(
195
+ self,
196
+ contract_path: Optional[str] = None,
197
+ data_path: Optional[str] = None,
198
+ dataframe: Optional[Union[pl.DataFrame, "pd.DataFrame"]] = None,
199
+ handle: Optional[DatasetHandle] = None, # BYOC: pre-built handle
200
+ emit_report: bool = True,
201
+ stats_mode: Literal["none", "summary", "profile"] = "none",
202
+ # Independent toggles
203
+ preplan: Literal["on", "off", "auto"] = "auto",
204
+ pushdown: Literal["on", "off", "auto"] = "auto",
205
+ enable_projection: bool = True,
206
+ csv_mode: Literal["auto", "duckdb", "parquet"] = "auto",
207
+ # Diagnostics
208
+ show_plan: bool = False,
209
+ explain_preplan: bool = False,
210
+ # State management
211
+ state_store: Optional["StateBackend"] = None,
212
+ save_state: bool = True,
213
+ # Inline rules (Python API)
214
+ inline_rules: Optional[List[Dict[str, Any]]] = None,
215
+ # Cloud storage credentials (S3, Azure, GCS)
216
+ storage_options: Optional[Dict[str, Any]] = None,
217
+ ):
218
+ # Validate inputs
219
+ if contract_path is None and inline_rules is None:
220
+ raise ValueError("Either contract_path or inline_rules must be provided")
221
+
222
+ # Validate toggle parameters
223
+ valid_csv_modes = {"auto", "duckdb", "parquet"}
224
+ if csv_mode not in valid_csv_modes:
225
+ raise ValueError(
226
+ f"Invalid csv_mode '{csv_mode}'. "
227
+ f"Must be one of: {', '.join(sorted(valid_csv_modes))}"
228
+ )
229
+
230
+ valid_toggles = {"on", "off", "auto"}
231
+ if preplan not in valid_toggles:
232
+ raise ValueError(
233
+ f"Invalid preplan '{preplan}'. "
234
+ f"Must be one of: {', '.join(sorted(valid_toggles))}"
235
+ )
236
+ if pushdown not in valid_toggles:
237
+ raise ValueError(
238
+ f"Invalid pushdown '{pushdown}'. "
239
+ f"Must be one of: {', '.join(sorted(valid_toggles))}"
240
+ )
241
+
242
+ valid_stats_modes = {"none", "summary", "profile"}
243
+ if stats_mode not in valid_stats_modes:
244
+ raise ValueError(
245
+ f"Invalid stats_mode '{stats_mode}'. "
246
+ f"Must be one of: {', '.join(sorted(valid_stats_modes))}"
247
+ )
248
+
249
+ self.contract_path = str(contract_path) if contract_path else None
250
+ self.data_path = data_path
251
+ self._input_dataframe = dataframe # Store user-provided DataFrame
252
+ self._inline_rules = inline_rules # Store inline rules for merging
253
+ self._inline_built_rules = [] # Populated in _load_contract() if BaseRule instances passed
254
+ self.emit_report = emit_report
255
+ self.stats_mode = stats_mode
256
+
257
+ self.preplan = preplan
258
+ self.pushdown = pushdown
259
+ self.enable_projection = bool(enable_projection)
260
+ self.csv_mode = csv_mode
261
+ self.show_plan = show_plan
262
+ self.explain_preplan = explain_preplan
263
+
264
+ # State management
265
+ self.state_store = state_store
266
+ self.save_state = save_state
267
+ self._last_state: Optional["ValidationState"] = None
268
+
269
+ self.contract: Optional[Contract] = None
270
+ self.df: Optional[pl.DataFrame] = None
271
+ self._handle: Optional[DatasetHandle] = handle # BYOC: pre-built handle
272
+ self._rules: Optional[List] = None # Built rules, for sample_failures()
273
+ self._storage_options = storage_options # Cloud storage credentials
274
+
275
+ register_default_materializers()
276
+ register_default_executors()
277
+
278
+ # --------------------------------------------------------------------- #
279
+
280
+ def run(self) -> Dict[str, Any]:
281
+ timers = RunTimers()
282
+ self._staging_tmpdir = None # Track for cleanup in finally block
283
+
284
+ try:
285
+ result = self._run_impl(timers)
286
+
287
+ # Save state if enabled
288
+ if self.save_state:
289
+ self._save_validation_state(result)
290
+
291
+ return result
292
+ finally:
293
+ # Cleanup staged temp directory (CSV -> Parquet staging)
294
+ if self._staging_tmpdir is not None:
295
+ try:
296
+ self._staging_tmpdir.cleanup()
297
+ except Exception as e:
298
+ log_exception(_logger, "Failed to cleanup staging directory", e)
299
+ self._staging_tmpdir = None
300
+
301
+ def _save_validation_state(self, result: Dict[str, Any]) -> None:
302
+ """Save validation state if a store is configured."""
303
+ try:
304
+ from kontra.state.types import ValidationState
305
+ from kontra.state.fingerprint import fingerprint_contract, fingerprint_dataset
306
+ from kontra.state.backends import get_default_store
307
+
308
+ # Get or create store
309
+ store = self.state_store
310
+ if store is None and self.save_state:
311
+ store = get_default_store()
312
+
313
+ if store is None:
314
+ return
315
+
316
+ # Generate fingerprints
317
+ contract_fp = fingerprint_contract(self.contract) if self.contract else "unknown"
318
+
319
+ source_ref = self.data_path or (self.contract.datasource if self.contract else "")
320
+ source_uri = _resolve_datasource_uri(source_ref) if source_ref else ""
321
+ dataset_fp = None
322
+ try:
323
+ handle = DatasetHandle.from_uri(source_uri, storage_options=self._storage_options)
324
+ dataset_fp = fingerprint_dataset(handle)
325
+ except Exception as e:
326
+ log_exception(_logger, "Could not fingerprint dataset", e)
327
+
328
+ # Derive contract name (from contract, or from path)
329
+ contract_name = "unknown"
330
+ if self.contract:
331
+ contract_name = self.contract.name or Path(self.contract_path).stem
332
+
333
+ # Create state from result
334
+ state = ValidationState.from_validation_result(
335
+ result=result,
336
+ contract_fingerprint=contract_fp,
337
+ dataset_fingerprint=dataset_fp,
338
+ contract_name=contract_name,
339
+ dataset_uri=source_uri,
340
+ )
341
+
342
+ # Save
343
+ store.save(state)
344
+ self._last_state = state
345
+
346
+ except Exception as e:
347
+ # Don't fail validation if state save fails
348
+ if os.getenv("KONTRA_VERBOSE"):
349
+ print(f"Warning: Failed to save validation state: {e}")
350
+
351
+ def get_last_state(self) -> Optional["ValidationState"]:
352
+ """Get the state from the last validation run."""
353
+ return self._last_state
354
+
355
+ def diff_from_last(self) -> Optional[Dict[str, Any]]:
356
+ """
357
+ Compare current state to previous state.
358
+
359
+ Returns a dict with changes, or None if no previous state exists.
360
+ """
361
+ if self._last_state is None:
362
+ return None
363
+
364
+ try:
365
+ from kontra.state.backends import get_default_store
366
+
367
+ store = self.state_store or get_default_store()
368
+ previous = store.get_previous(
369
+ self._last_state.contract_fingerprint,
370
+ before=self._last_state.run_at,
371
+ )
372
+
373
+ if previous is None:
374
+ return None
375
+
376
+ # Build simple diff
377
+ return self._build_diff(previous, self._last_state)
378
+
379
+ except Exception as e:
380
+ log_exception(_logger, "Failed to compute diff", e)
381
+ return None
382
+
383
+ def _build_diff(
384
+ self,
385
+ before: "ValidationState",
386
+ after: "ValidationState",
387
+ ) -> Dict[str, Any]:
388
+ """Build a diff between two validation states."""
389
+ diff: Dict[str, Any] = {
390
+ "before_run_at": before.run_at.isoformat(),
391
+ "after_run_at": after.run_at.isoformat(),
392
+ "summary_changed": before.summary.passed != after.summary.passed,
393
+ "rules_changed": [],
394
+ "new_failures": [],
395
+ "resolved_failures": [],
396
+ }
397
+
398
+ # Index before rules by ID
399
+ before_rules = {r.rule_id: r for r in before.rules}
400
+ after_rules = {r.rule_id: r for r in after.rules}
401
+
402
+ # Find changes
403
+ for rule_id, after_rule in after_rules.items():
404
+ before_rule = before_rules.get(rule_id)
405
+
406
+ if before_rule is None:
407
+ # New rule
408
+ if not after_rule.passed:
409
+ diff["new_failures"].append({
410
+ "rule_id": rule_id,
411
+ "failed_count": after_rule.failed_count,
412
+ })
413
+ elif before_rule.passed != after_rule.passed:
414
+ # Status changed
415
+ if after_rule.passed:
416
+ diff["resolved_failures"].append(rule_id)
417
+ else:
418
+ diff["new_failures"].append({
419
+ "rule_id": rule_id,
420
+ "failed_count": after_rule.failed_count,
421
+ "was_passing": True,
422
+ })
423
+ elif before_rule.failed_count != after_rule.failed_count:
424
+ # Count changed
425
+ diff["rules_changed"].append({
426
+ "rule_id": rule_id,
427
+ "before_count": before_rule.failed_count,
428
+ "after_count": after_rule.failed_count,
429
+ "delta": after_rule.failed_count - before_rule.failed_count,
430
+ })
431
+
432
+ diff["has_regressions"] = len(diff["new_failures"]) > 0 or any(
433
+ r["delta"] > 0 for r in diff["rules_changed"]
434
+ )
435
+
436
+ return diff
437
+
438
+ def _run_dataframe_mode(
439
+ self,
440
+ timers: RunTimers,
441
+ rules: List,
442
+ plan: "RuleExecutionPlan",
443
+ compiled_full,
444
+ rule_severity_map: Dict[str, str],
445
+ ) -> Dict[str, Any]:
446
+ """
447
+ Execute validation directly on a user-provided DataFrame.
448
+
449
+ This path:
450
+ - Skips preplan (no file metadata)
451
+ - Skips SQL pushdown (data already in memory)
452
+ - Uses Polars-only execution
453
+ """
454
+ t0 = now_ms()
455
+
456
+ # Convert pandas to polars if needed
457
+ df = self._input_dataframe
458
+ if not isinstance(df, pl.DataFrame):
459
+ try:
460
+ # Assume it's pandas-like
461
+ df = pl.from_pandas(df)
462
+ except Exception as e:
463
+ raise ValueError(
464
+ f"Could not convert DataFrame to Polars: {e}. "
465
+ "Pass a Polars DataFrame or a pandas DataFrame."
466
+ )
467
+
468
+ self.df = df
469
+ timers.data_load_ms = now_ms() - t0
470
+
471
+ # Execute all rules via Polars
472
+ t0 = now_ms()
473
+ polars_exec = PolarsBackend(executor=plan.execute_compiled)
474
+ exec_result = polars_exec.execute(self.df, compiled_full)
475
+ polars_results = exec_result.get("results", [])
476
+ timers.polars_ms = now_ms() - t0
477
+
478
+ # Merge results (all from Polars in this mode)
479
+ all_results: List[Dict[str, Any]] = []
480
+ for res in polars_results:
481
+ res["execution_source"] = "polars"
482
+ res["severity"] = rule_severity_map.get(res["rule_id"], "blocking")
483
+ all_results.append(res)
484
+
485
+ # Sort deterministically
486
+ all_results.sort(key=lambda r: r["rule_id"])
487
+
488
+ # Summary (use the plan's summary method for consistency)
489
+ summary = plan.summary(all_results)
490
+ summary["dataset_name"] = self.contract.datasource if self.contract else "dataframe"
491
+ summary["total_rows"] = int(self.df.height) if self.df is not None else 0
492
+ engine_label = "polars (dataframe mode)"
493
+
494
+ # Report
495
+ if self.emit_report:
496
+ if summary["passed"]:
497
+ report_success(
498
+ name=summary.get("dataset_name", "dataframe"),
499
+ results=all_results,
500
+ summary=summary,
501
+ )
502
+ else:
503
+ report_failure(
504
+ name=summary.get("dataset_name", "dataframe"),
505
+ results=all_results,
506
+ summary=summary,
507
+ )
508
+
509
+ result = {
510
+ "summary": summary,
511
+ "results": all_results,
512
+ }
513
+
514
+ # Stats
515
+ if self.stats_mode != "none":
516
+ stats: Dict[str, Any] = {
517
+ "run_meta": {
518
+ "contract_path": self.contract_path,
519
+ "engine": engine_label,
520
+ "materializer": "dataframe",
521
+ "preplan": "off",
522
+ "pushdown": "off",
523
+ },
524
+ "durations_ms": {
525
+ "contract_load": timers.contract_load_ms,
526
+ "compile": timers.compile_ms,
527
+ "data_load": timers.data_load_ms,
528
+ "polars": timers.polars_ms,
529
+ "total": timers.total_ms(),
530
+ },
531
+ }
532
+
533
+ if self.stats_mode == "summary":
534
+ stats["dataset"] = basic_summary(self.df)
535
+ elif self.stats_mode == "profile":
536
+ stats["dataset"] = profile_for(self.df, self.df.columns)
537
+
538
+ result["stats"] = stats
539
+
540
+ return result
541
+
542
+ def _load_contract(self) -> Contract:
543
+ """
544
+ Load contract from file and/or merge with inline rules.
545
+
546
+ Returns a Contract object with all rules to validate.
547
+ """
548
+ from kontra.config.models import RuleSpec
549
+
550
+ # Convert inline rules to RuleSpec objects (or pass through BaseRule instances)
551
+ inline_specs = []
552
+ inline_built_rules = [] # Already-built BaseRule instances
553
+ if self._inline_rules:
554
+ from kontra.rules.base import BaseRule as BaseRuleType
555
+ for rule in self._inline_rules:
556
+ if isinstance(rule, BaseRuleType):
557
+ # Already a rule instance - use directly
558
+ inline_built_rules.append(rule)
559
+ elif isinstance(rule, dict):
560
+ # Dict format - convert to RuleSpec
561
+ spec = RuleSpec(
562
+ name=rule.get("name", ""),
563
+ id=rule.get("id"),
564
+ params=rule.get("params", {}),
565
+ severity=rule.get("severity", "blocking"),
566
+ )
567
+ inline_specs.append(spec)
568
+ else:
569
+ raise ValueError(
570
+ f"Invalid rule type: {type(rule).__name__}. "
571
+ f"Expected dict or BaseRule instance."
572
+ )
573
+
574
+ # Store built rules to merge with factory-built rules later
575
+ self._inline_built_rules = inline_built_rules
576
+
577
+ # Load from file if path provided
578
+ if self.contract_path:
579
+ contract = (
580
+ ContractLoader.from_s3(self.contract_path)
581
+ if _is_s3_uri(self.contract_path)
582
+ else ContractLoader.from_path(self.contract_path)
583
+ )
584
+ # Merge inline rules with contract rules
585
+ if inline_specs:
586
+ contract.rules = list(contract.rules) + inline_specs
587
+ return contract
588
+
589
+ # No contract file - create synthetic contract from inline rules
590
+ dataset = self.data_path or "inline_validation"
591
+ return Contract(
592
+ name="inline_contract",
593
+ dataset=dataset,
594
+ rules=inline_specs,
595
+ )
596
+
597
+ def _run_impl(self, timers: RunTimers) -> Dict[str, Any]:
598
+ # 1) Contract (load from file and/or inline rules)
599
+ t0 = now_ms()
600
+ self.contract = self._load_contract()
601
+ timers.contract_load_ms = now_ms() - t0
602
+
603
+ # 2) Rules & plan
604
+ t0 = now_ms()
605
+ rules = RuleFactory(self.contract.rules).build_rules()
606
+ # Merge with any pre-built rule instances passed directly
607
+ if self._inline_built_rules:
608
+ rules = rules + self._inline_built_rules
609
+ self._rules = rules # Store for sample_failures()
610
+ plan = RuleExecutionPlan(rules)
611
+ compiled_full = plan.compile()
612
+ timers.compile_ms = now_ms() - t0
613
+
614
+ # Build rule_id -> severity mapping for injecting into preplan/SQL results
615
+ rule_severity_map = {r.rule_id: r.severity for r in rules}
616
+
617
+ # ------------------------------------------------------------------ #
618
+ # DataFrame mode: If user provided a DataFrame, use Polars-only path
619
+ # ------------------------------------------------------------------ #
620
+ if self._input_dataframe is not None:
621
+ return self._run_dataframe_mode(timers, rules, plan, compiled_full, rule_severity_map)
622
+
623
+ # Dataset handle (used across phases)
624
+ # BYOC: if a pre-built handle was provided, use it directly
625
+ if self._handle is not None:
626
+ handle = self._handle
627
+ source_uri = handle.uri
628
+ else:
629
+ source_ref = self.data_path or self.contract.datasource
630
+ source_uri = _resolve_datasource_uri(source_ref)
631
+ handle = DatasetHandle.from_uri(source_uri, storage_options=self._storage_options)
632
+
633
+ # ------------------------------------------------------------------ #
634
+ # 3) Preplan (metadata-only; independent of pushdown/projection)
635
+ preplan_effective = False
636
+ handled_ids_meta: Set[str] = set()
637
+ meta_results_by_id: Dict[str, Dict[str, Any]] = {}
638
+ preplan_row_groups: Optional[List[int]] = None
639
+ preplan_columns: Optional[List[str]] = None
640
+ preplan_analyze_ms = 0
641
+ preplan_total_rows: Optional[int] = None # Track row count from preplan metadata
642
+ preplan_summary: Dict[str, Any] = {
643
+ "enabled": self.preplan in {"on", "auto"},
644
+ "effective": False,
645
+ "rules_pass_meta": 0,
646
+ "rules_fail_meta": 0,
647
+ "rules_unknown": len(compiled_full.required_cols or []),
648
+ "row_groups_kept": None,
649
+ "row_groups_total": None,
650
+ "row_groups_pruned": None,
651
+ }
652
+
653
+ # Get filesystem from handle; preplan needs this for S3/remote access.
654
+ preplan_fs: pafs.FileSystem | None = None
655
+ if _is_s3_uri(handle.uri):
656
+ try:
657
+ preplan_fs = _create_s3_filesystem(handle)
658
+ except Exception as e:
659
+ # If S3 libs aren't installed, this will fail.
660
+ # We'll let the ParquetFile call fail below and be caught.
661
+ log_exception(_logger, "Could not create S3 filesystem for preplan", e)
662
+
663
+ if self.preplan in {"on", "auto"} and _is_parquet(handle.uri):
664
+ try:
665
+ t0 = now_ms()
666
+ static_preds = extract_static_predicates(rules=rules)
667
+ # PyArrow S3FileSystem expects 'bucket/key' format, not 's3://bucket/key'
668
+ preplan_path = _s3_uri_to_path(handle.uri) if preplan_fs else handle.uri
669
+ pre: PrePlan = preplan_single_parquet(
670
+ path=preplan_path,
671
+ required_columns=compiled_full.required_cols, # DC-driven columns
672
+ predicates=static_preds,
673
+ filesystem=preplan_fs,
674
+ )
675
+ preplan_analyze_ms = now_ms() - t0
676
+
677
+ # Register metadata-based rule decisions (pass/fail), unknowns remain
678
+ pass_meta = fail_meta = unknown = 0
679
+ for rid, decision in pre.rule_decisions.items():
680
+ if decision == "pass_meta":
681
+ meta_results_by_id[rid] = {
682
+ "rule_id": rid,
683
+ "passed": True,
684
+ "failed_count": 0,
685
+ "message": "Proven by metadata (Parquet stats)",
686
+ "execution_source": "metadata",
687
+ "severity": rule_severity_map.get(rid, "blocking"),
688
+ }
689
+ handled_ids_meta.add(rid)
690
+ pass_meta += 1
691
+ elif decision == "fail_meta":
692
+ meta_results_by_id[rid] = {
693
+ "rule_id": rid,
694
+ "passed": False,
695
+ "failed_count": 1,
696
+ "message": "Failed: violation proven by Parquet metadata (null values detected)",
697
+ "execution_source": "metadata",
698
+ "severity": rule_severity_map.get(rid, "blocking"),
699
+ }
700
+ handled_ids_meta.add(rid)
701
+ fail_meta += 1
702
+ else:
703
+ unknown += 1
704
+
705
+ preplan_row_groups = list(pre.manifest_row_groups or [])
706
+ preplan_columns = list(pre.manifest_columns or [])
707
+ preplan_effective = True
708
+ preplan_total_rows = pre.stats.get("total_rows")
709
+
710
+ rg_total = pre.stats.get("rg_total", None)
711
+ rg_kept = len(preplan_row_groups)
712
+ preplan_summary.update({
713
+ "effective": True,
714
+ "rules_pass_meta": pass_meta,
715
+ "rules_fail_meta": fail_meta,
716
+ "rules_unknown": unknown,
717
+ "row_groups_kept": rg_kept if rg_total is not None else None,
718
+ "row_groups_total": rg_total,
719
+ "row_groups_pruned": (rg_total - rg_kept) if (rg_total is not None) else None,
720
+ })
721
+
722
+ if self.explain_preplan:
723
+ print(
724
+ "\n-- PREPLAN (metadata) --"
725
+ f"\n Row-groups kept: {preplan_summary.get('row_groups_kept')}/{preplan_summary.get('row_groups_total')}"
726
+ f"\n Rules: {pass_meta} pass, {fail_meta} fail, {unknown} unknown\n"
727
+ )
728
+
729
+ except Exception as e:
730
+ # Distinguish between "preplan not available" vs "real errors"
731
+ err_str = str(e).lower()
732
+ err_type = type(e).__name__
733
+
734
+ # Re-raise errors that indicate real problems (auth, file not found, etc.)
735
+ is_auth_error = (
736
+ "access denied" in err_str
737
+ or "forbidden" in err_str
738
+ or "unauthorized" in err_str
739
+ or "credentials" in err_str
740
+ or "authentication" in err_str
741
+ )
742
+ is_not_found = (
743
+ isinstance(e, FileNotFoundError)
744
+ or "not found" in err_str
745
+ or "no such file" in err_str
746
+ or "does not exist" in err_str
747
+ )
748
+ is_permission = isinstance(e, PermissionError)
749
+
750
+ if is_auth_error or is_not_found or is_permission:
751
+ # These are real errors - don't silently skip
752
+ raise RuntimeError(
753
+ f"Preplan failed due to {err_type}: {e}. "
754
+ "Check file path and credentials."
755
+ ) from e
756
+
757
+ # Otherwise, preplan optimization just isn't available (e.g., no stats)
758
+ if os.getenv("KONTRA_VERBOSE"):
759
+ print(f"[INFO] Preplan skipped ({err_type}): {e}")
760
+ preplan_effective = False # leave summary with effective=False
761
+
762
+ # PostgreSQL preplan (uses pg_stats metadata)
763
+ elif self.preplan in {"on", "auto"} and handle.scheme in ("postgres", "postgresql"):
764
+ try:
765
+ from kontra.preplan.postgres import preplan_postgres, can_preplan_postgres
766
+ if can_preplan_postgres(handle):
767
+ t0 = now_ms()
768
+ static_preds = extract_static_predicates(rules=rules)
769
+ pre: PrePlan = preplan_postgres(
770
+ handle=handle,
771
+ required_columns=compiled_full.required_cols,
772
+ predicates=static_preds,
773
+ )
774
+ preplan_analyze_ms = now_ms() - t0
775
+
776
+ pass_meta = fail_meta = unknown = 0
777
+ for rid, decision in pre.rule_decisions.items():
778
+ if decision == "pass_meta":
779
+ meta_results_by_id[rid] = {
780
+ "rule_id": rid,
781
+ "passed": True,
782
+ "failed_count": 0,
783
+ "message": "Proven by metadata (pg_stats)",
784
+ "execution_source": "metadata",
785
+ "severity": rule_severity_map.get(rid, "blocking"),
786
+ }
787
+ handled_ids_meta.add(rid)
788
+ pass_meta += 1
789
+ else:
790
+ unknown += 1
791
+
792
+ preplan_effective = True
793
+ preplan_summary.update({
794
+ "effective": True,
795
+ "rules_pass_meta": pass_meta,
796
+ "rules_fail_meta": fail_meta,
797
+ "rules_unknown": unknown,
798
+ })
799
+ except Exception as e:
800
+ if os.getenv("KONTRA_VERBOSE"):
801
+ print(f"[INFO] PostgreSQL preplan skipped: {e}")
802
+
803
+ # SQL Server preplan (uses sys.columns metadata)
804
+ elif self.preplan in {"on", "auto"} and handle.scheme in ("mssql", "sqlserver"):
805
+ try:
806
+ from kontra.preplan.sqlserver import preplan_sqlserver, can_preplan_sqlserver
807
+ if can_preplan_sqlserver(handle):
808
+ t0 = now_ms()
809
+ static_preds = extract_static_predicates(rules=rules)
810
+ pre: PrePlan = preplan_sqlserver(
811
+ handle=handle,
812
+ required_columns=compiled_full.required_cols,
813
+ predicates=static_preds,
814
+ )
815
+ preplan_analyze_ms = now_ms() - t0
816
+
817
+ pass_meta = fail_meta = unknown = 0
818
+ for rid, decision in pre.rule_decisions.items():
819
+ if decision == "pass_meta":
820
+ meta_results_by_id[rid] = {
821
+ "rule_id": rid,
822
+ "passed": True,
823
+ "failed_count": 0,
824
+ "message": "Proven by metadata (SQL Server constraints)",
825
+ "execution_source": "metadata",
826
+ "severity": rule_severity_map.get(rid, "blocking"),
827
+ }
828
+ handled_ids_meta.add(rid)
829
+ pass_meta += 1
830
+ else:
831
+ unknown += 1
832
+
833
+ preplan_effective = True
834
+ preplan_summary.update({
835
+ "effective": True,
836
+ "rules_pass_meta": pass_meta,
837
+ "rules_fail_meta": fail_meta,
838
+ "rules_unknown": unknown,
839
+ })
840
+ except Exception as e:
841
+ if os.getenv("KONTRA_VERBOSE"):
842
+ print(f"[INFO] SQL Server preplan skipped: {e}")
843
+
844
+ # ------------------------------------------------------------------ #
845
+ # 4) Materializer setup (orthogonal)
846
+ materializer = pick_materializer(handle)
847
+ materializer_name = getattr(materializer, "name", "duckdb")
848
+ _staged_override_uri: Optional[str] = None
849
+
850
+ # ------------------------------------------------------------------ #
851
+ # 5) SQL pushdown (independent of preplan/projection)
852
+ sql_results_by_id: Dict[str, Dict[str, Any]] = {}
853
+ handled_ids_sql: Set[str] = set()
854
+ available_cols: List[str] = []
855
+ sql_row_count: Optional[int] = None
856
+ executor_name = "none"
857
+ pushdown_effective = False
858
+ push_compile_ms = push_execute_ms = push_introspect_ms = 0
859
+
860
+ executor = None
861
+ if self.pushdown in {"on", "auto"}:
862
+ # Exclude rules already decided by preplan
863
+ sql_rules_remaining = [s for s in compiled_full.sql_rules if s.get("rule_id") not in handled_ids_meta]
864
+ executor = pick_executor(handle, sql_rules_remaining)
865
+
866
+ if executor:
867
+ try:
868
+ # Compile
869
+ t0 = now_ms()
870
+ executor_name = getattr(executor, "name", "sql")
871
+ sql_plan_str = executor.compile([s for s in compiled_full.sql_rules if s.get("rule_id") not in handled_ids_meta])
872
+ push_compile_ms = now_ms() - t0
873
+ if self.show_plan and sql_plan_str:
874
+ print(f"\n-- {executor_name.upper()} SQL PLAN --\n{sql_plan_str}\n")
875
+
876
+ # Execute
877
+ t0 = now_ms()
878
+ duck_out = executor.execute(handle, sql_plan_str, csv_mode=self.csv_mode)
879
+ push_execute_ms = now_ms() - t0
880
+
881
+ # Inject severity into SQL results
882
+ sql_results_raw = duck_out.get("results", [])
883
+ for r in sql_results_raw:
884
+ r["severity"] = rule_severity_map.get(r.get("rule_id"), "blocking")
885
+ sql_results_by_id = {r["rule_id"]: r for r in sql_results_raw}
886
+ handled_ids_sql = set(sql_results_by_id.keys())
887
+
888
+ # Get row count and cols from execute result (avoids separate introspect call)
889
+ t0 = now_ms()
890
+ sql_row_count = duck_out.get("row_count")
891
+ available_cols = duck_out.get("available_cols") or []
892
+
893
+ # Fallback to introspect if execute didn't return these
894
+ if sql_row_count is None or not available_cols:
895
+ info = executor.introspect(handle, csv_mode=self.csv_mode)
896
+ push_introspect_ms = now_ms() - t0
897
+ sql_row_count = info.get("row_count") if sql_row_count is None else sql_row_count
898
+ available_cols = info.get("available_cols") or available_cols
899
+ staging = info.get("staging") or duck_out.get("staging")
900
+ else:
901
+ push_introspect_ms = now_ms() - t0
902
+ staging = duck_out.get("staging")
903
+
904
+ # Reuse staged Parquet (if the executor staged CSV → Parquet)
905
+ staging = staging or duck_out.get("staging")
906
+ if staging and staging.get("path"):
907
+ _staged_override_uri = staging["path"]
908
+ self._staging_tmpdir = staging.get("tmpdir")
909
+ handle = DatasetHandle.from_uri(_staged_override_uri)
910
+ materializer = pick_materializer(handle)
911
+ materializer_name = getattr(materializer, "name", materializer_name)
912
+
913
+ pushdown_effective = True
914
+ except Exception as e:
915
+ if os.getenv("KONTRA_VERBOSE") or self.show_plan:
916
+ print(f"[WARN] SQL pushdown failed ({type(e).__name__}): {e}")
917
+ executor = None # fall back silently
918
+
919
+ # ------------------------------------------------------------------ #
920
+ # 6) Residual Polars execution (projection independent; manifest optional)
921
+ handled_all = handled_ids_meta | handled_ids_sql
922
+ compiled_residual = plan.without_ids(compiled_full, handled_all)
923
+
924
+ # Projection is DC-driven; independent of preplan/pushdown
925
+ required_cols_full = compiled_full.required_cols if self.enable_projection else []
926
+ required_cols_residual = compiled_residual.required_cols if self.enable_projection else []
927
+
928
+ if not compiled_residual.predicates and not compiled_residual.fallback_rules:
929
+ self.df = None
930
+ polars_out = {"results": []}
931
+ timers.data_load_ms = timers.execute_ms = 0
932
+ else:
933
+ # Materialize minimal slice:
934
+ # If preplan produced a row-group manifest, honor it — otherwise let the materializer decide.
935
+ t0 = now_ms()
936
+ if preplan_effective and _is_parquet(handle.uri) and preplan_row_groups:
937
+ cols = (required_cols_residual or None) if self.enable_projection else None
938
+
939
+ # Reuse preplan filesystem if available, otherwise create from handle
940
+ residual_fs = preplan_fs
941
+ if residual_fs is None and _is_s3_uri(handle.uri):
942
+ try:
943
+ residual_fs = _create_s3_filesystem(handle)
944
+ except Exception as e:
945
+ # Let ParquetFile try default credentials
946
+ log_exception(_logger, "Could not create S3 filesystem for residual load", e)
947
+
948
+ # PyArrow S3FileSystem expects 'bucket/key' format, not 's3://bucket/key'
949
+ residual_path = _s3_uri_to_path(handle.uri) if residual_fs else handle.uri
950
+ pf = pq.ParquetFile(residual_path, filesystem=residual_fs)
951
+
952
+ pa_cols = cols if cols else None
953
+ rg_tables = [pf.read_row_group(i, columns=pa_cols) for i in preplan_row_groups]
954
+ pa_tbl = pa.concat_tables(rg_tables) if len(rg_tables) > 1 else rg_tables[0]
955
+ self.df = pl.from_arrow(pa_tbl)
956
+ else:
957
+ # Materializer respects projection (engine passes residual required cols)
958
+ self.df = materializer.to_polars(required_cols_residual or None)
959
+ timers.data_load_ms = now_ms() - t0
960
+
961
+ # Execute residual rules in Polars
962
+ t0 = now_ms()
963
+ polars_exec = PolarsBackend(executor=plan.execute_compiled)
964
+ polars_art = polars_exec.compile(compiled_residual)
965
+ polars_out = polars_exec.execute(self.df, polars_art)
966
+ timers.execute_ms = now_ms() - t0
967
+
968
+ # ------------------------------------------------------------------ #
969
+ # 7) Merge results — deterministic order: preplan → SQL → Polars
970
+ results: List[Dict[str, Any]] = list(meta_results_by_id.values())
971
+ results += [r for r in sql_results_by_id.values() if r["rule_id"] not in meta_results_by_id]
972
+ results += [r for r in polars_out["results"] if r["rule_id"] not in meta_results_by_id and r["rule_id"] not in sql_results_by_id]
973
+
974
+ # 8) Summary
975
+ summary = plan.summary(results)
976
+ summary["dataset_name"] = self.contract.datasource
977
+ # Row count priority: SQL executor > DataFrame > preplan metadata > 0
978
+ if sql_row_count is not None:
979
+ summary["total_rows"] = int(sql_row_count)
980
+ elif self.df is not None:
981
+ summary["total_rows"] = int(self.df.height)
982
+ elif preplan_total_rows is not None:
983
+ summary["total_rows"] = int(preplan_total_rows)
984
+ else:
985
+ summary["total_rows"] = 0
986
+ engine_label = (
987
+ f"{materializer_name}+polars "
988
+ f"(preplan:{'on' if preplan_effective else 'off'}, "
989
+ f"pushdown:{'on' if pushdown_effective else 'off'}, "
990
+ f"projection:{'on' if self.enable_projection else 'off'})"
991
+ )
992
+
993
+ if self.emit_report:
994
+ t0 = now_ms()
995
+ self._report(summary, results)
996
+ timers.report_ms = now_ms() - t0
997
+
998
+ # ------------------------------------------------------------------ #
999
+ # 9) Stats (feature-attributed)
1000
+ stats: Optional[Dict[str, Any]] = None
1001
+ if self.stats_mode != "none":
1002
+ if not available_cols:
1003
+ available_cols = self._peek_available_columns(handle.uri)
1004
+
1005
+ ds_summary = basic_summary(self.df, available_cols=available_cols, nrows_override=sql_row_count)
1006
+
1007
+ loaded_cols = list(self.df.columns) if self.df is not None else []
1008
+ proj = {
1009
+ "enabled": self.enable_projection,
1010
+ "available_count": len(available_cols or []) if available_cols is not None else len(loaded_cols),
1011
+ "full": {
1012
+ "required_columns": required_cols_full or [],
1013
+ "required_count": len(required_cols_full or []),
1014
+ },
1015
+ "residual": {
1016
+ "required_columns": required_cols_residual or [],
1017
+ "required_count": len(required_cols_residual or []),
1018
+ "loaded_count": len(loaded_cols),
1019
+ "effective": self.enable_projection and bool(required_cols_residual)
1020
+ and len(loaded_cols) <= len(required_cols_residual),
1021
+ },
1022
+ }
1023
+
1024
+ push = {
1025
+ "enabled": self.pushdown in {"on", "auto"},
1026
+ "effective": bool(pushdown_effective),
1027
+ "executor": executor_name,
1028
+ "rules_pushed": len(sql_results_by_id),
1029
+ "breakdown_ms": {
1030
+ "compile": push_compile_ms,
1031
+ "execute": push_execute_ms,
1032
+ "introspect": push_introspect_ms,
1033
+ },
1034
+ }
1035
+
1036
+ res = {
1037
+ "rules_local": len(polars_out["results"]) if "polars_out" in locals() else 0,
1038
+ }
1039
+
1040
+ phases_ms = {
1041
+ "contract_load": int(timers.contract_load_ms or 0),
1042
+ "compile": int(timers.compile_ms or 0),
1043
+ "preplan": int(preplan_analyze_ms or 0),
1044
+ "pushdown": int(push_compile_ms + push_execute_ms + push_introspect_ms),
1045
+ "data_load": int(timers.data_load_ms or 0),
1046
+ "execute": int(timers.execute_ms or 0),
1047
+ "report": int(timers.report_ms or 0),
1048
+ }
1049
+
1050
+ stats = {
1051
+ "stats_version": "2",
1052
+ "run_meta": {
1053
+ "phases_ms": phases_ms,
1054
+ "duration_ms_total": sum(phases_ms.values()),
1055
+ "dataset_path": self.data_path or self.contract.datasource,
1056
+ "contract_path": self.contract_path,
1057
+ "engine": engine_label,
1058
+ "materializer": materializer_name,
1059
+ "preplan_requested": self.preplan,
1060
+ "preplan": "on" if preplan_effective else "off",
1061
+ "pushdown_requested": self.pushdown,
1062
+ "pushdown": "on" if pushdown_effective else "off",
1063
+ "csv_mode": self.csv_mode,
1064
+ "staged_override": bool(_staged_override_uri),
1065
+ },
1066
+ "dataset": ds_summary,
1067
+ "preplan": preplan_summary,
1068
+ "pushdown": push,
1069
+ "projection": proj,
1070
+ "residual": res,
1071
+ "columns_touched": columns_touched([{"name": r.name, "params": r.params} for r in self.contract.rules]),
1072
+ "columns_validated": columns_touched([{"name": r.name, "params": r.params} for r in self.contract.rules]),
1073
+ "columns_loaded": loaded_cols,
1074
+ }
1075
+
1076
+ if self.stats_mode == "profile" and self.df is not None:
1077
+ stats["profile"] = profile_for(self.df, proj["residual"]["required_columns"])
1078
+
1079
+ if os.getenv("KONTRA_IO_DEBUG"):
1080
+ io_dbg = getattr(materializer, "io_debug", None)
1081
+ if callable(io_dbg):
1082
+ io = io_dbg()
1083
+ if io:
1084
+ stats["io"] = io
1085
+
1086
+ out: Dict[str, Any] = {
1087
+ "dataset": self.contract.datasource,
1088
+ "results": results,
1089
+ "summary": summary,
1090
+ }
1091
+ if stats is not None:
1092
+ out["stats"] = stats
1093
+ out.setdefault("run_meta", {})["engine_label"] = engine_label
1094
+
1095
+ # Ensure staged tempdir (if any) is cleaned after the whole run
1096
+ return out
1097
+
1098
+ # --------------------------------------------------------------------- #
1099
+
1100
+ def _report(self, summary: Dict[str, Any], results: List[Dict[str, Any]]) -> None:
1101
+ if summary["passed"]:
1102
+ # Show warning/info counts if any
1103
+ warning_info = ""
1104
+ if summary.get("warning_failures", 0) > 0:
1105
+ warning_info = f" ({summary['warning_failures']} warnings)"
1106
+ elif summary.get("info_failures", 0) > 0:
1107
+ warning_info = f" ({summary['info_failures']} info)"
1108
+
1109
+ report_success(
1110
+ f"{summary['dataset_name']} — PASSED "
1111
+ f"({summary['rules_passed']} of {summary['total_rules']} rules){warning_info}"
1112
+ )
1113
+ else:
1114
+ # Show severity breakdown
1115
+ blocking = summary.get("blocking_failures", summary["rules_failed"])
1116
+ warning = summary.get("warning_failures", 0)
1117
+ info = summary.get("info_failures", 0)
1118
+
1119
+ severity_info = f" ({blocking} blocking"
1120
+ if warning > 0:
1121
+ severity_info += f", {warning} warnings"
1122
+ if info > 0:
1123
+ severity_info += f", {info} info"
1124
+ severity_info += ")"
1125
+
1126
+ report_failure(
1127
+ f"{summary['dataset_name']} — FAILED "
1128
+ f"({summary['rules_failed']} of {summary['total_rules']} rules){severity_info}"
1129
+ )
1130
+
1131
+ # Show all rule results with execution source
1132
+ for r in results:
1133
+ source = r.get("execution_source", "polars")
1134
+ source_tag = f" [{source}]" if source else ""
1135
+ rule_id = r.get("rule_id", "<unknown>")
1136
+ passed = r.get("passed", False)
1137
+ severity = r.get("severity", "blocking")
1138
+
1139
+ # Severity tag for non-blocking failures
1140
+ severity_tag = ""
1141
+ if not passed and severity != "blocking":
1142
+ severity_tag = f" [{severity}]"
1143
+
1144
+ if passed:
1145
+ print(f" ✅ {rule_id}{source_tag}")
1146
+ else:
1147
+ msg = r.get("message", "Failed")
1148
+ failed_count = r.get("failed_count", 0)
1149
+ # Include failure count if available
1150
+ detail = f": {msg}"
1151
+ if failed_count > 0:
1152
+ detail = f": {failed_count:,} failures"
1153
+
1154
+ # Use different icon for warning/info
1155
+ icon = "❌" if severity == "blocking" else ("⚠️" if severity == "warning" else "ℹ️")
1156
+ print(f" {icon} {rule_id}{source_tag}{severity_tag}{detail}")
1157
+
1158
+ # Show detailed explanation if available
1159
+ details = r.get("details")
1160
+ if details:
1161
+ self._print_failure_details(details)
1162
+
1163
+ def _print_failure_details(self, details: Dict[str, Any]) -> None:
1164
+ """Print detailed failure explanation."""
1165
+ # Expected values (for allowed_values rule)
1166
+ expected = details.get("expected")
1167
+ if expected:
1168
+ expected_preview = ", ".join(expected[:5])
1169
+ if len(expected) > 5:
1170
+ expected_preview += f" ... ({len(expected)} total)"
1171
+ print(f" Expected: {expected_preview}")
1172
+
1173
+ # Unexpected values (for allowed_values rule)
1174
+ unexpected = details.get("unexpected_values")
1175
+ if unexpected:
1176
+ print(" Unexpected values:")
1177
+ for uv in unexpected[:5]:
1178
+ val = uv.get("value", "?")
1179
+ count = uv.get("count", 0)
1180
+ print(f" - \"{val}\" ({count:,} rows)")
1181
+ if len(unexpected) > 5:
1182
+ print(f" ... and {len(unexpected) - 5} more")
1183
+
1184
+ # Suggestion
1185
+ suggestion = details.get("suggestion")
1186
+ if suggestion:
1187
+ print(f" Suggestion: {suggestion}")
1188
+
1189
+ # --------------------------------------------------------------------- #
1190
+
1191
+ def _peek_available_columns(self, source: str) -> List[str]:
1192
+ """Cheap schema peek; used only for observability."""
1193
+ try:
1194
+ s = source.lower()
1195
+ # We can't easily peek S3 without a filesystem object,
1196
+ # so we'll just handle local files for now.
1197
+ if _is_s3_uri(s):
1198
+ return []
1199
+ if s.endswith(".parquet"):
1200
+ return list(pl.scan_parquet(source).collect_schema().names())
1201
+ if s.endswith(".csv"):
1202
+ return list(pl.scan_csv(source).collect_schema().names())
1203
+ except Exception as e:
1204
+ log_exception(_logger, f"Could not peek columns from {source}", e)
1205
+ return []