kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/engine/engine.py
ADDED
|
@@ -0,0 +1,1205 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Validation Engine — preplan-aware, hybrid, projection-efficient, deterministic.
|
|
5
|
+
|
|
6
|
+
Flow
|
|
7
|
+
----
|
|
8
|
+
1) Load contract
|
|
9
|
+
2) Build rules → compile plan (required columns + SQL-capable candidates)
|
|
10
|
+
3) (Optional) Preplan (metadata-only, Parquet): prove PASS/FAIL, build scan manifest
|
|
11
|
+
4) Pick materializer (e.g., DuckDB for S3 / staged CSV)
|
|
12
|
+
5) (Optional) SQL pushdown for eligible *remaining* rules (may stage CSV → Parquet)
|
|
13
|
+
6) Materialize residual slice for Polars (row-groups + projection)
|
|
14
|
+
7) Execute residual rules in Polars
|
|
15
|
+
8) Merge results (preplan → SQL → Polars), summarize, attach small stats dict
|
|
16
|
+
|
|
17
|
+
Principles
|
|
18
|
+
----------
|
|
19
|
+
- Deterministic: identical inputs → identical outputs
|
|
20
|
+
- Layered & independent toggles:
|
|
21
|
+
* Preplan (metadata) — independent of pushdown/projection
|
|
22
|
+
* Pushdown (SQL execution) — independent of preplan/projection
|
|
23
|
+
* Projection (contract-driven columns) — independent of preplan/pushdown
|
|
24
|
+
- Performance-first: plan → prune → load minimal slice → execute
|
|
25
|
+
- Clear separation: engine orchestrates; preplan is a leaf; reporters format/print
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import os
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any, Dict, List, Literal, Optional, Set, TYPE_CHECKING, Union
|
|
31
|
+
|
|
32
|
+
import polars as pl
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from kontra.state.backends.base import StateBackend
|
|
36
|
+
from kontra.state.types import ValidationState
|
|
37
|
+
import pyarrow as pa
|
|
38
|
+
import pyarrow.fs as pafs # <-- Added
|
|
39
|
+
import pyarrow.parquet as pq
|
|
40
|
+
|
|
41
|
+
from kontra.config.loader import ContractLoader
|
|
42
|
+
from kontra.config.models import Contract
|
|
43
|
+
from kontra.connectors.handle import DatasetHandle
|
|
44
|
+
from kontra.engine.backends.polars_backend import PolarsBackend
|
|
45
|
+
from kontra.engine.executors.registry import pick_executor, register_default_executors
|
|
46
|
+
from kontra.engine.materializers.registry import pick_materializer, register_default_materializers
|
|
47
|
+
from kontra.engine.stats import RunTimers, basic_summary, columns_touched, now_ms, profile_for
|
|
48
|
+
from kontra.reporters.rich_reporter import report_failure, report_success
|
|
49
|
+
from kontra.rules.execution_plan import RuleExecutionPlan
|
|
50
|
+
from kontra.rules.factory import RuleFactory
|
|
51
|
+
from kontra.logging import get_logger, log_exception
|
|
52
|
+
|
|
53
|
+
_logger = get_logger(__name__)
|
|
54
|
+
|
|
55
|
+
# Preplan (metadata-only) + static predicate extraction
|
|
56
|
+
from kontra.preplan.planner import preplan_single_parquet
|
|
57
|
+
from kontra.preplan.types import PrePlan
|
|
58
|
+
from kontra.rules.static_predicates import extract_static_predicates
|
|
59
|
+
|
|
60
|
+
# Built-ins (side-effect registration)
|
|
61
|
+
import kontra.rules.builtin.allowed_values # noqa: F401
|
|
62
|
+
import kontra.rules.builtin.disallowed_values # noqa: F401
|
|
63
|
+
import kontra.rules.builtin.custom_sql_check # noqa: F401
|
|
64
|
+
import kontra.rules.builtin.dtype # noqa: F401
|
|
65
|
+
import kontra.rules.builtin.freshness # noqa: F401
|
|
66
|
+
import kontra.rules.builtin.max_rows # noqa: F401
|
|
67
|
+
import kontra.rules.builtin.min_rows # noqa: F401
|
|
68
|
+
import kontra.rules.builtin.not_null # noqa: F401
|
|
69
|
+
import kontra.rules.builtin.range # noqa: F401
|
|
70
|
+
import kontra.rules.builtin.length # noqa: F401
|
|
71
|
+
import kontra.rules.builtin.regex # noqa: F401
|
|
72
|
+
import kontra.rules.builtin.contains # noqa: F401
|
|
73
|
+
import kontra.rules.builtin.starts_with # noqa: F401
|
|
74
|
+
import kontra.rules.builtin.ends_with # noqa: F401
|
|
75
|
+
import kontra.rules.builtin.unique # noqa: F401
|
|
76
|
+
import kontra.rules.builtin.compare # noqa: F401
|
|
77
|
+
import kontra.rules.builtin.conditional_not_null # noqa: F401
|
|
78
|
+
import kontra.rules.builtin.conditional_range # noqa: F401
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# --------------------------------------------------------------------------- #
|
|
82
|
+
# Helpers
|
|
83
|
+
# --------------------------------------------------------------------------- #
|
|
84
|
+
|
|
85
|
+
def _resolve_datasource_uri(reference: str) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Resolve a datasource reference to a concrete URI.
|
|
88
|
+
|
|
89
|
+
Tries to resolve named datasources (e.g., "prod_db.users") through config.
|
|
90
|
+
Falls back to returning the reference as-is if not found in config.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
reference: Named datasource ("prod_db.users") or direct URI/path
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Resolved URI (e.g., "postgres://host/db/public.users" or "./data.parquet")
|
|
97
|
+
"""
|
|
98
|
+
try:
|
|
99
|
+
from kontra.config.settings import resolve_datasource
|
|
100
|
+
return resolve_datasource(reference)
|
|
101
|
+
except (ValueError, ImportError):
|
|
102
|
+
# Not a named datasource or config not available - use as-is
|
|
103
|
+
return reference
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _is_s3_uri(val: str | None) -> bool:
|
|
107
|
+
return isinstance(val, str) and val.lower().startswith("s3://")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _s3_uri_to_path(uri: str) -> str:
|
|
111
|
+
"""Convert s3://bucket/key to bucket/key (PyArrow S3FileSystem format)."""
|
|
112
|
+
if uri.lower().startswith("s3://"):
|
|
113
|
+
return uri[5:] # Strip 's3://'
|
|
114
|
+
return uri
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _create_s3_filesystem(handle: DatasetHandle) -> pafs.S3FileSystem:
|
|
118
|
+
"""
|
|
119
|
+
Create a PyArrow S3FileSystem from handle's fs_opts (populated from env vars).
|
|
120
|
+
Supports MinIO and other S3-compatible storage via custom endpoints.
|
|
121
|
+
"""
|
|
122
|
+
opts = handle.fs_opts or {}
|
|
123
|
+
|
|
124
|
+
# Map our fs_opts keys to PyArrow S3FileSystem kwargs
|
|
125
|
+
kwargs: Dict[str, Any] = {}
|
|
126
|
+
if opts.get("s3_access_key_id") and opts.get("s3_secret_access_key"):
|
|
127
|
+
kwargs["access_key"] = opts["s3_access_key_id"]
|
|
128
|
+
kwargs["secret_key"] = opts["s3_secret_access_key"]
|
|
129
|
+
if opts.get("s3_session_token"):
|
|
130
|
+
kwargs["session_token"] = opts["s3_session_token"]
|
|
131
|
+
if opts.get("s3_region"):
|
|
132
|
+
kwargs["region"] = opts["s3_region"]
|
|
133
|
+
if opts.get("s3_endpoint"):
|
|
134
|
+
# PyArrow expects endpoint_override without the scheme
|
|
135
|
+
endpoint = opts["s3_endpoint"]
|
|
136
|
+
# Strip scheme if present and set scheme kwarg
|
|
137
|
+
if endpoint.startswith("http://"):
|
|
138
|
+
endpoint = endpoint[7:]
|
|
139
|
+
kwargs["scheme"] = "http"
|
|
140
|
+
elif endpoint.startswith("https://"):
|
|
141
|
+
endpoint = endpoint[8:]
|
|
142
|
+
kwargs["scheme"] = "https"
|
|
143
|
+
kwargs["endpoint_override"] = endpoint
|
|
144
|
+
|
|
145
|
+
# MinIO and some S3-compatible storage require path-style URLs (not virtual-hosted)
|
|
146
|
+
# DUCKDB_S3_URL_STYLE=path -> force_virtual_addressing=False
|
|
147
|
+
url_style = opts.get("s3_url_style", "").lower()
|
|
148
|
+
if url_style == "path":
|
|
149
|
+
kwargs["force_virtual_addressing"] = False
|
|
150
|
+
elif url_style == "host":
|
|
151
|
+
kwargs["force_virtual_addressing"] = True
|
|
152
|
+
# If endpoint is set but no url_style, default to path-style (common for MinIO)
|
|
153
|
+
elif opts.get("s3_endpoint"):
|
|
154
|
+
kwargs["force_virtual_addressing"] = False
|
|
155
|
+
|
|
156
|
+
return pafs.S3FileSystem(**kwargs)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _is_parquet(path: str | None) -> bool:
|
|
160
|
+
return isinstance(path, str) and path.lower().endswith(".parquet")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# --------------------------------------------------------------------------- #
|
|
164
|
+
# Engine
|
|
165
|
+
# --------------------------------------------------------------------------- #
|
|
166
|
+
|
|
167
|
+
class ValidationEngine:
|
|
168
|
+
"""
|
|
169
|
+
Orchestrates:
|
|
170
|
+
- Rule planning
|
|
171
|
+
- Preplan (metadata-only; Parquet) [independent]
|
|
172
|
+
- SQL pushdown (optional) [independent]
|
|
173
|
+
- Residual Polars execution
|
|
174
|
+
- Reporting + stats
|
|
175
|
+
|
|
176
|
+
Usage:
|
|
177
|
+
# From file paths
|
|
178
|
+
engine = ValidationEngine(contract_path="contract.yml")
|
|
179
|
+
result = engine.run()
|
|
180
|
+
|
|
181
|
+
# With DataFrame (skips preplan/pushdown, uses Polars directly)
|
|
182
|
+
import polars as pl
|
|
183
|
+
df = pl.read_parquet("data.parquet")
|
|
184
|
+
engine = ValidationEngine(contract_path="contract.yml", dataframe=df)
|
|
185
|
+
result = engine.run()
|
|
186
|
+
|
|
187
|
+
# With pandas DataFrame
|
|
188
|
+
import pandas as pd
|
|
189
|
+
pdf = pd.read_parquet("data.parquet")
|
|
190
|
+
engine = ValidationEngine(contract_path="contract.yml", dataframe=pdf)
|
|
191
|
+
result = engine.run()
|
|
192
|
+
"""
|
|
193
|
+
|
|
194
|
+
def __init__(
|
|
195
|
+
self,
|
|
196
|
+
contract_path: Optional[str] = None,
|
|
197
|
+
data_path: Optional[str] = None,
|
|
198
|
+
dataframe: Optional[Union[pl.DataFrame, "pd.DataFrame"]] = None,
|
|
199
|
+
handle: Optional[DatasetHandle] = None, # BYOC: pre-built handle
|
|
200
|
+
emit_report: bool = True,
|
|
201
|
+
stats_mode: Literal["none", "summary", "profile"] = "none",
|
|
202
|
+
# Independent toggles
|
|
203
|
+
preplan: Literal["on", "off", "auto"] = "auto",
|
|
204
|
+
pushdown: Literal["on", "off", "auto"] = "auto",
|
|
205
|
+
enable_projection: bool = True,
|
|
206
|
+
csv_mode: Literal["auto", "duckdb", "parquet"] = "auto",
|
|
207
|
+
# Diagnostics
|
|
208
|
+
show_plan: bool = False,
|
|
209
|
+
explain_preplan: bool = False,
|
|
210
|
+
# State management
|
|
211
|
+
state_store: Optional["StateBackend"] = None,
|
|
212
|
+
save_state: bool = True,
|
|
213
|
+
# Inline rules (Python API)
|
|
214
|
+
inline_rules: Optional[List[Dict[str, Any]]] = None,
|
|
215
|
+
# Cloud storage credentials (S3, Azure, GCS)
|
|
216
|
+
storage_options: Optional[Dict[str, Any]] = None,
|
|
217
|
+
):
|
|
218
|
+
# Validate inputs
|
|
219
|
+
if contract_path is None and inline_rules is None:
|
|
220
|
+
raise ValueError("Either contract_path or inline_rules must be provided")
|
|
221
|
+
|
|
222
|
+
# Validate toggle parameters
|
|
223
|
+
valid_csv_modes = {"auto", "duckdb", "parquet"}
|
|
224
|
+
if csv_mode not in valid_csv_modes:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Invalid csv_mode '{csv_mode}'. "
|
|
227
|
+
f"Must be one of: {', '.join(sorted(valid_csv_modes))}"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
valid_toggles = {"on", "off", "auto"}
|
|
231
|
+
if preplan not in valid_toggles:
|
|
232
|
+
raise ValueError(
|
|
233
|
+
f"Invalid preplan '{preplan}'. "
|
|
234
|
+
f"Must be one of: {', '.join(sorted(valid_toggles))}"
|
|
235
|
+
)
|
|
236
|
+
if pushdown not in valid_toggles:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"Invalid pushdown '{pushdown}'. "
|
|
239
|
+
f"Must be one of: {', '.join(sorted(valid_toggles))}"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
valid_stats_modes = {"none", "summary", "profile"}
|
|
243
|
+
if stats_mode not in valid_stats_modes:
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f"Invalid stats_mode '{stats_mode}'. "
|
|
246
|
+
f"Must be one of: {', '.join(sorted(valid_stats_modes))}"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
self.contract_path = str(contract_path) if contract_path else None
|
|
250
|
+
self.data_path = data_path
|
|
251
|
+
self._input_dataframe = dataframe # Store user-provided DataFrame
|
|
252
|
+
self._inline_rules = inline_rules # Store inline rules for merging
|
|
253
|
+
self._inline_built_rules = [] # Populated in _load_contract() if BaseRule instances passed
|
|
254
|
+
self.emit_report = emit_report
|
|
255
|
+
self.stats_mode = stats_mode
|
|
256
|
+
|
|
257
|
+
self.preplan = preplan
|
|
258
|
+
self.pushdown = pushdown
|
|
259
|
+
self.enable_projection = bool(enable_projection)
|
|
260
|
+
self.csv_mode = csv_mode
|
|
261
|
+
self.show_plan = show_plan
|
|
262
|
+
self.explain_preplan = explain_preplan
|
|
263
|
+
|
|
264
|
+
# State management
|
|
265
|
+
self.state_store = state_store
|
|
266
|
+
self.save_state = save_state
|
|
267
|
+
self._last_state: Optional["ValidationState"] = None
|
|
268
|
+
|
|
269
|
+
self.contract: Optional[Contract] = None
|
|
270
|
+
self.df: Optional[pl.DataFrame] = None
|
|
271
|
+
self._handle: Optional[DatasetHandle] = handle # BYOC: pre-built handle
|
|
272
|
+
self._rules: Optional[List] = None # Built rules, for sample_failures()
|
|
273
|
+
self._storage_options = storage_options # Cloud storage credentials
|
|
274
|
+
|
|
275
|
+
register_default_materializers()
|
|
276
|
+
register_default_executors()
|
|
277
|
+
|
|
278
|
+
# --------------------------------------------------------------------- #
|
|
279
|
+
|
|
280
|
+
def run(self) -> Dict[str, Any]:
|
|
281
|
+
timers = RunTimers()
|
|
282
|
+
self._staging_tmpdir = None # Track for cleanup in finally block
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
result = self._run_impl(timers)
|
|
286
|
+
|
|
287
|
+
# Save state if enabled
|
|
288
|
+
if self.save_state:
|
|
289
|
+
self._save_validation_state(result)
|
|
290
|
+
|
|
291
|
+
return result
|
|
292
|
+
finally:
|
|
293
|
+
# Cleanup staged temp directory (CSV -> Parquet staging)
|
|
294
|
+
if self._staging_tmpdir is not None:
|
|
295
|
+
try:
|
|
296
|
+
self._staging_tmpdir.cleanup()
|
|
297
|
+
except Exception as e:
|
|
298
|
+
log_exception(_logger, "Failed to cleanup staging directory", e)
|
|
299
|
+
self._staging_tmpdir = None
|
|
300
|
+
|
|
301
|
+
def _save_validation_state(self, result: Dict[str, Any]) -> None:
|
|
302
|
+
"""Save validation state if a store is configured."""
|
|
303
|
+
try:
|
|
304
|
+
from kontra.state.types import ValidationState
|
|
305
|
+
from kontra.state.fingerprint import fingerprint_contract, fingerprint_dataset
|
|
306
|
+
from kontra.state.backends import get_default_store
|
|
307
|
+
|
|
308
|
+
# Get or create store
|
|
309
|
+
store = self.state_store
|
|
310
|
+
if store is None and self.save_state:
|
|
311
|
+
store = get_default_store()
|
|
312
|
+
|
|
313
|
+
if store is None:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
# Generate fingerprints
|
|
317
|
+
contract_fp = fingerprint_contract(self.contract) if self.contract else "unknown"
|
|
318
|
+
|
|
319
|
+
source_ref = self.data_path or (self.contract.datasource if self.contract else "")
|
|
320
|
+
source_uri = _resolve_datasource_uri(source_ref) if source_ref else ""
|
|
321
|
+
dataset_fp = None
|
|
322
|
+
try:
|
|
323
|
+
handle = DatasetHandle.from_uri(source_uri, storage_options=self._storage_options)
|
|
324
|
+
dataset_fp = fingerprint_dataset(handle)
|
|
325
|
+
except Exception as e:
|
|
326
|
+
log_exception(_logger, "Could not fingerprint dataset", e)
|
|
327
|
+
|
|
328
|
+
# Derive contract name (from contract, or from path)
|
|
329
|
+
contract_name = "unknown"
|
|
330
|
+
if self.contract:
|
|
331
|
+
contract_name = self.contract.name or Path(self.contract_path).stem
|
|
332
|
+
|
|
333
|
+
# Create state from result
|
|
334
|
+
state = ValidationState.from_validation_result(
|
|
335
|
+
result=result,
|
|
336
|
+
contract_fingerprint=contract_fp,
|
|
337
|
+
dataset_fingerprint=dataset_fp,
|
|
338
|
+
contract_name=contract_name,
|
|
339
|
+
dataset_uri=source_uri,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Save
|
|
343
|
+
store.save(state)
|
|
344
|
+
self._last_state = state
|
|
345
|
+
|
|
346
|
+
except Exception as e:
|
|
347
|
+
# Don't fail validation if state save fails
|
|
348
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
349
|
+
print(f"Warning: Failed to save validation state: {e}")
|
|
350
|
+
|
|
351
|
+
def get_last_state(self) -> Optional["ValidationState"]:
|
|
352
|
+
"""Get the state from the last validation run."""
|
|
353
|
+
return self._last_state
|
|
354
|
+
|
|
355
|
+
def diff_from_last(self) -> Optional[Dict[str, Any]]:
|
|
356
|
+
"""
|
|
357
|
+
Compare current state to previous state.
|
|
358
|
+
|
|
359
|
+
Returns a dict with changes, or None if no previous state exists.
|
|
360
|
+
"""
|
|
361
|
+
if self._last_state is None:
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
try:
|
|
365
|
+
from kontra.state.backends import get_default_store
|
|
366
|
+
|
|
367
|
+
store = self.state_store or get_default_store()
|
|
368
|
+
previous = store.get_previous(
|
|
369
|
+
self._last_state.contract_fingerprint,
|
|
370
|
+
before=self._last_state.run_at,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if previous is None:
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
# Build simple diff
|
|
377
|
+
return self._build_diff(previous, self._last_state)
|
|
378
|
+
|
|
379
|
+
except Exception as e:
|
|
380
|
+
log_exception(_logger, "Failed to compute diff", e)
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
def _build_diff(
|
|
384
|
+
self,
|
|
385
|
+
before: "ValidationState",
|
|
386
|
+
after: "ValidationState",
|
|
387
|
+
) -> Dict[str, Any]:
|
|
388
|
+
"""Build a diff between two validation states."""
|
|
389
|
+
diff: Dict[str, Any] = {
|
|
390
|
+
"before_run_at": before.run_at.isoformat(),
|
|
391
|
+
"after_run_at": after.run_at.isoformat(),
|
|
392
|
+
"summary_changed": before.summary.passed != after.summary.passed,
|
|
393
|
+
"rules_changed": [],
|
|
394
|
+
"new_failures": [],
|
|
395
|
+
"resolved_failures": [],
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
# Index before rules by ID
|
|
399
|
+
before_rules = {r.rule_id: r for r in before.rules}
|
|
400
|
+
after_rules = {r.rule_id: r for r in after.rules}
|
|
401
|
+
|
|
402
|
+
# Find changes
|
|
403
|
+
for rule_id, after_rule in after_rules.items():
|
|
404
|
+
before_rule = before_rules.get(rule_id)
|
|
405
|
+
|
|
406
|
+
if before_rule is None:
|
|
407
|
+
# New rule
|
|
408
|
+
if not after_rule.passed:
|
|
409
|
+
diff["new_failures"].append({
|
|
410
|
+
"rule_id": rule_id,
|
|
411
|
+
"failed_count": after_rule.failed_count,
|
|
412
|
+
})
|
|
413
|
+
elif before_rule.passed != after_rule.passed:
|
|
414
|
+
# Status changed
|
|
415
|
+
if after_rule.passed:
|
|
416
|
+
diff["resolved_failures"].append(rule_id)
|
|
417
|
+
else:
|
|
418
|
+
diff["new_failures"].append({
|
|
419
|
+
"rule_id": rule_id,
|
|
420
|
+
"failed_count": after_rule.failed_count,
|
|
421
|
+
"was_passing": True,
|
|
422
|
+
})
|
|
423
|
+
elif before_rule.failed_count != after_rule.failed_count:
|
|
424
|
+
# Count changed
|
|
425
|
+
diff["rules_changed"].append({
|
|
426
|
+
"rule_id": rule_id,
|
|
427
|
+
"before_count": before_rule.failed_count,
|
|
428
|
+
"after_count": after_rule.failed_count,
|
|
429
|
+
"delta": after_rule.failed_count - before_rule.failed_count,
|
|
430
|
+
})
|
|
431
|
+
|
|
432
|
+
diff["has_regressions"] = len(diff["new_failures"]) > 0 or any(
|
|
433
|
+
r["delta"] > 0 for r in diff["rules_changed"]
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
return diff
|
|
437
|
+
|
|
438
|
+
def _run_dataframe_mode(
|
|
439
|
+
self,
|
|
440
|
+
timers: RunTimers,
|
|
441
|
+
rules: List,
|
|
442
|
+
plan: "RuleExecutionPlan",
|
|
443
|
+
compiled_full,
|
|
444
|
+
rule_severity_map: Dict[str, str],
|
|
445
|
+
) -> Dict[str, Any]:
|
|
446
|
+
"""
|
|
447
|
+
Execute validation directly on a user-provided DataFrame.
|
|
448
|
+
|
|
449
|
+
This path:
|
|
450
|
+
- Skips preplan (no file metadata)
|
|
451
|
+
- Skips SQL pushdown (data already in memory)
|
|
452
|
+
- Uses Polars-only execution
|
|
453
|
+
"""
|
|
454
|
+
t0 = now_ms()
|
|
455
|
+
|
|
456
|
+
# Convert pandas to polars if needed
|
|
457
|
+
df = self._input_dataframe
|
|
458
|
+
if not isinstance(df, pl.DataFrame):
|
|
459
|
+
try:
|
|
460
|
+
# Assume it's pandas-like
|
|
461
|
+
df = pl.from_pandas(df)
|
|
462
|
+
except Exception as e:
|
|
463
|
+
raise ValueError(
|
|
464
|
+
f"Could not convert DataFrame to Polars: {e}. "
|
|
465
|
+
"Pass a Polars DataFrame or a pandas DataFrame."
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
self.df = df
|
|
469
|
+
timers.data_load_ms = now_ms() - t0
|
|
470
|
+
|
|
471
|
+
# Execute all rules via Polars
|
|
472
|
+
t0 = now_ms()
|
|
473
|
+
polars_exec = PolarsBackend(executor=plan.execute_compiled)
|
|
474
|
+
exec_result = polars_exec.execute(self.df, compiled_full)
|
|
475
|
+
polars_results = exec_result.get("results", [])
|
|
476
|
+
timers.polars_ms = now_ms() - t0
|
|
477
|
+
|
|
478
|
+
# Merge results (all from Polars in this mode)
|
|
479
|
+
all_results: List[Dict[str, Any]] = []
|
|
480
|
+
for res in polars_results:
|
|
481
|
+
res["execution_source"] = "polars"
|
|
482
|
+
res["severity"] = rule_severity_map.get(res["rule_id"], "blocking")
|
|
483
|
+
all_results.append(res)
|
|
484
|
+
|
|
485
|
+
# Sort deterministically
|
|
486
|
+
all_results.sort(key=lambda r: r["rule_id"])
|
|
487
|
+
|
|
488
|
+
# Summary (use the plan's summary method for consistency)
|
|
489
|
+
summary = plan.summary(all_results)
|
|
490
|
+
summary["dataset_name"] = self.contract.datasource if self.contract else "dataframe"
|
|
491
|
+
summary["total_rows"] = int(self.df.height) if self.df is not None else 0
|
|
492
|
+
engine_label = "polars (dataframe mode)"
|
|
493
|
+
|
|
494
|
+
# Report
|
|
495
|
+
if self.emit_report:
|
|
496
|
+
if summary["passed"]:
|
|
497
|
+
report_success(
|
|
498
|
+
name=summary.get("dataset_name", "dataframe"),
|
|
499
|
+
results=all_results,
|
|
500
|
+
summary=summary,
|
|
501
|
+
)
|
|
502
|
+
else:
|
|
503
|
+
report_failure(
|
|
504
|
+
name=summary.get("dataset_name", "dataframe"),
|
|
505
|
+
results=all_results,
|
|
506
|
+
summary=summary,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
result = {
|
|
510
|
+
"summary": summary,
|
|
511
|
+
"results": all_results,
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
# Stats
|
|
515
|
+
if self.stats_mode != "none":
|
|
516
|
+
stats: Dict[str, Any] = {
|
|
517
|
+
"run_meta": {
|
|
518
|
+
"contract_path": self.contract_path,
|
|
519
|
+
"engine": engine_label,
|
|
520
|
+
"materializer": "dataframe",
|
|
521
|
+
"preplan": "off",
|
|
522
|
+
"pushdown": "off",
|
|
523
|
+
},
|
|
524
|
+
"durations_ms": {
|
|
525
|
+
"contract_load": timers.contract_load_ms,
|
|
526
|
+
"compile": timers.compile_ms,
|
|
527
|
+
"data_load": timers.data_load_ms,
|
|
528
|
+
"polars": timers.polars_ms,
|
|
529
|
+
"total": timers.total_ms(),
|
|
530
|
+
},
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
if self.stats_mode == "summary":
|
|
534
|
+
stats["dataset"] = basic_summary(self.df)
|
|
535
|
+
elif self.stats_mode == "profile":
|
|
536
|
+
stats["dataset"] = profile_for(self.df, self.df.columns)
|
|
537
|
+
|
|
538
|
+
result["stats"] = stats
|
|
539
|
+
|
|
540
|
+
return result
|
|
541
|
+
|
|
542
|
+
def _load_contract(self) -> Contract:
|
|
543
|
+
"""
|
|
544
|
+
Load contract from file and/or merge with inline rules.
|
|
545
|
+
|
|
546
|
+
Returns a Contract object with all rules to validate.
|
|
547
|
+
"""
|
|
548
|
+
from kontra.config.models import RuleSpec
|
|
549
|
+
|
|
550
|
+
# Convert inline rules to RuleSpec objects (or pass through BaseRule instances)
|
|
551
|
+
inline_specs = []
|
|
552
|
+
inline_built_rules = [] # Already-built BaseRule instances
|
|
553
|
+
if self._inline_rules:
|
|
554
|
+
from kontra.rules.base import BaseRule as BaseRuleType
|
|
555
|
+
for rule in self._inline_rules:
|
|
556
|
+
if isinstance(rule, BaseRuleType):
|
|
557
|
+
# Already a rule instance - use directly
|
|
558
|
+
inline_built_rules.append(rule)
|
|
559
|
+
elif isinstance(rule, dict):
|
|
560
|
+
# Dict format - convert to RuleSpec
|
|
561
|
+
spec = RuleSpec(
|
|
562
|
+
name=rule.get("name", ""),
|
|
563
|
+
id=rule.get("id"),
|
|
564
|
+
params=rule.get("params", {}),
|
|
565
|
+
severity=rule.get("severity", "blocking"),
|
|
566
|
+
)
|
|
567
|
+
inline_specs.append(spec)
|
|
568
|
+
else:
|
|
569
|
+
raise ValueError(
|
|
570
|
+
f"Invalid rule type: {type(rule).__name__}. "
|
|
571
|
+
f"Expected dict or BaseRule instance."
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# Store built rules to merge with factory-built rules later
|
|
575
|
+
self._inline_built_rules = inline_built_rules
|
|
576
|
+
|
|
577
|
+
# Load from file if path provided
|
|
578
|
+
if self.contract_path:
|
|
579
|
+
contract = (
|
|
580
|
+
ContractLoader.from_s3(self.contract_path)
|
|
581
|
+
if _is_s3_uri(self.contract_path)
|
|
582
|
+
else ContractLoader.from_path(self.contract_path)
|
|
583
|
+
)
|
|
584
|
+
# Merge inline rules with contract rules
|
|
585
|
+
if inline_specs:
|
|
586
|
+
contract.rules = list(contract.rules) + inline_specs
|
|
587
|
+
return contract
|
|
588
|
+
|
|
589
|
+
# No contract file - create synthetic contract from inline rules
|
|
590
|
+
dataset = self.data_path or "inline_validation"
|
|
591
|
+
return Contract(
|
|
592
|
+
name="inline_contract",
|
|
593
|
+
dataset=dataset,
|
|
594
|
+
rules=inline_specs,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
def _run_impl(self, timers: RunTimers) -> Dict[str, Any]:
|
|
598
|
+
# 1) Contract (load from file and/or inline rules)
|
|
599
|
+
t0 = now_ms()
|
|
600
|
+
self.contract = self._load_contract()
|
|
601
|
+
timers.contract_load_ms = now_ms() - t0
|
|
602
|
+
|
|
603
|
+
# 2) Rules & plan
|
|
604
|
+
t0 = now_ms()
|
|
605
|
+
rules = RuleFactory(self.contract.rules).build_rules()
|
|
606
|
+
# Merge with any pre-built rule instances passed directly
|
|
607
|
+
if self._inline_built_rules:
|
|
608
|
+
rules = rules + self._inline_built_rules
|
|
609
|
+
self._rules = rules # Store for sample_failures()
|
|
610
|
+
plan = RuleExecutionPlan(rules)
|
|
611
|
+
compiled_full = plan.compile()
|
|
612
|
+
timers.compile_ms = now_ms() - t0
|
|
613
|
+
|
|
614
|
+
# Build rule_id -> severity mapping for injecting into preplan/SQL results
|
|
615
|
+
rule_severity_map = {r.rule_id: r.severity for r in rules}
|
|
616
|
+
|
|
617
|
+
# ------------------------------------------------------------------ #
|
|
618
|
+
# DataFrame mode: If user provided a DataFrame, use Polars-only path
|
|
619
|
+
# ------------------------------------------------------------------ #
|
|
620
|
+
if self._input_dataframe is not None:
|
|
621
|
+
return self._run_dataframe_mode(timers, rules, plan, compiled_full, rule_severity_map)
|
|
622
|
+
|
|
623
|
+
# Dataset handle (used across phases)
|
|
624
|
+
# BYOC: if a pre-built handle was provided, use it directly
|
|
625
|
+
if self._handle is not None:
|
|
626
|
+
handle = self._handle
|
|
627
|
+
source_uri = handle.uri
|
|
628
|
+
else:
|
|
629
|
+
source_ref = self.data_path or self.contract.datasource
|
|
630
|
+
source_uri = _resolve_datasource_uri(source_ref)
|
|
631
|
+
handle = DatasetHandle.from_uri(source_uri, storage_options=self._storage_options)
|
|
632
|
+
|
|
633
|
+
# ------------------------------------------------------------------ #
|
|
634
|
+
# 3) Preplan (metadata-only; independent of pushdown/projection)
|
|
635
|
+
preplan_effective = False
|
|
636
|
+
handled_ids_meta: Set[str] = set()
|
|
637
|
+
meta_results_by_id: Dict[str, Dict[str, Any]] = {}
|
|
638
|
+
preplan_row_groups: Optional[List[int]] = None
|
|
639
|
+
preplan_columns: Optional[List[str]] = None
|
|
640
|
+
preplan_analyze_ms = 0
|
|
641
|
+
preplan_total_rows: Optional[int] = None # Track row count from preplan metadata
|
|
642
|
+
preplan_summary: Dict[str, Any] = {
|
|
643
|
+
"enabled": self.preplan in {"on", "auto"},
|
|
644
|
+
"effective": False,
|
|
645
|
+
"rules_pass_meta": 0,
|
|
646
|
+
"rules_fail_meta": 0,
|
|
647
|
+
"rules_unknown": len(compiled_full.required_cols or []),
|
|
648
|
+
"row_groups_kept": None,
|
|
649
|
+
"row_groups_total": None,
|
|
650
|
+
"row_groups_pruned": None,
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
# Get filesystem from handle; preplan needs this for S3/remote access.
|
|
654
|
+
preplan_fs: pafs.FileSystem | None = None
|
|
655
|
+
if _is_s3_uri(handle.uri):
|
|
656
|
+
try:
|
|
657
|
+
preplan_fs = _create_s3_filesystem(handle)
|
|
658
|
+
except Exception as e:
|
|
659
|
+
# If S3 libs aren't installed, this will fail.
|
|
660
|
+
# We'll let the ParquetFile call fail below and be caught.
|
|
661
|
+
log_exception(_logger, "Could not create S3 filesystem for preplan", e)
|
|
662
|
+
|
|
663
|
+
if self.preplan in {"on", "auto"} and _is_parquet(handle.uri):
|
|
664
|
+
try:
|
|
665
|
+
t0 = now_ms()
|
|
666
|
+
static_preds = extract_static_predicates(rules=rules)
|
|
667
|
+
# PyArrow S3FileSystem expects 'bucket/key' format, not 's3://bucket/key'
|
|
668
|
+
preplan_path = _s3_uri_to_path(handle.uri) if preplan_fs else handle.uri
|
|
669
|
+
pre: PrePlan = preplan_single_parquet(
|
|
670
|
+
path=preplan_path,
|
|
671
|
+
required_columns=compiled_full.required_cols, # DC-driven columns
|
|
672
|
+
predicates=static_preds,
|
|
673
|
+
filesystem=preplan_fs,
|
|
674
|
+
)
|
|
675
|
+
preplan_analyze_ms = now_ms() - t0
|
|
676
|
+
|
|
677
|
+
# Register metadata-based rule decisions (pass/fail), unknowns remain
|
|
678
|
+
pass_meta = fail_meta = unknown = 0
|
|
679
|
+
for rid, decision in pre.rule_decisions.items():
|
|
680
|
+
if decision == "pass_meta":
|
|
681
|
+
meta_results_by_id[rid] = {
|
|
682
|
+
"rule_id": rid,
|
|
683
|
+
"passed": True,
|
|
684
|
+
"failed_count": 0,
|
|
685
|
+
"message": "Proven by metadata (Parquet stats)",
|
|
686
|
+
"execution_source": "metadata",
|
|
687
|
+
"severity": rule_severity_map.get(rid, "blocking"),
|
|
688
|
+
}
|
|
689
|
+
handled_ids_meta.add(rid)
|
|
690
|
+
pass_meta += 1
|
|
691
|
+
elif decision == "fail_meta":
|
|
692
|
+
meta_results_by_id[rid] = {
|
|
693
|
+
"rule_id": rid,
|
|
694
|
+
"passed": False,
|
|
695
|
+
"failed_count": 1,
|
|
696
|
+
"message": "Failed: violation proven by Parquet metadata (null values detected)",
|
|
697
|
+
"execution_source": "metadata",
|
|
698
|
+
"severity": rule_severity_map.get(rid, "blocking"),
|
|
699
|
+
}
|
|
700
|
+
handled_ids_meta.add(rid)
|
|
701
|
+
fail_meta += 1
|
|
702
|
+
else:
|
|
703
|
+
unknown += 1
|
|
704
|
+
|
|
705
|
+
preplan_row_groups = list(pre.manifest_row_groups or [])
|
|
706
|
+
preplan_columns = list(pre.manifest_columns or [])
|
|
707
|
+
preplan_effective = True
|
|
708
|
+
preplan_total_rows = pre.stats.get("total_rows")
|
|
709
|
+
|
|
710
|
+
rg_total = pre.stats.get("rg_total", None)
|
|
711
|
+
rg_kept = len(preplan_row_groups)
|
|
712
|
+
preplan_summary.update({
|
|
713
|
+
"effective": True,
|
|
714
|
+
"rules_pass_meta": pass_meta,
|
|
715
|
+
"rules_fail_meta": fail_meta,
|
|
716
|
+
"rules_unknown": unknown,
|
|
717
|
+
"row_groups_kept": rg_kept if rg_total is not None else None,
|
|
718
|
+
"row_groups_total": rg_total,
|
|
719
|
+
"row_groups_pruned": (rg_total - rg_kept) if (rg_total is not None) else None,
|
|
720
|
+
})
|
|
721
|
+
|
|
722
|
+
if self.explain_preplan:
|
|
723
|
+
print(
|
|
724
|
+
"\n-- PREPLAN (metadata) --"
|
|
725
|
+
f"\n Row-groups kept: {preplan_summary.get('row_groups_kept')}/{preplan_summary.get('row_groups_total')}"
|
|
726
|
+
f"\n Rules: {pass_meta} pass, {fail_meta} fail, {unknown} unknown\n"
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
except Exception as e:
|
|
730
|
+
# Distinguish between "preplan not available" vs "real errors"
|
|
731
|
+
err_str = str(e).lower()
|
|
732
|
+
err_type = type(e).__name__
|
|
733
|
+
|
|
734
|
+
# Re-raise errors that indicate real problems (auth, file not found, etc.)
|
|
735
|
+
is_auth_error = (
|
|
736
|
+
"access denied" in err_str
|
|
737
|
+
or "forbidden" in err_str
|
|
738
|
+
or "unauthorized" in err_str
|
|
739
|
+
or "credentials" in err_str
|
|
740
|
+
or "authentication" in err_str
|
|
741
|
+
)
|
|
742
|
+
is_not_found = (
|
|
743
|
+
isinstance(e, FileNotFoundError)
|
|
744
|
+
or "not found" in err_str
|
|
745
|
+
or "no such file" in err_str
|
|
746
|
+
or "does not exist" in err_str
|
|
747
|
+
)
|
|
748
|
+
is_permission = isinstance(e, PermissionError)
|
|
749
|
+
|
|
750
|
+
if is_auth_error or is_not_found or is_permission:
|
|
751
|
+
# These are real errors - don't silently skip
|
|
752
|
+
raise RuntimeError(
|
|
753
|
+
f"Preplan failed due to {err_type}: {e}. "
|
|
754
|
+
"Check file path and credentials."
|
|
755
|
+
) from e
|
|
756
|
+
|
|
757
|
+
# Otherwise, preplan optimization just isn't available (e.g., no stats)
|
|
758
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
759
|
+
print(f"[INFO] Preplan skipped ({err_type}): {e}")
|
|
760
|
+
preplan_effective = False # leave summary with effective=False
|
|
761
|
+
|
|
762
|
+
# PostgreSQL preplan (uses pg_stats metadata)
|
|
763
|
+
elif self.preplan in {"on", "auto"} and handle.scheme in ("postgres", "postgresql"):
|
|
764
|
+
try:
|
|
765
|
+
from kontra.preplan.postgres import preplan_postgres, can_preplan_postgres
|
|
766
|
+
if can_preplan_postgres(handle):
|
|
767
|
+
t0 = now_ms()
|
|
768
|
+
static_preds = extract_static_predicates(rules=rules)
|
|
769
|
+
pre: PrePlan = preplan_postgres(
|
|
770
|
+
handle=handle,
|
|
771
|
+
required_columns=compiled_full.required_cols,
|
|
772
|
+
predicates=static_preds,
|
|
773
|
+
)
|
|
774
|
+
preplan_analyze_ms = now_ms() - t0
|
|
775
|
+
|
|
776
|
+
pass_meta = fail_meta = unknown = 0
|
|
777
|
+
for rid, decision in pre.rule_decisions.items():
|
|
778
|
+
if decision == "pass_meta":
|
|
779
|
+
meta_results_by_id[rid] = {
|
|
780
|
+
"rule_id": rid,
|
|
781
|
+
"passed": True,
|
|
782
|
+
"failed_count": 0,
|
|
783
|
+
"message": "Proven by metadata (pg_stats)",
|
|
784
|
+
"execution_source": "metadata",
|
|
785
|
+
"severity": rule_severity_map.get(rid, "blocking"),
|
|
786
|
+
}
|
|
787
|
+
handled_ids_meta.add(rid)
|
|
788
|
+
pass_meta += 1
|
|
789
|
+
else:
|
|
790
|
+
unknown += 1
|
|
791
|
+
|
|
792
|
+
preplan_effective = True
|
|
793
|
+
preplan_summary.update({
|
|
794
|
+
"effective": True,
|
|
795
|
+
"rules_pass_meta": pass_meta,
|
|
796
|
+
"rules_fail_meta": fail_meta,
|
|
797
|
+
"rules_unknown": unknown,
|
|
798
|
+
})
|
|
799
|
+
except Exception as e:
|
|
800
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
801
|
+
print(f"[INFO] PostgreSQL preplan skipped: {e}")
|
|
802
|
+
|
|
803
|
+
# SQL Server preplan (uses sys.columns metadata)
|
|
804
|
+
elif self.preplan in {"on", "auto"} and handle.scheme in ("mssql", "sqlserver"):
|
|
805
|
+
try:
|
|
806
|
+
from kontra.preplan.sqlserver import preplan_sqlserver, can_preplan_sqlserver
|
|
807
|
+
if can_preplan_sqlserver(handle):
|
|
808
|
+
t0 = now_ms()
|
|
809
|
+
static_preds = extract_static_predicates(rules=rules)
|
|
810
|
+
pre: PrePlan = preplan_sqlserver(
|
|
811
|
+
handle=handle,
|
|
812
|
+
required_columns=compiled_full.required_cols,
|
|
813
|
+
predicates=static_preds,
|
|
814
|
+
)
|
|
815
|
+
preplan_analyze_ms = now_ms() - t0
|
|
816
|
+
|
|
817
|
+
pass_meta = fail_meta = unknown = 0
|
|
818
|
+
for rid, decision in pre.rule_decisions.items():
|
|
819
|
+
if decision == "pass_meta":
|
|
820
|
+
meta_results_by_id[rid] = {
|
|
821
|
+
"rule_id": rid,
|
|
822
|
+
"passed": True,
|
|
823
|
+
"failed_count": 0,
|
|
824
|
+
"message": "Proven by metadata (SQL Server constraints)",
|
|
825
|
+
"execution_source": "metadata",
|
|
826
|
+
"severity": rule_severity_map.get(rid, "blocking"),
|
|
827
|
+
}
|
|
828
|
+
handled_ids_meta.add(rid)
|
|
829
|
+
pass_meta += 1
|
|
830
|
+
else:
|
|
831
|
+
unknown += 1
|
|
832
|
+
|
|
833
|
+
preplan_effective = True
|
|
834
|
+
preplan_summary.update({
|
|
835
|
+
"effective": True,
|
|
836
|
+
"rules_pass_meta": pass_meta,
|
|
837
|
+
"rules_fail_meta": fail_meta,
|
|
838
|
+
"rules_unknown": unknown,
|
|
839
|
+
})
|
|
840
|
+
except Exception as e:
|
|
841
|
+
if os.getenv("KONTRA_VERBOSE"):
|
|
842
|
+
print(f"[INFO] SQL Server preplan skipped: {e}")
|
|
843
|
+
|
|
844
|
+
# ------------------------------------------------------------------ #
|
|
845
|
+
# 4) Materializer setup (orthogonal)
|
|
846
|
+
materializer = pick_materializer(handle)
|
|
847
|
+
materializer_name = getattr(materializer, "name", "duckdb")
|
|
848
|
+
_staged_override_uri: Optional[str] = None
|
|
849
|
+
|
|
850
|
+
# ------------------------------------------------------------------ #
|
|
851
|
+
# 5) SQL pushdown (independent of preplan/projection)
|
|
852
|
+
sql_results_by_id: Dict[str, Dict[str, Any]] = {}
|
|
853
|
+
handled_ids_sql: Set[str] = set()
|
|
854
|
+
available_cols: List[str] = []
|
|
855
|
+
sql_row_count: Optional[int] = None
|
|
856
|
+
executor_name = "none"
|
|
857
|
+
pushdown_effective = False
|
|
858
|
+
push_compile_ms = push_execute_ms = push_introspect_ms = 0
|
|
859
|
+
|
|
860
|
+
executor = None
|
|
861
|
+
if self.pushdown in {"on", "auto"}:
|
|
862
|
+
# Exclude rules already decided by preplan
|
|
863
|
+
sql_rules_remaining = [s for s in compiled_full.sql_rules if s.get("rule_id") not in handled_ids_meta]
|
|
864
|
+
executor = pick_executor(handle, sql_rules_remaining)
|
|
865
|
+
|
|
866
|
+
if executor:
|
|
867
|
+
try:
|
|
868
|
+
# Compile
|
|
869
|
+
t0 = now_ms()
|
|
870
|
+
executor_name = getattr(executor, "name", "sql")
|
|
871
|
+
sql_plan_str = executor.compile([s for s in compiled_full.sql_rules if s.get("rule_id") not in handled_ids_meta])
|
|
872
|
+
push_compile_ms = now_ms() - t0
|
|
873
|
+
if self.show_plan and sql_plan_str:
|
|
874
|
+
print(f"\n-- {executor_name.upper()} SQL PLAN --\n{sql_plan_str}\n")
|
|
875
|
+
|
|
876
|
+
# Execute
|
|
877
|
+
t0 = now_ms()
|
|
878
|
+
duck_out = executor.execute(handle, sql_plan_str, csv_mode=self.csv_mode)
|
|
879
|
+
push_execute_ms = now_ms() - t0
|
|
880
|
+
|
|
881
|
+
# Inject severity into SQL results
|
|
882
|
+
sql_results_raw = duck_out.get("results", [])
|
|
883
|
+
for r in sql_results_raw:
|
|
884
|
+
r["severity"] = rule_severity_map.get(r.get("rule_id"), "blocking")
|
|
885
|
+
sql_results_by_id = {r["rule_id"]: r for r in sql_results_raw}
|
|
886
|
+
handled_ids_sql = set(sql_results_by_id.keys())
|
|
887
|
+
|
|
888
|
+
# Get row count and cols from execute result (avoids separate introspect call)
|
|
889
|
+
t0 = now_ms()
|
|
890
|
+
sql_row_count = duck_out.get("row_count")
|
|
891
|
+
available_cols = duck_out.get("available_cols") or []
|
|
892
|
+
|
|
893
|
+
# Fallback to introspect if execute didn't return these
|
|
894
|
+
if sql_row_count is None or not available_cols:
|
|
895
|
+
info = executor.introspect(handle, csv_mode=self.csv_mode)
|
|
896
|
+
push_introspect_ms = now_ms() - t0
|
|
897
|
+
sql_row_count = info.get("row_count") if sql_row_count is None else sql_row_count
|
|
898
|
+
available_cols = info.get("available_cols") or available_cols
|
|
899
|
+
staging = info.get("staging") or duck_out.get("staging")
|
|
900
|
+
else:
|
|
901
|
+
push_introspect_ms = now_ms() - t0
|
|
902
|
+
staging = duck_out.get("staging")
|
|
903
|
+
|
|
904
|
+
# Reuse staged Parquet (if the executor staged CSV → Parquet)
|
|
905
|
+
staging = staging or duck_out.get("staging")
|
|
906
|
+
if staging and staging.get("path"):
|
|
907
|
+
_staged_override_uri = staging["path"]
|
|
908
|
+
self._staging_tmpdir = staging.get("tmpdir")
|
|
909
|
+
handle = DatasetHandle.from_uri(_staged_override_uri)
|
|
910
|
+
materializer = pick_materializer(handle)
|
|
911
|
+
materializer_name = getattr(materializer, "name", materializer_name)
|
|
912
|
+
|
|
913
|
+
pushdown_effective = True
|
|
914
|
+
except Exception as e:
|
|
915
|
+
if os.getenv("KONTRA_VERBOSE") or self.show_plan:
|
|
916
|
+
print(f"[WARN] SQL pushdown failed ({type(e).__name__}): {e}")
|
|
917
|
+
executor = None # fall back silently
|
|
918
|
+
|
|
919
|
+
# ------------------------------------------------------------------ #
|
|
920
|
+
# 6) Residual Polars execution (projection independent; manifest optional)
|
|
921
|
+
handled_all = handled_ids_meta | handled_ids_sql
|
|
922
|
+
compiled_residual = plan.without_ids(compiled_full, handled_all)
|
|
923
|
+
|
|
924
|
+
# Projection is DC-driven; independent of preplan/pushdown
|
|
925
|
+
required_cols_full = compiled_full.required_cols if self.enable_projection else []
|
|
926
|
+
required_cols_residual = compiled_residual.required_cols if self.enable_projection else []
|
|
927
|
+
|
|
928
|
+
if not compiled_residual.predicates and not compiled_residual.fallback_rules:
|
|
929
|
+
self.df = None
|
|
930
|
+
polars_out = {"results": []}
|
|
931
|
+
timers.data_load_ms = timers.execute_ms = 0
|
|
932
|
+
else:
|
|
933
|
+
# Materialize minimal slice:
|
|
934
|
+
# If preplan produced a row-group manifest, honor it — otherwise let the materializer decide.
|
|
935
|
+
t0 = now_ms()
|
|
936
|
+
if preplan_effective and _is_parquet(handle.uri) and preplan_row_groups:
|
|
937
|
+
cols = (required_cols_residual or None) if self.enable_projection else None
|
|
938
|
+
|
|
939
|
+
# Reuse preplan filesystem if available, otherwise create from handle
|
|
940
|
+
residual_fs = preplan_fs
|
|
941
|
+
if residual_fs is None and _is_s3_uri(handle.uri):
|
|
942
|
+
try:
|
|
943
|
+
residual_fs = _create_s3_filesystem(handle)
|
|
944
|
+
except Exception as e:
|
|
945
|
+
# Let ParquetFile try default credentials
|
|
946
|
+
log_exception(_logger, "Could not create S3 filesystem for residual load", e)
|
|
947
|
+
|
|
948
|
+
# PyArrow S3FileSystem expects 'bucket/key' format, not 's3://bucket/key'
|
|
949
|
+
residual_path = _s3_uri_to_path(handle.uri) if residual_fs else handle.uri
|
|
950
|
+
pf = pq.ParquetFile(residual_path, filesystem=residual_fs)
|
|
951
|
+
|
|
952
|
+
pa_cols = cols if cols else None
|
|
953
|
+
rg_tables = [pf.read_row_group(i, columns=pa_cols) for i in preplan_row_groups]
|
|
954
|
+
pa_tbl = pa.concat_tables(rg_tables) if len(rg_tables) > 1 else rg_tables[0]
|
|
955
|
+
self.df = pl.from_arrow(pa_tbl)
|
|
956
|
+
else:
|
|
957
|
+
# Materializer respects projection (engine passes residual required cols)
|
|
958
|
+
self.df = materializer.to_polars(required_cols_residual or None)
|
|
959
|
+
timers.data_load_ms = now_ms() - t0
|
|
960
|
+
|
|
961
|
+
# Execute residual rules in Polars
|
|
962
|
+
t0 = now_ms()
|
|
963
|
+
polars_exec = PolarsBackend(executor=plan.execute_compiled)
|
|
964
|
+
polars_art = polars_exec.compile(compiled_residual)
|
|
965
|
+
polars_out = polars_exec.execute(self.df, polars_art)
|
|
966
|
+
timers.execute_ms = now_ms() - t0
|
|
967
|
+
|
|
968
|
+
# ------------------------------------------------------------------ #
|
|
969
|
+
# 7) Merge results — deterministic order: preplan → SQL → Polars
|
|
970
|
+
results: List[Dict[str, Any]] = list(meta_results_by_id.values())
|
|
971
|
+
results += [r for r in sql_results_by_id.values() if r["rule_id"] not in meta_results_by_id]
|
|
972
|
+
results += [r for r in polars_out["results"] if r["rule_id"] not in meta_results_by_id and r["rule_id"] not in sql_results_by_id]
|
|
973
|
+
|
|
974
|
+
# 8) Summary
|
|
975
|
+
summary = plan.summary(results)
|
|
976
|
+
summary["dataset_name"] = self.contract.datasource
|
|
977
|
+
# Row count priority: SQL executor > DataFrame > preplan metadata > 0
|
|
978
|
+
if sql_row_count is not None:
|
|
979
|
+
summary["total_rows"] = int(sql_row_count)
|
|
980
|
+
elif self.df is not None:
|
|
981
|
+
summary["total_rows"] = int(self.df.height)
|
|
982
|
+
elif preplan_total_rows is not None:
|
|
983
|
+
summary["total_rows"] = int(preplan_total_rows)
|
|
984
|
+
else:
|
|
985
|
+
summary["total_rows"] = 0
|
|
986
|
+
engine_label = (
|
|
987
|
+
f"{materializer_name}+polars "
|
|
988
|
+
f"(preplan:{'on' if preplan_effective else 'off'}, "
|
|
989
|
+
f"pushdown:{'on' if pushdown_effective else 'off'}, "
|
|
990
|
+
f"projection:{'on' if self.enable_projection else 'off'})"
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
if self.emit_report:
|
|
994
|
+
t0 = now_ms()
|
|
995
|
+
self._report(summary, results)
|
|
996
|
+
timers.report_ms = now_ms() - t0
|
|
997
|
+
|
|
998
|
+
# ------------------------------------------------------------------ #
|
|
999
|
+
# 9) Stats (feature-attributed)
|
|
1000
|
+
stats: Optional[Dict[str, Any]] = None
|
|
1001
|
+
if self.stats_mode != "none":
|
|
1002
|
+
if not available_cols:
|
|
1003
|
+
available_cols = self._peek_available_columns(handle.uri)
|
|
1004
|
+
|
|
1005
|
+
ds_summary = basic_summary(self.df, available_cols=available_cols, nrows_override=sql_row_count)
|
|
1006
|
+
|
|
1007
|
+
loaded_cols = list(self.df.columns) if self.df is not None else []
|
|
1008
|
+
proj = {
|
|
1009
|
+
"enabled": self.enable_projection,
|
|
1010
|
+
"available_count": len(available_cols or []) if available_cols is not None else len(loaded_cols),
|
|
1011
|
+
"full": {
|
|
1012
|
+
"required_columns": required_cols_full or [],
|
|
1013
|
+
"required_count": len(required_cols_full or []),
|
|
1014
|
+
},
|
|
1015
|
+
"residual": {
|
|
1016
|
+
"required_columns": required_cols_residual or [],
|
|
1017
|
+
"required_count": len(required_cols_residual or []),
|
|
1018
|
+
"loaded_count": len(loaded_cols),
|
|
1019
|
+
"effective": self.enable_projection and bool(required_cols_residual)
|
|
1020
|
+
and len(loaded_cols) <= len(required_cols_residual),
|
|
1021
|
+
},
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
push = {
|
|
1025
|
+
"enabled": self.pushdown in {"on", "auto"},
|
|
1026
|
+
"effective": bool(pushdown_effective),
|
|
1027
|
+
"executor": executor_name,
|
|
1028
|
+
"rules_pushed": len(sql_results_by_id),
|
|
1029
|
+
"breakdown_ms": {
|
|
1030
|
+
"compile": push_compile_ms,
|
|
1031
|
+
"execute": push_execute_ms,
|
|
1032
|
+
"introspect": push_introspect_ms,
|
|
1033
|
+
},
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
res = {
|
|
1037
|
+
"rules_local": len(polars_out["results"]) if "polars_out" in locals() else 0,
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
phases_ms = {
|
|
1041
|
+
"contract_load": int(timers.contract_load_ms or 0),
|
|
1042
|
+
"compile": int(timers.compile_ms or 0),
|
|
1043
|
+
"preplan": int(preplan_analyze_ms or 0),
|
|
1044
|
+
"pushdown": int(push_compile_ms + push_execute_ms + push_introspect_ms),
|
|
1045
|
+
"data_load": int(timers.data_load_ms or 0),
|
|
1046
|
+
"execute": int(timers.execute_ms or 0),
|
|
1047
|
+
"report": int(timers.report_ms or 0),
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
stats = {
|
|
1051
|
+
"stats_version": "2",
|
|
1052
|
+
"run_meta": {
|
|
1053
|
+
"phases_ms": phases_ms,
|
|
1054
|
+
"duration_ms_total": sum(phases_ms.values()),
|
|
1055
|
+
"dataset_path": self.data_path or self.contract.datasource,
|
|
1056
|
+
"contract_path": self.contract_path,
|
|
1057
|
+
"engine": engine_label,
|
|
1058
|
+
"materializer": materializer_name,
|
|
1059
|
+
"preplan_requested": self.preplan,
|
|
1060
|
+
"preplan": "on" if preplan_effective else "off",
|
|
1061
|
+
"pushdown_requested": self.pushdown,
|
|
1062
|
+
"pushdown": "on" if pushdown_effective else "off",
|
|
1063
|
+
"csv_mode": self.csv_mode,
|
|
1064
|
+
"staged_override": bool(_staged_override_uri),
|
|
1065
|
+
},
|
|
1066
|
+
"dataset": ds_summary,
|
|
1067
|
+
"preplan": preplan_summary,
|
|
1068
|
+
"pushdown": push,
|
|
1069
|
+
"projection": proj,
|
|
1070
|
+
"residual": res,
|
|
1071
|
+
"columns_touched": columns_touched([{"name": r.name, "params": r.params} for r in self.contract.rules]),
|
|
1072
|
+
"columns_validated": columns_touched([{"name": r.name, "params": r.params} for r in self.contract.rules]),
|
|
1073
|
+
"columns_loaded": loaded_cols,
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
if self.stats_mode == "profile" and self.df is not None:
|
|
1077
|
+
stats["profile"] = profile_for(self.df, proj["residual"]["required_columns"])
|
|
1078
|
+
|
|
1079
|
+
if os.getenv("KONTRA_IO_DEBUG"):
|
|
1080
|
+
io_dbg = getattr(materializer, "io_debug", None)
|
|
1081
|
+
if callable(io_dbg):
|
|
1082
|
+
io = io_dbg()
|
|
1083
|
+
if io:
|
|
1084
|
+
stats["io"] = io
|
|
1085
|
+
|
|
1086
|
+
out: Dict[str, Any] = {
|
|
1087
|
+
"dataset": self.contract.datasource,
|
|
1088
|
+
"results": results,
|
|
1089
|
+
"summary": summary,
|
|
1090
|
+
}
|
|
1091
|
+
if stats is not None:
|
|
1092
|
+
out["stats"] = stats
|
|
1093
|
+
out.setdefault("run_meta", {})["engine_label"] = engine_label
|
|
1094
|
+
|
|
1095
|
+
# Ensure staged tempdir (if any) is cleaned after the whole run
|
|
1096
|
+
return out
|
|
1097
|
+
|
|
1098
|
+
# --------------------------------------------------------------------- #
|
|
1099
|
+
|
|
1100
|
+
def _report(self, summary: Dict[str, Any], results: List[Dict[str, Any]]) -> None:
|
|
1101
|
+
if summary["passed"]:
|
|
1102
|
+
# Show warning/info counts if any
|
|
1103
|
+
warning_info = ""
|
|
1104
|
+
if summary.get("warning_failures", 0) > 0:
|
|
1105
|
+
warning_info = f" ({summary['warning_failures']} warnings)"
|
|
1106
|
+
elif summary.get("info_failures", 0) > 0:
|
|
1107
|
+
warning_info = f" ({summary['info_failures']} info)"
|
|
1108
|
+
|
|
1109
|
+
report_success(
|
|
1110
|
+
f"{summary['dataset_name']} — PASSED "
|
|
1111
|
+
f"({summary['rules_passed']} of {summary['total_rules']} rules){warning_info}"
|
|
1112
|
+
)
|
|
1113
|
+
else:
|
|
1114
|
+
# Show severity breakdown
|
|
1115
|
+
blocking = summary.get("blocking_failures", summary["rules_failed"])
|
|
1116
|
+
warning = summary.get("warning_failures", 0)
|
|
1117
|
+
info = summary.get("info_failures", 0)
|
|
1118
|
+
|
|
1119
|
+
severity_info = f" ({blocking} blocking"
|
|
1120
|
+
if warning > 0:
|
|
1121
|
+
severity_info += f", {warning} warnings"
|
|
1122
|
+
if info > 0:
|
|
1123
|
+
severity_info += f", {info} info"
|
|
1124
|
+
severity_info += ")"
|
|
1125
|
+
|
|
1126
|
+
report_failure(
|
|
1127
|
+
f"{summary['dataset_name']} — FAILED "
|
|
1128
|
+
f"({summary['rules_failed']} of {summary['total_rules']} rules){severity_info}"
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
# Show all rule results with execution source
|
|
1132
|
+
for r in results:
|
|
1133
|
+
source = r.get("execution_source", "polars")
|
|
1134
|
+
source_tag = f" [{source}]" if source else ""
|
|
1135
|
+
rule_id = r.get("rule_id", "<unknown>")
|
|
1136
|
+
passed = r.get("passed", False)
|
|
1137
|
+
severity = r.get("severity", "blocking")
|
|
1138
|
+
|
|
1139
|
+
# Severity tag for non-blocking failures
|
|
1140
|
+
severity_tag = ""
|
|
1141
|
+
if not passed and severity != "blocking":
|
|
1142
|
+
severity_tag = f" [{severity}]"
|
|
1143
|
+
|
|
1144
|
+
if passed:
|
|
1145
|
+
print(f" ✅ {rule_id}{source_tag}")
|
|
1146
|
+
else:
|
|
1147
|
+
msg = r.get("message", "Failed")
|
|
1148
|
+
failed_count = r.get("failed_count", 0)
|
|
1149
|
+
# Include failure count if available
|
|
1150
|
+
detail = f": {msg}"
|
|
1151
|
+
if failed_count > 0:
|
|
1152
|
+
detail = f": {failed_count:,} failures"
|
|
1153
|
+
|
|
1154
|
+
# Use different icon for warning/info
|
|
1155
|
+
icon = "❌" if severity == "blocking" else ("⚠️" if severity == "warning" else "ℹ️")
|
|
1156
|
+
print(f" {icon} {rule_id}{source_tag}{severity_tag}{detail}")
|
|
1157
|
+
|
|
1158
|
+
# Show detailed explanation if available
|
|
1159
|
+
details = r.get("details")
|
|
1160
|
+
if details:
|
|
1161
|
+
self._print_failure_details(details)
|
|
1162
|
+
|
|
1163
|
+
def _print_failure_details(self, details: Dict[str, Any]) -> None:
|
|
1164
|
+
"""Print detailed failure explanation."""
|
|
1165
|
+
# Expected values (for allowed_values rule)
|
|
1166
|
+
expected = details.get("expected")
|
|
1167
|
+
if expected:
|
|
1168
|
+
expected_preview = ", ".join(expected[:5])
|
|
1169
|
+
if len(expected) > 5:
|
|
1170
|
+
expected_preview += f" ... ({len(expected)} total)"
|
|
1171
|
+
print(f" Expected: {expected_preview}")
|
|
1172
|
+
|
|
1173
|
+
# Unexpected values (for allowed_values rule)
|
|
1174
|
+
unexpected = details.get("unexpected_values")
|
|
1175
|
+
if unexpected:
|
|
1176
|
+
print(" Unexpected values:")
|
|
1177
|
+
for uv in unexpected[:5]:
|
|
1178
|
+
val = uv.get("value", "?")
|
|
1179
|
+
count = uv.get("count", 0)
|
|
1180
|
+
print(f" - \"{val}\" ({count:,} rows)")
|
|
1181
|
+
if len(unexpected) > 5:
|
|
1182
|
+
print(f" ... and {len(unexpected) - 5} more")
|
|
1183
|
+
|
|
1184
|
+
# Suggestion
|
|
1185
|
+
suggestion = details.get("suggestion")
|
|
1186
|
+
if suggestion:
|
|
1187
|
+
print(f" Suggestion: {suggestion}")
|
|
1188
|
+
|
|
1189
|
+
# --------------------------------------------------------------------- #
|
|
1190
|
+
|
|
1191
|
+
def _peek_available_columns(self, source: str) -> List[str]:
|
|
1192
|
+
"""Cheap schema peek; used only for observability."""
|
|
1193
|
+
try:
|
|
1194
|
+
s = source.lower()
|
|
1195
|
+
# We can't easily peek S3 without a filesystem object,
|
|
1196
|
+
# so we'll just handle local files for now.
|
|
1197
|
+
if _is_s3_uri(s):
|
|
1198
|
+
return []
|
|
1199
|
+
if s.endswith(".parquet"):
|
|
1200
|
+
return list(pl.scan_parquet(source).collect_schema().names())
|
|
1201
|
+
if s.endswith(".csv"):
|
|
1202
|
+
return list(pl.scan_csv(source).collect_schema().names())
|
|
1203
|
+
except Exception as e:
|
|
1204
|
+
log_exception(_logger, f"Could not peek columns from {source}", e)
|
|
1205
|
+
return []
|