kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import date, datetime
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
import pyarrow.fs as pafs # <-- Added
|
|
8
|
+
import pyarrow.parquet as pq
|
|
9
|
+
|
|
10
|
+
from .types import PrePlan, Decision
|
|
11
|
+
|
|
12
|
+
# NOTE: The preplan consumes simple, metadata-usable predicates only.
|
|
13
|
+
# Shape: (rule_id, column, op, value)
|
|
14
|
+
# op ∈ {"==","!=",">=",">","<=","<","^=","not_null"}
|
|
15
|
+
# "^=" means "string prefix"
|
|
16
|
+
Predicate = Tuple[str, str, str, Any] # (rule_id, column, op, value)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ---------- small helpers ----------
|
|
20
|
+
|
|
21
|
+
def _iso(v: Any) -> Any:
|
|
22
|
+
if isinstance(v, (date, datetime)):
|
|
23
|
+
return v.isoformat()
|
|
24
|
+
return v
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _schema_names(md_schema) -> List[str]:
|
|
28
|
+
# Compatible with various pyarrow versions
|
|
29
|
+
try:
|
|
30
|
+
return list(md_schema.names)
|
|
31
|
+
except Exception:
|
|
32
|
+
try:
|
|
33
|
+
return [f.name for f in md_schema.to_arrow_schema()]
|
|
34
|
+
except Exception:
|
|
35
|
+
return []
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _rg_col_stats(rg, j) -> Optional[Dict[str, Any]]:
|
|
39
|
+
"""Return a safe dict of min/max/null_count for a row-group column j."""
|
|
40
|
+
col = rg.column(j)
|
|
41
|
+
stats = col.statistics
|
|
42
|
+
if stats is None:
|
|
43
|
+
return None
|
|
44
|
+
out: Dict[str, Any] = {
|
|
45
|
+
"min": _iso(getattr(stats, "min", None)) if getattr(stats, "has_min_max", True) else None,
|
|
46
|
+
"max": _iso(getattr(stats, "max", None)) if getattr(stats, "has_min_max", True) else None,
|
|
47
|
+
}
|
|
48
|
+
if getattr(stats, "has_null_count", True):
|
|
49
|
+
out["null_count"] = getattr(stats, "null_count", None)
|
|
50
|
+
return out
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _name_for_rg_col(rg, j, fallback: str) -> str:
|
|
54
|
+
try:
|
|
55
|
+
# path_in_schema handles nested names properly
|
|
56
|
+
return str(rg.column(j).path_in_schema)
|
|
57
|
+
except Exception:
|
|
58
|
+
return fallback
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------- metadata reasoning (per predicate, per row group) ----------
|
|
62
|
+
|
|
63
|
+
def _verdict_overlaps(op: str, val: Any, stats: Optional[Dict[str, Any]]) -> Optional[bool]:
|
|
64
|
+
"""
|
|
65
|
+
Return:
|
|
66
|
+
- True -> group MAY satisfy the predicate (cannot be ruled out by min/max)
|
|
67
|
+
- False -> group CANNOT satisfy predicate (disjoint by min/max)
|
|
68
|
+
- None -> unknown (no stats)
|
|
69
|
+
"""
|
|
70
|
+
if not stats or (stats.get("min") is None and stats.get("max") is None):
|
|
71
|
+
return None
|
|
72
|
+
mn, mx = stats.get("min"), stats.get("max")
|
|
73
|
+
|
|
74
|
+
# Normalize type for string columns
|
|
75
|
+
if isinstance(mn, str) and not isinstance(val, str):
|
|
76
|
+
val = str(val)
|
|
77
|
+
|
|
78
|
+
if op == "==":
|
|
79
|
+
if mn is not None and mx is not None and (val < mn or val > mx):
|
|
80
|
+
return False
|
|
81
|
+
return True
|
|
82
|
+
if op == "!=":
|
|
83
|
+
return True # min/max alone cannot rule "!=" out
|
|
84
|
+
if op == ">=":
|
|
85
|
+
return False if (mx is not None and mx < val) else True
|
|
86
|
+
if op == "<=":
|
|
87
|
+
return False if (mn is not None and mn > val) else True
|
|
88
|
+
if op == ">":
|
|
89
|
+
return False if (mx is not None and mx <= val) else True
|
|
90
|
+
if op == "<":
|
|
91
|
+
return False if (mn is not None and mn >= val) else True
|
|
92
|
+
if op == "^=": # string prefix: keep if ranges overlap the prefix window
|
|
93
|
+
if not isinstance(mn, str) or not isinstance(mx, str):
|
|
94
|
+
return None
|
|
95
|
+
upper = str(val) + "\uffff"
|
|
96
|
+
return not (upper < mn or str(val) > mx)
|
|
97
|
+
if op == "not_null":
|
|
98
|
+
# Overlap sense isn't meaningful; we handle not_null via _decide_fail/_decide_pass
|
|
99
|
+
return None
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _decide_pass(op: str, val: Any, rg_stats_iter: Iterable[Optional[Dict[str, Any]]]) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Can we *prove* that EVERY row in the file satisfies the predicate using only RG stats?
|
|
106
|
+
(dataset-level "PASS" for that rule)
|
|
107
|
+
"""
|
|
108
|
+
# For >= c: if for all rgs min >= c → pass
|
|
109
|
+
# For <= c: if for all rgs max <= c → pass
|
|
110
|
+
# For == c: if for all rgs (min==max==c) → pass
|
|
111
|
+
# For not_null: if for all rgs null_count == 0 → pass
|
|
112
|
+
ok_all = True
|
|
113
|
+
for s in rg_stats_iter:
|
|
114
|
+
if s is None:
|
|
115
|
+
return False
|
|
116
|
+
mn, mx = s.get("min"), s.get("max")
|
|
117
|
+
if op == ">=":
|
|
118
|
+
if mn is None or mn < val:
|
|
119
|
+
ok_all = False; break
|
|
120
|
+
elif op == "<=":
|
|
121
|
+
if mx is None or mx > val:
|
|
122
|
+
ok_all = False; break
|
|
123
|
+
elif op == "==":
|
|
124
|
+
if mn is None or mx is None or not (mn == val and mx == val):
|
|
125
|
+
ok_all = False; break
|
|
126
|
+
elif op == "not_null":
|
|
127
|
+
# Can only prove PASS if null_count is exactly 0 for all row groups
|
|
128
|
+
# null_count > 0 means violations exist; None means unknown (can't prove)
|
|
129
|
+
if s.get("null_count") != 0:
|
|
130
|
+
ok_all = False; break
|
|
131
|
+
else:
|
|
132
|
+
# For >, <, !=, ^= we don't try to prove dataset-level PASS via min/max
|
|
133
|
+
ok_all = False; break
|
|
134
|
+
return ok_all
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _decide_fail(op: str, val: Any, rg_stats_iter: Iterable[Optional[Dict[str, Any]]]) -> bool:
|
|
138
|
+
"""
|
|
139
|
+
Can we *prove* that AT LEAST ONE row violates the predicate using RG stats?
|
|
140
|
+
(dataset-level "FAIL" for that rule)
|
|
141
|
+
"""
|
|
142
|
+
for s in rg_stats_iter:
|
|
143
|
+
if s is None:
|
|
144
|
+
continue
|
|
145
|
+
mn, mx = s.get("min"), s.get("max")
|
|
146
|
+
if op == ">=":
|
|
147
|
+
# If an RG has mx < val ⇒ all rows in that RG violate ⇒ dataset FAIL
|
|
148
|
+
if mx is not None and mx < val:
|
|
149
|
+
return True
|
|
150
|
+
elif op == "<=":
|
|
151
|
+
if mn is not None and mn > val:
|
|
152
|
+
return True
|
|
153
|
+
elif op == "==":
|
|
154
|
+
# If an RG has range entirely not equal to val ⇒ all rows in that RG violate
|
|
155
|
+
if mn is not None and mx is not None and (mx < val or mn > val or (mn == mx and mn != val)):
|
|
156
|
+
return True
|
|
157
|
+
elif op == "not_null":
|
|
158
|
+
# Any rg with null_count > 0 proves at least one violation
|
|
159
|
+
nulls = s.get("null_count")
|
|
160
|
+
if isinstance(nulls, int) and nulls > 0:
|
|
161
|
+
return True
|
|
162
|
+
# For >, <, !=, ^= we typically cannot prove dataset-level FAIL with min/max alone.
|
|
163
|
+
return False
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------- public API ----------
|
|
167
|
+
|
|
168
|
+
def preplan_single_parquet(
|
|
169
|
+
path: str,
|
|
170
|
+
required_columns: List[str],
|
|
171
|
+
predicates: List[Predicate],
|
|
172
|
+
filesystem: pafs.FileSystem | None = None, # <-- Updated
|
|
173
|
+
) -> PrePlan:
|
|
174
|
+
"""
|
|
175
|
+
Metadata-only pre-planner for a SINGLE Parquet file.
|
|
176
|
+
|
|
177
|
+
Inputs:
|
|
178
|
+
- path: Parquet file path/URI
|
|
179
|
+
- required_columns: union of columns needed for *all* rules (from your CompiledPlan)
|
|
180
|
+
- predicates: metadata-usable predicates -> List[(rule_id, column, op, value)]
|
|
181
|
+
- filesystem: PyArrow filesystem object (e.g., for S3)
|
|
182
|
+
|
|
183
|
+
Outputs (PrePlan):
|
|
184
|
+
- manifest_row_groups: RG indices that STILL MATTER for remaining rules
|
|
185
|
+
- manifest_columns: columns still needed (you can pass through required_columns)
|
|
186
|
+
- rule_decisions: rule_id -> "pass_meta" | "fail_meta" | "unknown"
|
|
187
|
+
- stats: {"rg_total": N, "rg_kept": K}
|
|
188
|
+
"""
|
|
189
|
+
pf = pq.ParquetFile(path, filesystem=filesystem) # <-- Updated
|
|
190
|
+
md = pf.metadata
|
|
191
|
+
schema_names = _schema_names(md.schema)
|
|
192
|
+
|
|
193
|
+
# Pre-extract per-RG per-column stats into a simple map:
|
|
194
|
+
# rg_stats[i][col_name] -> {"min":..., "max":..., "null_count":...}
|
|
195
|
+
rg_stats: List[Dict[str, Dict[str, Any]]] = []
|
|
196
|
+
for i in range(md.num_row_groups):
|
|
197
|
+
rg = md.row_group(i)
|
|
198
|
+
per_col: Dict[str, Dict[str, Any]] = {}
|
|
199
|
+
for j in range(rg.num_columns):
|
|
200
|
+
name = _name_for_rg_col(rg, j, schema_names[j] if j < len(schema_names) else f"col_{j}")
|
|
201
|
+
s = _rg_col_stats(rg, j)
|
|
202
|
+
if s is not None:
|
|
203
|
+
per_col[name] = s
|
|
204
|
+
rg_stats.append(per_col)
|
|
205
|
+
|
|
206
|
+
# Decide each rule at dataset-level (PASS/FAIL/UNKNOWN by metadata)
|
|
207
|
+
rule_decisions: Dict[str, Decision] = {}
|
|
208
|
+
for rule_id, col, op, val in predicates:
|
|
209
|
+
stats_iter = (rgc.get(col) for rgc in rg_stats)
|
|
210
|
+
if _decide_fail(op, val, stats_iter):
|
|
211
|
+
rule_decisions[rule_id] = "fail_meta"
|
|
212
|
+
continue
|
|
213
|
+
# need a fresh iterator
|
|
214
|
+
stats_iter = (rgc.get(col) for rgc in rg_stats)
|
|
215
|
+
if _decide_pass(op, val, stats_iter):
|
|
216
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
217
|
+
else:
|
|
218
|
+
rule_decisions[rule_id] = "unknown"
|
|
219
|
+
|
|
220
|
+
# Determine which RGs we still need to scan (conservative):
|
|
221
|
+
# - If no predicates at all -> keep ALL RGs.
|
|
222
|
+
# - Else keep any RG that *might* be relevant for at least one UNKNOWN rule.
|
|
223
|
+
keep_rg: List[int] = list(range(md.num_row_groups))
|
|
224
|
+
unknown_preds = [(rid, col, op, val) for (rid, col, op, val) in predicates if rule_decisions.get(rid) == "unknown"]
|
|
225
|
+
|
|
226
|
+
if unknown_preds:
|
|
227
|
+
keep_rg = []
|
|
228
|
+
for i, per_col in enumerate(rg_stats):
|
|
229
|
+
# Keep if ANY unknown predicate "may overlap"
|
|
230
|
+
keep = False
|
|
231
|
+
for _, col, op, val in unknown_preds:
|
|
232
|
+
verdict = _verdict_overlaps(op, val, per_col.get(col))
|
|
233
|
+
# Verdict True -> overlaps; Verdict None -> unknown -> keep to be safe
|
|
234
|
+
if verdict is True or verdict is None:
|
|
235
|
+
keep = True
|
|
236
|
+
break
|
|
237
|
+
if keep:
|
|
238
|
+
keep_rg.append(i)
|
|
239
|
+
if not keep_rg:
|
|
240
|
+
# Safety: if overlap logic ended up too strict, default to ALL
|
|
241
|
+
keep_rg = list(range(md.num_row_groups))
|
|
242
|
+
|
|
243
|
+
preplan = PrePlan(
|
|
244
|
+
manifest_columns=list(required_columns) if required_columns else [],
|
|
245
|
+
manifest_row_groups=keep_rg,
|
|
246
|
+
rule_decisions=rule_decisions,
|
|
247
|
+
stats={
|
|
248
|
+
"rg_total": md.num_row_groups,
|
|
249
|
+
"rg_kept": len(keep_rg),
|
|
250
|
+
"total_rows": md.num_rows,
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
return preplan
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# src/kontra/preplan/postgres.py
|
|
2
|
+
"""
|
|
3
|
+
PostgreSQL preplan - use pg_stats for metadata-only rule decisions.
|
|
4
|
+
|
|
5
|
+
Similar to Parquet metadata preplan, but uses PostgreSQL's statistics catalog.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from kontra.connectors.handle import DatasetHandle
|
|
13
|
+
from kontra.connectors.postgres import PostgresConnectionParams, get_connection
|
|
14
|
+
|
|
15
|
+
from .types import PrePlan, Decision
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Predicate format: (rule_id, column, op, value)
|
|
19
|
+
Predicate = Tuple[str, str, str, Any]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fetch_pg_stats_for_preplan(
|
|
23
|
+
params: PostgresConnectionParams,
|
|
24
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
25
|
+
"""
|
|
26
|
+
Fetch PostgreSQL statistics for preplan decisions.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
{
|
|
30
|
+
"__table__": {"row_estimate": int, "page_count": int},
|
|
31
|
+
"column_name": {
|
|
32
|
+
"null_frac": float, # 0.0-1.0 fraction of nulls
|
|
33
|
+
"n_distinct": float, # -1 = unique, >0 = count, <0 = fraction
|
|
34
|
+
"most_common_vals": str, # Array literal
|
|
35
|
+
},
|
|
36
|
+
...
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
with get_connection(params) as conn:
|
|
40
|
+
with conn.cursor() as cur:
|
|
41
|
+
# Table-level stats from pg_class
|
|
42
|
+
cur.execute(
|
|
43
|
+
"""
|
|
44
|
+
SELECT reltuples::bigint AS row_estimate,
|
|
45
|
+
relpages AS page_count
|
|
46
|
+
FROM pg_class
|
|
47
|
+
WHERE relname = %s
|
|
48
|
+
AND relnamespace = %s::regnamespace
|
|
49
|
+
""",
|
|
50
|
+
(params.table, params.schema),
|
|
51
|
+
)
|
|
52
|
+
row = cur.fetchone()
|
|
53
|
+
table_stats = {
|
|
54
|
+
"row_estimate": row[0] if row else 0,
|
|
55
|
+
"page_count": row[1] if row else 0,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Column-level stats from pg_stats
|
|
59
|
+
cur.execute(
|
|
60
|
+
"""
|
|
61
|
+
SELECT attname AS column_name,
|
|
62
|
+
null_frac,
|
|
63
|
+
n_distinct,
|
|
64
|
+
most_common_vals::text
|
|
65
|
+
FROM pg_stats
|
|
66
|
+
WHERE schemaname = %s AND tablename = %s
|
|
67
|
+
""",
|
|
68
|
+
(params.schema, params.table),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
result: Dict[str, Dict[str, Any]] = {"__table__": table_stats}
|
|
72
|
+
for col_row in cur.fetchall():
|
|
73
|
+
col_name, null_frac, n_distinct, mcv = col_row
|
|
74
|
+
result[col_name] = {
|
|
75
|
+
"null_frac": null_frac,
|
|
76
|
+
"n_distinct": n_distinct,
|
|
77
|
+
"most_common_vals": mcv,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def preplan_postgres(
|
|
84
|
+
handle: DatasetHandle,
|
|
85
|
+
required_columns: List[str],
|
|
86
|
+
predicates: List[Predicate],
|
|
87
|
+
) -> PrePlan:
|
|
88
|
+
"""
|
|
89
|
+
Metadata-only pre-planner for PostgreSQL tables using pg_stats.
|
|
90
|
+
|
|
91
|
+
Supports decisions for:
|
|
92
|
+
- not_null: if null_frac == 0 -> pass_meta
|
|
93
|
+
- unique: if n_distinct == -1 -> pass_meta (PostgreSQL's way of saying "all unique")
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
handle: DatasetHandle with db_params
|
|
97
|
+
required_columns: Columns needed for validation
|
|
98
|
+
predicates: List of (rule_id, column, op, value) tuples
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
PrePlan with rule decisions based on pg_stats
|
|
102
|
+
"""
|
|
103
|
+
if not handle.db_params:
|
|
104
|
+
raise ValueError("PostgreSQL handle missing db_params")
|
|
105
|
+
|
|
106
|
+
params: PostgresConnectionParams = handle.db_params
|
|
107
|
+
pg_stats = fetch_pg_stats_for_preplan(params)
|
|
108
|
+
|
|
109
|
+
table_stats = pg_stats.get("__table__", {})
|
|
110
|
+
row_estimate = table_stats.get("row_estimate", 0)
|
|
111
|
+
|
|
112
|
+
rule_decisions: Dict[str, Decision] = {}
|
|
113
|
+
|
|
114
|
+
for rule_id, column, op, value in predicates:
|
|
115
|
+
col_stats = pg_stats.get(column)
|
|
116
|
+
|
|
117
|
+
if col_stats is None:
|
|
118
|
+
# No stats for this column (ANALYZE not run or column doesn't exist)
|
|
119
|
+
rule_decisions[rule_id] = "unknown"
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
null_frac = col_stats.get("null_frac")
|
|
123
|
+
n_distinct = col_stats.get("n_distinct")
|
|
124
|
+
|
|
125
|
+
if op == "not_null":
|
|
126
|
+
# If null_frac is exactly 0, the column has no nulls
|
|
127
|
+
if null_frac is not None and null_frac == 0:
|
|
128
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
129
|
+
elif null_frac is not None and null_frac > 0:
|
|
130
|
+
# We know there ARE nulls, but pg_stats doesn't give exact count
|
|
131
|
+
# So we can't prove pass, but we know the rule will likely fail
|
|
132
|
+
# Be conservative: mark as unknown (actual execution will determine)
|
|
133
|
+
rule_decisions[rule_id] = "unknown"
|
|
134
|
+
else:
|
|
135
|
+
rule_decisions[rule_id] = "unknown"
|
|
136
|
+
|
|
137
|
+
elif op == "unique":
|
|
138
|
+
# n_distinct == -1 means PostgreSQL detected all values are unique
|
|
139
|
+
# n_distinct < 0 (other than -1) means n_distinct is a fraction of rows
|
|
140
|
+
if n_distinct is not None:
|
|
141
|
+
if n_distinct == -1:
|
|
142
|
+
# All values are unique AND no nulls (unique constraint behavior)
|
|
143
|
+
if null_frac == 0:
|
|
144
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
145
|
+
else:
|
|
146
|
+
# Unique values but has nulls - need to check if nulls cause dups
|
|
147
|
+
rule_decisions[rule_id] = "unknown"
|
|
148
|
+
elif n_distinct < 0:
|
|
149
|
+
# Fraction - close to -1 means high uniqueness but not guaranteed
|
|
150
|
+
rule_decisions[rule_id] = "unknown"
|
|
151
|
+
else:
|
|
152
|
+
# n_distinct > 0: exact count or estimate
|
|
153
|
+
# If n_distinct equals row_estimate, likely unique
|
|
154
|
+
if row_estimate > 0 and n_distinct >= row_estimate:
|
|
155
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
156
|
+
else:
|
|
157
|
+
rule_decisions[rule_id] = "unknown"
|
|
158
|
+
else:
|
|
159
|
+
rule_decisions[rule_id] = "unknown"
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
# Other ops (>=, <=, ==, etc.) - pg_stats doesn't have min/max for general use
|
|
163
|
+
# (histogram_bounds could be used but it's complex)
|
|
164
|
+
rule_decisions[rule_id] = "unknown"
|
|
165
|
+
|
|
166
|
+
return PrePlan(
|
|
167
|
+
manifest_columns=list(required_columns) if required_columns else [],
|
|
168
|
+
manifest_row_groups=[], # Not applicable for PostgreSQL
|
|
169
|
+
rule_decisions=rule_decisions,
|
|
170
|
+
stats={
|
|
171
|
+
"row_estimate": row_estimate,
|
|
172
|
+
"columns_with_stats": len([k for k in pg_stats if k != "__table__"]),
|
|
173
|
+
},
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def can_preplan_postgres(handle: DatasetHandle) -> bool:
|
|
178
|
+
"""Check if PostgreSQL preplan is applicable for this handle."""
|
|
179
|
+
return handle.scheme in ("postgres", "postgresql") and handle.db_params is not None
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
# src/kontra/preplan/sqlserver.py
|
|
2
|
+
"""
|
|
3
|
+
SQL Server preplan - use metadata for rule decisions.
|
|
4
|
+
|
|
5
|
+
Uses SQL Server system views:
|
|
6
|
+
- sys.dm_db_partition_stats for row count estimates
|
|
7
|
+
- sys.columns for nullability constraints
|
|
8
|
+
- sys.indexes + sys.index_columns for uniqueness constraints
|
|
9
|
+
|
|
10
|
+
Note: Unlike PostgreSQL's pg_stats, SQL Server doesn't expose null_frac directly.
|
|
11
|
+
We use constraint metadata instead (NOT NULL, UNIQUE indexes).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Any, Dict, List, Tuple
|
|
17
|
+
|
|
18
|
+
from kontra.connectors.handle import DatasetHandle
|
|
19
|
+
from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
|
|
20
|
+
|
|
21
|
+
from .types import PrePlan, Decision
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Predicate format: (rule_id, column, op, value)
|
|
25
|
+
Predicate = Tuple[str, str, str, Any]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def fetch_sqlserver_metadata(
|
|
29
|
+
params: SqlServerConnectionParams,
|
|
30
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
31
|
+
"""
|
|
32
|
+
Fetch SQL Server metadata for preplan decisions.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
{
|
|
36
|
+
"__table__": {"row_estimate": int, "page_count": int},
|
|
37
|
+
"column_name": {
|
|
38
|
+
"is_nullable": bool, # From column definition
|
|
39
|
+
"is_identity": bool, # Identity column
|
|
40
|
+
"has_unique_constraint": bool, # Unique index or constraint
|
|
41
|
+
},
|
|
42
|
+
...
|
|
43
|
+
}
|
|
44
|
+
"""
|
|
45
|
+
with get_connection(params) as conn:
|
|
46
|
+
cursor = conn.cursor()
|
|
47
|
+
|
|
48
|
+
# Table-level stats from sys.dm_db_partition_stats
|
|
49
|
+
cursor.execute(
|
|
50
|
+
"""
|
|
51
|
+
SELECT SUM(row_count) AS row_estimate,
|
|
52
|
+
SUM(used_page_count) AS page_count
|
|
53
|
+
FROM sys.dm_db_partition_stats ps
|
|
54
|
+
JOIN sys.objects o ON ps.object_id = o.object_id
|
|
55
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
56
|
+
WHERE s.name = %s AND o.name = %s AND ps.index_id IN (0, 1)
|
|
57
|
+
""",
|
|
58
|
+
(params.schema, params.table),
|
|
59
|
+
)
|
|
60
|
+
row = cursor.fetchone()
|
|
61
|
+
table_stats = {
|
|
62
|
+
"row_estimate": int(row[0]) if row and row[0] else 0,
|
|
63
|
+
"page_count": int(row[1]) if row and row[1] else 0,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Column-level metadata
|
|
67
|
+
cursor.execute(
|
|
68
|
+
"""
|
|
69
|
+
SELECT
|
|
70
|
+
c.name AS column_name,
|
|
71
|
+
c.is_nullable,
|
|
72
|
+
c.is_identity
|
|
73
|
+
FROM sys.columns c
|
|
74
|
+
JOIN sys.objects o ON c.object_id = o.object_id
|
|
75
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
76
|
+
WHERE s.name = %s AND o.name = %s
|
|
77
|
+
""",
|
|
78
|
+
(params.schema, params.table),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
result: Dict[str, Dict[str, Any]] = {"__table__": table_stats}
|
|
82
|
+
for col_row in cursor.fetchall():
|
|
83
|
+
col_name, is_nullable, is_identity = col_row
|
|
84
|
+
result[col_name] = {
|
|
85
|
+
"is_nullable": bool(is_nullable),
|
|
86
|
+
"is_identity": bool(is_identity),
|
|
87
|
+
"has_unique_constraint": False, # Will be updated below
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Check for unique constraints/indexes
|
|
91
|
+
cursor.execute(
|
|
92
|
+
"""
|
|
93
|
+
SELECT c.name AS column_name
|
|
94
|
+
FROM sys.indexes i
|
|
95
|
+
JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
|
|
96
|
+
JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
|
|
97
|
+
JOIN sys.objects o ON i.object_id = o.object_id
|
|
98
|
+
JOIN sys.schemas s ON o.schema_id = s.schema_id
|
|
99
|
+
WHERE s.name = %s AND o.name = %s
|
|
100
|
+
AND (i.is_unique = 1 OR i.is_primary_key = 1)
|
|
101
|
+
AND ic.is_included_column = 0
|
|
102
|
+
GROUP BY c.name
|
|
103
|
+
HAVING COUNT(*) = 1 -- Single-column unique constraint
|
|
104
|
+
""",
|
|
105
|
+
(params.schema, params.table),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
for row in cursor.fetchall():
|
|
109
|
+
col_name = row[0]
|
|
110
|
+
if col_name in result:
|
|
111
|
+
result[col_name]["has_unique_constraint"] = True
|
|
112
|
+
|
|
113
|
+
return result
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def preplan_sqlserver(
|
|
117
|
+
handle: DatasetHandle,
|
|
118
|
+
required_columns: List[str],
|
|
119
|
+
predicates: List[Predicate],
|
|
120
|
+
) -> PrePlan:
|
|
121
|
+
"""
|
|
122
|
+
Metadata-only pre-planner for SQL Server tables.
|
|
123
|
+
|
|
124
|
+
Supports decisions for:
|
|
125
|
+
- not_null: if column is NOT NULL (is_nullable = 0) -> pass_meta
|
|
126
|
+
- unique: if column has unique index/constraint -> pass_meta
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
handle: DatasetHandle with db_params
|
|
130
|
+
required_columns: Columns needed for validation
|
|
131
|
+
predicates: List of (rule_id, column, op, value) tuples
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
PrePlan with rule decisions based on SQL Server metadata
|
|
135
|
+
"""
|
|
136
|
+
if not handle.db_params:
|
|
137
|
+
raise ValueError("SQL Server handle missing db_params")
|
|
138
|
+
|
|
139
|
+
params: SqlServerConnectionParams = handle.db_params
|
|
140
|
+
metadata = fetch_sqlserver_metadata(params)
|
|
141
|
+
|
|
142
|
+
table_stats = metadata.get("__table__", {})
|
|
143
|
+
row_estimate = table_stats.get("row_estimate", 0)
|
|
144
|
+
|
|
145
|
+
rule_decisions: Dict[str, Decision] = {}
|
|
146
|
+
|
|
147
|
+
for rule_id, column, op, value in predicates:
|
|
148
|
+
col_meta = metadata.get(column)
|
|
149
|
+
|
|
150
|
+
if col_meta is None:
|
|
151
|
+
# Column not found in metadata
|
|
152
|
+
rule_decisions[rule_id] = "unknown"
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
is_nullable = col_meta.get("is_nullable", True)
|
|
156
|
+
is_identity = col_meta.get("is_identity", False)
|
|
157
|
+
has_unique = col_meta.get("has_unique_constraint", False)
|
|
158
|
+
|
|
159
|
+
if op == "not_null":
|
|
160
|
+
# If column is defined as NOT NULL, it definitely has no nulls
|
|
161
|
+
if not is_nullable:
|
|
162
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
163
|
+
else:
|
|
164
|
+
# Column allows nulls - may or may not have any
|
|
165
|
+
rule_decisions[rule_id] = "unknown"
|
|
166
|
+
|
|
167
|
+
elif op == "unique":
|
|
168
|
+
# If column has unique constraint or is identity, it's unique
|
|
169
|
+
if has_unique or is_identity:
|
|
170
|
+
rule_decisions[rule_id] = "pass_meta"
|
|
171
|
+
else:
|
|
172
|
+
rule_decisions[rule_id] = "unknown"
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
# Other ops - would need actual data statistics
|
|
176
|
+
rule_decisions[rule_id] = "unknown"
|
|
177
|
+
|
|
178
|
+
return PrePlan(
|
|
179
|
+
manifest_columns=list(required_columns) if required_columns else [],
|
|
180
|
+
manifest_row_groups=[], # Not applicable for SQL Server
|
|
181
|
+
rule_decisions=rule_decisions,
|
|
182
|
+
stats={
|
|
183
|
+
"row_estimate": row_estimate,
|
|
184
|
+
"columns_with_metadata": len([k for k in metadata if k != "__table__"]),
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def can_preplan_sqlserver(handle: DatasetHandle) -> bool:
|
|
190
|
+
"""Check if SQL Server preplan is applicable for this handle."""
|
|
191
|
+
return handle.scheme in ("mssql", "sqlserver") and handle.db_params is not None
|
kontra/preplan/types.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, List, Literal
|
|
5
|
+
|
|
6
|
+
# A rule can be proven by metadata to pass, proven to fail, or remain unknown.
|
|
7
|
+
Decision = Literal["pass_meta", "fail_meta", "unknown"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PrePlan:
|
|
12
|
+
"""
|
|
13
|
+
Result of the metadata-only pre-planning stage.
|
|
14
|
+
|
|
15
|
+
- manifest_columns: union of columns still needed for SQL/Polars after metadata decisions.
|
|
16
|
+
- manifest_row_groups: Parquet row-group indices that *may* affect remaining rules.
|
|
17
|
+
(Single-file MVP; can evolve to a file list later.)
|
|
18
|
+
- rule_decisions: rule_id -> Decision ("pass_meta" | "fail_meta" | "unknown").
|
|
19
|
+
- stats: small numbers for observability (e.g., {"rg_total": 19, "rg_kept": 7}).
|
|
20
|
+
"""
|
|
21
|
+
manifest_columns: List[str]
|
|
22
|
+
manifest_row_groups: List[int]
|
|
23
|
+
rule_decisions: Dict[str, Decision]
|
|
24
|
+
stats: Dict[str, int]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# src/kontra/probes/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Transformation probes for Kontra.
|
|
4
|
+
|
|
5
|
+
Probes measure the structural effects of data transformations without
|
|
6
|
+
assigning meaning or judgment. They provide deterministic, structured,
|
|
7
|
+
token-efficient measurements for agents to reason about.
|
|
8
|
+
|
|
9
|
+
Available probes:
|
|
10
|
+
- compare: Measure differences between before/after transformation
|
|
11
|
+
- profile_relationship: Measure JOIN viability between datasets
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from kontra.probes.compare import compare
|
|
15
|
+
from kontra.probes.relationship import profile_relationship
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"compare",
|
|
19
|
+
"profile_relationship",
|
|
20
|
+
]
|