kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,253 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import date, datetime
5
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
6
+
7
+ import pyarrow.fs as pafs # <-- Added
8
+ import pyarrow.parquet as pq
9
+
10
+ from .types import PrePlan, Decision
11
+
12
+ # NOTE: The preplan consumes simple, metadata-usable predicates only.
13
+ # Shape: (rule_id, column, op, value)
14
+ # op ∈ {"==","!=",">=",">","<=","<","^=","not_null"}
15
+ # "^=" means "string prefix"
16
+ Predicate = Tuple[str, str, str, Any] # (rule_id, column, op, value)
17
+
18
+
19
+ # ---------- small helpers ----------
20
+
21
+ def _iso(v: Any) -> Any:
22
+ if isinstance(v, (date, datetime)):
23
+ return v.isoformat()
24
+ return v
25
+
26
+
27
+ def _schema_names(md_schema) -> List[str]:
28
+ # Compatible with various pyarrow versions
29
+ try:
30
+ return list(md_schema.names)
31
+ except Exception:
32
+ try:
33
+ return [f.name for f in md_schema.to_arrow_schema()]
34
+ except Exception:
35
+ return []
36
+
37
+
38
+ def _rg_col_stats(rg, j) -> Optional[Dict[str, Any]]:
39
+ """Return a safe dict of min/max/null_count for a row-group column j."""
40
+ col = rg.column(j)
41
+ stats = col.statistics
42
+ if stats is None:
43
+ return None
44
+ out: Dict[str, Any] = {
45
+ "min": _iso(getattr(stats, "min", None)) if getattr(stats, "has_min_max", True) else None,
46
+ "max": _iso(getattr(stats, "max", None)) if getattr(stats, "has_min_max", True) else None,
47
+ }
48
+ if getattr(stats, "has_null_count", True):
49
+ out["null_count"] = getattr(stats, "null_count", None)
50
+ return out
51
+
52
+
53
+ def _name_for_rg_col(rg, j, fallback: str) -> str:
54
+ try:
55
+ # path_in_schema handles nested names properly
56
+ return str(rg.column(j).path_in_schema)
57
+ except Exception:
58
+ return fallback
59
+
60
+
61
+ # ---------- metadata reasoning (per predicate, per row group) ----------
62
+
63
+ def _verdict_overlaps(op: str, val: Any, stats: Optional[Dict[str, Any]]) -> Optional[bool]:
64
+ """
65
+ Return:
66
+ - True -> group MAY satisfy the predicate (cannot be ruled out by min/max)
67
+ - False -> group CANNOT satisfy predicate (disjoint by min/max)
68
+ - None -> unknown (no stats)
69
+ """
70
+ if not stats or (stats.get("min") is None and stats.get("max") is None):
71
+ return None
72
+ mn, mx = stats.get("min"), stats.get("max")
73
+
74
+ # Normalize type for string columns
75
+ if isinstance(mn, str) and not isinstance(val, str):
76
+ val = str(val)
77
+
78
+ if op == "==":
79
+ if mn is not None and mx is not None and (val < mn or val > mx):
80
+ return False
81
+ return True
82
+ if op == "!=":
83
+ return True # min/max alone cannot rule "!=" out
84
+ if op == ">=":
85
+ return False if (mx is not None and mx < val) else True
86
+ if op == "<=":
87
+ return False if (mn is not None and mn > val) else True
88
+ if op == ">":
89
+ return False if (mx is not None and mx <= val) else True
90
+ if op == "<":
91
+ return False if (mn is not None and mn >= val) else True
92
+ if op == "^=": # string prefix: keep if ranges overlap the prefix window
93
+ if not isinstance(mn, str) or not isinstance(mx, str):
94
+ return None
95
+ upper = str(val) + "\uffff"
96
+ return not (upper < mn or str(val) > mx)
97
+ if op == "not_null":
98
+ # Overlap sense isn't meaningful; we handle not_null via _decide_fail/_decide_pass
99
+ return None
100
+ return None
101
+
102
+
103
+ def _decide_pass(op: str, val: Any, rg_stats_iter: Iterable[Optional[Dict[str, Any]]]) -> bool:
104
+ """
105
+ Can we *prove* that EVERY row in the file satisfies the predicate using only RG stats?
106
+ (dataset-level "PASS" for that rule)
107
+ """
108
+ # For >= c: if for all rgs min >= c → pass
109
+ # For <= c: if for all rgs max <= c → pass
110
+ # For == c: if for all rgs (min==max==c) → pass
111
+ # For not_null: if for all rgs null_count == 0 → pass
112
+ ok_all = True
113
+ for s in rg_stats_iter:
114
+ if s is None:
115
+ return False
116
+ mn, mx = s.get("min"), s.get("max")
117
+ if op == ">=":
118
+ if mn is None or mn < val:
119
+ ok_all = False; break
120
+ elif op == "<=":
121
+ if mx is None or mx > val:
122
+ ok_all = False; break
123
+ elif op == "==":
124
+ if mn is None or mx is None or not (mn == val and mx == val):
125
+ ok_all = False; break
126
+ elif op == "not_null":
127
+ # Can only prove PASS if null_count is exactly 0 for all row groups
128
+ # null_count > 0 means violations exist; None means unknown (can't prove)
129
+ if s.get("null_count") != 0:
130
+ ok_all = False; break
131
+ else:
132
+ # For >, <, !=, ^= we don't try to prove dataset-level PASS via min/max
133
+ ok_all = False; break
134
+ return ok_all
135
+
136
+
137
+ def _decide_fail(op: str, val: Any, rg_stats_iter: Iterable[Optional[Dict[str, Any]]]) -> bool:
138
+ """
139
+ Can we *prove* that AT LEAST ONE row violates the predicate using RG stats?
140
+ (dataset-level "FAIL" for that rule)
141
+ """
142
+ for s in rg_stats_iter:
143
+ if s is None:
144
+ continue
145
+ mn, mx = s.get("min"), s.get("max")
146
+ if op == ">=":
147
+ # If an RG has mx < val ⇒ all rows in that RG violate ⇒ dataset FAIL
148
+ if mx is not None and mx < val:
149
+ return True
150
+ elif op == "<=":
151
+ if mn is not None and mn > val:
152
+ return True
153
+ elif op == "==":
154
+ # If an RG has range entirely not equal to val ⇒ all rows in that RG violate
155
+ if mn is not None and mx is not None and (mx < val or mn > val or (mn == mx and mn != val)):
156
+ return True
157
+ elif op == "not_null":
158
+ # Any rg with null_count > 0 proves at least one violation
159
+ nulls = s.get("null_count")
160
+ if isinstance(nulls, int) and nulls > 0:
161
+ return True
162
+ # For >, <, !=, ^= we typically cannot prove dataset-level FAIL with min/max alone.
163
+ return False
164
+
165
+
166
+ # ---------- public API ----------
167
+
168
+ def preplan_single_parquet(
169
+ path: str,
170
+ required_columns: List[str],
171
+ predicates: List[Predicate],
172
+ filesystem: pafs.FileSystem | None = None, # <-- Updated
173
+ ) -> PrePlan:
174
+ """
175
+ Metadata-only pre-planner for a SINGLE Parquet file.
176
+
177
+ Inputs:
178
+ - path: Parquet file path/URI
179
+ - required_columns: union of columns needed for *all* rules (from your CompiledPlan)
180
+ - predicates: metadata-usable predicates -> List[(rule_id, column, op, value)]
181
+ - filesystem: PyArrow filesystem object (e.g., for S3)
182
+
183
+ Outputs (PrePlan):
184
+ - manifest_row_groups: RG indices that STILL MATTER for remaining rules
185
+ - manifest_columns: columns still needed (you can pass through required_columns)
186
+ - rule_decisions: rule_id -> "pass_meta" | "fail_meta" | "unknown"
187
+ - stats: {"rg_total": N, "rg_kept": K}
188
+ """
189
+ pf = pq.ParquetFile(path, filesystem=filesystem) # <-- Updated
190
+ md = pf.metadata
191
+ schema_names = _schema_names(md.schema)
192
+
193
+ # Pre-extract per-RG per-column stats into a simple map:
194
+ # rg_stats[i][col_name] -> {"min":..., "max":..., "null_count":...}
195
+ rg_stats: List[Dict[str, Dict[str, Any]]] = []
196
+ for i in range(md.num_row_groups):
197
+ rg = md.row_group(i)
198
+ per_col: Dict[str, Dict[str, Any]] = {}
199
+ for j in range(rg.num_columns):
200
+ name = _name_for_rg_col(rg, j, schema_names[j] if j < len(schema_names) else f"col_{j}")
201
+ s = _rg_col_stats(rg, j)
202
+ if s is not None:
203
+ per_col[name] = s
204
+ rg_stats.append(per_col)
205
+
206
+ # Decide each rule at dataset-level (PASS/FAIL/UNKNOWN by metadata)
207
+ rule_decisions: Dict[str, Decision] = {}
208
+ for rule_id, col, op, val in predicates:
209
+ stats_iter = (rgc.get(col) for rgc in rg_stats)
210
+ if _decide_fail(op, val, stats_iter):
211
+ rule_decisions[rule_id] = "fail_meta"
212
+ continue
213
+ # need a fresh iterator
214
+ stats_iter = (rgc.get(col) for rgc in rg_stats)
215
+ if _decide_pass(op, val, stats_iter):
216
+ rule_decisions[rule_id] = "pass_meta"
217
+ else:
218
+ rule_decisions[rule_id] = "unknown"
219
+
220
+ # Determine which RGs we still need to scan (conservative):
221
+ # - If no predicates at all -> keep ALL RGs.
222
+ # - Else keep any RG that *might* be relevant for at least one UNKNOWN rule.
223
+ keep_rg: List[int] = list(range(md.num_row_groups))
224
+ unknown_preds = [(rid, col, op, val) for (rid, col, op, val) in predicates if rule_decisions.get(rid) == "unknown"]
225
+
226
+ if unknown_preds:
227
+ keep_rg = []
228
+ for i, per_col in enumerate(rg_stats):
229
+ # Keep if ANY unknown predicate "may overlap"
230
+ keep = False
231
+ for _, col, op, val in unknown_preds:
232
+ verdict = _verdict_overlaps(op, val, per_col.get(col))
233
+ # Verdict True -> overlaps; Verdict None -> unknown -> keep to be safe
234
+ if verdict is True or verdict is None:
235
+ keep = True
236
+ break
237
+ if keep:
238
+ keep_rg.append(i)
239
+ if not keep_rg:
240
+ # Safety: if overlap logic ended up too strict, default to ALL
241
+ keep_rg = list(range(md.num_row_groups))
242
+
243
+ preplan = PrePlan(
244
+ manifest_columns=list(required_columns) if required_columns else [],
245
+ manifest_row_groups=keep_rg,
246
+ rule_decisions=rule_decisions,
247
+ stats={
248
+ "rg_total": md.num_row_groups,
249
+ "rg_kept": len(keep_rg),
250
+ "total_rows": md.num_rows,
251
+ },
252
+ )
253
+ return preplan
@@ -0,0 +1,179 @@
1
+ # src/kontra/preplan/postgres.py
2
+ """
3
+ PostgreSQL preplan - use pg_stats for metadata-only rule decisions.
4
+
5
+ Similar to Parquet metadata preplan, but uses PostgreSQL's statistics catalog.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from kontra.connectors.handle import DatasetHandle
13
+ from kontra.connectors.postgres import PostgresConnectionParams, get_connection
14
+
15
+ from .types import PrePlan, Decision
16
+
17
+
18
+ # Predicate format: (rule_id, column, op, value)
19
+ Predicate = Tuple[str, str, str, Any]
20
+
21
+
22
+ def fetch_pg_stats_for_preplan(
23
+ params: PostgresConnectionParams,
24
+ ) -> Dict[str, Dict[str, Any]]:
25
+ """
26
+ Fetch PostgreSQL statistics for preplan decisions.
27
+
28
+ Returns:
29
+ {
30
+ "__table__": {"row_estimate": int, "page_count": int},
31
+ "column_name": {
32
+ "null_frac": float, # 0.0-1.0 fraction of nulls
33
+ "n_distinct": float, # -1 = unique, >0 = count, <0 = fraction
34
+ "most_common_vals": str, # Array literal
35
+ },
36
+ ...
37
+ }
38
+ """
39
+ with get_connection(params) as conn:
40
+ with conn.cursor() as cur:
41
+ # Table-level stats from pg_class
42
+ cur.execute(
43
+ """
44
+ SELECT reltuples::bigint AS row_estimate,
45
+ relpages AS page_count
46
+ FROM pg_class
47
+ WHERE relname = %s
48
+ AND relnamespace = %s::regnamespace
49
+ """,
50
+ (params.table, params.schema),
51
+ )
52
+ row = cur.fetchone()
53
+ table_stats = {
54
+ "row_estimate": row[0] if row else 0,
55
+ "page_count": row[1] if row else 0,
56
+ }
57
+
58
+ # Column-level stats from pg_stats
59
+ cur.execute(
60
+ """
61
+ SELECT attname AS column_name,
62
+ null_frac,
63
+ n_distinct,
64
+ most_common_vals::text
65
+ FROM pg_stats
66
+ WHERE schemaname = %s AND tablename = %s
67
+ """,
68
+ (params.schema, params.table),
69
+ )
70
+
71
+ result: Dict[str, Dict[str, Any]] = {"__table__": table_stats}
72
+ for col_row in cur.fetchall():
73
+ col_name, null_frac, n_distinct, mcv = col_row
74
+ result[col_name] = {
75
+ "null_frac": null_frac,
76
+ "n_distinct": n_distinct,
77
+ "most_common_vals": mcv,
78
+ }
79
+
80
+ return result
81
+
82
+
83
+ def preplan_postgres(
84
+ handle: DatasetHandle,
85
+ required_columns: List[str],
86
+ predicates: List[Predicate],
87
+ ) -> PrePlan:
88
+ """
89
+ Metadata-only pre-planner for PostgreSQL tables using pg_stats.
90
+
91
+ Supports decisions for:
92
+ - not_null: if null_frac == 0 -> pass_meta
93
+ - unique: if n_distinct == -1 -> pass_meta (PostgreSQL's way of saying "all unique")
94
+
95
+ Args:
96
+ handle: DatasetHandle with db_params
97
+ required_columns: Columns needed for validation
98
+ predicates: List of (rule_id, column, op, value) tuples
99
+
100
+ Returns:
101
+ PrePlan with rule decisions based on pg_stats
102
+ """
103
+ if not handle.db_params:
104
+ raise ValueError("PostgreSQL handle missing db_params")
105
+
106
+ params: PostgresConnectionParams = handle.db_params
107
+ pg_stats = fetch_pg_stats_for_preplan(params)
108
+
109
+ table_stats = pg_stats.get("__table__", {})
110
+ row_estimate = table_stats.get("row_estimate", 0)
111
+
112
+ rule_decisions: Dict[str, Decision] = {}
113
+
114
+ for rule_id, column, op, value in predicates:
115
+ col_stats = pg_stats.get(column)
116
+
117
+ if col_stats is None:
118
+ # No stats for this column (ANALYZE not run or column doesn't exist)
119
+ rule_decisions[rule_id] = "unknown"
120
+ continue
121
+
122
+ null_frac = col_stats.get("null_frac")
123
+ n_distinct = col_stats.get("n_distinct")
124
+
125
+ if op == "not_null":
126
+ # If null_frac is exactly 0, the column has no nulls
127
+ if null_frac is not None and null_frac == 0:
128
+ rule_decisions[rule_id] = "pass_meta"
129
+ elif null_frac is not None and null_frac > 0:
130
+ # We know there ARE nulls, but pg_stats doesn't give exact count
131
+ # So we can't prove pass, but we know the rule will likely fail
132
+ # Be conservative: mark as unknown (actual execution will determine)
133
+ rule_decisions[rule_id] = "unknown"
134
+ else:
135
+ rule_decisions[rule_id] = "unknown"
136
+
137
+ elif op == "unique":
138
+ # n_distinct == -1 means PostgreSQL detected all values are unique
139
+ # n_distinct < 0 (other than -1) means n_distinct is a fraction of rows
140
+ if n_distinct is not None:
141
+ if n_distinct == -1:
142
+ # All values are unique AND no nulls (unique constraint behavior)
143
+ if null_frac == 0:
144
+ rule_decisions[rule_id] = "pass_meta"
145
+ else:
146
+ # Unique values but has nulls - need to check if nulls cause dups
147
+ rule_decisions[rule_id] = "unknown"
148
+ elif n_distinct < 0:
149
+ # Fraction - close to -1 means high uniqueness but not guaranteed
150
+ rule_decisions[rule_id] = "unknown"
151
+ else:
152
+ # n_distinct > 0: exact count or estimate
153
+ # If n_distinct equals row_estimate, likely unique
154
+ if row_estimate > 0 and n_distinct >= row_estimate:
155
+ rule_decisions[rule_id] = "pass_meta"
156
+ else:
157
+ rule_decisions[rule_id] = "unknown"
158
+ else:
159
+ rule_decisions[rule_id] = "unknown"
160
+
161
+ else:
162
+ # Other ops (>=, <=, ==, etc.) - pg_stats doesn't have min/max for general use
163
+ # (histogram_bounds could be used but it's complex)
164
+ rule_decisions[rule_id] = "unknown"
165
+
166
+ return PrePlan(
167
+ manifest_columns=list(required_columns) if required_columns else [],
168
+ manifest_row_groups=[], # Not applicable for PostgreSQL
169
+ rule_decisions=rule_decisions,
170
+ stats={
171
+ "row_estimate": row_estimate,
172
+ "columns_with_stats": len([k for k in pg_stats if k != "__table__"]),
173
+ },
174
+ )
175
+
176
+
177
+ def can_preplan_postgres(handle: DatasetHandle) -> bool:
178
+ """Check if PostgreSQL preplan is applicable for this handle."""
179
+ return handle.scheme in ("postgres", "postgresql") and handle.db_params is not None
@@ -0,0 +1,191 @@
1
+ # src/kontra/preplan/sqlserver.py
2
+ """
3
+ SQL Server preplan - use metadata for rule decisions.
4
+
5
+ Uses SQL Server system views:
6
+ - sys.dm_db_partition_stats for row count estimates
7
+ - sys.columns for nullability constraints
8
+ - sys.indexes + sys.index_columns for uniqueness constraints
9
+
10
+ Note: Unlike PostgreSQL's pg_stats, SQL Server doesn't expose null_frac directly.
11
+ We use constraint metadata instead (NOT NULL, UNIQUE indexes).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import Any, Dict, List, Tuple
17
+
18
+ from kontra.connectors.handle import DatasetHandle
19
+ from kontra.connectors.sqlserver import SqlServerConnectionParams, get_connection
20
+
21
+ from .types import PrePlan, Decision
22
+
23
+
24
+ # Predicate format: (rule_id, column, op, value)
25
+ Predicate = Tuple[str, str, str, Any]
26
+
27
+
28
+ def fetch_sqlserver_metadata(
29
+ params: SqlServerConnectionParams,
30
+ ) -> Dict[str, Dict[str, Any]]:
31
+ """
32
+ Fetch SQL Server metadata for preplan decisions.
33
+
34
+ Returns:
35
+ {
36
+ "__table__": {"row_estimate": int, "page_count": int},
37
+ "column_name": {
38
+ "is_nullable": bool, # From column definition
39
+ "is_identity": bool, # Identity column
40
+ "has_unique_constraint": bool, # Unique index or constraint
41
+ },
42
+ ...
43
+ }
44
+ """
45
+ with get_connection(params) as conn:
46
+ cursor = conn.cursor()
47
+
48
+ # Table-level stats from sys.dm_db_partition_stats
49
+ cursor.execute(
50
+ """
51
+ SELECT SUM(row_count) AS row_estimate,
52
+ SUM(used_page_count) AS page_count
53
+ FROM sys.dm_db_partition_stats ps
54
+ JOIN sys.objects o ON ps.object_id = o.object_id
55
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
56
+ WHERE s.name = %s AND o.name = %s AND ps.index_id IN (0, 1)
57
+ """,
58
+ (params.schema, params.table),
59
+ )
60
+ row = cursor.fetchone()
61
+ table_stats = {
62
+ "row_estimate": int(row[0]) if row and row[0] else 0,
63
+ "page_count": int(row[1]) if row and row[1] else 0,
64
+ }
65
+
66
+ # Column-level metadata
67
+ cursor.execute(
68
+ """
69
+ SELECT
70
+ c.name AS column_name,
71
+ c.is_nullable,
72
+ c.is_identity
73
+ FROM sys.columns c
74
+ JOIN sys.objects o ON c.object_id = o.object_id
75
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
76
+ WHERE s.name = %s AND o.name = %s
77
+ """,
78
+ (params.schema, params.table),
79
+ )
80
+
81
+ result: Dict[str, Dict[str, Any]] = {"__table__": table_stats}
82
+ for col_row in cursor.fetchall():
83
+ col_name, is_nullable, is_identity = col_row
84
+ result[col_name] = {
85
+ "is_nullable": bool(is_nullable),
86
+ "is_identity": bool(is_identity),
87
+ "has_unique_constraint": False, # Will be updated below
88
+ }
89
+
90
+ # Check for unique constraints/indexes
91
+ cursor.execute(
92
+ """
93
+ SELECT c.name AS column_name
94
+ FROM sys.indexes i
95
+ JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
96
+ JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
97
+ JOIN sys.objects o ON i.object_id = o.object_id
98
+ JOIN sys.schemas s ON o.schema_id = s.schema_id
99
+ WHERE s.name = %s AND o.name = %s
100
+ AND (i.is_unique = 1 OR i.is_primary_key = 1)
101
+ AND ic.is_included_column = 0
102
+ GROUP BY c.name
103
+ HAVING COUNT(*) = 1 -- Single-column unique constraint
104
+ """,
105
+ (params.schema, params.table),
106
+ )
107
+
108
+ for row in cursor.fetchall():
109
+ col_name = row[0]
110
+ if col_name in result:
111
+ result[col_name]["has_unique_constraint"] = True
112
+
113
+ return result
114
+
115
+
116
+ def preplan_sqlserver(
117
+ handle: DatasetHandle,
118
+ required_columns: List[str],
119
+ predicates: List[Predicate],
120
+ ) -> PrePlan:
121
+ """
122
+ Metadata-only pre-planner for SQL Server tables.
123
+
124
+ Supports decisions for:
125
+ - not_null: if column is NOT NULL (is_nullable = 0) -> pass_meta
126
+ - unique: if column has unique index/constraint -> pass_meta
127
+
128
+ Args:
129
+ handle: DatasetHandle with db_params
130
+ required_columns: Columns needed for validation
131
+ predicates: List of (rule_id, column, op, value) tuples
132
+
133
+ Returns:
134
+ PrePlan with rule decisions based on SQL Server metadata
135
+ """
136
+ if not handle.db_params:
137
+ raise ValueError("SQL Server handle missing db_params")
138
+
139
+ params: SqlServerConnectionParams = handle.db_params
140
+ metadata = fetch_sqlserver_metadata(params)
141
+
142
+ table_stats = metadata.get("__table__", {})
143
+ row_estimate = table_stats.get("row_estimate", 0)
144
+
145
+ rule_decisions: Dict[str, Decision] = {}
146
+
147
+ for rule_id, column, op, value in predicates:
148
+ col_meta = metadata.get(column)
149
+
150
+ if col_meta is None:
151
+ # Column not found in metadata
152
+ rule_decisions[rule_id] = "unknown"
153
+ continue
154
+
155
+ is_nullable = col_meta.get("is_nullable", True)
156
+ is_identity = col_meta.get("is_identity", False)
157
+ has_unique = col_meta.get("has_unique_constraint", False)
158
+
159
+ if op == "not_null":
160
+ # If column is defined as NOT NULL, it definitely has no nulls
161
+ if not is_nullable:
162
+ rule_decisions[rule_id] = "pass_meta"
163
+ else:
164
+ # Column allows nulls - may or may not have any
165
+ rule_decisions[rule_id] = "unknown"
166
+
167
+ elif op == "unique":
168
+ # If column has unique constraint or is identity, it's unique
169
+ if has_unique or is_identity:
170
+ rule_decisions[rule_id] = "pass_meta"
171
+ else:
172
+ rule_decisions[rule_id] = "unknown"
173
+
174
+ else:
175
+ # Other ops - would need actual data statistics
176
+ rule_decisions[rule_id] = "unknown"
177
+
178
+ return PrePlan(
179
+ manifest_columns=list(required_columns) if required_columns else [],
180
+ manifest_row_groups=[], # Not applicable for SQL Server
181
+ rule_decisions=rule_decisions,
182
+ stats={
183
+ "row_estimate": row_estimate,
184
+ "columns_with_metadata": len([k for k in metadata if k != "__table__"]),
185
+ },
186
+ )
187
+
188
+
189
+ def can_preplan_sqlserver(handle: DatasetHandle) -> bool:
190
+ """Check if SQL Server preplan is applicable for this handle."""
191
+ return handle.scheme in ("mssql", "sqlserver") and handle.db_params is not None
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Literal
5
+
6
+ # A rule can be proven by metadata to pass, proven to fail, or remain unknown.
7
+ Decision = Literal["pass_meta", "fail_meta", "unknown"]
8
+
9
+
10
+ @dataclass
11
+ class PrePlan:
12
+ """
13
+ Result of the metadata-only pre-planning stage.
14
+
15
+ - manifest_columns: union of columns still needed for SQL/Polars after metadata decisions.
16
+ - manifest_row_groups: Parquet row-group indices that *may* affect remaining rules.
17
+ (Single-file MVP; can evolve to a file list later.)
18
+ - rule_decisions: rule_id -> Decision ("pass_meta" | "fail_meta" | "unknown").
19
+ - stats: small numbers for observability (e.g., {"rg_total": 19, "rg_kept": 7}).
20
+ """
21
+ manifest_columns: List[str]
22
+ manifest_row_groups: List[int]
23
+ rule_decisions: Dict[str, Decision]
24
+ stats: Dict[str, int]
@@ -0,0 +1,20 @@
1
+ # src/kontra/probes/__init__.py
2
+ """
3
+ Transformation probes for Kontra.
4
+
5
+ Probes measure the structural effects of data transformations without
6
+ assigning meaning or judgment. They provide deterministic, structured,
7
+ token-efficient measurements for agents to reason about.
8
+
9
+ Available probes:
10
+ - compare: Measure differences between before/after transformation
11
+ - profile_relationship: Measure JOIN viability between datasets
12
+ """
13
+
14
+ from kontra.probes.compare import compare
15
+ from kontra.probes.relationship import profile_relationship
16
+
17
+ __all__ = [
18
+ "compare",
19
+ "profile_relationship",
20
+ ]