kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
kontra/engine/stats.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# src/kontra/engine/stats.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stats helpers — minimal, fast, and CLI-friendly.
|
|
6
|
+
|
|
7
|
+
Design goals
|
|
8
|
+
------------
|
|
9
|
+
- Keep helpers tiny and zero-alloc heavy; these run on every validation.
|
|
10
|
+
- Avoid coupling to engine internals or reporters; return plain dicts.
|
|
11
|
+
- Backwards compatible: existing callers keep working as-is.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Iterable, Dict, Any, List, Optional
|
|
16
|
+
import time
|
|
17
|
+
import polars as pl
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ----------------------------- Timers -----------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RunTimers:
|
|
25
|
+
contract_load_ms: int = 0
|
|
26
|
+
data_load_ms: int = 0
|
|
27
|
+
compile_ms: int = 0
|
|
28
|
+
execute_ms: int = 0
|
|
29
|
+
report_ms: int = 0
|
|
30
|
+
polars_ms: int = 0
|
|
31
|
+
preplan_ms: int = 0
|
|
32
|
+
sql_ms: int = 0
|
|
33
|
+
|
|
34
|
+
def total_ms(self) -> int:
|
|
35
|
+
"""Total time across all phases."""
|
|
36
|
+
return (
|
|
37
|
+
self.contract_load_ms
|
|
38
|
+
+ self.data_load_ms
|
|
39
|
+
+ self.compile_ms
|
|
40
|
+
+ self.execute_ms
|
|
41
|
+
+ self.report_ms
|
|
42
|
+
+ self.polars_ms
|
|
43
|
+
+ self.preplan_ms
|
|
44
|
+
+ self.sql_ms
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def now_ms() -> int:
|
|
49
|
+
return int(time.time() * 1000)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------- Summaries ---------------------------------------
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def basic_summary(
|
|
56
|
+
df: Optional[pl.DataFrame],
|
|
57
|
+
*,
|
|
58
|
+
available_cols: Optional[List[str]] = None,
|
|
59
|
+
nrows_override: Optional[int] = None,
|
|
60
|
+
) -> Dict[str, int]:
|
|
61
|
+
"""
|
|
62
|
+
Return a tiny dataset summary.
|
|
63
|
+
|
|
64
|
+
Args
|
|
65
|
+
----
|
|
66
|
+
df:
|
|
67
|
+
The (possibly pruned) Polars DataFrame. May be None when we
|
|
68
|
+
skipped materialization (e.g., all rules pushed down).
|
|
69
|
+
available_cols:
|
|
70
|
+
Full schema columns if known (e.g., via SQL introspection or a cheap scan).
|
|
71
|
+
When provided, we report ncols = len(available_cols) instead of df width,
|
|
72
|
+
so the CLI consistently shows *total* columns, not just loaded.
|
|
73
|
+
nrows_override:
|
|
74
|
+
Authoritative row count (e.g., from SQL COUNT(*)) to avoid collecting df.height.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
{"nrows": int, "ncols": int}
|
|
79
|
+
"""
|
|
80
|
+
if df is None:
|
|
81
|
+
nrows = int(nrows_override or 0)
|
|
82
|
+
ncols = int(len(available_cols or []))
|
|
83
|
+
return {"nrows": nrows, "ncols": ncols}
|
|
84
|
+
|
|
85
|
+
nrows = int(nrows_override if nrows_override is not None else df.height)
|
|
86
|
+
ncols = int(len(available_cols)) if available_cols is not None else int(len(df.columns))
|
|
87
|
+
return {"nrows": nrows, "ncols": ncols}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def columns_touched(rule_specs: Iterable[Dict[str, Any]]) -> List[str]:
|
|
91
|
+
"""
|
|
92
|
+
Ordered de-duplicated list of columns referenced by rules.
|
|
93
|
+
"""
|
|
94
|
+
cols: List[str] = []
|
|
95
|
+
seen: set[str] = set()
|
|
96
|
+
for r in rule_specs:
|
|
97
|
+
col = r.get("params", {}).get("column")
|
|
98
|
+
if isinstance(col, str) and col and col not in seen:
|
|
99
|
+
seen.add(col)
|
|
100
|
+
cols.append(col)
|
|
101
|
+
return cols
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_coverage(
|
|
105
|
+
*,
|
|
106
|
+
total_rules: int,
|
|
107
|
+
sql_results: Dict[str, Dict[str, Any]] | List[Dict[str, Any]],
|
|
108
|
+
polars_results: List[Dict[str, Any]],
|
|
109
|
+
validated_columns: List[str],
|
|
110
|
+
) -> Dict[str, Any]:
|
|
111
|
+
"""
|
|
112
|
+
Compact, renderer-friendly coverage block.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
{
|
|
117
|
+
"rules_total": int,
|
|
118
|
+
"rules_sql": int, "rules_failed_sql": int,
|
|
119
|
+
"rules_polars": int, "rules_failed_polars": int,
|
|
120
|
+
"validated_columns": [...],
|
|
121
|
+
}
|
|
122
|
+
"""
|
|
123
|
+
# Allow either a dict-by-id or a flat list for sql_results
|
|
124
|
+
if isinstance(sql_results, dict):
|
|
125
|
+
sql_vals = list(sql_results.values())
|
|
126
|
+
else:
|
|
127
|
+
sql_vals = list(sql_results or [])
|
|
128
|
+
|
|
129
|
+
rules_sql = len(sql_vals)
|
|
130
|
+
rules_failed_sql = sum(1 for r in sql_vals if not r.get("passed", False))
|
|
131
|
+
|
|
132
|
+
rules_polars = len(polars_results or [])
|
|
133
|
+
rules_failed_polars = sum(1 for r in polars_results or [] if not r.get("passed", False))
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
"rules_total": int(total_rules),
|
|
137
|
+
"rules_sql": int(rules_sql),
|
|
138
|
+
"rules_failed_sql": int(rules_failed_sql),
|
|
139
|
+
"rules_polars": int(rules_polars),
|
|
140
|
+
"rules_failed_polars": int(rules_failed_polars),
|
|
141
|
+
"validated_columns": list(validated_columns or []),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ------------------------------ Profiling -------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def profile_for(df: pl.DataFrame, cols: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
149
|
+
"""
|
|
150
|
+
Lightweight, single-pass column profile for touched columns only.
|
|
151
|
+
"""
|
|
152
|
+
if not cols:
|
|
153
|
+
return {}
|
|
154
|
+
|
|
155
|
+
exprs: List[pl.Expr] = []
|
|
156
|
+
for c in cols:
|
|
157
|
+
# common stats by dtype family
|
|
158
|
+
e = [
|
|
159
|
+
pl.col(c).is_null().sum().alias(f"__nulls__{c}"),
|
|
160
|
+
pl.col(c).n_unique().alias(f"__distinct__{c}"),
|
|
161
|
+
]
|
|
162
|
+
# numeric extras
|
|
163
|
+
try:
|
|
164
|
+
s = df.get_column(c)
|
|
165
|
+
if pl.datatypes.is_numeric(s.dtype):
|
|
166
|
+
e += [
|
|
167
|
+
pl.col(c).min().alias(f"__min__{c}"),
|
|
168
|
+
pl.col(c).max().alias(f"__max__{c}"),
|
|
169
|
+
pl.col(c).mean().alias(f"__mean__{c}"),
|
|
170
|
+
]
|
|
171
|
+
except Exception:
|
|
172
|
+
# column missing (shouldn't happen if projection is correct) — skip extras
|
|
173
|
+
pass
|
|
174
|
+
exprs.extend(e)
|
|
175
|
+
|
|
176
|
+
out = df.select(exprs)
|
|
177
|
+
if out.height == 0:
|
|
178
|
+
return {}
|
|
179
|
+
|
|
180
|
+
# Use named=True to get row as dict for direct column access
|
|
181
|
+
row = out.row(0, named=True)
|
|
182
|
+
stats: Dict[str, Dict[str, Any]] = {}
|
|
183
|
+
for c in cols:
|
|
184
|
+
d: Dict[str, Any] = {
|
|
185
|
+
"nulls": int(row[f"__nulls__{c}"]),
|
|
186
|
+
"distinct": int(row[f"__distinct__{c}"]),
|
|
187
|
+
}
|
|
188
|
+
# Only attach numeric extras if these columns exist in the projection
|
|
189
|
+
if f"__min__{c}" in out.columns:
|
|
190
|
+
d["min"] = row[f"__min__{c}"]
|
|
191
|
+
d["max"] = row[f"__max__{c}"]
|
|
192
|
+
d["mean"] = float(row[f"__mean__{c}"])
|
|
193
|
+
stats[c] = d
|
|
194
|
+
return stats
|
kontra/engine/types.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# src/kontra/engine/types.py
|
|
2
|
+
"""
|
|
3
|
+
Type definitions for engine result dictionaries.
|
|
4
|
+
|
|
5
|
+
These TypedDicts provide IDE support and documentation for the
|
|
6
|
+
dict-based results returned by the validation engine.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from kontra.engine.types import RuleResultDict, ValidationResultDict
|
|
10
|
+
|
|
11
|
+
def process_result(result: RuleResultDict) -> None:
|
|
12
|
+
print(result["rule_id"]) # IDE knows this is str
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Any, Dict, List, Optional, TypedDict, Literal
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RuleResultDict(TypedDict, total=False):
|
|
21
|
+
"""
|
|
22
|
+
Result of validating a single rule.
|
|
23
|
+
|
|
24
|
+
Required fields:
|
|
25
|
+
rule_id: Unique identifier for the rule
|
|
26
|
+
passed: Whether the rule passed validation
|
|
27
|
+
failed_count: Number of violations found
|
|
28
|
+
message: Human-readable result message
|
|
29
|
+
|
|
30
|
+
Optional fields:
|
|
31
|
+
severity: blocking | warning | info
|
|
32
|
+
execution_source: Where rule was executed (polars | sql | metadata)
|
|
33
|
+
failure_mode: Type of failure (null_values, duplicate_values, etc.)
|
|
34
|
+
details: Additional details (unexpected values, suggestions, etc.)
|
|
35
|
+
actions_executed: List of post-validation actions run
|
|
36
|
+
"""
|
|
37
|
+
# Required
|
|
38
|
+
rule_id: str
|
|
39
|
+
passed: bool
|
|
40
|
+
failed_count: int
|
|
41
|
+
message: str
|
|
42
|
+
# Optional
|
|
43
|
+
severity: str
|
|
44
|
+
execution_source: str
|
|
45
|
+
failure_mode: str
|
|
46
|
+
details: Dict[str, Any]
|
|
47
|
+
actions_executed: List[str]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class SummaryDict(TypedDict, total=False):
|
|
51
|
+
"""
|
|
52
|
+
Validation summary for a dataset.
|
|
53
|
+
|
|
54
|
+
Contains aggregate pass/fail counts and optional severity breakdowns.
|
|
55
|
+
"""
|
|
56
|
+
passed: bool
|
|
57
|
+
total_rules: int
|
|
58
|
+
rules_passed: int
|
|
59
|
+
rules_failed: int
|
|
60
|
+
dataset_name: str
|
|
61
|
+
# Severity breakdown
|
|
62
|
+
blocking_failures: int
|
|
63
|
+
warning_failures: int
|
|
64
|
+
info_failures: int
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ValidationResultDict(TypedDict, total=False):
|
|
68
|
+
"""
|
|
69
|
+
Complete validation result returned by ValidationEngine.run().
|
|
70
|
+
|
|
71
|
+
Contains summary, individual rule results, and optional stats.
|
|
72
|
+
"""
|
|
73
|
+
dataset: str
|
|
74
|
+
summary: SummaryDict
|
|
75
|
+
results: List[RuleResultDict]
|
|
76
|
+
stats: Dict[str, Any]
|
|
77
|
+
run_meta: Dict[str, Any]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class PreplanSummaryDict(TypedDict, total=False):
|
|
81
|
+
"""
|
|
82
|
+
Preplan (metadata analysis) summary.
|
|
83
|
+
|
|
84
|
+
Reports how many rules were resolved via metadata without data scan.
|
|
85
|
+
"""
|
|
86
|
+
enabled: bool
|
|
87
|
+
effective: bool
|
|
88
|
+
rules_pass_meta: int
|
|
89
|
+
rules_fail_meta: int
|
|
90
|
+
rules_unknown: int
|
|
91
|
+
row_groups_kept: Optional[int]
|
|
92
|
+
row_groups_total: Optional[int]
|
|
93
|
+
row_groups_pruned: Optional[int]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ProjectionDict(TypedDict, total=False):
|
|
97
|
+
"""
|
|
98
|
+
Column projection statistics.
|
|
99
|
+
|
|
100
|
+
Reports column pruning effectiveness.
|
|
101
|
+
"""
|
|
102
|
+
enabled: bool
|
|
103
|
+
available_count: int
|
|
104
|
+
full: Dict[str, Any]
|
|
105
|
+
residual: Dict[str, Any]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class PushdownDict(TypedDict, total=False):
|
|
109
|
+
"""
|
|
110
|
+
SQL pushdown statistics.
|
|
111
|
+
|
|
112
|
+
Reports SQL execution details and timing.
|
|
113
|
+
"""
|
|
114
|
+
enabled: bool
|
|
115
|
+
effective: bool
|
|
116
|
+
executor: str
|
|
117
|
+
rules_pushed: int
|
|
118
|
+
breakdown_ms: Dict[str, int]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class StatsDict(TypedDict, total=False):
|
|
122
|
+
"""
|
|
123
|
+
Full validation statistics.
|
|
124
|
+
|
|
125
|
+
Optional stats block attached to validation results when
|
|
126
|
+
stats_mode is "summary" or "profile".
|
|
127
|
+
"""
|
|
128
|
+
stats_version: str
|
|
129
|
+
run_meta: Dict[str, Any]
|
|
130
|
+
dataset: Dict[str, Any]
|
|
131
|
+
preplan: PreplanSummaryDict
|
|
132
|
+
pushdown: PushdownDict
|
|
133
|
+
projection: ProjectionDict
|
|
134
|
+
residual: Dict[str, Any]
|
|
135
|
+
columns_touched: List[str]
|
|
136
|
+
columns_validated: List[str]
|
|
137
|
+
columns_loaded: List[str]
|
|
138
|
+
profile: Dict[str, Any]
|