kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# src/kontra/rules/builtin/disallowed_values.py
|
|
2
|
+
"""
|
|
3
|
+
Disallowed values rule - Column must NOT contain any of the specified values.
|
|
4
|
+
|
|
5
|
+
Inverse of allowed_values: fails if value IS in the list.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
- name: disallowed_values
|
|
9
|
+
params:
|
|
10
|
+
column: status
|
|
11
|
+
values: ["deleted", "banned", "spam"]
|
|
12
|
+
|
|
13
|
+
Fails when:
|
|
14
|
+
- The column value IS in the disallowed values list
|
|
15
|
+
|
|
16
|
+
Passes when:
|
|
17
|
+
- The column value is NOT in the disallowed values list
|
|
18
|
+
- The column value is NULL (NULL is not in any list)
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import Any, Dict, List, Optional, Sequence, Set
|
|
23
|
+
|
|
24
|
+
import polars as pl
|
|
25
|
+
|
|
26
|
+
from kontra.rules.base import BaseRule
|
|
27
|
+
from kontra.rules.registry import register_rule
|
|
28
|
+
from kontra.rules.predicates import Predicate
|
|
29
|
+
from kontra.state.types import FailureMode
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_rule("disallowed_values")
|
|
33
|
+
class DisallowedValuesRule(BaseRule):
|
|
34
|
+
"""
|
|
35
|
+
Fails where column value IS in the disallowed set.
|
|
36
|
+
|
|
37
|
+
params:
|
|
38
|
+
- column: str (required) - Column to check
|
|
39
|
+
- values: list (required) - Values that are NOT allowed
|
|
40
|
+
|
|
41
|
+
NULL handling:
|
|
42
|
+
- NULL values are NOT failures (NULL is not in any list)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, name: str, params: Dict[str, Any]):
|
|
46
|
+
super().__init__(name, params)
|
|
47
|
+
self._column = self._get_required_param("column", str)
|
|
48
|
+
if "values" not in self.params:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Rule '{self.name}' requires parameter 'values' but it was not provided"
|
|
51
|
+
)
|
|
52
|
+
self._values: Sequence[Any] = self.params["values"]
|
|
53
|
+
|
|
54
|
+
def required_columns(self) -> Set[str]:
|
|
55
|
+
return {self._column}
|
|
56
|
+
|
|
57
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
58
|
+
# Check column exists before accessing
|
|
59
|
+
col_check = self._check_columns(df, {self._column})
|
|
60
|
+
if col_check is not None:
|
|
61
|
+
return col_check
|
|
62
|
+
|
|
63
|
+
# Failure = value IS in the disallowed list (not including NULL)
|
|
64
|
+
# is_in returns NULL for NULL values, we want NULL -> not a failure
|
|
65
|
+
mask = df[self._column].is_in(list(self._values)).fill_null(False)
|
|
66
|
+
|
|
67
|
+
res = super()._failures(df, mask, f"{self._column} contains disallowed values")
|
|
68
|
+
res["rule_id"] = self.rule_id
|
|
69
|
+
|
|
70
|
+
if res["failed_count"] > 0:
|
|
71
|
+
res["failure_mode"] = str(FailureMode.NOVEL_CATEGORY)
|
|
72
|
+
res["details"] = self._explain_failure(df, mask)
|
|
73
|
+
|
|
74
|
+
return res
|
|
75
|
+
|
|
76
|
+
def _explain_failure(self, df: pl.DataFrame, mask: pl.Series) -> Dict[str, Any]:
|
|
77
|
+
"""Generate detailed failure explanation."""
|
|
78
|
+
# Find which disallowed values were found and their counts
|
|
79
|
+
found_values = (
|
|
80
|
+
df.filter(mask)
|
|
81
|
+
.group_by(self._column)
|
|
82
|
+
.agg(pl.len().alias("count"))
|
|
83
|
+
.sort("count", descending=True)
|
|
84
|
+
.head(10)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
found_list: List[Dict[str, Any]] = []
|
|
88
|
+
for row in found_values.iter_rows(named=True):
|
|
89
|
+
found_list.append({
|
|
90
|
+
"value": row[self._column],
|
|
91
|
+
"count": row["count"],
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"disallowed": [str(v) for v in self._values],
|
|
96
|
+
"found_values": found_list,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def compile_predicate(self) -> Optional[Predicate]:
|
|
100
|
+
# Failure = value IS in the disallowed list
|
|
101
|
+
expr = pl.col(self._column).is_in(self._values).fill_null(False)
|
|
102
|
+
return Predicate(
|
|
103
|
+
rule_id=self.rule_id,
|
|
104
|
+
expr=expr,
|
|
105
|
+
message=f"{self._column} contains disallowed values",
|
|
106
|
+
columns={self._column},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def to_sql_spec(self) -> Optional[Dict[str, Any]]:
|
|
110
|
+
"""Generate SQL pushdown specification."""
|
|
111
|
+
return {
|
|
112
|
+
"kind": "disallowed_values",
|
|
113
|
+
"rule_id": self.rule_id,
|
|
114
|
+
"column": self._column,
|
|
115
|
+
"values": list(self._values),
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
def to_sql_filter(self, dialect: str = "postgres") -> str | None:
|
|
119
|
+
"""Generate SQL filter for sampling failing rows."""
|
|
120
|
+
col = f'"{self._column}"'
|
|
121
|
+
|
|
122
|
+
# Build IN list (exclude None values)
|
|
123
|
+
quoted_values = []
|
|
124
|
+
for v in self._values:
|
|
125
|
+
if v is None:
|
|
126
|
+
continue
|
|
127
|
+
elif isinstance(v, str):
|
|
128
|
+
escaped = v.replace("'", "''")
|
|
129
|
+
quoted_values.append(f"'{escaped}'")
|
|
130
|
+
elif isinstance(v, bool):
|
|
131
|
+
quoted_values.append("TRUE" if v else "FALSE")
|
|
132
|
+
else:
|
|
133
|
+
quoted_values.append(str(v))
|
|
134
|
+
|
|
135
|
+
if not quoted_values:
|
|
136
|
+
return None # No values to check
|
|
137
|
+
|
|
138
|
+
in_list = ", ".join(quoted_values)
|
|
139
|
+
# Failure = value IS in the disallowed list (not null)
|
|
140
|
+
return f"{col} IN ({in_list})"
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, Any, Optional, Set, Tuple
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
from kontra.rules.base import BaseRule
|
|
7
|
+
from kontra.rules.registry import register_rule
|
|
8
|
+
from kontra.state.types import FailureMode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_rule("dtype")
|
|
12
|
+
class DtypeRule(BaseRule):
|
|
13
|
+
"""
|
|
14
|
+
Dtype — schema-level type check for a single column.
|
|
15
|
+
|
|
16
|
+
Params
|
|
17
|
+
------
|
|
18
|
+
- column: str # required
|
|
19
|
+
- type: str # required
|
|
20
|
+
Accepts either:
|
|
21
|
+
* exact physical types: int8/int16/int32/int64, uint8/uint16/uint32/uint64,
|
|
22
|
+
float32/float64 (or float/double as aliases),
|
|
23
|
+
boolean/bool, utf8/string/str/text, date, datetime, time
|
|
24
|
+
* logical families: int/integer, float, numeric, string/str
|
|
25
|
+
|
|
26
|
+
- mode: "strict" # optional (default). Future: may support relaxed modes.
|
|
27
|
+
|
|
28
|
+
Semantics
|
|
29
|
+
---------
|
|
30
|
+
- Exact types require an exact match (e.g., "int16" passes only if the column is Int16).
|
|
31
|
+
- Family types accept any member of the family (e.g., "int" accepts Int8/16/32/64).
|
|
32
|
+
- Strings: "utf8", "string", "str", "text" are treated as the same family (Utf8 or String).
|
|
33
|
+
- We do NOT cast — we only validate. (Casting hints may come via planner/materializers later.)
|
|
34
|
+
|
|
35
|
+
Results
|
|
36
|
+
-------
|
|
37
|
+
- On mismatch or invalid config, `failed_count == nrows` (schema-level violation).
|
|
38
|
+
- Message is deterministic: "<col> expected <expected>, found <ActualDtype>".
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Valid type names (for error message)
|
|
42
|
+
_VALID_TYPES = [
|
|
43
|
+
# Exact types
|
|
44
|
+
"int8", "int16", "int32", "int64",
|
|
45
|
+
"uint8", "uint16", "uint32", "uint64",
|
|
46
|
+
"float32", "float64", "float", "double",
|
|
47
|
+
"bool", "boolean",
|
|
48
|
+
"date", "datetime", "time",
|
|
49
|
+
"utf8", "string", "str", "text",
|
|
50
|
+
# Family types
|
|
51
|
+
"int", "integer", "numeric",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
# ---- Aliases / Maps -----------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def __init__(self, *args, **kwargs):
|
|
57
|
+
super().__init__(*args, **kwargs)
|
|
58
|
+
from kontra.errors import RuleParameterError
|
|
59
|
+
|
|
60
|
+
expected_type = self.params.get("type")
|
|
61
|
+
if expected_type is not None:
|
|
62
|
+
label, allowed = self._normalize_expected(str(expected_type))
|
|
63
|
+
if allowed is None:
|
|
64
|
+
raise RuleParameterError(
|
|
65
|
+
"dtype", "type",
|
|
66
|
+
f"unknown type '{expected_type}'. Valid types: {', '.join(sorted(self._VALID_TYPES))}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
_STRING_ALIASES = {"utf8", "string", "str", "text"}
|
|
70
|
+
|
|
71
|
+
# Exact physical types (single-member sets treated as "exact")
|
|
72
|
+
_EXACT_MAP = {
|
|
73
|
+
# signed ints
|
|
74
|
+
"int8": {pl.Int8}, "int16": {pl.Int16}, "int32": {pl.Int32}, "int64": {pl.Int64},
|
|
75
|
+
# unsigned ints
|
|
76
|
+
"uint8": {pl.UInt8}, "uint16": {pl.UInt16}, "uint32": {pl.UInt32}, "uint64": {pl.UInt64},
|
|
77
|
+
# floats
|
|
78
|
+
"float32": {pl.Float32}, "float64": {pl.Float64},
|
|
79
|
+
"float": {pl.Float64}, "double": {pl.Float64}, # common aliases treated as exact Float64
|
|
80
|
+
# booleans
|
|
81
|
+
"bool": {pl.Boolean}, "boolean": {pl.Boolean},
|
|
82
|
+
# temporal
|
|
83
|
+
"date": {pl.Date}, "datetime": {pl.Datetime}, "time": {pl.Time},
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Logical families (multi-member sets)
|
|
87
|
+
_FAMILY_MAP = {
|
|
88
|
+
"int": {pl.Int8, pl.Int16, pl.Int32, pl.Int64},
|
|
89
|
+
"integer": {pl.Int8, pl.Int16, pl.Int32, pl.Int64},
|
|
90
|
+
"float": {pl.Float32, pl.Float64},
|
|
91
|
+
"numeric": {pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64},
|
|
92
|
+
"string": {pl.Utf8, getattr(pl, "String", pl.Utf8)}, # tolerate both Utf8 and String
|
|
93
|
+
"str": {pl.Utf8, getattr(pl, "String", pl.Utf8)},
|
|
94
|
+
"text": {pl.Utf8, getattr(pl, "String", pl.Utf8)},
|
|
95
|
+
"utf8": {pl.Utf8, getattr(pl, "String", pl.Utf8)},
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# ---- Normalization ------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _dtype_label(dt: pl.DataType) -> str:
|
|
102
|
+
"""Stable, user-friendly label for actual dtype in messages."""
|
|
103
|
+
# Polars dtypes stringify nicely (e.g., "Int64", "Utf8").
|
|
104
|
+
# Keep that behavior, but ensure Utf8/String variants read cleanly.
|
|
105
|
+
if dt == pl.Utf8:
|
|
106
|
+
return "Utf8"
|
|
107
|
+
# Some Polars versions may have pl.String; prefer "Utf8" in messages for consistency.
|
|
108
|
+
if getattr(pl, "String", None) and dt == getattr(pl, "String"):
|
|
109
|
+
return "Utf8"
|
|
110
|
+
return str(dt)
|
|
111
|
+
|
|
112
|
+
def _normalize_expected(self, typ: str) -> Tuple[str, Optional[set]]:
|
|
113
|
+
"""
|
|
114
|
+
Returns (label, allowed_set).
|
|
115
|
+
- label: string echoed in error messages ("int16", "int", "date", ...)
|
|
116
|
+
- allowed_set: a set of acceptable Polars dtypes (None if unknown)
|
|
117
|
+
"""
|
|
118
|
+
t = (typ or "").strip().lower()
|
|
119
|
+
if not t:
|
|
120
|
+
return "<unspecified>", None
|
|
121
|
+
|
|
122
|
+
# tolerate hyphen variants like "utf-8"
|
|
123
|
+
t_no_dash = t.replace("-", "")
|
|
124
|
+
|
|
125
|
+
# Family first (covers "string", "str", "utf8", etc.)
|
|
126
|
+
if t in self._FAMILY_MAP:
|
|
127
|
+
return t, self._FAMILY_MAP[t]
|
|
128
|
+
if t_no_dash in self._FAMILY_MAP:
|
|
129
|
+
return t_no_dash, self._FAMILY_MAP[t_no_dash]
|
|
130
|
+
|
|
131
|
+
# Exact physical types (single-member sets)
|
|
132
|
+
if t in self._EXACT_MAP:
|
|
133
|
+
return t, self._EXACT_MAP[t]
|
|
134
|
+
|
|
135
|
+
return t, None
|
|
136
|
+
|
|
137
|
+
# ---- Rule contract ------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
140
|
+
column = self.params.get("column")
|
|
141
|
+
expected_type = self.params.get("type")
|
|
142
|
+
mode = (self.params.get("mode") or "strict").lower()
|
|
143
|
+
|
|
144
|
+
if mode != "strict":
|
|
145
|
+
return {
|
|
146
|
+
"rule_id": self.rule_id,
|
|
147
|
+
"passed": False,
|
|
148
|
+
"failed_count": int(df.height),
|
|
149
|
+
"message": f"Unsupported dtype mode '{mode}'; only 'strict' is implemented.",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if not isinstance(column, str) or not column:
|
|
153
|
+
return {
|
|
154
|
+
"rule_id": self.rule_id,
|
|
155
|
+
"passed": False,
|
|
156
|
+
"failed_count": int(df.height),
|
|
157
|
+
"message": "Missing required 'column' parameter for dtype rule",
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if column not in df.columns:
|
|
161
|
+
return {
|
|
162
|
+
"rule_id": self.rule_id,
|
|
163
|
+
"passed": False,
|
|
164
|
+
"failed_count": int(df.height),
|
|
165
|
+
"message": f"Column '{column}' not found for dtype check",
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
label, allowed = self._normalize_expected(str(expected_type) if expected_type is not None else "")
|
|
169
|
+
if allowed is None:
|
|
170
|
+
return {
|
|
171
|
+
"rule_id": self.rule_id,
|
|
172
|
+
"passed": False,
|
|
173
|
+
"failed_count": int(df.height),
|
|
174
|
+
"message": f"Invalid expected dtype '{expected_type}'",
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
actual = df[column].dtype
|
|
178
|
+
# Use equality comparison instead of set membership because parametric
|
|
179
|
+
# types like Datetime(time_unit='us') have different hashes than pl.Datetime
|
|
180
|
+
# but are equal via __eq__
|
|
181
|
+
passed = any(actual == a for a in allowed)
|
|
182
|
+
|
|
183
|
+
result: Dict[str, Any] = {
|
|
184
|
+
"rule_id": self.rule_id,
|
|
185
|
+
"passed": bool(passed),
|
|
186
|
+
"failed_count": 0 if passed else int(df.height),
|
|
187
|
+
"message": "Passed" if passed else f"{column} expected {label}, found {self._dtype_label(actual)}",
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if not passed:
|
|
191
|
+
result["failure_mode"] = str(FailureMode.SCHEMA_DRIFT)
|
|
192
|
+
result["details"] = {
|
|
193
|
+
"expected_type": label,
|
|
194
|
+
"actual_type": self._dtype_label(actual),
|
|
195
|
+
"column": column,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
def required_columns(self) -> Set[str]:
|
|
201
|
+
# dtype check inspects the column’s dtype; ensure it is loaded (for projection).
|
|
202
|
+
col = self.params.get("column")
|
|
203
|
+
return {col} if isinstance(col, str) else set()
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# src/kontra/rules/builtin/ends_with.py
|
|
2
|
+
"""
|
|
3
|
+
Ends with rule - Column must end with the specified suffix.
|
|
4
|
+
|
|
5
|
+
Uses LIKE pattern matching for maximum efficiency (faster than regex).
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
- name: ends_with
|
|
9
|
+
params:
|
|
10
|
+
column: filename
|
|
11
|
+
suffix: ".csv"
|
|
12
|
+
|
|
13
|
+
Fails when:
|
|
14
|
+
- Value does NOT end with the suffix
|
|
15
|
+
- Value is NULL (can't check NULL)
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import Any, Dict, List, Optional, Set
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
|
|
23
|
+
from kontra.rules.base import BaseRule
|
|
24
|
+
from kontra.rules.registry import register_rule
|
|
25
|
+
from kontra.rules.predicates import Predicate
|
|
26
|
+
from kontra.state.types import FailureMode
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _escape_like_pattern(value: str, escape_char: str = "\\") -> str:
|
|
30
|
+
"""Escape LIKE special characters: %, _, and the escape char."""
|
|
31
|
+
for c in (escape_char, "%", "_"):
|
|
32
|
+
value = value.replace(c, escape_char + c)
|
|
33
|
+
return value
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@register_rule("ends_with")
|
|
37
|
+
class EndsWithRule(BaseRule):
|
|
38
|
+
"""
|
|
39
|
+
Fails where column value does NOT end with the suffix.
|
|
40
|
+
|
|
41
|
+
params:
|
|
42
|
+
- column: str (required) - Column to check
|
|
43
|
+
- suffix: str (required) - Suffix that must be present
|
|
44
|
+
|
|
45
|
+
NULL handling:
|
|
46
|
+
- NULL values are failures (can't check NULL)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, name: str, params: Dict[str, Any]):
|
|
50
|
+
super().__init__(name, params)
|
|
51
|
+
self._column = self._get_required_param("column", str)
|
|
52
|
+
self._suffix = self._get_required_param("suffix", str)
|
|
53
|
+
|
|
54
|
+
if not self._suffix:
|
|
55
|
+
raise ValueError("Rule 'ends_with' suffix cannot be empty")
|
|
56
|
+
|
|
57
|
+
def required_columns(self) -> Set[str]:
|
|
58
|
+
return {self._column}
|
|
59
|
+
|
|
60
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
61
|
+
# Check column exists before accessing
|
|
62
|
+
col_check = self._check_columns(df, {self._column})
|
|
63
|
+
if col_check is not None:
|
|
64
|
+
return col_check
|
|
65
|
+
|
|
66
|
+
# Use Polars str.ends_with for efficiency
|
|
67
|
+
ends_result = df[self._column].cast(pl.Utf8).str.ends_with(self._suffix)
|
|
68
|
+
|
|
69
|
+
# Failure = does NOT end with OR is NULL
|
|
70
|
+
mask = (~ends_result).fill_null(True)
|
|
71
|
+
|
|
72
|
+
msg = f"{self._column} does not end with '{self._suffix}'"
|
|
73
|
+
res = super()._failures(df, mask, msg)
|
|
74
|
+
res["rule_id"] = self.rule_id
|
|
75
|
+
|
|
76
|
+
if res["failed_count"] > 0:
|
|
77
|
+
res["failure_mode"] = str(FailureMode.PATTERN_MISMATCH)
|
|
78
|
+
res["details"] = self._explain_failure(df, mask)
|
|
79
|
+
|
|
80
|
+
return res
|
|
81
|
+
|
|
82
|
+
def _explain_failure(self, df: pl.DataFrame, mask: pl.Series) -> Dict[str, Any]:
|
|
83
|
+
"""Generate detailed failure explanation."""
|
|
84
|
+
details: Dict[str, Any] = {
|
|
85
|
+
"column": self._column,
|
|
86
|
+
"expected_suffix": self._suffix,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Sample failing values
|
|
90
|
+
failed_df = df.filter(mask).head(5)
|
|
91
|
+
samples: List[Any] = []
|
|
92
|
+
for val in failed_df[self._column]:
|
|
93
|
+
samples.append(val)
|
|
94
|
+
|
|
95
|
+
if samples:
|
|
96
|
+
details["sample_failures"] = samples
|
|
97
|
+
|
|
98
|
+
return details
|
|
99
|
+
|
|
100
|
+
def compile_predicate(self) -> Optional[Predicate]:
|
|
101
|
+
ends_expr = pl.col(self._column).cast(pl.Utf8).str.ends_with(self._suffix)
|
|
102
|
+
expr = (~ends_expr).fill_null(True)
|
|
103
|
+
|
|
104
|
+
return Predicate(
|
|
105
|
+
rule_id=self.rule_id,
|
|
106
|
+
expr=expr,
|
|
107
|
+
message=f"{self._column} does not end with '{self._suffix}'",
|
|
108
|
+
columns={self._column},
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def to_sql_spec(self) -> Optional[Dict[str, Any]]:
|
|
112
|
+
"""Generate SQL pushdown specification."""
|
|
113
|
+
return {
|
|
114
|
+
"kind": "ends_with",
|
|
115
|
+
"rule_id": self.rule_id,
|
|
116
|
+
"column": self._column,
|
|
117
|
+
"suffix": self._suffix,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
def to_sql_filter(self, dialect: str = "postgres") -> str | None:
|
|
121
|
+
"""Generate SQL filter for sampling failing rows."""
|
|
122
|
+
col = f'"{self._column}"'
|
|
123
|
+
|
|
124
|
+
# Escape LIKE special characters
|
|
125
|
+
escaped = _escape_like_pattern(self._suffix)
|
|
126
|
+
pattern = f"%{escaped}"
|
|
127
|
+
|
|
128
|
+
# Failure = does NOT end with OR is NULL
|
|
129
|
+
return f"{col} IS NULL OR {col} NOT LIKE '{pattern}' ESCAPE '\\'"
|