kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Freshness Rule - validates that a timestamp column has recent data.
|
|
5
|
+
|
|
6
|
+
Usage in contract:
|
|
7
|
+
- name: freshness
|
|
8
|
+
params:
|
|
9
|
+
column: updated_at
|
|
10
|
+
max_age: "24h" # or "1d", "30m", "7d", etc.
|
|
11
|
+
|
|
12
|
+
Supported time units:
|
|
13
|
+
- s, sec, second(s): seconds
|
|
14
|
+
- m, min, minute(s): minutes
|
|
15
|
+
- h, hr, hour(s): hours
|
|
16
|
+
- d, day(s): days
|
|
17
|
+
- w, week(s): weeks
|
|
18
|
+
|
|
19
|
+
The rule passes if MAX(column) >= NOW() - max_age.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from datetime import datetime, timedelta, timezone
|
|
24
|
+
from typing import Any, Dict, Optional, Set
|
|
25
|
+
|
|
26
|
+
import polars as pl
|
|
27
|
+
|
|
28
|
+
from kontra.rules.base import BaseRule
|
|
29
|
+
from kontra.rules.predicates import Predicate
|
|
30
|
+
from kontra.rules.registry import register_rule
|
|
31
|
+
from kontra.state.types import FailureMode
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_duration(duration_str: str) -> timedelta:
|
|
35
|
+
"""
|
|
36
|
+
Parse a human-readable duration string into a timedelta.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
"24h" -> 24 hours
|
|
40
|
+
"1d" -> 1 day
|
|
41
|
+
"30m" -> 30 minutes
|
|
42
|
+
"7 days" -> 7 days
|
|
43
|
+
"2w" -> 2 weeks
|
|
44
|
+
"1h30m" -> 1 hour 30 minutes
|
|
45
|
+
"2d12h" -> 2 days 12 hours
|
|
46
|
+
"""
|
|
47
|
+
duration_str = duration_str.strip().lower()
|
|
48
|
+
|
|
49
|
+
# Map unit variations to timedelta kwargs
|
|
50
|
+
unit_map = {
|
|
51
|
+
's': 'seconds', 'sec': 'seconds', 'second': 'seconds', 'seconds': 'seconds',
|
|
52
|
+
'm': 'minutes', 'min': 'minutes', 'minute': 'minutes', 'minutes': 'minutes',
|
|
53
|
+
'h': 'hours', 'hr': 'hours', 'hour': 'hours', 'hours': 'hours',
|
|
54
|
+
'd': 'days', 'day': 'days', 'days': 'days',
|
|
55
|
+
'w': 'weeks', 'week': 'weeks', 'weeks': 'weeks',
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Find all number+unit pairs (e.g., "1h30m" -> [("1", "h"), ("30", "m")])
|
|
59
|
+
pattern = r'(\d+(?:\.\d+)?)\s*([a-z]+)'
|
|
60
|
+
matches = re.findall(pattern, duration_str)
|
|
61
|
+
|
|
62
|
+
if not matches:
|
|
63
|
+
raise ValueError(f"Invalid duration format: '{duration_str}'. Expected format like '24h', '1d', '30m', '1h30m'")
|
|
64
|
+
|
|
65
|
+
# Verify that the matches cover the entire string (no unmatched parts)
|
|
66
|
+
reconstructed = ''.join(f'{v}{u}' for v, u in matches)
|
|
67
|
+
# Remove spaces from original for comparison
|
|
68
|
+
original_no_spaces = re.sub(r'\s+', '', duration_str)
|
|
69
|
+
if reconstructed != original_no_spaces:
|
|
70
|
+
raise ValueError(f"Invalid duration format: '{duration_str}'. Expected format like '24h', '1d', '30m', '1h30m'")
|
|
71
|
+
|
|
72
|
+
# Accumulate timedelta components
|
|
73
|
+
total = timedelta()
|
|
74
|
+
for value_str, unit in matches:
|
|
75
|
+
value = float(value_str)
|
|
76
|
+
if unit not in unit_map:
|
|
77
|
+
raise ValueError(f"Unknown time unit: '{unit}'. Supported: s, m, h, d, w (or full names)")
|
|
78
|
+
total += timedelta(**{unit_map[unit]: value})
|
|
79
|
+
|
|
80
|
+
return total
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@register_rule("freshness")
|
|
84
|
+
class FreshnessRule(BaseRule):
|
|
85
|
+
"""
|
|
86
|
+
Validates that a timestamp column contains recent data.
|
|
87
|
+
|
|
88
|
+
The rule checks if the maximum value in the timestamp column
|
|
89
|
+
is within the specified max_age from the current time.
|
|
90
|
+
|
|
91
|
+
Parameters:
|
|
92
|
+
column: The timestamp column to check
|
|
93
|
+
max_age: Maximum age allowed (e.g., "24h", "1d", "30m")
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
def __init__(self, name: str, params: Dict[str, Any]):
|
|
97
|
+
super().__init__(name, params)
|
|
98
|
+
self._validate_params()
|
|
99
|
+
|
|
100
|
+
def _validate_params(self) -> None:
|
|
101
|
+
if "column" not in self.params:
|
|
102
|
+
raise ValueError("freshness rule requires 'column' parameter")
|
|
103
|
+
if "max_age" not in self.params:
|
|
104
|
+
raise ValueError("freshness rule requires 'max_age' parameter")
|
|
105
|
+
# Validate max_age is parseable
|
|
106
|
+
parse_duration(str(self.params["max_age"]))
|
|
107
|
+
|
|
108
|
+
def required_columns(self) -> Set[str]:
|
|
109
|
+
return {self.params["column"]}
|
|
110
|
+
|
|
111
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
112
|
+
column = self.params["column"]
|
|
113
|
+
max_age = parse_duration(str(self.params["max_age"]))
|
|
114
|
+
|
|
115
|
+
# Check column exists before accessing
|
|
116
|
+
col_check = self._check_columns(df, {column})
|
|
117
|
+
if col_check is not None:
|
|
118
|
+
return col_check
|
|
119
|
+
|
|
120
|
+
# Check column dtype is datetime-compatible
|
|
121
|
+
col_dtype = df[column].dtype
|
|
122
|
+
datetime_types = (pl.Datetime, pl.Date)
|
|
123
|
+
is_datetime_type = isinstance(col_dtype, datetime_types) or col_dtype in datetime_types
|
|
124
|
+
|
|
125
|
+
if not is_datetime_type:
|
|
126
|
+
# Check if it's a string that might be parseable
|
|
127
|
+
if col_dtype not in (pl.Utf8, getattr(pl, "String", pl.Utf8)):
|
|
128
|
+
return {
|
|
129
|
+
"rule_id": self.rule_id,
|
|
130
|
+
"passed": False,
|
|
131
|
+
"failed_count": df.height,
|
|
132
|
+
"message": f"Column '{column}' must be a datetime type for freshness check (found {col_dtype})",
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Get the maximum timestamp
|
|
136
|
+
max_ts = df[column].max()
|
|
137
|
+
|
|
138
|
+
if max_ts is None:
|
|
139
|
+
return {
|
|
140
|
+
"rule_id": self.rule_id,
|
|
141
|
+
"passed": False,
|
|
142
|
+
"failed_count": df.height,
|
|
143
|
+
"message": f"Column '{column}' has no non-null timestamps",
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# Convert to datetime if needed
|
|
147
|
+
if isinstance(max_ts, datetime):
|
|
148
|
+
max_datetime = max_ts
|
|
149
|
+
else:
|
|
150
|
+
# Try to handle various timestamp types (including strings)
|
|
151
|
+
try:
|
|
152
|
+
max_datetime = datetime.fromisoformat(str(max_ts).replace('Z', '+00:00'))
|
|
153
|
+
except (ValueError, AttributeError):
|
|
154
|
+
return {
|
|
155
|
+
"rule_id": self.rule_id,
|
|
156
|
+
"passed": False,
|
|
157
|
+
"failed_count": df.height,
|
|
158
|
+
"message": f"Column '{column}' contains values that cannot be parsed as datetime (got: {type(max_ts).__name__})",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Get current time (use UTC for consistency)
|
|
162
|
+
now = datetime.now(timezone.utc)
|
|
163
|
+
|
|
164
|
+
# Make max_datetime timezone-aware if it isn't
|
|
165
|
+
if hasattr(max_datetime, 'tzinfo') and max_datetime.tzinfo is None:
|
|
166
|
+
max_datetime = max_datetime.replace(tzinfo=timezone.utc)
|
|
167
|
+
|
|
168
|
+
threshold = now - max_age
|
|
169
|
+
|
|
170
|
+
# Check if the most recent data is fresh enough
|
|
171
|
+
is_fresh = max_datetime >= threshold
|
|
172
|
+
|
|
173
|
+
if is_fresh:
|
|
174
|
+
return {
|
|
175
|
+
"rule_id": self.rule_id,
|
|
176
|
+
"passed": True,
|
|
177
|
+
"failed_count": 0,
|
|
178
|
+
"message": "Passed",
|
|
179
|
+
}
|
|
180
|
+
else:
|
|
181
|
+
age = now - max_datetime
|
|
182
|
+
return {
|
|
183
|
+
"rule_id": self.rule_id,
|
|
184
|
+
"passed": False,
|
|
185
|
+
"failed_count": 1,
|
|
186
|
+
"message": f"Data is stale: most recent record is {_format_timedelta(age)} old (max allowed: {self.params['max_age']})",
|
|
187
|
+
"failure_mode": str(FailureMode.FRESHNESS_LAG),
|
|
188
|
+
"details": {
|
|
189
|
+
"latest_timestamp": max_datetime.isoformat(),
|
|
190
|
+
"threshold_timestamp": threshold.isoformat(),
|
|
191
|
+
"actual_age_seconds": int(age.total_seconds()),
|
|
192
|
+
"max_age_seconds": int(max_age.total_seconds()),
|
|
193
|
+
"max_age_spec": str(self.params["max_age"]),
|
|
194
|
+
},
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
def compile_predicate(self) -> Optional[Predicate]:
|
|
198
|
+
# Freshness is an aggregate check (MAX), not row-level
|
|
199
|
+
# Cannot be vectorized as a per-row predicate
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
def to_sql_spec(self) -> Optional[Dict[str, Any]]:
|
|
203
|
+
"""Generate SQL spec for pushdown execution."""
|
|
204
|
+
column = self.params.get("column")
|
|
205
|
+
max_age = self.params.get("max_age")
|
|
206
|
+
|
|
207
|
+
if not (column and max_age):
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
td = parse_duration(str(max_age))
|
|
212
|
+
# Convert to total seconds for SQL
|
|
213
|
+
total_seconds = int(td.total_seconds())
|
|
214
|
+
except ValueError:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
"kind": "freshness",
|
|
219
|
+
"rule_id": self.rule_id,
|
|
220
|
+
"column": column,
|
|
221
|
+
"max_age_seconds": total_seconds,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _format_timedelta(td: timedelta) -> str:
|
|
226
|
+
"""Format a timedelta in human-readable form."""
|
|
227
|
+
total_seconds = int(td.total_seconds())
|
|
228
|
+
|
|
229
|
+
if total_seconds < 60:
|
|
230
|
+
return f"{total_seconds}s"
|
|
231
|
+
elif total_seconds < 3600:
|
|
232
|
+
return f"{total_seconds // 60}m"
|
|
233
|
+
elif total_seconds < 86400:
|
|
234
|
+
hours = total_seconds // 3600
|
|
235
|
+
mins = (total_seconds % 3600) // 60
|
|
236
|
+
return f"{hours}h {mins}m" if mins else f"{hours}h"
|
|
237
|
+
else:
|
|
238
|
+
days = total_seconds // 86400
|
|
239
|
+
hours = (total_seconds % 86400) // 3600
|
|
240
|
+
return f"{days}d {hours}h" if hours else f"{days}d"
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# src/kontra/rules/builtin/length.py
|
|
2
|
+
"""
|
|
3
|
+
Length rule - Column string length must be within specified bounds.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
- name: length
|
|
7
|
+
params:
|
|
8
|
+
column: username
|
|
9
|
+
min: 3
|
|
10
|
+
max: 50
|
|
11
|
+
|
|
12
|
+
Fails when:
|
|
13
|
+
- String length < min (if min specified)
|
|
14
|
+
- String length > max (if max specified)
|
|
15
|
+
- Value is NULL (can't measure length of NULL)
|
|
16
|
+
|
|
17
|
+
At least one of `min` or `max` must be specified.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
22
|
+
|
|
23
|
+
import polars as pl
|
|
24
|
+
|
|
25
|
+
from kontra.rules.base import BaseRule
|
|
26
|
+
from kontra.rules.registry import register_rule
|
|
27
|
+
from kontra.rules.predicates import Predicate
|
|
28
|
+
from kontra.state.types import FailureMode
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@register_rule("length")
|
|
32
|
+
class LengthRule(BaseRule):
|
|
33
|
+
"""
|
|
34
|
+
Fails where string length is outside [min, max] bounds.
|
|
35
|
+
|
|
36
|
+
params:
|
|
37
|
+
- column: str (required) - Column to check
|
|
38
|
+
- min: int (optional) - Minimum length (inclusive)
|
|
39
|
+
- max: int (optional) - Maximum length (inclusive)
|
|
40
|
+
|
|
41
|
+
At least one of min or max must be provided.
|
|
42
|
+
|
|
43
|
+
NULL handling:
|
|
44
|
+
- NULL values are failures (can't measure length of NULL)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, name: str, params: Dict[str, Any]):
|
|
48
|
+
super().__init__(name, params)
|
|
49
|
+
self._column = self._get_required_param("column", str)
|
|
50
|
+
self._min_len: Optional[int] = params.get("min")
|
|
51
|
+
self._max_len: Optional[int] = params.get("max")
|
|
52
|
+
|
|
53
|
+
# Validate at least one bound is provided
|
|
54
|
+
if self._min_len is None and self._max_len is None:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"Rule 'length' requires at least one of 'min' or 'max' parameters"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Validate min <= max if both provided
|
|
60
|
+
if self._min_len is not None and self._max_len is not None:
|
|
61
|
+
if self._min_len > self._max_len:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Rule 'length' min ({self._min_len}) must be <= max ({self._max_len})"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Validate non-negative
|
|
67
|
+
if self._min_len is not None and self._min_len < 0:
|
|
68
|
+
raise ValueError(f"Rule 'length' min must be non-negative, got {self._min_len}")
|
|
69
|
+
if self._max_len is not None and self._max_len < 0:
|
|
70
|
+
raise ValueError(f"Rule 'length' max must be non-negative, got {self._max_len}")
|
|
71
|
+
|
|
72
|
+
def required_columns(self) -> Set[str]:
|
|
73
|
+
return {self._column}
|
|
74
|
+
|
|
75
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
76
|
+
# Check column exists before accessing
|
|
77
|
+
col_check = self._check_columns(df, {self._column})
|
|
78
|
+
if col_check is not None:
|
|
79
|
+
return col_check
|
|
80
|
+
|
|
81
|
+
# Get string length (cast to string first to handle non-string columns)
|
|
82
|
+
length_col = df[self._column].cast(pl.Utf8).str.len_chars()
|
|
83
|
+
|
|
84
|
+
# Build mask: True = failure
|
|
85
|
+
mask = df[self._column].is_null() # NULL is failure
|
|
86
|
+
|
|
87
|
+
if self._min_len is not None:
|
|
88
|
+
mask = mask | (length_col < self._min_len)
|
|
89
|
+
if self._max_len is not None:
|
|
90
|
+
mask = mask | (length_col > self._max_len)
|
|
91
|
+
|
|
92
|
+
# Build message
|
|
93
|
+
if self._min_len is not None and self._max_len is not None:
|
|
94
|
+
msg = f"{self._column} length not in range [{self._min_len}, {self._max_len}]"
|
|
95
|
+
elif self._min_len is not None:
|
|
96
|
+
msg = f"{self._column} length < {self._min_len}"
|
|
97
|
+
else:
|
|
98
|
+
msg = f"{self._column} length > {self._max_len}"
|
|
99
|
+
|
|
100
|
+
res = super()._failures(df, mask, msg)
|
|
101
|
+
res["rule_id"] = self.rule_id
|
|
102
|
+
|
|
103
|
+
if res["failed_count"] > 0:
|
|
104
|
+
res["failure_mode"] = str(FailureMode.RANGE_VIOLATION)
|
|
105
|
+
res["details"] = self._explain_failure(df, length_col, mask)
|
|
106
|
+
|
|
107
|
+
return res
|
|
108
|
+
|
|
109
|
+
def _explain_failure(
|
|
110
|
+
self, df: pl.DataFrame, length_col: pl.Series, mask: pl.Series
|
|
111
|
+
) -> Dict[str, Any]:
|
|
112
|
+
"""Generate detailed failure explanation."""
|
|
113
|
+
details: Dict[str, Any] = {
|
|
114
|
+
"column": self._column,
|
|
115
|
+
}
|
|
116
|
+
if self._min_len is not None:
|
|
117
|
+
details["min_length"] = self._min_len
|
|
118
|
+
if self._max_len is not None:
|
|
119
|
+
details["max_length"] = self._max_len
|
|
120
|
+
|
|
121
|
+
# Sample failing values with their lengths
|
|
122
|
+
failed_df = df.filter(mask).with_columns(
|
|
123
|
+
length_col.filter(mask).alias("_length")
|
|
124
|
+
).head(5)
|
|
125
|
+
|
|
126
|
+
samples: List[Dict[str, Any]] = []
|
|
127
|
+
for row in failed_df.iter_rows(named=True):
|
|
128
|
+
val = row[self._column]
|
|
129
|
+
length = row.get("_length")
|
|
130
|
+
samples.append({
|
|
131
|
+
"value": val,
|
|
132
|
+
"length": length,
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
if samples:
|
|
136
|
+
details["sample_failures"] = samples
|
|
137
|
+
|
|
138
|
+
return details
|
|
139
|
+
|
|
140
|
+
def compile_predicate(self) -> Optional[Predicate]:
|
|
141
|
+
# Get string length
|
|
142
|
+
length_expr = pl.col(self._column).cast(pl.Utf8).str.len_chars()
|
|
143
|
+
|
|
144
|
+
# Build mask: True = failure
|
|
145
|
+
expr = pl.col(self._column).is_null()
|
|
146
|
+
|
|
147
|
+
if self._min_len is not None:
|
|
148
|
+
expr = expr | (length_expr < self._min_len)
|
|
149
|
+
if self._max_len is not None:
|
|
150
|
+
expr = expr | (length_expr > self._max_len)
|
|
151
|
+
|
|
152
|
+
# Build message
|
|
153
|
+
if self._min_len is not None and self._max_len is not None:
|
|
154
|
+
msg = f"{self._column} length not in range [{self._min_len}, {self._max_len}]"
|
|
155
|
+
elif self._min_len is not None:
|
|
156
|
+
msg = f"{self._column} length < {self._min_len}"
|
|
157
|
+
else:
|
|
158
|
+
msg = f"{self._column} length > {self._max_len}"
|
|
159
|
+
|
|
160
|
+
return Predicate(
|
|
161
|
+
rule_id=self.rule_id,
|
|
162
|
+
expr=expr,
|
|
163
|
+
message=msg,
|
|
164
|
+
columns={self._column},
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def to_sql_spec(self) -> Optional[Dict[str, Any]]:
|
|
168
|
+
"""Generate SQL pushdown specification."""
|
|
169
|
+
return {
|
|
170
|
+
"kind": "length",
|
|
171
|
+
"rule_id": self.rule_id,
|
|
172
|
+
"column": self._column,
|
|
173
|
+
"min": self._min_len,
|
|
174
|
+
"max": self._max_len,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def to_sql_filter(self, dialect: str = "postgres") -> str | None:
|
|
178
|
+
"""Generate SQL filter for sampling failing rows."""
|
|
179
|
+
col = f'"{self._column}"'
|
|
180
|
+
|
|
181
|
+
# SQL Server uses LEN(), others use LENGTH()
|
|
182
|
+
if dialect in ("mssql", "sqlserver"):
|
|
183
|
+
len_func = f"LEN({col})"
|
|
184
|
+
else:
|
|
185
|
+
len_func = f"LENGTH({col})"
|
|
186
|
+
|
|
187
|
+
conditions = [f"{col} IS NULL"]
|
|
188
|
+
if self._min_len is not None:
|
|
189
|
+
conditions.append(f"{len_func} < {self._min_len}")
|
|
190
|
+
if self._max_len is not None:
|
|
191
|
+
conditions.append(f"{len_func} > {self._max_len}")
|
|
192
|
+
|
|
193
|
+
return " OR ".join(conditions)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, Any, Optional
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from kontra.rules.base import BaseRule
|
|
6
|
+
from kontra.rules.registry import register_rule
|
|
7
|
+
from kontra.state.types import FailureMode
|
|
8
|
+
|
|
9
|
+
@register_rule("max_rows")
|
|
10
|
+
class MaxRowsRule(BaseRule):
|
|
11
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
# Accept both 'value' and 'threshold' for backwards compatibility
|
|
13
|
+
max_count = int(self.params.get("value", self.params.get("threshold", 0)))
|
|
14
|
+
h = int(df.height)
|
|
15
|
+
passed = h <= max_count
|
|
16
|
+
|
|
17
|
+
result: Dict[str, Any] = {
|
|
18
|
+
"rule_id": self.rule_id,
|
|
19
|
+
"passed": passed,
|
|
20
|
+
"failed_count": 0 if passed else (h - max_count),
|
|
21
|
+
"message": f"Dataset has {h} rows, exceeds max {max_count}",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if not passed:
|
|
25
|
+
result["failure_mode"] = str(FailureMode.ROW_COUNT_HIGH)
|
|
26
|
+
result["details"] = {
|
|
27
|
+
"actual_rows": h,
|
|
28
|
+
"maximum_allowed": max_count,
|
|
29
|
+
"excess": h - max_count,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
def compile_predicate(self):
|
|
35
|
+
return None # dataset-level scalar check
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, Any, Optional
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from kontra.rules.base import BaseRule
|
|
6
|
+
from kontra.rules.registry import register_rule
|
|
7
|
+
from kontra.state.types import FailureMode
|
|
8
|
+
|
|
9
|
+
@register_rule("min_rows")
|
|
10
|
+
class MinRowsRule(BaseRule):
|
|
11
|
+
def __init__(self, *args, **kwargs):
|
|
12
|
+
super().__init__(*args, **kwargs)
|
|
13
|
+
# Validate threshold at construction time
|
|
14
|
+
threshold = self.params.get("value", self.params.get("threshold", 0))
|
|
15
|
+
if threshold is not None and int(threshold) < 0:
|
|
16
|
+
from kontra.errors import RuleParameterError
|
|
17
|
+
raise RuleParameterError(
|
|
18
|
+
"min_rows", "threshold",
|
|
19
|
+
f"must be non-negative, got {threshold}"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
23
|
+
# Accept both 'value' and 'threshold' for backwards compatibility
|
|
24
|
+
min_count = int(self.params.get("value", self.params.get("threshold", 0)))
|
|
25
|
+
h = int(df.height)
|
|
26
|
+
passed = h >= min_count
|
|
27
|
+
|
|
28
|
+
result: Dict[str, Any] = {
|
|
29
|
+
"rule_id": self.rule_id,
|
|
30
|
+
"passed": passed,
|
|
31
|
+
"failed_count": 0 if passed else (min_count - h),
|
|
32
|
+
"message": f"Dataset has {h} rows, requires at least {min_count}",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if not passed:
|
|
36
|
+
result["failure_mode"] = str(FailureMode.ROW_COUNT_LOW)
|
|
37
|
+
result["details"] = {
|
|
38
|
+
"actual_rows": h,
|
|
39
|
+
"minimum_required": min_count,
|
|
40
|
+
"shortfall": min_count - h,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return result
|
|
44
|
+
|
|
45
|
+
def compile_predicate(self):
|
|
46
|
+
return None # dataset-level scalar check
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, Any, List, Optional
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from kontra.rules.base import BaseRule
|
|
6
|
+
from kontra.rules.registry import register_rule
|
|
7
|
+
from kontra.rules.predicates import Predicate
|
|
8
|
+
from kontra.state.types import FailureMode
|
|
9
|
+
|
|
10
|
+
@register_rule("not_null")
|
|
11
|
+
class NotNullRule(BaseRule):
|
|
12
|
+
"""
|
|
13
|
+
Fails where column contains NULL values.
|
|
14
|
+
|
|
15
|
+
params:
|
|
16
|
+
- column: str (required) - Column to check
|
|
17
|
+
- include_nan: bool (optional, default: False) - Also treat NaN as null
|
|
18
|
+
|
|
19
|
+
Note: By default, NaN values are NOT considered null (Polars behavior).
|
|
20
|
+
Set include_nan=True to catch both NULL and NaN values.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, name: str, params: Dict[str, Any]):
|
|
24
|
+
super().__init__(name, params)
|
|
25
|
+
# Validate required parameter at construction time
|
|
26
|
+
self._get_required_param("column", str)
|
|
27
|
+
|
|
28
|
+
def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
|
|
29
|
+
column = self.params["column"]
|
|
30
|
+
include_nan = self.params.get("include_nan", False)
|
|
31
|
+
|
|
32
|
+
# Check column exists before accessing
|
|
33
|
+
col_check = self._check_columns(df, {column})
|
|
34
|
+
if col_check is not None:
|
|
35
|
+
return col_check
|
|
36
|
+
|
|
37
|
+
# Build mask for null (and optionally NaN) values
|
|
38
|
+
mask = df[column].is_null()
|
|
39
|
+
if include_nan:
|
|
40
|
+
# For numeric columns, also check for NaN
|
|
41
|
+
col = df[column]
|
|
42
|
+
if col.dtype.is_float():
|
|
43
|
+
mask = mask | col.is_nan()
|
|
44
|
+
|
|
45
|
+
message = f"{column} contains null values"
|
|
46
|
+
if include_nan:
|
|
47
|
+
message = f"{column} contains null or NaN values"
|
|
48
|
+
|
|
49
|
+
res = super()._failures(df, mask, message)
|
|
50
|
+
res["rule_id"] = self.rule_id
|
|
51
|
+
|
|
52
|
+
# Add failure details
|
|
53
|
+
if res["failed_count"] > 0:
|
|
54
|
+
res["failure_mode"] = str(FailureMode.NULL_VALUES)
|
|
55
|
+
res["details"] = self._explain_failure(df, column, res["failed_count"], include_nan)
|
|
56
|
+
|
|
57
|
+
return res
|
|
58
|
+
|
|
59
|
+
def _explain_failure(
|
|
60
|
+
self, df: pl.DataFrame, column: str, null_count: int, include_nan: bool = False
|
|
61
|
+
) -> Dict[str, Any]:
|
|
62
|
+
"""Generate detailed failure explanation."""
|
|
63
|
+
total_rows = df.height
|
|
64
|
+
null_rate = null_count / total_rows if total_rows > 0 else 0
|
|
65
|
+
|
|
66
|
+
details: Dict[str, Any] = {
|
|
67
|
+
"null_count": null_count,
|
|
68
|
+
"null_rate": round(null_rate, 4),
|
|
69
|
+
"total_rows": total_rows,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if include_nan:
|
|
73
|
+
details["includes_nan"] = True
|
|
74
|
+
|
|
75
|
+
# Find sample row positions with nulls (first 5)
|
|
76
|
+
if null_count > 0 and null_count <= 1000:
|
|
77
|
+
null_positions: List[int] = []
|
|
78
|
+
col = df[column]
|
|
79
|
+
for i, val in enumerate(col):
|
|
80
|
+
if val is None:
|
|
81
|
+
null_positions.append(i)
|
|
82
|
+
if len(null_positions) >= 5:
|
|
83
|
+
break
|
|
84
|
+
if null_positions:
|
|
85
|
+
details["sample_positions"] = null_positions
|
|
86
|
+
|
|
87
|
+
return details
|
|
88
|
+
|
|
89
|
+
def compile_predicate(self) -> Optional[Predicate]:
|
|
90
|
+
column = self.params["column"]
|
|
91
|
+
include_nan = self.params.get("include_nan", False)
|
|
92
|
+
|
|
93
|
+
expr = pl.col(column).is_null()
|
|
94
|
+
message = f"{column} contains null values"
|
|
95
|
+
|
|
96
|
+
if include_nan:
|
|
97
|
+
# Note: is_nan() only works on float columns, but compile_predicate
|
|
98
|
+
# doesn't have access to the DataFrame schema. The expression will
|
|
99
|
+
# be evaluated at runtime where Polars handles type checking.
|
|
100
|
+
expr = expr | pl.col(column).is_nan()
|
|
101
|
+
message = f"{column} contains null or NaN values"
|
|
102
|
+
|
|
103
|
+
return Predicate(
|
|
104
|
+
rule_id=self.rule_id,
|
|
105
|
+
expr=expr,
|
|
106
|
+
message=message,
|
|
107
|
+
columns={column},
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def to_sql_filter(self, dialect: str = "postgres") -> str | None:
|
|
111
|
+
column = self.params["column"]
|
|
112
|
+
include_nan = self.params.get("include_nan", False)
|
|
113
|
+
|
|
114
|
+
# Quote column name for safety
|
|
115
|
+
col = f'"{column}"'
|
|
116
|
+
|
|
117
|
+
if include_nan:
|
|
118
|
+
# NaN check: value != value is true for NaN
|
|
119
|
+
return f"{col} IS NULL OR {col} != {col}"
|
|
120
|
+
else:
|
|
121
|
+
return f"{col} IS NULL"
|