kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,240 @@
1
+ from __future__ import annotations
2
+
3
+ """
4
+ Freshness Rule - validates that a timestamp column has recent data.
5
+
6
+ Usage in contract:
7
+ - name: freshness
8
+ params:
9
+ column: updated_at
10
+ max_age: "24h" # or "1d", "30m", "7d", etc.
11
+
12
+ Supported time units:
13
+ - s, sec, second(s): seconds
14
+ - m, min, minute(s): minutes
15
+ - h, hr, hour(s): hours
16
+ - d, day(s): days
17
+ - w, week(s): weeks
18
+
19
+ The rule passes if MAX(column) >= NOW() - max_age.
20
+ """
21
+
22
+ import re
23
+ from datetime import datetime, timedelta, timezone
24
+ from typing import Any, Dict, Optional, Set
25
+
26
+ import polars as pl
27
+
28
+ from kontra.rules.base import BaseRule
29
+ from kontra.rules.predicates import Predicate
30
+ from kontra.rules.registry import register_rule
31
+ from kontra.state.types import FailureMode
32
+
33
+
34
+ def parse_duration(duration_str: str) -> timedelta:
35
+ """
36
+ Parse a human-readable duration string into a timedelta.
37
+
38
+ Examples:
39
+ "24h" -> 24 hours
40
+ "1d" -> 1 day
41
+ "30m" -> 30 minutes
42
+ "7 days" -> 7 days
43
+ "2w" -> 2 weeks
44
+ "1h30m" -> 1 hour 30 minutes
45
+ "2d12h" -> 2 days 12 hours
46
+ """
47
+ duration_str = duration_str.strip().lower()
48
+
49
+ # Map unit variations to timedelta kwargs
50
+ unit_map = {
51
+ 's': 'seconds', 'sec': 'seconds', 'second': 'seconds', 'seconds': 'seconds',
52
+ 'm': 'minutes', 'min': 'minutes', 'minute': 'minutes', 'minutes': 'minutes',
53
+ 'h': 'hours', 'hr': 'hours', 'hour': 'hours', 'hours': 'hours',
54
+ 'd': 'days', 'day': 'days', 'days': 'days',
55
+ 'w': 'weeks', 'week': 'weeks', 'weeks': 'weeks',
56
+ }
57
+
58
+ # Find all number+unit pairs (e.g., "1h30m" -> [("1", "h"), ("30", "m")])
59
+ pattern = r'(\d+(?:\.\d+)?)\s*([a-z]+)'
60
+ matches = re.findall(pattern, duration_str)
61
+
62
+ if not matches:
63
+ raise ValueError(f"Invalid duration format: '{duration_str}'. Expected format like '24h', '1d', '30m', '1h30m'")
64
+
65
+ # Verify that the matches cover the entire string (no unmatched parts)
66
+ reconstructed = ''.join(f'{v}{u}' for v, u in matches)
67
+ # Remove spaces from original for comparison
68
+ original_no_spaces = re.sub(r'\s+', '', duration_str)
69
+ if reconstructed != original_no_spaces:
70
+ raise ValueError(f"Invalid duration format: '{duration_str}'. Expected format like '24h', '1d', '30m', '1h30m'")
71
+
72
+ # Accumulate timedelta components
73
+ total = timedelta()
74
+ for value_str, unit in matches:
75
+ value = float(value_str)
76
+ if unit not in unit_map:
77
+ raise ValueError(f"Unknown time unit: '{unit}'. Supported: s, m, h, d, w (or full names)")
78
+ total += timedelta(**{unit_map[unit]: value})
79
+
80
+ return total
81
+
82
+
83
+ @register_rule("freshness")
84
+ class FreshnessRule(BaseRule):
85
+ """
86
+ Validates that a timestamp column contains recent data.
87
+
88
+ The rule checks if the maximum value in the timestamp column
89
+ is within the specified max_age from the current time.
90
+
91
+ Parameters:
92
+ column: The timestamp column to check
93
+ max_age: Maximum age allowed (e.g., "24h", "1d", "30m")
94
+ """
95
+
96
+ def __init__(self, name: str, params: Dict[str, Any]):
97
+ super().__init__(name, params)
98
+ self._validate_params()
99
+
100
+ def _validate_params(self) -> None:
101
+ if "column" not in self.params:
102
+ raise ValueError("freshness rule requires 'column' parameter")
103
+ if "max_age" not in self.params:
104
+ raise ValueError("freshness rule requires 'max_age' parameter")
105
+ # Validate max_age is parseable
106
+ parse_duration(str(self.params["max_age"]))
107
+
108
+ def required_columns(self) -> Set[str]:
109
+ return {self.params["column"]}
110
+
111
+ def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
112
+ column = self.params["column"]
113
+ max_age = parse_duration(str(self.params["max_age"]))
114
+
115
+ # Check column exists before accessing
116
+ col_check = self._check_columns(df, {column})
117
+ if col_check is not None:
118
+ return col_check
119
+
120
+ # Check column dtype is datetime-compatible
121
+ col_dtype = df[column].dtype
122
+ datetime_types = (pl.Datetime, pl.Date)
123
+ is_datetime_type = isinstance(col_dtype, datetime_types) or col_dtype in datetime_types
124
+
125
+ if not is_datetime_type:
126
+ # Check if it's a string that might be parseable
127
+ if col_dtype not in (pl.Utf8, getattr(pl, "String", pl.Utf8)):
128
+ return {
129
+ "rule_id": self.rule_id,
130
+ "passed": False,
131
+ "failed_count": df.height,
132
+ "message": f"Column '{column}' must be a datetime type for freshness check (found {col_dtype})",
133
+ }
134
+
135
+ # Get the maximum timestamp
136
+ max_ts = df[column].max()
137
+
138
+ if max_ts is None:
139
+ return {
140
+ "rule_id": self.rule_id,
141
+ "passed": False,
142
+ "failed_count": df.height,
143
+ "message": f"Column '{column}' has no non-null timestamps",
144
+ }
145
+
146
+ # Convert to datetime if needed
147
+ if isinstance(max_ts, datetime):
148
+ max_datetime = max_ts
149
+ else:
150
+ # Try to handle various timestamp types (including strings)
151
+ try:
152
+ max_datetime = datetime.fromisoformat(str(max_ts).replace('Z', '+00:00'))
153
+ except (ValueError, AttributeError):
154
+ return {
155
+ "rule_id": self.rule_id,
156
+ "passed": False,
157
+ "failed_count": df.height,
158
+ "message": f"Column '{column}' contains values that cannot be parsed as datetime (got: {type(max_ts).__name__})",
159
+ }
160
+
161
+ # Get current time (use UTC for consistency)
162
+ now = datetime.now(timezone.utc)
163
+
164
+ # Make max_datetime timezone-aware if it isn't
165
+ if hasattr(max_datetime, 'tzinfo') and max_datetime.tzinfo is None:
166
+ max_datetime = max_datetime.replace(tzinfo=timezone.utc)
167
+
168
+ threshold = now - max_age
169
+
170
+ # Check if the most recent data is fresh enough
171
+ is_fresh = max_datetime >= threshold
172
+
173
+ if is_fresh:
174
+ return {
175
+ "rule_id": self.rule_id,
176
+ "passed": True,
177
+ "failed_count": 0,
178
+ "message": "Passed",
179
+ }
180
+ else:
181
+ age = now - max_datetime
182
+ return {
183
+ "rule_id": self.rule_id,
184
+ "passed": False,
185
+ "failed_count": 1,
186
+ "message": f"Data is stale: most recent record is {_format_timedelta(age)} old (max allowed: {self.params['max_age']})",
187
+ "failure_mode": str(FailureMode.FRESHNESS_LAG),
188
+ "details": {
189
+ "latest_timestamp": max_datetime.isoformat(),
190
+ "threshold_timestamp": threshold.isoformat(),
191
+ "actual_age_seconds": int(age.total_seconds()),
192
+ "max_age_seconds": int(max_age.total_seconds()),
193
+ "max_age_spec": str(self.params["max_age"]),
194
+ },
195
+ }
196
+
197
+ def compile_predicate(self) -> Optional[Predicate]:
198
+ # Freshness is an aggregate check (MAX), not row-level
199
+ # Cannot be vectorized as a per-row predicate
200
+ return None
201
+
202
+ def to_sql_spec(self) -> Optional[Dict[str, Any]]:
203
+ """Generate SQL spec for pushdown execution."""
204
+ column = self.params.get("column")
205
+ max_age = self.params.get("max_age")
206
+
207
+ if not (column and max_age):
208
+ return None
209
+
210
+ try:
211
+ td = parse_duration(str(max_age))
212
+ # Convert to total seconds for SQL
213
+ total_seconds = int(td.total_seconds())
214
+ except ValueError:
215
+ return None
216
+
217
+ return {
218
+ "kind": "freshness",
219
+ "rule_id": self.rule_id,
220
+ "column": column,
221
+ "max_age_seconds": total_seconds,
222
+ }
223
+
224
+
225
+ def _format_timedelta(td: timedelta) -> str:
226
+ """Format a timedelta in human-readable form."""
227
+ total_seconds = int(td.total_seconds())
228
+
229
+ if total_seconds < 60:
230
+ return f"{total_seconds}s"
231
+ elif total_seconds < 3600:
232
+ return f"{total_seconds // 60}m"
233
+ elif total_seconds < 86400:
234
+ hours = total_seconds // 3600
235
+ mins = (total_seconds % 3600) // 60
236
+ return f"{hours}h {mins}m" if mins else f"{hours}h"
237
+ else:
238
+ days = total_seconds // 86400
239
+ hours = (total_seconds % 86400) // 3600
240
+ return f"{days}d {hours}h" if hours else f"{days}d"
@@ -0,0 +1,193 @@
1
+ # src/kontra/rules/builtin/length.py
2
+ """
3
+ Length rule - Column string length must be within specified bounds.
4
+
5
+ Usage:
6
+ - name: length
7
+ params:
8
+ column: username
9
+ min: 3
10
+ max: 50
11
+
12
+ Fails when:
13
+ - String length < min (if min specified)
14
+ - String length > max (if max specified)
15
+ - Value is NULL (can't measure length of NULL)
16
+
17
+ At least one of `min` or `max` must be specified.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ from typing import Any, Dict, List, Optional, Set, Union
22
+
23
+ import polars as pl
24
+
25
+ from kontra.rules.base import BaseRule
26
+ from kontra.rules.registry import register_rule
27
+ from kontra.rules.predicates import Predicate
28
+ from kontra.state.types import FailureMode
29
+
30
+
31
+ @register_rule("length")
32
+ class LengthRule(BaseRule):
33
+ """
34
+ Fails where string length is outside [min, max] bounds.
35
+
36
+ params:
37
+ - column: str (required) - Column to check
38
+ - min: int (optional) - Minimum length (inclusive)
39
+ - max: int (optional) - Maximum length (inclusive)
40
+
41
+ At least one of min or max must be provided.
42
+
43
+ NULL handling:
44
+ - NULL values are failures (can't measure length of NULL)
45
+ """
46
+
47
+ def __init__(self, name: str, params: Dict[str, Any]):
48
+ super().__init__(name, params)
49
+ self._column = self._get_required_param("column", str)
50
+ self._min_len: Optional[int] = params.get("min")
51
+ self._max_len: Optional[int] = params.get("max")
52
+
53
+ # Validate at least one bound is provided
54
+ if self._min_len is None and self._max_len is None:
55
+ raise ValueError(
56
+ f"Rule 'length' requires at least one of 'min' or 'max' parameters"
57
+ )
58
+
59
+ # Validate min <= max if both provided
60
+ if self._min_len is not None and self._max_len is not None:
61
+ if self._min_len > self._max_len:
62
+ raise ValueError(
63
+ f"Rule 'length' min ({self._min_len}) must be <= max ({self._max_len})"
64
+ )
65
+
66
+ # Validate non-negative
67
+ if self._min_len is not None and self._min_len < 0:
68
+ raise ValueError(f"Rule 'length' min must be non-negative, got {self._min_len}")
69
+ if self._max_len is not None and self._max_len < 0:
70
+ raise ValueError(f"Rule 'length' max must be non-negative, got {self._max_len}")
71
+
72
+ def required_columns(self) -> Set[str]:
73
+ return {self._column}
74
+
75
+ def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
76
+ # Check column exists before accessing
77
+ col_check = self._check_columns(df, {self._column})
78
+ if col_check is not None:
79
+ return col_check
80
+
81
+ # Get string length (cast to string first to handle non-string columns)
82
+ length_col = df[self._column].cast(pl.Utf8).str.len_chars()
83
+
84
+ # Build mask: True = failure
85
+ mask = df[self._column].is_null() # NULL is failure
86
+
87
+ if self._min_len is not None:
88
+ mask = mask | (length_col < self._min_len)
89
+ if self._max_len is not None:
90
+ mask = mask | (length_col > self._max_len)
91
+
92
+ # Build message
93
+ if self._min_len is not None and self._max_len is not None:
94
+ msg = f"{self._column} length not in range [{self._min_len}, {self._max_len}]"
95
+ elif self._min_len is not None:
96
+ msg = f"{self._column} length < {self._min_len}"
97
+ else:
98
+ msg = f"{self._column} length > {self._max_len}"
99
+
100
+ res = super()._failures(df, mask, msg)
101
+ res["rule_id"] = self.rule_id
102
+
103
+ if res["failed_count"] > 0:
104
+ res["failure_mode"] = str(FailureMode.RANGE_VIOLATION)
105
+ res["details"] = self._explain_failure(df, length_col, mask)
106
+
107
+ return res
108
+
109
+ def _explain_failure(
110
+ self, df: pl.DataFrame, length_col: pl.Series, mask: pl.Series
111
+ ) -> Dict[str, Any]:
112
+ """Generate detailed failure explanation."""
113
+ details: Dict[str, Any] = {
114
+ "column": self._column,
115
+ }
116
+ if self._min_len is not None:
117
+ details["min_length"] = self._min_len
118
+ if self._max_len is not None:
119
+ details["max_length"] = self._max_len
120
+
121
+ # Sample failing values with their lengths
122
+ failed_df = df.filter(mask).with_columns(
123
+ length_col.filter(mask).alias("_length")
124
+ ).head(5)
125
+
126
+ samples: List[Dict[str, Any]] = []
127
+ for row in failed_df.iter_rows(named=True):
128
+ val = row[self._column]
129
+ length = row.get("_length")
130
+ samples.append({
131
+ "value": val,
132
+ "length": length,
133
+ })
134
+
135
+ if samples:
136
+ details["sample_failures"] = samples
137
+
138
+ return details
139
+
140
+ def compile_predicate(self) -> Optional[Predicate]:
141
+ # Get string length
142
+ length_expr = pl.col(self._column).cast(pl.Utf8).str.len_chars()
143
+
144
+ # Build mask: True = failure
145
+ expr = pl.col(self._column).is_null()
146
+
147
+ if self._min_len is not None:
148
+ expr = expr | (length_expr < self._min_len)
149
+ if self._max_len is not None:
150
+ expr = expr | (length_expr > self._max_len)
151
+
152
+ # Build message
153
+ if self._min_len is not None and self._max_len is not None:
154
+ msg = f"{self._column} length not in range [{self._min_len}, {self._max_len}]"
155
+ elif self._min_len is not None:
156
+ msg = f"{self._column} length < {self._min_len}"
157
+ else:
158
+ msg = f"{self._column} length > {self._max_len}"
159
+
160
+ return Predicate(
161
+ rule_id=self.rule_id,
162
+ expr=expr,
163
+ message=msg,
164
+ columns={self._column},
165
+ )
166
+
167
+ def to_sql_spec(self) -> Optional[Dict[str, Any]]:
168
+ """Generate SQL pushdown specification."""
169
+ return {
170
+ "kind": "length",
171
+ "rule_id": self.rule_id,
172
+ "column": self._column,
173
+ "min": self._min_len,
174
+ "max": self._max_len,
175
+ }
176
+
177
+ def to_sql_filter(self, dialect: str = "postgres") -> str | None:
178
+ """Generate SQL filter for sampling failing rows."""
179
+ col = f'"{self._column}"'
180
+
181
+ # SQL Server uses LEN(), others use LENGTH()
182
+ if dialect in ("mssql", "sqlserver"):
183
+ len_func = f"LEN({col})"
184
+ else:
185
+ len_func = f"LENGTH({col})"
186
+
187
+ conditions = [f"{col} IS NULL"]
188
+ if self._min_len is not None:
189
+ conditions.append(f"{len_func} < {self._min_len}")
190
+ if self._max_len is not None:
191
+ conditions.append(f"{len_func} > {self._max_len}")
192
+
193
+ return " OR ".join(conditions)
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, Any, Optional
3
+ import polars as pl
4
+
5
+ from kontra.rules.base import BaseRule
6
+ from kontra.rules.registry import register_rule
7
+ from kontra.state.types import FailureMode
8
+
9
+ @register_rule("max_rows")
10
+ class MaxRowsRule(BaseRule):
11
+ def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
12
+ # Accept both 'value' and 'threshold' for backwards compatibility
13
+ max_count = int(self.params.get("value", self.params.get("threshold", 0)))
14
+ h = int(df.height)
15
+ passed = h <= max_count
16
+
17
+ result: Dict[str, Any] = {
18
+ "rule_id": self.rule_id,
19
+ "passed": passed,
20
+ "failed_count": 0 if passed else (h - max_count),
21
+ "message": f"Dataset has {h} rows, exceeds max {max_count}",
22
+ }
23
+
24
+ if not passed:
25
+ result["failure_mode"] = str(FailureMode.ROW_COUNT_HIGH)
26
+ result["details"] = {
27
+ "actual_rows": h,
28
+ "maximum_allowed": max_count,
29
+ "excess": h - max_count,
30
+ }
31
+
32
+ return result
33
+
34
+ def compile_predicate(self):
35
+ return None # dataset-level scalar check
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, Any, Optional
3
+ import polars as pl
4
+
5
+ from kontra.rules.base import BaseRule
6
+ from kontra.rules.registry import register_rule
7
+ from kontra.state.types import FailureMode
8
+
9
+ @register_rule("min_rows")
10
+ class MinRowsRule(BaseRule):
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ # Validate threshold at construction time
14
+ threshold = self.params.get("value", self.params.get("threshold", 0))
15
+ if threshold is not None and int(threshold) < 0:
16
+ from kontra.errors import RuleParameterError
17
+ raise RuleParameterError(
18
+ "min_rows", "threshold",
19
+ f"must be non-negative, got {threshold}"
20
+ )
21
+
22
+ def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
23
+ # Accept both 'value' and 'threshold' for backwards compatibility
24
+ min_count = int(self.params.get("value", self.params.get("threshold", 0)))
25
+ h = int(df.height)
26
+ passed = h >= min_count
27
+
28
+ result: Dict[str, Any] = {
29
+ "rule_id": self.rule_id,
30
+ "passed": passed,
31
+ "failed_count": 0 if passed else (min_count - h),
32
+ "message": f"Dataset has {h} rows, requires at least {min_count}",
33
+ }
34
+
35
+ if not passed:
36
+ result["failure_mode"] = str(FailureMode.ROW_COUNT_LOW)
37
+ result["details"] = {
38
+ "actual_rows": h,
39
+ "minimum_required": min_count,
40
+ "shortfall": min_count - h,
41
+ }
42
+
43
+ return result
44
+
45
+ def compile_predicate(self):
46
+ return None # dataset-level scalar check
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, Any, List, Optional
3
+ import polars as pl
4
+
5
+ from kontra.rules.base import BaseRule
6
+ from kontra.rules.registry import register_rule
7
+ from kontra.rules.predicates import Predicate
8
+ from kontra.state.types import FailureMode
9
+
10
+ @register_rule("not_null")
11
+ class NotNullRule(BaseRule):
12
+ """
13
+ Fails where column contains NULL values.
14
+
15
+ params:
16
+ - column: str (required) - Column to check
17
+ - include_nan: bool (optional, default: False) - Also treat NaN as null
18
+
19
+ Note: By default, NaN values are NOT considered null (Polars behavior).
20
+ Set include_nan=True to catch both NULL and NaN values.
21
+ """
22
+
23
+ def __init__(self, name: str, params: Dict[str, Any]):
24
+ super().__init__(name, params)
25
+ # Validate required parameter at construction time
26
+ self._get_required_param("column", str)
27
+
28
+ def validate(self, df: pl.DataFrame) -> Dict[str, Any]:
29
+ column = self.params["column"]
30
+ include_nan = self.params.get("include_nan", False)
31
+
32
+ # Check column exists before accessing
33
+ col_check = self._check_columns(df, {column})
34
+ if col_check is not None:
35
+ return col_check
36
+
37
+ # Build mask for null (and optionally NaN) values
38
+ mask = df[column].is_null()
39
+ if include_nan:
40
+ # For numeric columns, also check for NaN
41
+ col = df[column]
42
+ if col.dtype.is_float():
43
+ mask = mask | col.is_nan()
44
+
45
+ message = f"{column} contains null values"
46
+ if include_nan:
47
+ message = f"{column} contains null or NaN values"
48
+
49
+ res = super()._failures(df, mask, message)
50
+ res["rule_id"] = self.rule_id
51
+
52
+ # Add failure details
53
+ if res["failed_count"] > 0:
54
+ res["failure_mode"] = str(FailureMode.NULL_VALUES)
55
+ res["details"] = self._explain_failure(df, column, res["failed_count"], include_nan)
56
+
57
+ return res
58
+
59
+ def _explain_failure(
60
+ self, df: pl.DataFrame, column: str, null_count: int, include_nan: bool = False
61
+ ) -> Dict[str, Any]:
62
+ """Generate detailed failure explanation."""
63
+ total_rows = df.height
64
+ null_rate = null_count / total_rows if total_rows > 0 else 0
65
+
66
+ details: Dict[str, Any] = {
67
+ "null_count": null_count,
68
+ "null_rate": round(null_rate, 4),
69
+ "total_rows": total_rows,
70
+ }
71
+
72
+ if include_nan:
73
+ details["includes_nan"] = True
74
+
75
+ # Find sample row positions with nulls (first 5)
76
+ if null_count > 0 and null_count <= 1000:
77
+ null_positions: List[int] = []
78
+ col = df[column]
79
+ for i, val in enumerate(col):
80
+ if val is None:
81
+ null_positions.append(i)
82
+ if len(null_positions) >= 5:
83
+ break
84
+ if null_positions:
85
+ details["sample_positions"] = null_positions
86
+
87
+ return details
88
+
89
+ def compile_predicate(self) -> Optional[Predicate]:
90
+ column = self.params["column"]
91
+ include_nan = self.params.get("include_nan", False)
92
+
93
+ expr = pl.col(column).is_null()
94
+ message = f"{column} contains null values"
95
+
96
+ if include_nan:
97
+ # Note: is_nan() only works on float columns, but compile_predicate
98
+ # doesn't have access to the DataFrame schema. The expression will
99
+ # be evaluated at runtime where Polars handles type checking.
100
+ expr = expr | pl.col(column).is_nan()
101
+ message = f"{column} contains null or NaN values"
102
+
103
+ return Predicate(
104
+ rule_id=self.rule_id,
105
+ expr=expr,
106
+ message=message,
107
+ columns={column},
108
+ )
109
+
110
+ def to_sql_filter(self, dialect: str = "postgres") -> str | None:
111
+ column = self.params["column"]
112
+ include_nan = self.params.get("include_nan", False)
113
+
114
+ # Quote column name for safety
115
+ col = f'"{column}"'
116
+
117
+ if include_nan:
118
+ # NaN check: value != value is true for NaN
119
+ return f"{col} IS NULL OR {col} != {col}"
120
+ else:
121
+ return f"{col} IS NULL"