kontra 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kontra/__init__.py +1871 -0
- kontra/api/__init__.py +22 -0
- kontra/api/compare.py +340 -0
- kontra/api/decorators.py +153 -0
- kontra/api/results.py +2121 -0
- kontra/api/rules.py +681 -0
- kontra/cli/__init__.py +0 -0
- kontra/cli/commands/__init__.py +1 -0
- kontra/cli/commands/config.py +153 -0
- kontra/cli/commands/diff.py +450 -0
- kontra/cli/commands/history.py +196 -0
- kontra/cli/commands/profile.py +289 -0
- kontra/cli/commands/validate.py +468 -0
- kontra/cli/constants.py +6 -0
- kontra/cli/main.py +48 -0
- kontra/cli/renderers.py +304 -0
- kontra/cli/utils.py +28 -0
- kontra/config/__init__.py +34 -0
- kontra/config/loader.py +127 -0
- kontra/config/models.py +49 -0
- kontra/config/settings.py +797 -0
- kontra/connectors/__init__.py +0 -0
- kontra/connectors/db_utils.py +251 -0
- kontra/connectors/detection.py +323 -0
- kontra/connectors/handle.py +368 -0
- kontra/connectors/postgres.py +127 -0
- kontra/connectors/sqlserver.py +226 -0
- kontra/engine/__init__.py +0 -0
- kontra/engine/backends/duckdb_session.py +227 -0
- kontra/engine/backends/duckdb_utils.py +18 -0
- kontra/engine/backends/polars_backend.py +47 -0
- kontra/engine/engine.py +1205 -0
- kontra/engine/executors/__init__.py +15 -0
- kontra/engine/executors/base.py +50 -0
- kontra/engine/executors/database_base.py +528 -0
- kontra/engine/executors/duckdb_sql.py +607 -0
- kontra/engine/executors/postgres_sql.py +162 -0
- kontra/engine/executors/registry.py +69 -0
- kontra/engine/executors/sqlserver_sql.py +163 -0
- kontra/engine/materializers/__init__.py +14 -0
- kontra/engine/materializers/base.py +42 -0
- kontra/engine/materializers/duckdb.py +110 -0
- kontra/engine/materializers/factory.py +22 -0
- kontra/engine/materializers/polars_connector.py +131 -0
- kontra/engine/materializers/postgres.py +157 -0
- kontra/engine/materializers/registry.py +138 -0
- kontra/engine/materializers/sqlserver.py +160 -0
- kontra/engine/result.py +15 -0
- kontra/engine/sql_utils.py +611 -0
- kontra/engine/sql_validator.py +609 -0
- kontra/engine/stats.py +194 -0
- kontra/engine/types.py +138 -0
- kontra/errors.py +533 -0
- kontra/logging.py +85 -0
- kontra/preplan/__init__.py +5 -0
- kontra/preplan/planner.py +253 -0
- kontra/preplan/postgres.py +179 -0
- kontra/preplan/sqlserver.py +191 -0
- kontra/preplan/types.py +24 -0
- kontra/probes/__init__.py +20 -0
- kontra/probes/compare.py +400 -0
- kontra/probes/relationship.py +283 -0
- kontra/reporters/__init__.py +0 -0
- kontra/reporters/json_reporter.py +190 -0
- kontra/reporters/rich_reporter.py +11 -0
- kontra/rules/__init__.py +35 -0
- kontra/rules/base.py +186 -0
- kontra/rules/builtin/__init__.py +40 -0
- kontra/rules/builtin/allowed_values.py +156 -0
- kontra/rules/builtin/compare.py +188 -0
- kontra/rules/builtin/conditional_not_null.py +213 -0
- kontra/rules/builtin/conditional_range.py +310 -0
- kontra/rules/builtin/contains.py +138 -0
- kontra/rules/builtin/custom_sql_check.py +182 -0
- kontra/rules/builtin/disallowed_values.py +140 -0
- kontra/rules/builtin/dtype.py +203 -0
- kontra/rules/builtin/ends_with.py +129 -0
- kontra/rules/builtin/freshness.py +240 -0
- kontra/rules/builtin/length.py +193 -0
- kontra/rules/builtin/max_rows.py +35 -0
- kontra/rules/builtin/min_rows.py +46 -0
- kontra/rules/builtin/not_null.py +121 -0
- kontra/rules/builtin/range.py +222 -0
- kontra/rules/builtin/regex.py +143 -0
- kontra/rules/builtin/starts_with.py +129 -0
- kontra/rules/builtin/unique.py +124 -0
- kontra/rules/condition_parser.py +203 -0
- kontra/rules/execution_plan.py +455 -0
- kontra/rules/factory.py +103 -0
- kontra/rules/predicates.py +25 -0
- kontra/rules/registry.py +24 -0
- kontra/rules/static_predicates.py +120 -0
- kontra/scout/__init__.py +9 -0
- kontra/scout/backends/__init__.py +17 -0
- kontra/scout/backends/base.py +111 -0
- kontra/scout/backends/duckdb_backend.py +359 -0
- kontra/scout/backends/postgres_backend.py +519 -0
- kontra/scout/backends/sqlserver_backend.py +577 -0
- kontra/scout/dtype_mapping.py +150 -0
- kontra/scout/patterns.py +69 -0
- kontra/scout/profiler.py +801 -0
- kontra/scout/reporters/__init__.py +39 -0
- kontra/scout/reporters/json_reporter.py +165 -0
- kontra/scout/reporters/markdown_reporter.py +152 -0
- kontra/scout/reporters/rich_reporter.py +144 -0
- kontra/scout/store.py +208 -0
- kontra/scout/suggest.py +200 -0
- kontra/scout/types.py +652 -0
- kontra/state/__init__.py +29 -0
- kontra/state/backends/__init__.py +79 -0
- kontra/state/backends/base.py +348 -0
- kontra/state/backends/local.py +480 -0
- kontra/state/backends/postgres.py +1010 -0
- kontra/state/backends/s3.py +543 -0
- kontra/state/backends/sqlserver.py +969 -0
- kontra/state/fingerprint.py +166 -0
- kontra/state/types.py +1061 -0
- kontra/version.py +1 -0
- kontra-0.5.2.dist-info/METADATA +122 -0
- kontra-0.5.2.dist-info/RECORD +124 -0
- kontra-0.5.2.dist-info/WHEEL +5 -0
- kontra-0.5.2.dist-info/entry_points.txt +2 -0
- kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
- kontra-0.5.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
# src/kontra/engine/sql_utils.py
|
|
2
|
+
"""
|
|
3
|
+
Shared SQL utilities for all database executors.
|
|
4
|
+
|
|
5
|
+
This module provides dialect-aware SQL escaping and common aggregate
|
|
6
|
+
expression builders to reduce code duplication across executors.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, List, Literal, Optional
|
|
12
|
+
|
|
13
|
+
Dialect = Literal["duckdb", "postgres", "sqlserver"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# =============================================================================
|
|
17
|
+
# Identifier and Literal Escaping
|
|
18
|
+
# =============================================================================
|
|
19
|
+
|
|
20
|
+
def esc_ident(name: str, dialect: Dialect = "duckdb") -> str:
|
|
21
|
+
"""
|
|
22
|
+
Escape a SQL identifier (column name, table name) for the given dialect.
|
|
23
|
+
|
|
24
|
+
- DuckDB/PostgreSQL: "name" with " doubled
|
|
25
|
+
- SQL Server: [name] with ] doubled
|
|
26
|
+
"""
|
|
27
|
+
if dialect == "sqlserver":
|
|
28
|
+
return "[" + name.replace("]", "]]") + "]"
|
|
29
|
+
else: # duckdb, postgres
|
|
30
|
+
return '"' + name.replace('"', '""') + '"'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def lit_str(value: str, dialect: Dialect = "duckdb") -> str:
|
|
34
|
+
"""
|
|
35
|
+
Escape a string literal for SQL. All dialects use single quotes.
|
|
36
|
+
"""
|
|
37
|
+
return "'" + value.replace("'", "''") + "'"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def lit_value(value: Any, dialect: Dialect = "duckdb") -> str:
|
|
41
|
+
"""
|
|
42
|
+
Convert a Python value to a SQL literal.
|
|
43
|
+
"""
|
|
44
|
+
if value is None:
|
|
45
|
+
return "NULL"
|
|
46
|
+
elif isinstance(value, bool):
|
|
47
|
+
return "TRUE" if value else "FALSE"
|
|
48
|
+
elif isinstance(value, str):
|
|
49
|
+
return lit_str(value, dialect)
|
|
50
|
+
elif isinstance(value, (int, float)):
|
|
51
|
+
return str(value)
|
|
52
|
+
else:
|
|
53
|
+
return lit_str(str(value), dialect)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# Common Aggregate Expression Builders
|
|
58
|
+
# =============================================================================
|
|
59
|
+
|
|
60
|
+
def agg_not_null(col: str, rule_id: str, dialect: Dialect = "duckdb") -> str:
|
|
61
|
+
"""Count NULL values in a column."""
|
|
62
|
+
c = esc_ident(col, dialect)
|
|
63
|
+
r = esc_ident(rule_id, dialect)
|
|
64
|
+
return f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS {r}"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def agg_unique(col: str, rule_id: str, dialect: Dialect = "duckdb") -> str:
|
|
68
|
+
"""Count duplicate values in a column."""
|
|
69
|
+
c = esc_ident(col, dialect)
|
|
70
|
+
r = esc_ident(rule_id, dialect)
|
|
71
|
+
return f"(COUNT(*) - COUNT(DISTINCT {c})) AS {r}"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def agg_min_rows(threshold: int, rule_id: str, dialect: Dialect = "duckdb") -> str:
|
|
75
|
+
"""Check if row count >= threshold. Returns deficit if below."""
|
|
76
|
+
r = esc_ident(rule_id, dialect)
|
|
77
|
+
n = int(threshold)
|
|
78
|
+
if dialect == "sqlserver":
|
|
79
|
+
# SQL Server doesn't have GREATEST
|
|
80
|
+
return f"CASE WHEN COUNT(*) >= {n} THEN 0 ELSE {n} - COUNT(*) END AS {r}"
|
|
81
|
+
else:
|
|
82
|
+
return f"GREATEST(0, {n} - COUNT(*)) AS {r}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def agg_max_rows(threshold: int, rule_id: str, dialect: Dialect = "duckdb") -> str:
|
|
86
|
+
"""Check if row count <= threshold. Returns excess if above."""
|
|
87
|
+
r = esc_ident(rule_id, dialect)
|
|
88
|
+
n = int(threshold)
|
|
89
|
+
if dialect == "sqlserver":
|
|
90
|
+
return f"CASE WHEN COUNT(*) <= {n} THEN 0 ELSE COUNT(*) - {n} END AS {r}"
|
|
91
|
+
else:
|
|
92
|
+
return f"GREATEST(0, COUNT(*) - {n}) AS {r}"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def agg_allowed_values(
|
|
96
|
+
col: str, values: List[Any], rule_id: str, dialect: Dialect = "duckdb"
|
|
97
|
+
) -> str:
|
|
98
|
+
"""Count values not in the allowed set."""
|
|
99
|
+
c = esc_ident(col, dialect)
|
|
100
|
+
r = esc_ident(rule_id, dialect)
|
|
101
|
+
|
|
102
|
+
val_list = ", ".join(
|
|
103
|
+
lit_str(str(v), dialect) if isinstance(v, str) else str(v)
|
|
104
|
+
for v in values
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if dialect == "sqlserver":
|
|
108
|
+
cast_col = f"CAST({c} AS NVARCHAR(MAX))"
|
|
109
|
+
elif dialect == "postgres":
|
|
110
|
+
cast_col = f"{c}::text"
|
|
111
|
+
else:
|
|
112
|
+
cast_col = c
|
|
113
|
+
|
|
114
|
+
return (
|
|
115
|
+
f"SUM(CASE WHEN {c} IS NOT NULL AND {cast_col} NOT IN ({val_list}) "
|
|
116
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def agg_freshness(
|
|
121
|
+
col: str, max_age_seconds: int, rule_id: str, dialect: Dialect = "duckdb"
|
|
122
|
+
) -> str:
|
|
123
|
+
"""Check if MAX(column) is within max_age_seconds of now."""
|
|
124
|
+
c = esc_ident(col, dialect)
|
|
125
|
+
r = esc_ident(rule_id, dialect)
|
|
126
|
+
secs = int(max_age_seconds)
|
|
127
|
+
|
|
128
|
+
if dialect == "sqlserver":
|
|
129
|
+
threshold = f"DATEADD(SECOND, -{secs}, GETUTCDATE())"
|
|
130
|
+
else: # duckdb, postgres use similar syntax
|
|
131
|
+
threshold = f"(NOW() - INTERVAL '{secs} seconds')"
|
|
132
|
+
|
|
133
|
+
return f"CASE WHEN MAX({c}) >= {threshold} THEN 0 ELSE 1 END AS {r}"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def agg_range(
|
|
137
|
+
col: str,
|
|
138
|
+
min_val: Optional[Any],
|
|
139
|
+
max_val: Optional[Any],
|
|
140
|
+
rule_id: str,
|
|
141
|
+
dialect: Dialect = "duckdb",
|
|
142
|
+
) -> str:
|
|
143
|
+
"""Count values outside [min, max] range. NULLs are failures."""
|
|
144
|
+
c = esc_ident(col, dialect)
|
|
145
|
+
r = esc_ident(rule_id, dialect)
|
|
146
|
+
|
|
147
|
+
conditions = []
|
|
148
|
+
if min_val is not None:
|
|
149
|
+
conditions.append(f"{c} < {min_val}")
|
|
150
|
+
if max_val is not None:
|
|
151
|
+
conditions.append(f"{c} > {max_val}")
|
|
152
|
+
|
|
153
|
+
out_of_range = " OR ".join(conditions) if conditions else "0=1"
|
|
154
|
+
|
|
155
|
+
return (
|
|
156
|
+
f"SUM(CASE WHEN {c} IS NULL OR ({out_of_range}) THEN 1 ELSE 0 END) AS {r}"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def agg_regex(
|
|
161
|
+
col: str, pattern: str, rule_id: str, dialect: Dialect = "duckdb"
|
|
162
|
+
) -> str:
|
|
163
|
+
"""Count values that don't match the regex pattern. NULLs are failures."""
|
|
164
|
+
c = esc_ident(col, dialect)
|
|
165
|
+
r = esc_ident(rule_id, dialect)
|
|
166
|
+
escaped_pattern = pattern.replace("'", "''")
|
|
167
|
+
|
|
168
|
+
if dialect == "sqlserver":
|
|
169
|
+
# SQL Server uses PATINDEX with LIKE-style patterns (limited regex)
|
|
170
|
+
return (
|
|
171
|
+
f"SUM(CASE WHEN {c} IS NULL "
|
|
172
|
+
f"OR PATINDEX('%{escaped_pattern}%', CAST({c} AS NVARCHAR(MAX))) = 0 "
|
|
173
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
174
|
+
)
|
|
175
|
+
elif dialect == "postgres":
|
|
176
|
+
# PostgreSQL uses ~ operator for regex
|
|
177
|
+
return (
|
|
178
|
+
f"SUM(CASE WHEN {c} IS NULL "
|
|
179
|
+
f"OR NOT ({c}::text ~ '{escaped_pattern}') "
|
|
180
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
181
|
+
)
|
|
182
|
+
else: # duckdb
|
|
183
|
+
# DuckDB uses regexp_matches()
|
|
184
|
+
return (
|
|
185
|
+
f"SUM(CASE WHEN {c} IS NULL "
|
|
186
|
+
f"OR NOT regexp_matches(CAST({c} AS VARCHAR), '{escaped_pattern}') "
|
|
187
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# =============================================================================
|
|
192
|
+
# EXISTS Expression Builders (for early-termination patterns)
|
|
193
|
+
# =============================================================================
|
|
194
|
+
|
|
195
|
+
def exists_not_null(
|
|
196
|
+
col: str, rule_id: str, table: str, dialect: Dialect = "duckdb"
|
|
197
|
+
) -> str:
|
|
198
|
+
"""
|
|
199
|
+
EXISTS expression for not_null rule - stops at first NULL found.
|
|
200
|
+
Returns 1 if any NULL exists, 0 otherwise.
|
|
201
|
+
"""
|
|
202
|
+
c = esc_ident(col, dialect)
|
|
203
|
+
r = esc_ident(rule_id, dialect)
|
|
204
|
+
|
|
205
|
+
if dialect == "sqlserver":
|
|
206
|
+
return (
|
|
207
|
+
f"(SELECT CASE WHEN EXISTS (SELECT 1 FROM {table} WHERE {c} IS NULL) "
|
|
208
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
209
|
+
)
|
|
210
|
+
else: # postgres, duckdb
|
|
211
|
+
return (
|
|
212
|
+
f"EXISTS (SELECT 1 FROM {table} WHERE {c} IS NULL LIMIT 1) AS {r}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# =============================================================================
|
|
217
|
+
# Result Parsing
|
|
218
|
+
# =============================================================================
|
|
219
|
+
|
|
220
|
+
# SQL comparison operators
|
|
221
|
+
SQL_OP_MAP = {
|
|
222
|
+
">": ">",
|
|
223
|
+
">=": ">=",
|
|
224
|
+
"<": "<",
|
|
225
|
+
"<=": "<=",
|
|
226
|
+
"==": "=",
|
|
227
|
+
"!=": "<>",
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def agg_compare(
|
|
232
|
+
left: str,
|
|
233
|
+
right: str,
|
|
234
|
+
op: str,
|
|
235
|
+
rule_id: str,
|
|
236
|
+
dialect: Dialect = "duckdb",
|
|
237
|
+
) -> str:
|
|
238
|
+
"""
|
|
239
|
+
Count rows where the comparison fails or either column is NULL.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
left: Left column name
|
|
243
|
+
right: Right column name
|
|
244
|
+
op: Comparison operator (>, >=, <, <=, ==, !=)
|
|
245
|
+
rule_id: Rule identifier for alias
|
|
246
|
+
dialect: SQL dialect
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
SQL aggregate expression
|
|
250
|
+
"""
|
|
251
|
+
l = esc_ident(left, dialect)
|
|
252
|
+
r_col = esc_ident(right, dialect)
|
|
253
|
+
r_id = esc_ident(rule_id, dialect)
|
|
254
|
+
sql_op = SQL_OP_MAP.get(op, op)
|
|
255
|
+
|
|
256
|
+
# Count failures: NULL in either column OR comparison is false
|
|
257
|
+
return (
|
|
258
|
+
f"SUM(CASE WHEN {l} IS NULL OR {r_col} IS NULL "
|
|
259
|
+
f"OR NOT ({l} {sql_op} {r_col}) THEN 1 ELSE 0 END) AS {r_id}"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def agg_conditional_not_null(
|
|
264
|
+
column: str,
|
|
265
|
+
when_column: str,
|
|
266
|
+
when_op: str,
|
|
267
|
+
when_value: Any,
|
|
268
|
+
rule_id: str,
|
|
269
|
+
dialect: Dialect = "duckdb",
|
|
270
|
+
) -> str:
|
|
271
|
+
"""
|
|
272
|
+
Count rows where column is NULL when condition is met.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
column: Column that must not be null
|
|
276
|
+
when_column: Column in the condition
|
|
277
|
+
when_op: Condition operator
|
|
278
|
+
when_value: Condition value
|
|
279
|
+
rule_id: Rule identifier for alias
|
|
280
|
+
dialect: SQL dialect
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
SQL aggregate expression
|
|
284
|
+
"""
|
|
285
|
+
col = esc_ident(column, dialect)
|
|
286
|
+
when_col = esc_ident(when_column, dialect)
|
|
287
|
+
r_id = esc_ident(rule_id, dialect)
|
|
288
|
+
sql_op = SQL_OP_MAP.get(when_op, when_op)
|
|
289
|
+
|
|
290
|
+
# Handle NULL value in condition
|
|
291
|
+
if when_value is None:
|
|
292
|
+
if when_op == "==":
|
|
293
|
+
condition = f"{when_col} IS NULL"
|
|
294
|
+
elif when_op == "!=":
|
|
295
|
+
condition = f"{when_col} IS NOT NULL"
|
|
296
|
+
else:
|
|
297
|
+
condition = "1=0" # Other operators with NULL -> always false
|
|
298
|
+
else:
|
|
299
|
+
val = lit_value(when_value, dialect)
|
|
300
|
+
condition = f"{when_col} {sql_op} {val}"
|
|
301
|
+
|
|
302
|
+
# Count failures: condition is TRUE AND column is NULL
|
|
303
|
+
return (
|
|
304
|
+
f"SUM(CASE WHEN ({condition}) AND {col} IS NULL THEN 1 ELSE 0 END) AS {r_id}"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def agg_conditional_range(
|
|
309
|
+
column: str,
|
|
310
|
+
when_column: str,
|
|
311
|
+
when_op: str,
|
|
312
|
+
when_value: Any,
|
|
313
|
+
min_val: Any,
|
|
314
|
+
max_val: Any,
|
|
315
|
+
rule_id: str,
|
|
316
|
+
dialect: Dialect = "duckdb",
|
|
317
|
+
) -> str:
|
|
318
|
+
"""
|
|
319
|
+
Count rows where column is outside range when condition is met.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
column: Column to check range
|
|
323
|
+
when_column: Column in the condition
|
|
324
|
+
when_op: Condition operator
|
|
325
|
+
when_value: Condition value
|
|
326
|
+
min_val: Minimum allowed value (inclusive)
|
|
327
|
+
max_val: Maximum allowed value (inclusive)
|
|
328
|
+
rule_id: Rule identifier for alias
|
|
329
|
+
dialect: SQL dialect
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
SQL aggregate expression
|
|
333
|
+
"""
|
|
334
|
+
col = esc_ident(column, dialect)
|
|
335
|
+
when_col = esc_ident(when_column, dialect)
|
|
336
|
+
r_id = esc_ident(rule_id, dialect)
|
|
337
|
+
sql_op = SQL_OP_MAP.get(when_op, when_op)
|
|
338
|
+
|
|
339
|
+
# Handle NULL value in condition
|
|
340
|
+
if when_value is None:
|
|
341
|
+
if when_op == "==":
|
|
342
|
+
condition = f"{when_col} IS NULL"
|
|
343
|
+
elif when_op == "!=":
|
|
344
|
+
condition = f"{when_col} IS NOT NULL"
|
|
345
|
+
else:
|
|
346
|
+
condition = "1=0" # Other operators with NULL -> always false
|
|
347
|
+
else:
|
|
348
|
+
val = lit_value(when_value, dialect)
|
|
349
|
+
condition = f"{when_col} {sql_op} {val}"
|
|
350
|
+
|
|
351
|
+
# Build range violation part: NULL OR outside range
|
|
352
|
+
range_parts = [f"{col} IS NULL"]
|
|
353
|
+
if min_val is not None:
|
|
354
|
+
range_parts.append(f"{col} < {min_val}")
|
|
355
|
+
if max_val is not None:
|
|
356
|
+
range_parts.append(f"{col} > {max_val}")
|
|
357
|
+
range_violation = " OR ".join(range_parts)
|
|
358
|
+
|
|
359
|
+
# Count failures: condition is TRUE AND (column is NULL OR outside range)
|
|
360
|
+
return (
|
|
361
|
+
f"SUM(CASE WHEN ({condition}) AND ({range_violation}) THEN 1 ELSE 0 END) AS {r_id}"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# Mapping from rule kind to failure_mode
|
|
366
|
+
RULE_KIND_TO_FAILURE_MODE = {
|
|
367
|
+
"not_null": "null_values",
|
|
368
|
+
"unique": "duplicate_values",
|
|
369
|
+
"allowed_values": "novel_category",
|
|
370
|
+
"disallowed_values": "disallowed_value",
|
|
371
|
+
"min_rows": "row_count_low",
|
|
372
|
+
"max_rows": "row_count_high",
|
|
373
|
+
"range": "range_violation",
|
|
374
|
+
"length": "length_violation",
|
|
375
|
+
"freshness": "freshness_lag",
|
|
376
|
+
"regex": "pattern_mismatch",
|
|
377
|
+
"contains": "pattern_mismatch",
|
|
378
|
+
"starts_with": "pattern_mismatch",
|
|
379
|
+
"ends_with": "pattern_mismatch",
|
|
380
|
+
"dtype": "schema_drift",
|
|
381
|
+
"custom_sql_check": "custom_check_failed",
|
|
382
|
+
"compare": "comparison_failed",
|
|
383
|
+
"conditional_not_null": "conditional_null",
|
|
384
|
+
"conditional_range": "conditional_range_violation",
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# =============================================================================
|
|
389
|
+
# String Validation Aggregate Expression Builders
|
|
390
|
+
# =============================================================================
|
|
391
|
+
|
|
392
|
+
def escape_like_pattern(value: str, escape_char: str = "\\") -> str:
|
|
393
|
+
"""
|
|
394
|
+
Escape special characters in a LIKE pattern value.
|
|
395
|
+
|
|
396
|
+
LIKE special characters: %, _, and the escape character itself.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
value: The literal string to escape
|
|
400
|
+
escape_char: The escape character to use (default: backslash)
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Escaped string safe for use in LIKE patterns
|
|
404
|
+
"""
|
|
405
|
+
# Order matters: escape the escape char first
|
|
406
|
+
for c in (escape_char, "%", "_"):
|
|
407
|
+
value = value.replace(c, escape_char + c)
|
|
408
|
+
return value
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def agg_disallowed_values(
|
|
412
|
+
col: str, values: List[Any], rule_id: str, dialect: Dialect = "duckdb"
|
|
413
|
+
) -> str:
|
|
414
|
+
"""
|
|
415
|
+
Count values that ARE in the disallowed set.
|
|
416
|
+
|
|
417
|
+
Inverse of allowed_values: fails if value IS in the list.
|
|
418
|
+
NULL values are NOT failures (NULL is not in any list).
|
|
419
|
+
"""
|
|
420
|
+
c = esc_ident(col, dialect)
|
|
421
|
+
r = esc_ident(rule_id, dialect)
|
|
422
|
+
|
|
423
|
+
if not values:
|
|
424
|
+
# No disallowed values means nothing can fail
|
|
425
|
+
return f"0 AS {r}"
|
|
426
|
+
|
|
427
|
+
val_list = ", ".join(
|
|
428
|
+
lit_str(str(v), dialect) if isinstance(v, str) else str(v)
|
|
429
|
+
for v in values
|
|
430
|
+
if v is not None # NULL in disallowed list doesn't make sense
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if dialect == "sqlserver":
|
|
434
|
+
cast_col = f"CAST({c} AS NVARCHAR(MAX))"
|
|
435
|
+
elif dialect == "postgres":
|
|
436
|
+
cast_col = f"{c}::text"
|
|
437
|
+
else:
|
|
438
|
+
cast_col = c
|
|
439
|
+
|
|
440
|
+
# Failure = value IS in the disallowed list (and not null)
|
|
441
|
+
return (
|
|
442
|
+
f"SUM(CASE WHEN {c} IS NOT NULL AND {cast_col} IN ({val_list}) "
|
|
443
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def agg_length(
|
|
448
|
+
col: str,
|
|
449
|
+
min_len: Optional[int],
|
|
450
|
+
max_len: Optional[int],
|
|
451
|
+
rule_id: str,
|
|
452
|
+
dialect: Dialect = "duckdb",
|
|
453
|
+
) -> str:
|
|
454
|
+
"""
|
|
455
|
+
Count values where string length is outside [min_len, max_len].
|
|
456
|
+
|
|
457
|
+
NULL values are failures (can't measure length of NULL).
|
|
458
|
+
"""
|
|
459
|
+
c = esc_ident(col, dialect)
|
|
460
|
+
r = esc_ident(rule_id, dialect)
|
|
461
|
+
|
|
462
|
+
# SQL Server uses LEN(), others use LENGTH()
|
|
463
|
+
if dialect == "sqlserver":
|
|
464
|
+
len_func = f"LEN({c})"
|
|
465
|
+
else:
|
|
466
|
+
len_func = f"LENGTH({c})"
|
|
467
|
+
|
|
468
|
+
conditions = [f"{c} IS NULL"]
|
|
469
|
+
if min_len is not None:
|
|
470
|
+
conditions.append(f"{len_func} < {int(min_len)}")
|
|
471
|
+
if max_len is not None:
|
|
472
|
+
conditions.append(f"{len_func} > {int(max_len)}")
|
|
473
|
+
|
|
474
|
+
violation = " OR ".join(conditions)
|
|
475
|
+
return f"SUM(CASE WHEN {violation} THEN 1 ELSE 0 END) AS {r}"
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def agg_contains(
|
|
479
|
+
col: str, substring: str, rule_id: str, dialect: Dialect = "duckdb"
|
|
480
|
+
) -> str:
|
|
481
|
+
"""
|
|
482
|
+
Count values that do NOT contain the substring.
|
|
483
|
+
|
|
484
|
+
Uses LIKE for efficiency (faster than regex).
|
|
485
|
+
NULL values are failures.
|
|
486
|
+
"""
|
|
487
|
+
c = esc_ident(col, dialect)
|
|
488
|
+
r = esc_ident(rule_id, dialect)
|
|
489
|
+
|
|
490
|
+
# Escape LIKE special characters in the substring
|
|
491
|
+
escaped = escape_like_pattern(substring)
|
|
492
|
+
pattern = f"%{escaped}%"
|
|
493
|
+
|
|
494
|
+
if dialect == "sqlserver":
|
|
495
|
+
# SQL Server LIKE is case-insensitive by default (depends on collation)
|
|
496
|
+
# Use ESCAPE clause for backslash
|
|
497
|
+
return (
|
|
498
|
+
f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
|
|
499
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
500
|
+
)
|
|
501
|
+
else:
|
|
502
|
+
# DuckDB and PostgreSQL
|
|
503
|
+
return (
|
|
504
|
+
f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
|
|
505
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def agg_starts_with(
|
|
510
|
+
col: str, prefix: str, rule_id: str, dialect: Dialect = "duckdb"
|
|
511
|
+
) -> str:
|
|
512
|
+
"""
|
|
513
|
+
Count values that do NOT start with the prefix.
|
|
514
|
+
|
|
515
|
+
Uses LIKE for efficiency (faster than regex).
|
|
516
|
+
NULL values are failures.
|
|
517
|
+
"""
|
|
518
|
+
c = esc_ident(col, dialect)
|
|
519
|
+
r = esc_ident(rule_id, dialect)
|
|
520
|
+
|
|
521
|
+
# Escape LIKE special characters in the prefix
|
|
522
|
+
escaped = escape_like_pattern(prefix)
|
|
523
|
+
pattern = f"{escaped}%"
|
|
524
|
+
|
|
525
|
+
return (
|
|
526
|
+
f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
|
|
527
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def agg_ends_with(
|
|
532
|
+
col: str, suffix: str, rule_id: str, dialect: Dialect = "duckdb"
|
|
533
|
+
) -> str:
|
|
534
|
+
"""
|
|
535
|
+
Count values that do NOT end with the suffix.
|
|
536
|
+
|
|
537
|
+
Uses LIKE for efficiency (faster than regex).
|
|
538
|
+
NULL values are failures.
|
|
539
|
+
"""
|
|
540
|
+
c = esc_ident(col, dialect)
|
|
541
|
+
r = esc_ident(rule_id, dialect)
|
|
542
|
+
|
|
543
|
+
# Escape LIKE special characters in the suffix
|
|
544
|
+
escaped = escape_like_pattern(suffix)
|
|
545
|
+
pattern = f"%{escaped}"
|
|
546
|
+
|
|
547
|
+
return (
|
|
548
|
+
f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
|
|
549
|
+
f"THEN 1 ELSE 0 END) AS {r}"
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def results_from_row(
|
|
554
|
+
columns: List[str],
|
|
555
|
+
values: tuple,
|
|
556
|
+
is_exists: bool = False,
|
|
557
|
+
rule_kinds: Optional[dict] = None,
|
|
558
|
+
) -> List[dict]:
|
|
559
|
+
"""
|
|
560
|
+
Convert a single-row SQL result to Kontra result format.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
columns: Column names (rule IDs)
|
|
564
|
+
values: Result values
|
|
565
|
+
is_exists: If True, values are booleans (True=violation, False=pass)
|
|
566
|
+
If False, values are counts (0=pass, >0=violation count)
|
|
567
|
+
rule_kinds: Optional dict mapping rule_id -> rule_kind for failure_mode
|
|
568
|
+
"""
|
|
569
|
+
rule_kinds = rule_kinds or {}
|
|
570
|
+
out = []
|
|
571
|
+
for i, col in enumerate(columns):
|
|
572
|
+
if col == "__no_sql_rules__":
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
rule_id = col
|
|
576
|
+
val = values[i]
|
|
577
|
+
|
|
578
|
+
# Get failure_mode from rule kind
|
|
579
|
+
rule_kind = rule_kinds.get(rule_id)
|
|
580
|
+
failure_mode = RULE_KIND_TO_FAILURE_MODE.get(rule_kind) if rule_kind else None
|
|
581
|
+
|
|
582
|
+
if is_exists:
|
|
583
|
+
has_violation = bool(val) if val is not None else False
|
|
584
|
+
result = {
|
|
585
|
+
"rule_id": rule_id,
|
|
586
|
+
"passed": not has_violation,
|
|
587
|
+
"failed_count": 1 if has_violation else 0,
|
|
588
|
+
"message": "Passed" if not has_violation else "Failed",
|
|
589
|
+
"severity": "ERROR",
|
|
590
|
+
"actions_executed": [],
|
|
591
|
+
"execution_source": "sql",
|
|
592
|
+
}
|
|
593
|
+
if has_violation and failure_mode:
|
|
594
|
+
result["failure_mode"] = failure_mode
|
|
595
|
+
out.append(result)
|
|
596
|
+
else:
|
|
597
|
+
failed_count = int(val) if val is not None else 0
|
|
598
|
+
result = {
|
|
599
|
+
"rule_id": rule_id,
|
|
600
|
+
"passed": failed_count == 0,
|
|
601
|
+
"failed_count": failed_count,
|
|
602
|
+
"message": "Passed" if failed_count == 0 else "Failed",
|
|
603
|
+
"severity": "ERROR",
|
|
604
|
+
"actions_executed": [],
|
|
605
|
+
"execution_source": "sql",
|
|
606
|
+
}
|
|
607
|
+
if failed_count > 0 and failure_mode:
|
|
608
|
+
result["failure_mode"] = failure_mode
|
|
609
|
+
out.append(result)
|
|
610
|
+
|
|
611
|
+
return out
|