kontra 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. kontra/__init__.py +1871 -0
  2. kontra/api/__init__.py +22 -0
  3. kontra/api/compare.py +340 -0
  4. kontra/api/decorators.py +153 -0
  5. kontra/api/results.py +2121 -0
  6. kontra/api/rules.py +681 -0
  7. kontra/cli/__init__.py +0 -0
  8. kontra/cli/commands/__init__.py +1 -0
  9. kontra/cli/commands/config.py +153 -0
  10. kontra/cli/commands/diff.py +450 -0
  11. kontra/cli/commands/history.py +196 -0
  12. kontra/cli/commands/profile.py +289 -0
  13. kontra/cli/commands/validate.py +468 -0
  14. kontra/cli/constants.py +6 -0
  15. kontra/cli/main.py +48 -0
  16. kontra/cli/renderers.py +304 -0
  17. kontra/cli/utils.py +28 -0
  18. kontra/config/__init__.py +34 -0
  19. kontra/config/loader.py +127 -0
  20. kontra/config/models.py +49 -0
  21. kontra/config/settings.py +797 -0
  22. kontra/connectors/__init__.py +0 -0
  23. kontra/connectors/db_utils.py +251 -0
  24. kontra/connectors/detection.py +323 -0
  25. kontra/connectors/handle.py +368 -0
  26. kontra/connectors/postgres.py +127 -0
  27. kontra/connectors/sqlserver.py +226 -0
  28. kontra/engine/__init__.py +0 -0
  29. kontra/engine/backends/duckdb_session.py +227 -0
  30. kontra/engine/backends/duckdb_utils.py +18 -0
  31. kontra/engine/backends/polars_backend.py +47 -0
  32. kontra/engine/engine.py +1205 -0
  33. kontra/engine/executors/__init__.py +15 -0
  34. kontra/engine/executors/base.py +50 -0
  35. kontra/engine/executors/database_base.py +528 -0
  36. kontra/engine/executors/duckdb_sql.py +607 -0
  37. kontra/engine/executors/postgres_sql.py +162 -0
  38. kontra/engine/executors/registry.py +69 -0
  39. kontra/engine/executors/sqlserver_sql.py +163 -0
  40. kontra/engine/materializers/__init__.py +14 -0
  41. kontra/engine/materializers/base.py +42 -0
  42. kontra/engine/materializers/duckdb.py +110 -0
  43. kontra/engine/materializers/factory.py +22 -0
  44. kontra/engine/materializers/polars_connector.py +131 -0
  45. kontra/engine/materializers/postgres.py +157 -0
  46. kontra/engine/materializers/registry.py +138 -0
  47. kontra/engine/materializers/sqlserver.py +160 -0
  48. kontra/engine/result.py +15 -0
  49. kontra/engine/sql_utils.py +611 -0
  50. kontra/engine/sql_validator.py +609 -0
  51. kontra/engine/stats.py +194 -0
  52. kontra/engine/types.py +138 -0
  53. kontra/errors.py +533 -0
  54. kontra/logging.py +85 -0
  55. kontra/preplan/__init__.py +5 -0
  56. kontra/preplan/planner.py +253 -0
  57. kontra/preplan/postgres.py +179 -0
  58. kontra/preplan/sqlserver.py +191 -0
  59. kontra/preplan/types.py +24 -0
  60. kontra/probes/__init__.py +20 -0
  61. kontra/probes/compare.py +400 -0
  62. kontra/probes/relationship.py +283 -0
  63. kontra/reporters/__init__.py +0 -0
  64. kontra/reporters/json_reporter.py +190 -0
  65. kontra/reporters/rich_reporter.py +11 -0
  66. kontra/rules/__init__.py +35 -0
  67. kontra/rules/base.py +186 -0
  68. kontra/rules/builtin/__init__.py +40 -0
  69. kontra/rules/builtin/allowed_values.py +156 -0
  70. kontra/rules/builtin/compare.py +188 -0
  71. kontra/rules/builtin/conditional_not_null.py +213 -0
  72. kontra/rules/builtin/conditional_range.py +310 -0
  73. kontra/rules/builtin/contains.py +138 -0
  74. kontra/rules/builtin/custom_sql_check.py +182 -0
  75. kontra/rules/builtin/disallowed_values.py +140 -0
  76. kontra/rules/builtin/dtype.py +203 -0
  77. kontra/rules/builtin/ends_with.py +129 -0
  78. kontra/rules/builtin/freshness.py +240 -0
  79. kontra/rules/builtin/length.py +193 -0
  80. kontra/rules/builtin/max_rows.py +35 -0
  81. kontra/rules/builtin/min_rows.py +46 -0
  82. kontra/rules/builtin/not_null.py +121 -0
  83. kontra/rules/builtin/range.py +222 -0
  84. kontra/rules/builtin/regex.py +143 -0
  85. kontra/rules/builtin/starts_with.py +129 -0
  86. kontra/rules/builtin/unique.py +124 -0
  87. kontra/rules/condition_parser.py +203 -0
  88. kontra/rules/execution_plan.py +455 -0
  89. kontra/rules/factory.py +103 -0
  90. kontra/rules/predicates.py +25 -0
  91. kontra/rules/registry.py +24 -0
  92. kontra/rules/static_predicates.py +120 -0
  93. kontra/scout/__init__.py +9 -0
  94. kontra/scout/backends/__init__.py +17 -0
  95. kontra/scout/backends/base.py +111 -0
  96. kontra/scout/backends/duckdb_backend.py +359 -0
  97. kontra/scout/backends/postgres_backend.py +519 -0
  98. kontra/scout/backends/sqlserver_backend.py +577 -0
  99. kontra/scout/dtype_mapping.py +150 -0
  100. kontra/scout/patterns.py +69 -0
  101. kontra/scout/profiler.py +801 -0
  102. kontra/scout/reporters/__init__.py +39 -0
  103. kontra/scout/reporters/json_reporter.py +165 -0
  104. kontra/scout/reporters/markdown_reporter.py +152 -0
  105. kontra/scout/reporters/rich_reporter.py +144 -0
  106. kontra/scout/store.py +208 -0
  107. kontra/scout/suggest.py +200 -0
  108. kontra/scout/types.py +652 -0
  109. kontra/state/__init__.py +29 -0
  110. kontra/state/backends/__init__.py +79 -0
  111. kontra/state/backends/base.py +348 -0
  112. kontra/state/backends/local.py +480 -0
  113. kontra/state/backends/postgres.py +1010 -0
  114. kontra/state/backends/s3.py +543 -0
  115. kontra/state/backends/sqlserver.py +969 -0
  116. kontra/state/fingerprint.py +166 -0
  117. kontra/state/types.py +1061 -0
  118. kontra/version.py +1 -0
  119. kontra-0.5.2.dist-info/METADATA +122 -0
  120. kontra-0.5.2.dist-info/RECORD +124 -0
  121. kontra-0.5.2.dist-info/WHEEL +5 -0
  122. kontra-0.5.2.dist-info/entry_points.txt +2 -0
  123. kontra-0.5.2.dist-info/licenses/LICENSE +17 -0
  124. kontra-0.5.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,611 @@
1
+ # src/kontra/engine/sql_utils.py
2
+ """
3
+ Shared SQL utilities for all database executors.
4
+
5
+ This module provides dialect-aware SQL escaping and common aggregate
6
+ expression builders to reduce code duplication across executors.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, List, Literal, Optional
12
+
13
+ Dialect = Literal["duckdb", "postgres", "sqlserver"]
14
+
15
+
16
+ # =============================================================================
17
+ # Identifier and Literal Escaping
18
+ # =============================================================================
19
+
20
+ def esc_ident(name: str, dialect: Dialect = "duckdb") -> str:
21
+ """
22
+ Escape a SQL identifier (column name, table name) for the given dialect.
23
+
24
+ - DuckDB/PostgreSQL: "name" with " doubled
25
+ - SQL Server: [name] with ] doubled
26
+ """
27
+ if dialect == "sqlserver":
28
+ return "[" + name.replace("]", "]]") + "]"
29
+ else: # duckdb, postgres
30
+ return '"' + name.replace('"', '""') + '"'
31
+
32
+
33
+ def lit_str(value: str, dialect: Dialect = "duckdb") -> str:
34
+ """
35
+ Escape a string literal for SQL. All dialects use single quotes.
36
+ """
37
+ return "'" + value.replace("'", "''") + "'"
38
+
39
+
40
+ def lit_value(value: Any, dialect: Dialect = "duckdb") -> str:
41
+ """
42
+ Convert a Python value to a SQL literal.
43
+ """
44
+ if value is None:
45
+ return "NULL"
46
+ elif isinstance(value, bool):
47
+ return "TRUE" if value else "FALSE"
48
+ elif isinstance(value, str):
49
+ return lit_str(value, dialect)
50
+ elif isinstance(value, (int, float)):
51
+ return str(value)
52
+ else:
53
+ return lit_str(str(value), dialect)
54
+
55
+
56
+ # =============================================================================
57
+ # Common Aggregate Expression Builders
58
+ # =============================================================================
59
+
60
+ def agg_not_null(col: str, rule_id: str, dialect: Dialect = "duckdb") -> str:
61
+ """Count NULL values in a column."""
62
+ c = esc_ident(col, dialect)
63
+ r = esc_ident(rule_id, dialect)
64
+ return f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS {r}"
65
+
66
+
67
+ def agg_unique(col: str, rule_id: str, dialect: Dialect = "duckdb") -> str:
68
+ """Count duplicate values in a column."""
69
+ c = esc_ident(col, dialect)
70
+ r = esc_ident(rule_id, dialect)
71
+ return f"(COUNT(*) - COUNT(DISTINCT {c})) AS {r}"
72
+
73
+
74
+ def agg_min_rows(threshold: int, rule_id: str, dialect: Dialect = "duckdb") -> str:
75
+ """Check if row count >= threshold. Returns deficit if below."""
76
+ r = esc_ident(rule_id, dialect)
77
+ n = int(threshold)
78
+ if dialect == "sqlserver":
79
+ # SQL Server doesn't have GREATEST
80
+ return f"CASE WHEN COUNT(*) >= {n} THEN 0 ELSE {n} - COUNT(*) END AS {r}"
81
+ else:
82
+ return f"GREATEST(0, {n} - COUNT(*)) AS {r}"
83
+
84
+
85
+ def agg_max_rows(threshold: int, rule_id: str, dialect: Dialect = "duckdb") -> str:
86
+ """Check if row count <= threshold. Returns excess if above."""
87
+ r = esc_ident(rule_id, dialect)
88
+ n = int(threshold)
89
+ if dialect == "sqlserver":
90
+ return f"CASE WHEN COUNT(*) <= {n} THEN 0 ELSE COUNT(*) - {n} END AS {r}"
91
+ else:
92
+ return f"GREATEST(0, COUNT(*) - {n}) AS {r}"
93
+
94
+
95
+ def agg_allowed_values(
96
+ col: str, values: List[Any], rule_id: str, dialect: Dialect = "duckdb"
97
+ ) -> str:
98
+ """Count values not in the allowed set."""
99
+ c = esc_ident(col, dialect)
100
+ r = esc_ident(rule_id, dialect)
101
+
102
+ val_list = ", ".join(
103
+ lit_str(str(v), dialect) if isinstance(v, str) else str(v)
104
+ for v in values
105
+ )
106
+
107
+ if dialect == "sqlserver":
108
+ cast_col = f"CAST({c} AS NVARCHAR(MAX))"
109
+ elif dialect == "postgres":
110
+ cast_col = f"{c}::text"
111
+ else:
112
+ cast_col = c
113
+
114
+ return (
115
+ f"SUM(CASE WHEN {c} IS NOT NULL AND {cast_col} NOT IN ({val_list}) "
116
+ f"THEN 1 ELSE 0 END) AS {r}"
117
+ )
118
+
119
+
120
+ def agg_freshness(
121
+ col: str, max_age_seconds: int, rule_id: str, dialect: Dialect = "duckdb"
122
+ ) -> str:
123
+ """Check if MAX(column) is within max_age_seconds of now."""
124
+ c = esc_ident(col, dialect)
125
+ r = esc_ident(rule_id, dialect)
126
+ secs = int(max_age_seconds)
127
+
128
+ if dialect == "sqlserver":
129
+ threshold = f"DATEADD(SECOND, -{secs}, GETUTCDATE())"
130
+ else: # duckdb, postgres use similar syntax
131
+ threshold = f"(NOW() - INTERVAL '{secs} seconds')"
132
+
133
+ return f"CASE WHEN MAX({c}) >= {threshold} THEN 0 ELSE 1 END AS {r}"
134
+
135
+
136
+ def agg_range(
137
+ col: str,
138
+ min_val: Optional[Any],
139
+ max_val: Optional[Any],
140
+ rule_id: str,
141
+ dialect: Dialect = "duckdb",
142
+ ) -> str:
143
+ """Count values outside [min, max] range. NULLs are failures."""
144
+ c = esc_ident(col, dialect)
145
+ r = esc_ident(rule_id, dialect)
146
+
147
+ conditions = []
148
+ if min_val is not None:
149
+ conditions.append(f"{c} < {min_val}")
150
+ if max_val is not None:
151
+ conditions.append(f"{c} > {max_val}")
152
+
153
+ out_of_range = " OR ".join(conditions) if conditions else "0=1"
154
+
155
+ return (
156
+ f"SUM(CASE WHEN {c} IS NULL OR ({out_of_range}) THEN 1 ELSE 0 END) AS {r}"
157
+ )
158
+
159
+
160
+ def agg_regex(
161
+ col: str, pattern: str, rule_id: str, dialect: Dialect = "duckdb"
162
+ ) -> str:
163
+ """Count values that don't match the regex pattern. NULLs are failures."""
164
+ c = esc_ident(col, dialect)
165
+ r = esc_ident(rule_id, dialect)
166
+ escaped_pattern = pattern.replace("'", "''")
167
+
168
+ if dialect == "sqlserver":
169
+ # SQL Server uses PATINDEX with LIKE-style patterns (limited regex)
170
+ return (
171
+ f"SUM(CASE WHEN {c} IS NULL "
172
+ f"OR PATINDEX('%{escaped_pattern}%', CAST({c} AS NVARCHAR(MAX))) = 0 "
173
+ f"THEN 1 ELSE 0 END) AS {r}"
174
+ )
175
+ elif dialect == "postgres":
176
+ # PostgreSQL uses ~ operator for regex
177
+ return (
178
+ f"SUM(CASE WHEN {c} IS NULL "
179
+ f"OR NOT ({c}::text ~ '{escaped_pattern}') "
180
+ f"THEN 1 ELSE 0 END) AS {r}"
181
+ )
182
+ else: # duckdb
183
+ # DuckDB uses regexp_matches()
184
+ return (
185
+ f"SUM(CASE WHEN {c} IS NULL "
186
+ f"OR NOT regexp_matches(CAST({c} AS VARCHAR), '{escaped_pattern}') "
187
+ f"THEN 1 ELSE 0 END) AS {r}"
188
+ )
189
+
190
+
191
+ # =============================================================================
192
+ # EXISTS Expression Builders (for early-termination patterns)
193
+ # =============================================================================
194
+
195
+ def exists_not_null(
196
+ col: str, rule_id: str, table: str, dialect: Dialect = "duckdb"
197
+ ) -> str:
198
+ """
199
+ EXISTS expression for not_null rule - stops at first NULL found.
200
+ Returns 1 if any NULL exists, 0 otherwise.
201
+ """
202
+ c = esc_ident(col, dialect)
203
+ r = esc_ident(rule_id, dialect)
204
+
205
+ if dialect == "sqlserver":
206
+ return (
207
+ f"(SELECT CASE WHEN EXISTS (SELECT 1 FROM {table} WHERE {c} IS NULL) "
208
+ f"THEN 1 ELSE 0 END) AS {r}"
209
+ )
210
+ else: # postgres, duckdb
211
+ return (
212
+ f"EXISTS (SELECT 1 FROM {table} WHERE {c} IS NULL LIMIT 1) AS {r}"
213
+ )
214
+
215
+
216
+ # =============================================================================
217
+ # Result Parsing
218
+ # =============================================================================
219
+
220
+ # SQL comparison operators
221
+ SQL_OP_MAP = {
222
+ ">": ">",
223
+ ">=": ">=",
224
+ "<": "<",
225
+ "<=": "<=",
226
+ "==": "=",
227
+ "!=": "<>",
228
+ }
229
+
230
+
231
+ def agg_compare(
232
+ left: str,
233
+ right: str,
234
+ op: str,
235
+ rule_id: str,
236
+ dialect: Dialect = "duckdb",
237
+ ) -> str:
238
+ """
239
+ Count rows where the comparison fails or either column is NULL.
240
+
241
+ Args:
242
+ left: Left column name
243
+ right: Right column name
244
+ op: Comparison operator (>, >=, <, <=, ==, !=)
245
+ rule_id: Rule identifier for alias
246
+ dialect: SQL dialect
247
+
248
+ Returns:
249
+ SQL aggregate expression
250
+ """
251
+ l = esc_ident(left, dialect)
252
+ r_col = esc_ident(right, dialect)
253
+ r_id = esc_ident(rule_id, dialect)
254
+ sql_op = SQL_OP_MAP.get(op, op)
255
+
256
+ # Count failures: NULL in either column OR comparison is false
257
+ return (
258
+ f"SUM(CASE WHEN {l} IS NULL OR {r_col} IS NULL "
259
+ f"OR NOT ({l} {sql_op} {r_col}) THEN 1 ELSE 0 END) AS {r_id}"
260
+ )
261
+
262
+
263
+ def agg_conditional_not_null(
264
+ column: str,
265
+ when_column: str,
266
+ when_op: str,
267
+ when_value: Any,
268
+ rule_id: str,
269
+ dialect: Dialect = "duckdb",
270
+ ) -> str:
271
+ """
272
+ Count rows where column is NULL when condition is met.
273
+
274
+ Args:
275
+ column: Column that must not be null
276
+ when_column: Column in the condition
277
+ when_op: Condition operator
278
+ when_value: Condition value
279
+ rule_id: Rule identifier for alias
280
+ dialect: SQL dialect
281
+
282
+ Returns:
283
+ SQL aggregate expression
284
+ """
285
+ col = esc_ident(column, dialect)
286
+ when_col = esc_ident(when_column, dialect)
287
+ r_id = esc_ident(rule_id, dialect)
288
+ sql_op = SQL_OP_MAP.get(when_op, when_op)
289
+
290
+ # Handle NULL value in condition
291
+ if when_value is None:
292
+ if when_op == "==":
293
+ condition = f"{when_col} IS NULL"
294
+ elif when_op == "!=":
295
+ condition = f"{when_col} IS NOT NULL"
296
+ else:
297
+ condition = "1=0" # Other operators with NULL -> always false
298
+ else:
299
+ val = lit_value(when_value, dialect)
300
+ condition = f"{when_col} {sql_op} {val}"
301
+
302
+ # Count failures: condition is TRUE AND column is NULL
303
+ return (
304
+ f"SUM(CASE WHEN ({condition}) AND {col} IS NULL THEN 1 ELSE 0 END) AS {r_id}"
305
+ )
306
+
307
+
308
+ def agg_conditional_range(
309
+ column: str,
310
+ when_column: str,
311
+ when_op: str,
312
+ when_value: Any,
313
+ min_val: Any,
314
+ max_val: Any,
315
+ rule_id: str,
316
+ dialect: Dialect = "duckdb",
317
+ ) -> str:
318
+ """
319
+ Count rows where column is outside range when condition is met.
320
+
321
+ Args:
322
+ column: Column to check range
323
+ when_column: Column in the condition
324
+ when_op: Condition operator
325
+ when_value: Condition value
326
+ min_val: Minimum allowed value (inclusive)
327
+ max_val: Maximum allowed value (inclusive)
328
+ rule_id: Rule identifier for alias
329
+ dialect: SQL dialect
330
+
331
+ Returns:
332
+ SQL aggregate expression
333
+ """
334
+ col = esc_ident(column, dialect)
335
+ when_col = esc_ident(when_column, dialect)
336
+ r_id = esc_ident(rule_id, dialect)
337
+ sql_op = SQL_OP_MAP.get(when_op, when_op)
338
+
339
+ # Handle NULL value in condition
340
+ if when_value is None:
341
+ if when_op == "==":
342
+ condition = f"{when_col} IS NULL"
343
+ elif when_op == "!=":
344
+ condition = f"{when_col} IS NOT NULL"
345
+ else:
346
+ condition = "1=0" # Other operators with NULL -> always false
347
+ else:
348
+ val = lit_value(when_value, dialect)
349
+ condition = f"{when_col} {sql_op} {val}"
350
+
351
+ # Build range violation part: NULL OR outside range
352
+ range_parts = [f"{col} IS NULL"]
353
+ if min_val is not None:
354
+ range_parts.append(f"{col} < {min_val}")
355
+ if max_val is not None:
356
+ range_parts.append(f"{col} > {max_val}")
357
+ range_violation = " OR ".join(range_parts)
358
+
359
+ # Count failures: condition is TRUE AND (column is NULL OR outside range)
360
+ return (
361
+ f"SUM(CASE WHEN ({condition}) AND ({range_violation}) THEN 1 ELSE 0 END) AS {r_id}"
362
+ )
363
+
364
+
365
+ # Mapping from rule kind to failure_mode
366
+ RULE_KIND_TO_FAILURE_MODE = {
367
+ "not_null": "null_values",
368
+ "unique": "duplicate_values",
369
+ "allowed_values": "novel_category",
370
+ "disallowed_values": "disallowed_value",
371
+ "min_rows": "row_count_low",
372
+ "max_rows": "row_count_high",
373
+ "range": "range_violation",
374
+ "length": "length_violation",
375
+ "freshness": "freshness_lag",
376
+ "regex": "pattern_mismatch",
377
+ "contains": "pattern_mismatch",
378
+ "starts_with": "pattern_mismatch",
379
+ "ends_with": "pattern_mismatch",
380
+ "dtype": "schema_drift",
381
+ "custom_sql_check": "custom_check_failed",
382
+ "compare": "comparison_failed",
383
+ "conditional_not_null": "conditional_null",
384
+ "conditional_range": "conditional_range_violation",
385
+ }
386
+
387
+
388
+ # =============================================================================
389
+ # String Validation Aggregate Expression Builders
390
+ # =============================================================================
391
+
392
+ def escape_like_pattern(value: str, escape_char: str = "\\") -> str:
393
+ """
394
+ Escape special characters in a LIKE pattern value.
395
+
396
+ LIKE special characters: %, _, and the escape character itself.
397
+
398
+ Args:
399
+ value: The literal string to escape
400
+ escape_char: The escape character to use (default: backslash)
401
+
402
+ Returns:
403
+ Escaped string safe for use in LIKE patterns
404
+ """
405
+ # Order matters: escape the escape char first
406
+ for c in (escape_char, "%", "_"):
407
+ value = value.replace(c, escape_char + c)
408
+ return value
409
+
410
+
411
+ def agg_disallowed_values(
412
+ col: str, values: List[Any], rule_id: str, dialect: Dialect = "duckdb"
413
+ ) -> str:
414
+ """
415
+ Count values that ARE in the disallowed set.
416
+
417
+ Inverse of allowed_values: fails if value IS in the list.
418
+ NULL values are NOT failures (NULL is not in any list).
419
+ """
420
+ c = esc_ident(col, dialect)
421
+ r = esc_ident(rule_id, dialect)
422
+
423
+ if not values:
424
+ # No disallowed values means nothing can fail
425
+ return f"0 AS {r}"
426
+
427
+ val_list = ", ".join(
428
+ lit_str(str(v), dialect) if isinstance(v, str) else str(v)
429
+ for v in values
430
+ if v is not None # NULL in disallowed list doesn't make sense
431
+ )
432
+
433
+ if dialect == "sqlserver":
434
+ cast_col = f"CAST({c} AS NVARCHAR(MAX))"
435
+ elif dialect == "postgres":
436
+ cast_col = f"{c}::text"
437
+ else:
438
+ cast_col = c
439
+
440
+ # Failure = value IS in the disallowed list (and not null)
441
+ return (
442
+ f"SUM(CASE WHEN {c} IS NOT NULL AND {cast_col} IN ({val_list}) "
443
+ f"THEN 1 ELSE 0 END) AS {r}"
444
+ )
445
+
446
+
447
+ def agg_length(
448
+ col: str,
449
+ min_len: Optional[int],
450
+ max_len: Optional[int],
451
+ rule_id: str,
452
+ dialect: Dialect = "duckdb",
453
+ ) -> str:
454
+ """
455
+ Count values where string length is outside [min_len, max_len].
456
+
457
+ NULL values are failures (can't measure length of NULL).
458
+ """
459
+ c = esc_ident(col, dialect)
460
+ r = esc_ident(rule_id, dialect)
461
+
462
+ # SQL Server uses LEN(), others use LENGTH()
463
+ if dialect == "sqlserver":
464
+ len_func = f"LEN({c})"
465
+ else:
466
+ len_func = f"LENGTH({c})"
467
+
468
+ conditions = [f"{c} IS NULL"]
469
+ if min_len is not None:
470
+ conditions.append(f"{len_func} < {int(min_len)}")
471
+ if max_len is not None:
472
+ conditions.append(f"{len_func} > {int(max_len)}")
473
+
474
+ violation = " OR ".join(conditions)
475
+ return f"SUM(CASE WHEN {violation} THEN 1 ELSE 0 END) AS {r}"
476
+
477
+
478
+ def agg_contains(
479
+ col: str, substring: str, rule_id: str, dialect: Dialect = "duckdb"
480
+ ) -> str:
481
+ """
482
+ Count values that do NOT contain the substring.
483
+
484
+ Uses LIKE for efficiency (faster than regex).
485
+ NULL values are failures.
486
+ """
487
+ c = esc_ident(col, dialect)
488
+ r = esc_ident(rule_id, dialect)
489
+
490
+ # Escape LIKE special characters in the substring
491
+ escaped = escape_like_pattern(substring)
492
+ pattern = f"%{escaped}%"
493
+
494
+ if dialect == "sqlserver":
495
+ # SQL Server LIKE is case-insensitive by default (depends on collation)
496
+ # Use ESCAPE clause for backslash
497
+ return (
498
+ f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
499
+ f"THEN 1 ELSE 0 END) AS {r}"
500
+ )
501
+ else:
502
+ # DuckDB and PostgreSQL
503
+ return (
504
+ f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
505
+ f"THEN 1 ELSE 0 END) AS {r}"
506
+ )
507
+
508
+
509
+ def agg_starts_with(
510
+ col: str, prefix: str, rule_id: str, dialect: Dialect = "duckdb"
511
+ ) -> str:
512
+ """
513
+ Count values that do NOT start with the prefix.
514
+
515
+ Uses LIKE for efficiency (faster than regex).
516
+ NULL values are failures.
517
+ """
518
+ c = esc_ident(col, dialect)
519
+ r = esc_ident(rule_id, dialect)
520
+
521
+ # Escape LIKE special characters in the prefix
522
+ escaped = escape_like_pattern(prefix)
523
+ pattern = f"{escaped}%"
524
+
525
+ return (
526
+ f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
527
+ f"THEN 1 ELSE 0 END) AS {r}"
528
+ )
529
+
530
+
531
+ def agg_ends_with(
532
+ col: str, suffix: str, rule_id: str, dialect: Dialect = "duckdb"
533
+ ) -> str:
534
+ """
535
+ Count values that do NOT end with the suffix.
536
+
537
+ Uses LIKE for efficiency (faster than regex).
538
+ NULL values are failures.
539
+ """
540
+ c = esc_ident(col, dialect)
541
+ r = esc_ident(rule_id, dialect)
542
+
543
+ # Escape LIKE special characters in the suffix
544
+ escaped = escape_like_pattern(suffix)
545
+ pattern = f"%{escaped}"
546
+
547
+ return (
548
+ f"SUM(CASE WHEN {c} IS NULL OR {c} NOT LIKE '{pattern}' ESCAPE '\\' "
549
+ f"THEN 1 ELSE 0 END) AS {r}"
550
+ )
551
+
552
+
553
+ def results_from_row(
554
+ columns: List[str],
555
+ values: tuple,
556
+ is_exists: bool = False,
557
+ rule_kinds: Optional[dict] = None,
558
+ ) -> List[dict]:
559
+ """
560
+ Convert a single-row SQL result to Kontra result format.
561
+
562
+ Args:
563
+ columns: Column names (rule IDs)
564
+ values: Result values
565
+ is_exists: If True, values are booleans (True=violation, False=pass)
566
+ If False, values are counts (0=pass, >0=violation count)
567
+ rule_kinds: Optional dict mapping rule_id -> rule_kind for failure_mode
568
+ """
569
+ rule_kinds = rule_kinds or {}
570
+ out = []
571
+ for i, col in enumerate(columns):
572
+ if col == "__no_sql_rules__":
573
+ continue
574
+
575
+ rule_id = col
576
+ val = values[i]
577
+
578
+ # Get failure_mode from rule kind
579
+ rule_kind = rule_kinds.get(rule_id)
580
+ failure_mode = RULE_KIND_TO_FAILURE_MODE.get(rule_kind) if rule_kind else None
581
+
582
+ if is_exists:
583
+ has_violation = bool(val) if val is not None else False
584
+ result = {
585
+ "rule_id": rule_id,
586
+ "passed": not has_violation,
587
+ "failed_count": 1 if has_violation else 0,
588
+ "message": "Passed" if not has_violation else "Failed",
589
+ "severity": "ERROR",
590
+ "actions_executed": [],
591
+ "execution_source": "sql",
592
+ }
593
+ if has_violation and failure_mode:
594
+ result["failure_mode"] = failure_mode
595
+ out.append(result)
596
+ else:
597
+ failed_count = int(val) if val is not None else 0
598
+ result = {
599
+ "rule_id": rule_id,
600
+ "passed": failed_count == 0,
601
+ "failed_count": failed_count,
602
+ "message": "Passed" if failed_count == 0 else "Failed",
603
+ "severity": "ERROR",
604
+ "actions_executed": [],
605
+ "execution_source": "sql",
606
+ }
607
+ if failed_count > 0 and failure_mode:
608
+ result["failure_mode"] = failure_mode
609
+ out.append(result)
610
+
611
+ return out