focus-validator 2.1.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {focus_validator-2.1.0 → focus_validator-2.2.0}/PKG-INFO +2 -1
  2. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/focus_to_duckdb_converter.py +599 -58
  3. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/parquet_data_loader.py +45 -13
  4. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/spec_rules.py +3 -0
  5. {focus_validator-2.1.0 → focus_validator-2.2.0}/pyproject.toml +2 -1
  6. {focus_validator-2.1.0 → focus_validator-2.2.0}/LICENSE +0 -0
  7. {focus_validator-2.1.0 → focus_validator-2.2.0}/README.md +0 -0
  8. {focus_validator-2.1.0 → focus_validator-2.2.0}/build.py +0 -0
  9. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/__init__.py +0 -0
  10. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config/logging.yaml +0 -0
  11. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/__init__.py +0 -0
  12. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/common.py +0 -0
  13. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/json_loader.py +0 -0
  14. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/plan_builder.py +0 -0
  15. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/rule.py +0 -0
  16. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/rule_dependency_resolver.py +0 -0
  17. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/__init__.py +0 -0
  18. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/csv_data_loader.py +0 -0
  19. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/csv_data_loader_pandas_backup.py +0 -0
  20. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/data_loader.py +0 -0
  21. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/exceptions.py +0 -0
  22. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/main.py +0 -0
  23. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/__init__.py +0 -0
  24. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter.py +0 -0
  25. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_console.py +0 -0
  26. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_unittest.py +0 -0
  27. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_validation_graph.py +0 -0
  28. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_web.py +0 -0
  29. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/__init__.py +0 -0
  30. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/currency_codes.csv +0 -0
  31. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/model-1.2.0.1.json +0 -0
  32. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/__init__.py +0 -0
  33. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/download_currency_codes.py +0 -0
  34. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/performance_logging.py +0 -0
  35. {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: focus_validator
3
- Version: 2.1.0
3
+ Version: 2.2.0
4
4
  Summary: FOCUS spec validator.
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.12,<4.0
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.14
11
11
  Requires-Dist: ddt (>=1.7.1,<2.0.0)
12
12
  Requires-Dist: duckdb (>=1.4.1,<2.0.0)
13
13
  Requires-Dist: graphviz (>=0.21,<0.22)
14
+ Requires-Dist: jsonschema (>=4.25.1,<5.0.0)
14
15
  Requires-Dist: multimethod (>=2.0,<2.1)
15
16
  Requires-Dist: numpy (>=1.26,<2.0)
16
17
  Requires-Dist: pandas (>=2,<3)
@@ -548,7 +548,9 @@ class TypeStringCheckGenerator(DuckDBCheckGenerator):
548
548
  """
549
549
 
550
550
  # Predicate SQL (for condition mode)
551
- predicate_sql = f"{col} IS NOT NULL AND typeof({col}) = 'VARCHAR'"
551
+ predicate_sql = self._apply_condition(
552
+ f"{col} IS NOT NULL AND typeof({col}) = 'VARCHAR'"
553
+ )
552
554
 
553
555
  return SQLQuery(
554
556
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -558,6 +560,42 @@ class TypeStringCheckGenerator(DuckDBCheckGenerator):
558
560
  return "type_string"
559
561
 
560
562
 
563
+ class TypeJSONCheckGenerator(DuckDBCheckGenerator):
564
+ REQUIRED_KEYS = {"ColumnName"}
565
+
566
+ def generateSql(self) -> SQLQuery:
567
+ col = self.params.ColumnName
568
+ keyword = self._get_validation_keyword()
569
+ message = self.errorMessage or f"{col} {keyword} be of type JSON."
570
+ msg_sql = message.replace("'", "''")
571
+
572
+ condition = f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))"
573
+ condition = self._apply_condition(condition)
574
+
575
+ requirement_sql = f"""
576
+ WITH invalid AS (
577
+ SELECT 1
578
+ FROM {{table_name}}
579
+ WHERE {condition}
580
+ )
581
+ SELECT
582
+ COUNT(*) AS violations,
583
+ CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
584
+ FROM invalid
585
+ """
586
+
587
+ predicate_sql = self._apply_condition(
588
+ f"{col} IS NOT NULL AND typeof({col}) = 'JSON'"
589
+ )
590
+
591
+ return SQLQuery(
592
+ requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
593
+ )
594
+
595
+ def getCheckType(self) -> str:
596
+ return "type_json"
597
+
598
+
561
599
  class TypeDecimalCheckGenerator(DuckDBCheckGenerator):
562
600
  REQUIRED_KEYS = {"ColumnName"}
563
601
 
@@ -590,7 +628,7 @@ class TypeDecimalCheckGenerator(DuckDBCheckGenerator):
590
628
  """
591
629
 
592
630
  # Predicate SQL (for condition mode)
593
- predicate_sql = (
631
+ predicate_sql = self._apply_condition(
594
632
  f"{col} IS NOT NULL AND typeof({col}) IN ('DECIMAL', 'DOUBLE', 'FLOAT')"
595
633
  )
596
634
 
@@ -639,7 +677,7 @@ class TypeDateTimeGenerator(DuckDBCheckGenerator):
639
677
  """
640
678
 
641
679
  # Predicate SQL (for condition mode)
642
- predicate_sql = (
680
+ predicate_sql = self._apply_condition(
643
681
  f"{col} IS NOT NULL "
644
682
  f"AND (typeof({col}) IN ('TIMESTAMP', 'TIMESTAMP_NS', 'TIMESTAMP WITH TIME ZONE', 'DATE') "
645
683
  f"OR ({col}::TEXT ~ '^[0-9]{{4}}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]Z$'))"
@@ -684,7 +722,9 @@ class FormatNumericGenerator(DuckDBCheckGenerator):
684
722
  """
685
723
 
686
724
  # Predicate SQL (for condition mode)
687
- predicate_sql = f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[+-]?([0-9]*[.])?[0-9]+([eE][+-]?[0-9]+)?$')"
725
+ predicate_sql = self._apply_condition(
726
+ f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[+-]?([0-9]*[.])?[0-9]+([eE][+-]?[0-9]+)?$')"
727
+ )
688
728
 
689
729
  return SQLQuery(
690
730
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -741,7 +781,9 @@ class FormatStringGenerator(DuckDBCheckGenerator):
741
781
  """
742
782
 
743
783
  # Predicate SQL (for condition mode)
744
- predicate_sql = f"{col} IS NOT NULL AND ({col}::TEXT ~ '^[\\x00-\\x7F]*$')"
784
+ predicate_sql = self._apply_condition(
785
+ f"{col} IS NOT NULL AND ({col}::TEXT ~ '^[\\x00-\\x7F]*$')"
786
+ )
745
787
 
746
788
  return SQLQuery(
747
789
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -806,7 +848,7 @@ class FormatDateTimeGenerator(DuckDBCheckGenerator):
806
848
  """
807
849
 
808
850
  # Predicate SQL (for condition mode)
809
- predicate_sql = (
851
+ predicate_sql = self._apply_condition(
810
852
  f"{col} IS NOT NULL "
811
853
  f"AND (typeof({col}) IN ('TIMESTAMP', 'TIMESTAMP_NS', 'TIMESTAMP WITH TIME ZONE', 'DATE') "
812
854
  f"OR (typeof({col}) = 'VARCHAR' AND {col}::TEXT ~ '^[0-9]{{4}}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]Z?$' "
@@ -857,7 +899,9 @@ class FormatBillingCurrencyCodeGenerator(DuckDBCheckGenerator):
857
899
  """
858
900
 
859
901
  # Predicate SQL (for condition mode)
860
- predicate_sql = f"{col} IS NOT NULL AND TRIM({col}::TEXT) IN ('{codes_list}')"
902
+ predicate_sql = self._apply_condition(
903
+ f"{col} IS NOT NULL AND TRIM({col}::TEXT) IN ('{codes_list}')"
904
+ )
861
905
 
862
906
  return SQLQuery(
863
907
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -897,7 +941,9 @@ class FormatCurrencyGenerator(DuckDBCheckGenerator):
897
941
  """
898
942
 
899
943
  # Predicate SQL (for condition mode)
900
- predicate_sql = f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[A-Z]{{3}}$')"
944
+ predicate_sql = self._apply_condition(
945
+ f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[A-Z]{{3}}$')"
946
+ )
901
947
 
902
948
  return SQLQuery(
903
949
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -1035,7 +1081,7 @@ class FormatUnitGenerator(DuckDBCheckGenerator):
1035
1081
  """
1036
1082
 
1037
1083
  # Predicate SQL (for condition mode)
1038
- predicate_sql = (
1084
+ predicate_sql = self._apply_condition(
1039
1085
  f"{col} IS NOT NULL AND regexp_matches({col}, '{combined_pattern}')"
1040
1086
  )
1041
1087
 
@@ -1081,16 +1127,10 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
1081
1127
  message = self.errorMessage or f"{col} {keyword} be valid JSON format"
1082
1128
  msg_sql = message.replace("'", "''")
1083
1129
 
1084
- # Requirement SQL (finds violations)
1085
- # Check if column is not null and either:
1086
- # 1. Cannot be cast to JSON, or
1087
- # 2. Is not a valid JSON string when treated as text
1088
- condition = (
1089
- f"{col} IS NOT NULL "
1090
- f"AND (TRY_CAST({col} AS JSON) IS NULL "
1091
- f"OR (typeof({col}) = 'VARCHAR' AND NOT json_valid({col}::TEXT)))"
1130
+ invalid_predicate = (
1131
+ f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))"
1092
1132
  )
1093
- condition = self._apply_condition(condition)
1133
+ condition = self._apply_condition(invalid_predicate)
1094
1134
 
1095
1135
  requirement_sql = f"""
1096
1136
  WITH invalid AS (
@@ -1104,11 +1144,8 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
1104
1144
  FROM invalid
1105
1145
  """
1106
1146
 
1107
- # Predicate SQL (for condition mode)
1108
- predicate_sql = (
1109
- f"{col} IS NOT NULL "
1110
- f"AND (TRY_CAST({col} AS JSON) IS NOT NULL "
1111
- f"OR (typeof({col}) = 'VARCHAR' AND json_valid({col}::TEXT)))"
1147
+ predicate_sql = self._apply_condition(
1148
+ f"{col} IS NOT NULL AND json_valid(CAST({col} AS VARCHAR))"
1112
1149
  )
1113
1150
 
1114
1151
  return SQLQuery(
@@ -1119,6 +1156,178 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
1119
1156
  return "format_json"
1120
1157
 
1121
1158
 
1159
+ class CheckJSONSchemaGenerator(DuckDBCheckGenerator):
1160
+ REQUIRED_KEYS = {"ColumnName", "SchemaId"}
1161
+ DEFAULTS = {"Path": "$"}
1162
+
1163
+ def getCheckType(self) -> str:
1164
+ return "json_schema"
1165
+
1166
+ def generateSql(self) -> SQLQuery:
1167
+ col = self.params.ColumnName
1168
+ schema_id = self.params.SchemaId
1169
+ keyword = self._get_validation_keyword()
1170
+ self.errorMessage = (
1171
+ self.errorMessage or f"{col} {keyword} conform to JSON Schema '{schema_id}'"
1172
+ )
1173
+ return SQLQuery(requirement_sql="SELECT 0 AS violations")
1174
+
1175
+ def _extract_path_value(self, payload: Any, path: str) -> Any:
1176
+ """Extract a value from a JSON payload using a limited JSONPath subset.
1177
+
1178
+ Supported: '$', '$.key', '$.key.nested', '$.key[0]'.
1179
+ Not supported: chained indices ('$.foo[0][1]'), bracket-key access
1180
+ ('$["foo bar"]'), wildcards, or filters.
1181
+ """
1182
+ if path == "$":
1183
+ return payload
1184
+
1185
+ if not path.startswith("$."):
1186
+ raise InvalidRuleException(
1187
+ f"Unsupported JSON path '{path}' for CheckJSONSchema in rule {self.rule_id}"
1188
+ )
1189
+
1190
+ current = payload
1191
+ for segment in path[2:].split("."):
1192
+ if current is None:
1193
+ return None
1194
+
1195
+ token = segment
1196
+ while token:
1197
+ array_match = re.match(
1198
+ r"^([A-Za-z_][A-Za-z0-9_]*)(\[(\d+)\])?(.*)$", token
1199
+ )
1200
+ if not array_match:
1201
+ raise InvalidRuleException(
1202
+ f"Unsupported JSON path segment '{segment}' for CheckJSONSchema in rule {self.rule_id}"
1203
+ )
1204
+
1205
+ key_name, _, array_idx, remainder = array_match.groups()
1206
+ if not isinstance(current, dict):
1207
+ return None
1208
+ current = current.get(key_name)
1209
+
1210
+ if array_idx is not None:
1211
+ if not isinstance(current, list):
1212
+ return None
1213
+ idx = int(array_idx)
1214
+ if idx >= len(current):
1215
+ return None
1216
+ current = current[idx]
1217
+
1218
+ token = remainder or ""
1219
+
1220
+ return current
1221
+
1222
+ def generateCheck(self) -> DuckDBColumnCheck:
1223
+ chk = super().generateCheck()
1224
+
1225
+ schema_map = getattr(self.params, "schemas", None) or {}
1226
+ schema_id = self.params.SchemaId
1227
+ schema_entry = schema_map.get(schema_id)
1228
+
1229
+ if not isinstance(schema_entry, dict) or "Schema" not in schema_entry:
1230
+ raise InvalidRuleException(
1231
+ f"SchemaId '{schema_id}' referenced by rule {self.rule_id} was not found in model Schemas"
1232
+ )
1233
+
1234
+ schema = schema_entry["Schema"]
1235
+ path = getattr(self.params, "Path", "$")
1236
+ col = self.params.ColumnName
1237
+ where_clauses = [f"{col} IS NOT NULL"]
1238
+ row_condition = (self.row_condition_sql or "").strip()
1239
+ if row_condition:
1240
+ where_clauses.append(f"({row_condition})")
1241
+
1242
+ query = f"SELECT {col} FROM {{table_name}} WHERE " + " AND ".join(where_clauses)
1243
+
1244
+ def _exec_json_schema(conn):
1245
+ try:
1246
+ from jsonschema import ( # type: ignore[import-untyped]
1247
+ Draft202012Validator,
1248
+ )
1249
+ except ModuleNotFoundError as exc:
1250
+ raise RuntimeError(
1251
+ "CheckJSONSchema requires the 'jsonschema' package to be installed"
1252
+ ) from exc
1253
+
1254
+ Draft202012Validator.check_schema(schema)
1255
+ validator = Draft202012Validator(schema)
1256
+ table_name = getattr(self.params, "table_name", "focus_data")
1257
+ sql = query.replace("{table_name}", table_name)
1258
+ sql = sql.replace("{table_name}", table_name)
1259
+ try:
1260
+ rows = conn.execute(sql).fetchall()
1261
+ except (duckdb.BinderException, duckdb.CatalogException) as exc:
1262
+ msg = str(exc)
1263
+ missing = []
1264
+ patterns = [
1265
+ r'Column with name ([A-Za-z0-9_"]+) does not exist',
1266
+ r'Referenced column "([A-Za-z0-9_]+)" not found',
1267
+ r'Binder Error: .*? column ([A-Za-z0-9_"]+)',
1268
+ r'"([A-Za-z0-9_]+)" not found',
1269
+ ]
1270
+ for pattern in patterns:
1271
+ for match in re.finditer(pattern, msg):
1272
+ col_name = match.group(1).strip('"')
1273
+ if col_name and col_name not in missing:
1274
+ missing.append(col_name)
1275
+
1276
+ missing_msg = (
1277
+ f"Missing columns: {', '.join(missing)}"
1278
+ if missing
1279
+ else "Missing required column(s)"
1280
+ )
1281
+ return False, {
1282
+ "violations": 1,
1283
+ "schema_id": schema_id,
1284
+ "message": f"{self.errorMessage}. {missing_msg}",
1285
+ "failure_reason": missing_msg,
1286
+ "error_type": "missing_columns",
1287
+ }
1288
+
1289
+ failure_messages: list[str] = []
1290
+ violations = 0
1291
+ for row_num, row in enumerate(rows, start=1):
1292
+ raw_value = row[0] if isinstance(row, (tuple, list)) else row
1293
+ try:
1294
+ payload = (
1295
+ json.loads(raw_value)
1296
+ if isinstance(raw_value, str)
1297
+ else raw_value
1298
+ )
1299
+ except Exception as exc:
1300
+ violations += 1
1301
+ failure_messages.append(f"row {row_num}: invalid JSON ({exc})")
1302
+ continue
1303
+
1304
+ instance = self._extract_path_value(payload, path)
1305
+ errors = sorted(
1306
+ validator.iter_errors(instance), key=lambda err: list(err.path)
1307
+ )
1308
+ if errors:
1309
+ violations += 1
1310
+ failure_messages.append(f"row {row_num}: {errors[0].message}")
1311
+
1312
+ ok = violations == 0
1313
+ details = {
1314
+ "violations": violations,
1315
+ "schema_id": schema_id,
1316
+ "message": (
1317
+ self.errorMessage
1318
+ if ok
1319
+ else f"{self.errorMessage}. First error: {failure_messages[0]}"
1320
+ ),
1321
+ }
1322
+ if failure_messages:
1323
+ details["failure_messages"] = failure_messages[:5]
1324
+ return ok, details
1325
+
1326
+ chk.special_executor = _exec_json_schema
1327
+ chk.meta["special_executor_kind"] = "json_schema"
1328
+ return chk
1329
+
1330
+
1122
1331
  class CheckValueGenerator(DuckDBCheckGenerator):
1123
1332
  REQUIRED_KEYS = {"ColumnName", "Value"}
1124
1333
 
@@ -1157,7 +1366,8 @@ class CheckValueGenerator(DuckDBCheckGenerator):
1157
1366
  """
1158
1367
 
1159
1368
  return SQLQuery(
1160
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate
1369
+ requirement_sql=requirement_sql.strip(),
1370
+ predicate_sql=self._apply_condition(predicate),
1161
1371
  )
1162
1372
 
1163
1373
  def get_sample_sql(self) -> str:
@@ -1246,7 +1456,8 @@ class CheckNotValueGenerator(DuckDBCheckGenerator):
1246
1456
  """
1247
1457
 
1248
1458
  return SQLQuery(
1249
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate
1459
+ requirement_sql=requirement_sql.strip(),
1460
+ predicate_sql=self._apply_condition(predicate),
1250
1461
  )
1251
1462
 
1252
1463
  def get_sample_sql(self) -> str:
@@ -1286,6 +1497,118 @@ class CheckNotValueGenerator(DuckDBCheckGenerator):
1286
1497
  return sql_query.get_predicate_sql()
1287
1498
 
1288
1499
 
1500
+ class CheckRegexMatchGenerator(DuckDBCheckGenerator):
1501
+ REQUIRED_KEYS = {"ColumnName", "Pattern"}
1502
+
1503
+ def generateSql(self) -> SQLQuery:
1504
+ col = self.params.ColumnName
1505
+ pattern = self.params.Pattern
1506
+ keyword = self._get_validation_keyword()
1507
+ pattern_sql = str(pattern).replace("'", "''")
1508
+ message = self.errorMessage or f"{col} {keyword} match regex '{pattern}'."
1509
+ msg_sql = message.replace("'", "''")
1510
+
1511
+ condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
1512
+ condition = self._apply_condition(condition)
1513
+
1514
+ requirement_sql = f"""
1515
+ WITH invalid AS (
1516
+ SELECT 1
1517
+ FROM {{table_name}}
1518
+ WHERE {condition}
1519
+ )
1520
+ SELECT
1521
+ COUNT(*) AS violations,
1522
+ CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
1523
+ FROM invalid
1524
+ """
1525
+
1526
+ predicate_sql = self._apply_condition(
1527
+ f"{col} IS NOT NULL AND regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
1528
+ )
1529
+
1530
+ return SQLQuery(
1531
+ requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
1532
+ )
1533
+
1534
+ def get_sample_sql(self) -> str:
1535
+ col = self.params.ColumnName
1536
+ pattern = self.params.Pattern
1537
+ pattern_sql = str(pattern).replace("'", "''")
1538
+
1539
+ condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
1540
+ condition = self._apply_condition(condition)
1541
+
1542
+ return f"""
1543
+ SELECT {col}
1544
+ FROM {{table_name}}
1545
+ WHERE {condition}
1546
+ """
1547
+
1548
+ @property
1549
+ def sample_sql(self) -> str:
1550
+ return self.get_sample_sql()
1551
+
1552
+ def getCheckType(self) -> str:
1553
+ return "check_regex_match"
1554
+
1555
+
1556
+ class CheckStringEndsWithGenerator(DuckDBCheckGenerator):
1557
+ REQUIRED_KEYS = {"ColumnName", "Value"}
1558
+
1559
+ def generateSql(self) -> SQLQuery:
1560
+ col = self.params.ColumnName
1561
+ value = self.params.Value
1562
+ keyword = self._get_validation_keyword()
1563
+ value_sql = str(value).replace("'", "''")
1564
+ message = self.errorMessage or f"{col} {keyword} end with '{value}'."
1565
+ msg_sql = message.replace("'", "''")
1566
+
1567
+ condition = f"{col} IS NOT NULL AND NOT ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
1568
+ condition = self._apply_condition(condition)
1569
+
1570
+ requirement_sql = f"""
1571
+ WITH invalid AS (
1572
+ SELECT 1
1573
+ FROM {{table_name}}
1574
+ WHERE {condition}
1575
+ )
1576
+ SELECT
1577
+ COUNT(*) AS violations,
1578
+ CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
1579
+ FROM invalid
1580
+ """
1581
+
1582
+ predicate_sql = self._apply_condition(
1583
+ f"{col} IS NOT NULL AND ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
1584
+ )
1585
+
1586
+ return SQLQuery(
1587
+ requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
1588
+ )
1589
+
1590
+ def get_sample_sql(self) -> str:
1591
+ col = self.params.ColumnName
1592
+ value = self.params.Value
1593
+ value_sql = str(value).replace("'", "''")
1594
+
1595
+ condition = f"{col} IS NOT NULL AND NOT ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
1596
+ condition = self._apply_condition(condition)
1597
+
1598
+ return f"""
1599
+ SELECT {col}
1600
+ FROM {{table_name}}
1601
+ WHERE {condition}
1602
+ """
1603
+
1604
+ @property
1605
+ def sample_sql(self) -> str:
1606
+ return self.get_sample_sql()
1607
+
1608
+ def getCheckType(self) -> str:
1609
+ return "check_string_ends_with"
1610
+
1611
+
1289
1612
  class CheckSameValueGenerator(DuckDBCheckGenerator):
1290
1613
  REQUIRED_KEYS = {"ColumnAName", "ColumnBName"}
1291
1614
 
@@ -1317,7 +1640,7 @@ class CheckSameValueGenerator(DuckDBCheckGenerator):
1317
1640
  """
1318
1641
 
1319
1642
  # Predicate SQL (for condition mode)
1320
- predicate_sql = (
1643
+ predicate_sql = self._apply_condition(
1321
1644
  f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} = {col_b}"
1322
1645
  )
1323
1646
 
@@ -1395,7 +1718,7 @@ class CheckNotSameValueGenerator(DuckDBCheckGenerator):
1395
1718
  """
1396
1719
 
1397
1720
  # Predicate SQL (for condition mode)
1398
- predicate_sql = (
1721
+ predicate_sql = self._apply_condition(
1399
1722
  f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} <> {col_b}"
1400
1723
  )
1401
1724
 
@@ -1469,7 +1792,9 @@ class ColumnByColumnEqualsColumnValueGenerator(DuckDBCheckGenerator):
1469
1792
  """
1470
1793
 
1471
1794
  # Predicate SQL (for condition mode)
1472
- predicate_sql = f"{a} IS NOT NULL AND {b} IS NOT NULL AND {r} IS NOT NULL AND ({a} * {b}) = {r}"
1795
+ predicate_sql = self._apply_condition(
1796
+ f"{a} IS NOT NULL AND {b} IS NOT NULL AND {r} IS NOT NULL AND ({a} * {b}) = {r}"
1797
+ )
1473
1798
 
1474
1799
  return SQLQuery(
1475
1800
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -1486,21 +1811,38 @@ class ColumnByColumnEqualsColumnValueGenerator(DuckDBCheckGenerator):
1486
1811
  return sql_query.get_predicate_sql()
1487
1812
 
1488
1813
 
1489
- class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
1814
+ class _CheckScalarComparisonGenerator(DuckDBCheckGenerator):
1815
+ """Base for single-column scalar comparison checks (>=, >, <=, ...).
1816
+
1817
+ Subclasses differ only by operator and wording, so they set:
1818
+ - PASS_OPERATOR: operator a valid value satisfies (e.g. ">=")
1819
+ - VIOLATION_OPERATOR: its negation, used to find violating rows (e.g. "<")
1820
+ - MESSAGE_PHRASE: human-readable phrase (e.g. "greater than or equal to")
1821
+ - CHECK_TYPE: value returned by getCheckType()
1822
+ """
1823
+
1490
1824
  REQUIRED_KEYS = {"ColumnName", "Value"}
1825
+ PASS_OPERATOR: ClassVar[str]
1826
+ VIOLATION_OPERATOR: ClassVar[str]
1827
+ MESSAGE_PHRASE: ClassVar[str]
1828
+ CHECK_TYPE: ClassVar[str]
1829
+
1830
+ def _violation_condition(self) -> str:
1831
+ col = self.params.ColumnName
1832
+ val = self.params.Value
1833
+ return f"{col} IS NOT NULL AND {col} {self.VIOLATION_OPERATOR} {self._lit(val)}"
1491
1834
 
1492
1835
  def generateSql(self) -> SQLQuery:
1493
1836
  col = self.params.ColumnName
1494
1837
  val = self.params.Value
1495
1838
  keyword = self._get_validation_keyword()
1496
1839
  message = (
1497
- self.errorMessage or f"{col} {keyword} be greater than or equal to {val}."
1840
+ self.errorMessage or f"{col} {keyword} be {self.MESSAGE_PHRASE} {val}."
1498
1841
  )
1499
1842
  msg_sql = message.replace("'", "''")
1500
1843
 
1501
1844
  # Requirement SQL (finds violations)
1502
- condition = f"{col} IS NOT NULL AND {col} < {val}"
1503
- condition = self._apply_condition(condition)
1845
+ condition = self._apply_condition(self._violation_condition())
1504
1846
 
1505
1847
  requirement_sql = f"""
1506
1848
  WITH invalid AS (
@@ -1515,7 +1857,9 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
1515
1857
  """
1516
1858
 
1517
1859
  # Predicate SQL (for condition mode)
1518
- predicate_sql = f"{col} IS NOT NULL AND {col} >= {self._lit(val)}"
1860
+ predicate_sql = self._apply_condition(
1861
+ f"{col} IS NOT NULL AND {col} {self.PASS_OPERATOR} {self._lit(val)}"
1862
+ )
1519
1863
 
1520
1864
  return SQLQuery(
1521
1865
  requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
@@ -1524,11 +1868,7 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
1524
1868
  def get_sample_sql(self) -> str:
1525
1869
  """Return SQL to fetch sample violating rows for display"""
1526
1870
  col = self.params.ColumnName
1527
- val = self.params.Value
1528
-
1529
- # Build condition to find violating rows (values less than the required minimum)
1530
- condition = f"{col} IS NOT NULL AND {col} < {val}"
1531
- condition = self._apply_condition(condition)
1871
+ condition = self._apply_condition(self._violation_condition())
1532
1872
 
1533
1873
  return f"""
1534
1874
  SELECT {col}
@@ -1542,7 +1882,7 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
1542
1882
  return self.get_sample_sql()
1543
1883
 
1544
1884
  def getCheckType(self) -> str:
1545
- return "check_greater_equal"
1885
+ return self.CHECK_TYPE
1546
1886
 
1547
1887
  def generatePredicate(self) -> str | None:
1548
1888
  """Backward compatibility wrapper"""
@@ -1552,6 +1892,88 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
1552
1892
  return sql_query.get_predicate_sql()
1553
1893
 
1554
1894
 
1895
+ class CheckGreaterOrEqualGenerator(_CheckScalarComparisonGenerator):
1896
+ PASS_OPERATOR = ">="
1897
+ VIOLATION_OPERATOR = "<"
1898
+ MESSAGE_PHRASE = "greater than or equal to"
1899
+ CHECK_TYPE = "check_greater_equal"
1900
+
1901
+
1902
+ class CheckGreaterThanGenerator(_CheckScalarComparisonGenerator):
1903
+ PASS_OPERATOR = ">"
1904
+ VIOLATION_OPERATOR = "<="
1905
+ MESSAGE_PHRASE = "greater than"
1906
+ CHECK_TYPE = "check_greater_than"
1907
+
1908
+
1909
+ class CheckLessOrEqualGenerator(_CheckScalarComparisonGenerator):
1910
+ PASS_OPERATOR = "<="
1911
+ VIOLATION_OPERATOR = ">"
1912
+ MESSAGE_PHRASE = "less than or equal to"
1913
+ CHECK_TYPE = "check_less_or_equal"
1914
+
1915
+
1916
+ class CheckColumnComparisonGenerator(DuckDBCheckGenerator):
1917
+ REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "Comparator"}
1918
+
1919
+ _VALID_COMPARATORS: ClassVar[Set[str]] = {"=", "!=", "<>", ">", ">=", "<", "<="}
1920
+
1921
+ def generateSql(self) -> SQLQuery:
1922
+ col_a = self.params.ColumnAName
1923
+ col_b = self.params.ColumnBName
1924
+ comparator = self.params.Comparator
1925
+ keyword = self._get_validation_keyword()
1926
+
1927
+ if comparator not in self._VALID_COMPARATORS:
1928
+ raise InvalidRuleException(
1929
+ f"Unsupported comparator for {self.rule_id}: {comparator}"
1930
+ )
1931
+
1932
+ message = self.errorMessage or f"{col_a} {keyword} be {comparator} {col_b}."
1933
+ msg_sql = message.replace("'", "''")
1934
+
1935
+ pass_predicate = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b}"
1936
+ condition = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND NOT ({col_a} {comparator} {col_b})"
1937
+ condition = self._apply_condition(condition)
1938
+
1939
+ requirement_sql = f"""
1940
+ WITH invalid AS (
1941
+ SELECT 1
1942
+ FROM {{table_name}}
1943
+ WHERE {condition}
1944
+ )
1945
+ SELECT
1946
+ COUNT(*) AS violations,
1947
+ CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
1948
+ FROM invalid
1949
+ """
1950
+
1951
+ return SQLQuery(
1952
+ requirement_sql=requirement_sql.strip(),
1953
+ predicate_sql=self._apply_condition(pass_predicate),
1954
+ )
1955
+
1956
+ def get_sample_sql(self) -> str:
1957
+ col_a = self.params.ColumnAName
1958
+ col_b = self.params.ColumnBName
1959
+ comparator = self.params.Comparator
1960
+ condition = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND NOT ({col_a} {comparator} {col_b})"
1961
+ condition = self._apply_condition(condition)
1962
+
1963
+ return f"""
1964
+ SELECT {col_a}, {col_b}
1965
+ FROM {{table_name}}
1966
+ WHERE {condition}
1967
+ """
1968
+
1969
+ @property
1970
+ def sample_sql(self) -> str:
1971
+ return self.get_sample_sql()
1972
+
1973
+ def getCheckType(self) -> str:
1974
+ return "check_column_comparison"
1975
+
1976
+
1555
1977
  class CheckDistinctCountGenerator(DuckDBCheckGenerator):
1556
1978
  REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "ExpectedCount"}
1557
1979
 
@@ -1605,6 +2027,66 @@ class CheckDistinctCountGenerator(DuckDBCheckGenerator):
1605
2027
  return "distinct_count"
1606
2028
 
1607
2029
 
2030
+ class CheckNoDuplicatesGenerator(DuckDBCheckGenerator):
2031
+ REQUIRED_KEYS = {"ColumnName"}
2032
+
2033
+ def generateSql(self) -> SQLQuery:
2034
+ col = self.params.ColumnName
2035
+ keyword = self._get_validation_keyword()
2036
+ message = self.errorMessage or f"{col} {keyword} contain no duplicate values."
2037
+ msg_sql = message.replace("'", "''")
2038
+
2039
+ where_clause = f"WHERE {col} IS NOT NULL"
2040
+ if self.row_condition_sql and self.row_condition_sql.strip():
2041
+ where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})"
2042
+
2043
+ requirement_sql = f"""
2044
+ WITH counts AS (
2045
+ SELECT {col} AS value, COUNT(*) AS occurrences
2046
+ FROM {{table_name}}
2047
+ {where_clause}
2048
+ GROUP BY {col}
2049
+ ),
2050
+ invalid AS (
2051
+ SELECT value, occurrences
2052
+ FROM counts
2053
+ WHERE occurrences > 1
2054
+ )
2055
+ SELECT
2056
+ COUNT(*) AS violations,
2057
+ CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
2058
+ FROM invalid
2059
+ """
2060
+
2061
+ return SQLQuery(requirement_sql=requirement_sql.strip(), predicate_sql=None)
2062
+
2063
+ def get_sample_sql(self) -> str:
2064
+ col = self.params.ColumnName
2065
+ where_clause = f"WHERE {col} IS NOT NULL"
2066
+ if self.row_condition_sql and self.row_condition_sql.strip():
2067
+ where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})"
2068
+
2069
+ return f"""
2070
+ WITH dupes AS (
2071
+ SELECT {col} AS value
2072
+ FROM {{table_name}}
2073
+ {where_clause}
2074
+ GROUP BY {col}
2075
+ HAVING COUNT(*) > 1
2076
+ )
2077
+ SELECT t.{col}
2078
+ FROM {{table_name}} t
2079
+ JOIN dupes d ON t.{col} = d.value
2080
+ """
2081
+
2082
+ @property
2083
+ def sample_sql(self) -> str:
2084
+ return self.get_sample_sql()
2085
+
2086
+ def getCheckType(self) -> str:
2087
+ return "check_no_duplicates"
2088
+
2089
+
1608
2090
  class CheckModelRuleGenerator(DuckDBCheckGenerator):
1609
2091
  REQUIRED_KEYS = {"ModelRuleId"}
1610
2092
 
@@ -1825,7 +2307,8 @@ class JSONCheckPathTypeGenerator(DuckDBCheckGenerator):
1825
2307
  """
1826
2308
 
1827
2309
  return SQLQuery(
1828
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
2310
+ requirement_sql=requirement_sql.strip(),
2311
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
1829
2312
  )
1830
2313
 
1831
2314
  def getCheckType(self) -> str:
@@ -1947,7 +2430,8 @@ class JSONCheckPathKeyValueFormatGenerator(DuckDBCheckGenerator):
1947
2430
  """
1948
2431
 
1949
2432
  return SQLQuery(
1950
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
2433
+ requirement_sql=requirement_sql.strip(),
2434
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
1951
2435
  )
1952
2436
 
1953
2437
  def getCheckType(self) -> str:
@@ -2080,7 +2564,8 @@ class JSONCheckPathKeyStartsWithGenerator(DuckDBCheckGenerator):
2080
2564
  """
2081
2565
 
2082
2566
  return SQLQuery(
2083
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
2567
+ requirement_sql=requirement_sql.strip(),
2568
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2084
2569
  )
2085
2570
 
2086
2571
  def getCheckType(self) -> str:
@@ -2213,7 +2698,8 @@ class JSONCheckPathKeyExistsGenerator(DuckDBCheckGenerator):
2213
2698
  """
2214
2699
 
2215
2700
  return SQLQuery(
2216
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
2701
+ requirement_sql=requirement_sql.strip(),
2702
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2217
2703
  )
2218
2704
 
2219
2705
  def getCheckType(self) -> str:
@@ -2384,7 +2870,8 @@ class JSONCheckPathValueGenerator(DuckDBCheckGenerator):
2384
2870
  """
2385
2871
 
2386
2872
  return SQLQuery(
2387
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
2873
+ requirement_sql=requirement_sql.strip(),
2874
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2388
2875
  )
2389
2876
 
2390
2877
  def getCheckType(self) -> str:
@@ -2548,7 +3035,8 @@ class JSONCheckPathNotValueGenerator(DuckDBCheckGenerator):
2548
3035
  """
2549
3036
 
2550
3037
  return SQLQuery(
2551
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3038
+ requirement_sql=requirement_sql.strip(),
3039
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2552
3040
  )
2553
3041
 
2554
3042
  def getCheckType(self) -> str:
@@ -2843,7 +3331,8 @@ class JSONCheckPathSameValueGenerator(DuckDBCheckGenerator):
2843
3331
  """
2844
3332
 
2845
3333
  return SQLQuery(
2846
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3334
+ requirement_sql=requirement_sql.strip(),
3335
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2847
3336
  )
2848
3337
 
2849
3338
  def getCheckType(self) -> str:
@@ -2948,7 +3437,8 @@ class JSONCheckPathNumericFormatGenerator(DuckDBCheckGenerator):
2948
3437
  """
2949
3438
 
2950
3439
  return SQLQuery(
2951
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3440
+ requirement_sql=requirement_sql.strip(),
3441
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
2952
3442
  )
2953
3443
 
2954
3444
  def getCheckType(self) -> str:
@@ -3127,7 +3617,8 @@ class JSONCheckPathUnitFormatGenerator(DuckDBCheckGenerator):
3127
3617
  """
3128
3618
 
3129
3619
  return SQLQuery(
3130
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3620
+ requirement_sql=requirement_sql.strip(),
3621
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3131
3622
  )
3132
3623
 
3133
3624
  def getCheckType(self) -> str:
@@ -3225,7 +3716,8 @@ class JSONCheckPathDistinctParentGenerator(DuckDBCheckGenerator):
3225
3716
  """
3226
3717
 
3227
3718
  return SQLQuery(
3228
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3719
+ requirement_sql=requirement_sql.strip(),
3720
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3229
3721
  )
3230
3722
 
3231
3723
  def getCheckType(self) -> str:
@@ -3281,7 +3773,7 @@ class FormatJSONFormatGenerator(DuckDBCheckGenerator):
3281
3773
 
3282
3774
  return SQLQuery(
3283
3775
  requirement_sql=requirement_sql.strip(),
3284
- predicate_sql=predicate_sql.strip(),
3776
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3285
3777
  )
3286
3778
 
3287
3779
  # Path provided - validate elements at that path
@@ -3360,7 +3852,8 @@ class FormatJSONFormatGenerator(DuckDBCheckGenerator):
3360
3852
  """
3361
3853
 
3362
3854
  return SQLQuery(
3363
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3855
+ requirement_sql=requirement_sql.strip(),
3856
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3364
3857
  )
3365
3858
 
3366
3859
  def getCheckType(self) -> str:
@@ -3465,7 +3958,8 @@ class JSONFormatStringGenerator(DuckDBCheckGenerator):
3465
3958
  """
3466
3959
 
3467
3960
  return SQLQuery(
3468
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
3961
+ requirement_sql=requirement_sql.strip(),
3962
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3469
3963
  )
3470
3964
 
3471
3965
  def getCheckType(self) -> str:
@@ -3652,7 +4146,8 @@ class JSONFormatUnitGenerator(DuckDBCheckGenerator):
3652
4146
  """
3653
4147
 
3654
4148
  return SQLQuery(
3655
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
4149
+ requirement_sql=requirement_sql.strip(),
4150
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3656
4151
  )
3657
4152
 
3658
4153
  def getCheckType(self) -> str:
@@ -3764,7 +4259,8 @@ class JSONFormatNumericGenerator(DuckDBCheckGenerator):
3764
4259
  """
3765
4260
 
3766
4261
  return SQLQuery(
3767
- requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql.strip()
4262
+ requirement_sql=requirement_sql.strip(),
4263
+ predicate_sql=self._apply_condition(predicate_sql.strip()),
3768
4264
  )
3769
4265
 
3770
4266
  def getCheckType(self) -> str:
@@ -4343,6 +4839,10 @@ class FocusToDuckDBSchemaConverter:
4343
4839
  "generator": TypeStringCheckGenerator,
4344
4840
  "factory": lambda args: "ColumnName",
4345
4841
  },
4842
+ "TypeJSON": {
4843
+ "generator": TypeJSONCheckGenerator,
4844
+ "factory": lambda args: "ColumnName",
4845
+ },
4346
4846
  "TypeDecimal": {
4347
4847
  "generator": TypeDecimalCheckGenerator,
4348
4848
  "factory": lambda args: "ColumnName",
@@ -4367,6 +4867,10 @@ class FocusToDuckDBSchemaConverter:
4367
4867
  "generator": FormatBillingCurrencyCodeGenerator,
4368
4868
  "factory": lambda args: "ColumnName",
4369
4869
  },
4870
+ "FormatJSON": {
4871
+ "generator": FormatJSONGenerator,
4872
+ "factory": lambda args: "ColumnName",
4873
+ },
4370
4874
  "FormatKeyValue": {
4371
4875
  "generator": FormatJSONGenerator,
4372
4876
  "factory": lambda args: "ColumnName",
@@ -4375,10 +4879,18 @@ class FocusToDuckDBSchemaConverter:
4375
4879
  "generator": FormatCurrencyGenerator,
4376
4880
  "factory": lambda args: "ColumnName",
4377
4881
  },
4882
+ "CheckColumnComparison": {
4883
+ "generator": CheckColumnComparisonGenerator,
4884
+ "factory": lambda args: "ColumnAName",
4885
+ },
4378
4886
  "CheckNationalCurrency": {
4379
4887
  "generator": FormatBillingCurrencyCodeGenerator,
4380
4888
  "factory": lambda args: "ColumnName",
4381
4889
  },
4890
+ "CheckGreaterThanValue": {
4891
+ "generator": CheckGreaterThanGenerator,
4892
+ "factory": lambda args: "ColumnName",
4893
+ },
4382
4894
  "FormatUnit": {
4383
4895
  "generator": FormatUnitGenerator,
4384
4896
  "factory": lambda args: "ColumnName",
@@ -4387,10 +4899,26 @@ class FocusToDuckDBSchemaConverter:
4387
4899
  "generator": CheckValueGenerator,
4388
4900
  "factory": lambda args: "ColumnName",
4389
4901
  },
4902
+ "CheckLessOrEqualThanValue": {
4903
+ "generator": CheckLessOrEqualGenerator,
4904
+ "factory": lambda args: "ColumnName",
4905
+ },
4390
4906
  "CheckNotValue": {
4391
4907
  "generator": CheckNotValueGenerator,
4392
4908
  "factory": lambda args: "ColumnName",
4393
4909
  },
4910
+ "CheckNoDuplicates": {
4911
+ "generator": CheckNoDuplicatesGenerator,
4912
+ "factory": lambda args: "ColumnName",
4913
+ },
4914
+ "CheckRegexMatch": {
4915
+ "generator": CheckRegexMatchGenerator,
4916
+ "factory": lambda args: "ColumnName",
4917
+ },
4918
+ "CheckStringEndsWith": {
4919
+ "generator": CheckStringEndsWithGenerator,
4920
+ "factory": lambda args: "ColumnName",
4921
+ },
4394
4922
  "CheckSameValue": {
4395
4923
  "generator": CheckSameValueGenerator,
4396
4924
  "factory": lambda args: "ColumnAName",
@@ -4415,6 +4943,10 @@ class FocusToDuckDBSchemaConverter:
4415
4943
  "generator": CheckModelRuleGenerator,
4416
4944
  "factory": lambda args: "ModelRuleId",
4417
4945
  },
4946
+ "CheckJSONSchema": {
4947
+ "generator": CheckJSONSchemaGenerator,
4948
+ "factory": lambda args: "ColumnName",
4949
+ },
4418
4950
  "AND": {
4419
4951
  "generator": CompositeANDRuleGenerator,
4420
4952
  "factory": lambda args: "Items",
@@ -4595,6 +5127,7 @@ class FocusToDuckDBSchemaConverter:
4595
5127
  transpile_dialect: Optional[str] = None,
4596
5128
  show_violations: bool = False,
4597
5129
  rules_version: Optional[str] = None,
5130
+ schemas: Optional[Dict[str, Any]] = None,
4598
5131
  ) -> None:
4599
5132
  self.log = logging.getLogger(f"{__name__}.{self.__class__.__qualname__}")
4600
5133
  self.conn: duckdb.DuckDBPyConnection | None = None
@@ -4608,6 +5141,7 @@ class FocusToDuckDBSchemaConverter:
4608
5141
  )
4609
5142
  self.show_violations = show_violations
4610
5143
  self.rules_version = rules_version
5144
+ self.schemas = schemas or {}
4611
5145
 
4612
5146
  # Build the effective CHECK_GENERATORS mapping for this version
4613
5147
  self.CHECK_GENERATORS = self._build_check_generators_for_version(rules_version)
@@ -5304,7 +5838,7 @@ class FocusToDuckDBSchemaConverter:
5304
5838
  gen_cls = reg["generator"]
5305
5839
 
5306
5840
  # Strip reserved + 'CheckFunction' and pass as-is (no aliasing)
5307
- reserved = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set()
5841
+ reserved: set = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set()
5308
5842
  params = {
5309
5843
  k: v
5310
5844
  for k, v in requirement.items()
@@ -5374,6 +5908,8 @@ class FocusToDuckDBSchemaConverter:
5374
5908
  rule_id=rule_id,
5375
5909
  plan=self.plan,
5376
5910
  conn=self.conn,
5911
+ schemas=self.schemas,
5912
+ table_name=self.table_name,
5377
5913
  parent_results_by_idx=parent_results_by_idx or {},
5378
5914
  parent_edges=parent_edges or (),
5379
5915
  row_condition_sql=row_condition_sql,
@@ -5683,7 +6219,7 @@ class FocusToDuckDBSchemaConverter:
5683
6219
  gen_cls = reg["generator"]
5684
6220
 
5685
6221
  # Basic required-key validation (optional)
5686
- required = getattr(gen_cls, "REQUIRED_KEYS", set()) or set()
6222
+ required: set = getattr(gen_cls, "REQUIRED_KEYS", set()) or set()
5687
6223
  missing = [k for k in required if k not in spec]
5688
6224
  if missing:
5689
6225
  # For conditions, you can choose to return None or raise
@@ -5911,15 +6447,20 @@ class FocusToDuckDBSchemaConverter:
5911
6447
  # Conformance reference / special executor (no SQL)
5912
6448
  special = getattr(check, "special_executor", None)
5913
6449
  if callable(special):
6450
+ special_kind = meta.get("special_executor_kind")
5914
6451
  return {
5915
6452
  "rule_id": rid,
5916
- "type": "reference",
6453
+ "type": "special",
5917
6454
  "check_type": ctype,
5918
6455
  "generator": meta.get("generator"),
5919
6456
  "row_condition_sql": meta.get("row_condition_sql"),
5920
6457
  "referenced": getattr(check, "referenced_rule_id", None),
5921
6458
  "sql": None, # executed by reference, not SQL
5922
- "note": "mirrors referenced rule outcome (no SQL)",
6459
+ "note": (
6460
+ "mirrors referenced rule outcome (no SQL)"
6461
+ if special_kind == "reference"
6462
+ else "executed via special executor (no SQL)"
6463
+ ),
5923
6464
  "must_satisfy": must_satisfy,
5924
6465
  }
5925
6466
 
@@ -58,14 +58,23 @@ class ParquetDataLoader:
58
58
  # Try multiple datetime parsing strategies
59
59
  converted = None
60
60
 
61
+ # A strategy succeeds only if parsing introduces no nulls
62
+ # beyond those already present (so nullable columns convert),
63
+ # and the column has at least one real value (an all-null
64
+ # column carries no evidence of being a datetime, so it is
65
+ # left as a string rather than coerced).
66
+ original_null_count = series.null_count()
67
+ has_values = original_null_count < series.len()
68
+
61
69
  # Strategy 1: Try ISO format with timezone
62
70
  try:
63
71
  candidate = series.str.to_datetime(
64
72
  format="%Y-%m-%dT%H:%M:%S%z", # ISO with timezone like -05:00
65
73
  strict=False,
66
74
  )
67
- # Check if conversion was successful (all values converted)
68
- if candidate.null_count() == 0:
75
+ # Accept if parsing added no new nulls (nullable columns)
76
+ # and at least one value actually parsed
77
+ if has_values and candidate.null_count() == original_null_count:
69
78
  converted = candidate
70
79
  except Exception:
71
80
  pass
@@ -77,8 +86,12 @@ class ParquetDataLoader:
77
86
  format="%Y-%m-%dT%H:%M:%SZ", # ISO with Z timezone
78
87
  strict=False,
79
88
  )
80
- # Check if conversion was successful (all values converted)
81
- if candidate.null_count() == 0:
89
+ # Accept if parsing added no new nulls (nullable
90
+ # columns) and at least one value actually parsed
91
+ if (
92
+ has_values
93
+ and candidate.null_count() == original_null_count
94
+ ):
82
95
  converted = candidate
83
96
  except Exception:
84
97
  pass
@@ -90,8 +103,12 @@ class ParquetDataLoader:
90
103
  format="%Y-%m-%d %H:%M:%S", # Space-separated format
91
104
  strict=False,
92
105
  )
93
- # Check if conversion was successful (all values converted)
94
- if candidate.null_count() == 0:
106
+ # Accept if parsing added no new nulls (nullable
107
+ # columns) and at least one value actually parsed
108
+ if (
109
+ has_values
110
+ and candidate.null_count() == original_null_count
111
+ ):
95
112
  converted = candidate
96
113
  except Exception:
97
114
  pass
@@ -102,8 +119,12 @@ class ParquetDataLoader:
102
119
  candidate = series.str.to_datetime(
103
120
  format="%Y-%m-%d", strict=False # Simple date format
104
121
  )
105
- # Check if conversion was successful (all values converted)
106
- if candidate.null_count() == 0:
122
+ # Accept if parsing added no new nulls (nullable
123
+ # columns) and at least one value actually parsed
124
+ if (
125
+ has_values
126
+ and candidate.null_count() == original_null_count
127
+ ):
107
128
  converted = candidate
108
129
  except Exception:
109
130
  pass
@@ -147,14 +168,21 @@ class ParquetDataLoader:
147
168
  series.name, converted_values, dtype=pl.Datetime("us")
148
169
  )
149
170
 
150
- # Check if we successfully converted all values
151
- if candidate.null_count() == 0:
171
+ # Accept if parsing added no new nulls (nullable
172
+ # columns) and at least one value actually parsed
173
+ if (
174
+ has_values
175
+ and candidate.null_count() == original_null_count
176
+ ):
152
177
  converted = candidate
153
178
 
154
179
  except Exception:
155
180
  pass
156
181
 
157
- # Strategy 6: Let Polars infer format (for fallback cases)
182
+ # Strategy 6: Let Polars infer the format (fallback for any
183
+ # single format strategies 1-5 did not match). Format inference
184
+ # cannot parse timezone-qualified ISO (trailing 'Z' or offsets),
185
+ # but strategies 1-2 already cover those.
158
186
  if converted is None:
159
187
  try:
160
188
  candidate = series.str.to_datetime(
@@ -163,8 +191,12 @@ class ParquetDataLoader:
163
191
  exact=False, # Allow partial matches
164
192
  cache=True, # Cache format inference
165
193
  )
166
- # For auto-inference, allow some nulls but require most values to convert
167
- if candidate.null_count() < len(candidate):
194
+ # Accept if parsing added no new nulls (nullable columns)
195
+ # and at least one value actually parsed
196
+ if (
197
+ has_values
198
+ and candidate.null_count() == original_null_count
199
+ ):
168
200
  converted = candidate
169
201
  except Exception:
170
202
  pass
@@ -442,6 +442,7 @@ class SpecRules:
442
442
 
443
443
  self.plan = val_plan
444
444
  self.column_types = column_types
445
+ self.model_data = model_data
445
446
  self._meta = {
446
447
  "json_rule_file": self.json_rule_file,
447
448
  "focus_dataset": self.focus_dataset,
@@ -482,6 +483,7 @@ class SpecRules:
482
483
  transpile_dialect=self.transpile_dialect,
483
484
  show_violations=show_violations,
484
485
  rules_version=self.rules_version,
486
+ schemas=getattr(self, "model_data", {}).get("Schemas", {}),
485
487
  )
486
488
  # 1) Let the converter prepare schemas, UDFs, temp views, etc.
487
489
  if connection is None:
@@ -620,6 +622,7 @@ class SpecRules:
620
622
  transpile_dialect=self.transpile_dialect,
621
623
  show_violations=False, # Not relevant for explain mode
622
624
  rules_version=self.rules_version,
625
+ schemas=getattr(self, "model_data", {}).get("Schemas", {}),
623
626
  )
624
627
 
625
628
  # Create a minimal connection for explain mode (converter needs it for initialization)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "focus_validator"
3
- version = "2.1.0"
3
+ version = "2.2.0"
4
4
  description = "FOCUS spec validator."
5
5
  authors = []
6
6
  readme = "README.md"
@@ -26,6 +26,7 @@ requests = "*"
26
26
  pandera = { version = "^0.26.1" }
27
27
  multimethod = ">=2.0,<2.1"
28
28
  sqlglot = "^27.28.1"
29
+ jsonschema = "^4.25.1"
29
30
  numpy = { version = "^1.26"}
30
31
  pytz = "^2025.2"
31
32
  pandasql = "^0.7.3"
File without changes