focus-validator 2.1.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {focus_validator-2.1.0 → focus_validator-2.2.0}/PKG-INFO +2 -1
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/focus_to_duckdb_converter.py +599 -58
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/parquet_data_loader.py +45 -13
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/spec_rules.py +3 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/pyproject.toml +2 -1
- {focus_validator-2.1.0 → focus_validator-2.2.0}/LICENSE +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/README.md +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/build.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config/logging.yaml +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/common.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/json_loader.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/plan_builder.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/rule.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/rule_dependency_resolver.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/csv_data_loader.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/csv_data_loader_pandas_backup.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/data_loader.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/exceptions.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/main.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_console.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_unittest.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_validation_graph.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_web.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/currency_codes.csv +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/rules/model-1.2.0.1.json +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/__init__.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/download_currency_codes.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/performance_logging.py +0 -0
- {focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: focus_validator
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: FOCUS spec validator.
|
|
5
5
|
License-File: LICENSE
|
|
6
6
|
Requires-Python: >=3.12,<4.0
|
|
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
11
11
|
Requires-Dist: ddt (>=1.7.1,<2.0.0)
|
|
12
12
|
Requires-Dist: duckdb (>=1.4.1,<2.0.0)
|
|
13
13
|
Requires-Dist: graphviz (>=0.21,<0.22)
|
|
14
|
+
Requires-Dist: jsonschema (>=4.25.1,<5.0.0)
|
|
14
15
|
Requires-Dist: multimethod (>=2.0,<2.1)
|
|
15
16
|
Requires-Dist: numpy (>=1.26,<2.0)
|
|
16
17
|
Requires-Dist: pandas (>=2,<3)
|
|
@@ -548,7 +548,9 @@ class TypeStringCheckGenerator(DuckDBCheckGenerator):
|
|
|
548
548
|
"""
|
|
549
549
|
|
|
550
550
|
# Predicate SQL (for condition mode)
|
|
551
|
-
predicate_sql =
|
|
551
|
+
predicate_sql = self._apply_condition(
|
|
552
|
+
f"{col} IS NOT NULL AND typeof({col}) = 'VARCHAR'"
|
|
553
|
+
)
|
|
552
554
|
|
|
553
555
|
return SQLQuery(
|
|
554
556
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -558,6 +560,42 @@ class TypeStringCheckGenerator(DuckDBCheckGenerator):
|
|
|
558
560
|
return "type_string"
|
|
559
561
|
|
|
560
562
|
|
|
563
|
+
class TypeJSONCheckGenerator(DuckDBCheckGenerator):
|
|
564
|
+
REQUIRED_KEYS = {"ColumnName"}
|
|
565
|
+
|
|
566
|
+
def generateSql(self) -> SQLQuery:
|
|
567
|
+
col = self.params.ColumnName
|
|
568
|
+
keyword = self._get_validation_keyword()
|
|
569
|
+
message = self.errorMessage or f"{col} {keyword} be of type JSON."
|
|
570
|
+
msg_sql = message.replace("'", "''")
|
|
571
|
+
|
|
572
|
+
condition = f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))"
|
|
573
|
+
condition = self._apply_condition(condition)
|
|
574
|
+
|
|
575
|
+
requirement_sql = f"""
|
|
576
|
+
WITH invalid AS (
|
|
577
|
+
SELECT 1
|
|
578
|
+
FROM {{table_name}}
|
|
579
|
+
WHERE {condition}
|
|
580
|
+
)
|
|
581
|
+
SELECT
|
|
582
|
+
COUNT(*) AS violations,
|
|
583
|
+
CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
|
|
584
|
+
FROM invalid
|
|
585
|
+
"""
|
|
586
|
+
|
|
587
|
+
predicate_sql = self._apply_condition(
|
|
588
|
+
f"{col} IS NOT NULL AND typeof({col}) = 'JSON'"
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
return SQLQuery(
|
|
592
|
+
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
def getCheckType(self) -> str:
|
|
596
|
+
return "type_json"
|
|
597
|
+
|
|
598
|
+
|
|
561
599
|
class TypeDecimalCheckGenerator(DuckDBCheckGenerator):
|
|
562
600
|
REQUIRED_KEYS = {"ColumnName"}
|
|
563
601
|
|
|
@@ -590,7 +628,7 @@ class TypeDecimalCheckGenerator(DuckDBCheckGenerator):
|
|
|
590
628
|
"""
|
|
591
629
|
|
|
592
630
|
# Predicate SQL (for condition mode)
|
|
593
|
-
predicate_sql = (
|
|
631
|
+
predicate_sql = self._apply_condition(
|
|
594
632
|
f"{col} IS NOT NULL AND typeof({col}) IN ('DECIMAL', 'DOUBLE', 'FLOAT')"
|
|
595
633
|
)
|
|
596
634
|
|
|
@@ -639,7 +677,7 @@ class TypeDateTimeGenerator(DuckDBCheckGenerator):
|
|
|
639
677
|
"""
|
|
640
678
|
|
|
641
679
|
# Predicate SQL (for condition mode)
|
|
642
|
-
predicate_sql = (
|
|
680
|
+
predicate_sql = self._apply_condition(
|
|
643
681
|
f"{col} IS NOT NULL "
|
|
644
682
|
f"AND (typeof({col}) IN ('TIMESTAMP', 'TIMESTAMP_NS', 'TIMESTAMP WITH TIME ZONE', 'DATE') "
|
|
645
683
|
f"OR ({col}::TEXT ~ '^[0-9]{{4}}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]Z$'))"
|
|
@@ -684,7 +722,9 @@ class FormatNumericGenerator(DuckDBCheckGenerator):
|
|
|
684
722
|
"""
|
|
685
723
|
|
|
686
724
|
# Predicate SQL (for condition mode)
|
|
687
|
-
predicate_sql =
|
|
725
|
+
predicate_sql = self._apply_condition(
|
|
726
|
+
f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[+-]?([0-9]*[.])?[0-9]+([eE][+-]?[0-9]+)?$')"
|
|
727
|
+
)
|
|
688
728
|
|
|
689
729
|
return SQLQuery(
|
|
690
730
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -741,7 +781,9 @@ class FormatStringGenerator(DuckDBCheckGenerator):
|
|
|
741
781
|
"""
|
|
742
782
|
|
|
743
783
|
# Predicate SQL (for condition mode)
|
|
744
|
-
predicate_sql =
|
|
784
|
+
predicate_sql = self._apply_condition(
|
|
785
|
+
f"{col} IS NOT NULL AND ({col}::TEXT ~ '^[\\x00-\\x7F]*$')"
|
|
786
|
+
)
|
|
745
787
|
|
|
746
788
|
return SQLQuery(
|
|
747
789
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -806,7 +848,7 @@ class FormatDateTimeGenerator(DuckDBCheckGenerator):
|
|
|
806
848
|
"""
|
|
807
849
|
|
|
808
850
|
# Predicate SQL (for condition mode)
|
|
809
|
-
predicate_sql = (
|
|
851
|
+
predicate_sql = self._apply_condition(
|
|
810
852
|
f"{col} IS NOT NULL "
|
|
811
853
|
f"AND (typeof({col}) IN ('TIMESTAMP', 'TIMESTAMP_NS', 'TIMESTAMP WITH TIME ZONE', 'DATE') "
|
|
812
854
|
f"OR (typeof({col}) = 'VARCHAR' AND {col}::TEXT ~ '^[0-9]{{4}}-[0-1][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]Z?$' "
|
|
@@ -857,7 +899,9 @@ class FormatBillingCurrencyCodeGenerator(DuckDBCheckGenerator):
|
|
|
857
899
|
"""
|
|
858
900
|
|
|
859
901
|
# Predicate SQL (for condition mode)
|
|
860
|
-
predicate_sql =
|
|
902
|
+
predicate_sql = self._apply_condition(
|
|
903
|
+
f"{col} IS NOT NULL AND TRIM({col}::TEXT) IN ('{codes_list}')"
|
|
904
|
+
)
|
|
861
905
|
|
|
862
906
|
return SQLQuery(
|
|
863
907
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -897,7 +941,9 @@ class FormatCurrencyGenerator(DuckDBCheckGenerator):
|
|
|
897
941
|
"""
|
|
898
942
|
|
|
899
943
|
# Predicate SQL (for condition mode)
|
|
900
|
-
predicate_sql =
|
|
944
|
+
predicate_sql = self._apply_condition(
|
|
945
|
+
f"{col} IS NOT NULL AND (TRIM({col}::TEXT) ~ '^[A-Z]{{3}}$')"
|
|
946
|
+
)
|
|
901
947
|
|
|
902
948
|
return SQLQuery(
|
|
903
949
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -1035,7 +1081,7 @@ class FormatUnitGenerator(DuckDBCheckGenerator):
|
|
|
1035
1081
|
"""
|
|
1036
1082
|
|
|
1037
1083
|
# Predicate SQL (for condition mode)
|
|
1038
|
-
predicate_sql = (
|
|
1084
|
+
predicate_sql = self._apply_condition(
|
|
1039
1085
|
f"{col} IS NOT NULL AND regexp_matches({col}, '{combined_pattern}')"
|
|
1040
1086
|
)
|
|
1041
1087
|
|
|
@@ -1081,16 +1127,10 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
|
|
|
1081
1127
|
message = self.errorMessage or f"{col} {keyword} be valid JSON format"
|
|
1082
1128
|
msg_sql = message.replace("'", "''")
|
|
1083
1129
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
# 1. Cannot be cast to JSON, or
|
|
1087
|
-
# 2. Is not a valid JSON string when treated as text
|
|
1088
|
-
condition = (
|
|
1089
|
-
f"{col} IS NOT NULL "
|
|
1090
|
-
f"AND (TRY_CAST({col} AS JSON) IS NULL "
|
|
1091
|
-
f"OR (typeof({col}) = 'VARCHAR' AND NOT json_valid({col}::TEXT)))"
|
|
1130
|
+
invalid_predicate = (
|
|
1131
|
+
f"{col} IS NOT NULL AND NOT json_valid(CAST({col} AS VARCHAR))"
|
|
1092
1132
|
)
|
|
1093
|
-
condition = self._apply_condition(
|
|
1133
|
+
condition = self._apply_condition(invalid_predicate)
|
|
1094
1134
|
|
|
1095
1135
|
requirement_sql = f"""
|
|
1096
1136
|
WITH invalid AS (
|
|
@@ -1104,11 +1144,8 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
|
|
|
1104
1144
|
FROM invalid
|
|
1105
1145
|
"""
|
|
1106
1146
|
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
f"{col} IS NOT NULL "
|
|
1110
|
-
f"AND (TRY_CAST({col} AS JSON) IS NOT NULL "
|
|
1111
|
-
f"OR (typeof({col}) = 'VARCHAR' AND json_valid({col}::TEXT)))"
|
|
1147
|
+
predicate_sql = self._apply_condition(
|
|
1148
|
+
f"{col} IS NOT NULL AND json_valid(CAST({col} AS VARCHAR))"
|
|
1112
1149
|
)
|
|
1113
1150
|
|
|
1114
1151
|
return SQLQuery(
|
|
@@ -1119,6 +1156,178 @@ class FormatJSONGenerator(DuckDBCheckGenerator):
|
|
|
1119
1156
|
return "format_json"
|
|
1120
1157
|
|
|
1121
1158
|
|
|
1159
|
+
class CheckJSONSchemaGenerator(DuckDBCheckGenerator):
|
|
1160
|
+
REQUIRED_KEYS = {"ColumnName", "SchemaId"}
|
|
1161
|
+
DEFAULTS = {"Path": "$"}
|
|
1162
|
+
|
|
1163
|
+
def getCheckType(self) -> str:
|
|
1164
|
+
return "json_schema"
|
|
1165
|
+
|
|
1166
|
+
def generateSql(self) -> SQLQuery:
|
|
1167
|
+
col = self.params.ColumnName
|
|
1168
|
+
schema_id = self.params.SchemaId
|
|
1169
|
+
keyword = self._get_validation_keyword()
|
|
1170
|
+
self.errorMessage = (
|
|
1171
|
+
self.errorMessage or f"{col} {keyword} conform to JSON Schema '{schema_id}'"
|
|
1172
|
+
)
|
|
1173
|
+
return SQLQuery(requirement_sql="SELECT 0 AS violations")
|
|
1174
|
+
|
|
1175
|
+
def _extract_path_value(self, payload: Any, path: str) -> Any:
|
|
1176
|
+
"""Extract a value from a JSON payload using a limited JSONPath subset.
|
|
1177
|
+
|
|
1178
|
+
Supported: '$', '$.key', '$.key.nested', '$.key[0]'.
|
|
1179
|
+
Not supported: chained indices ('$.foo[0][1]'), bracket-key access
|
|
1180
|
+
('$["foo bar"]'), wildcards, or filters.
|
|
1181
|
+
"""
|
|
1182
|
+
if path == "$":
|
|
1183
|
+
return payload
|
|
1184
|
+
|
|
1185
|
+
if not path.startswith("$."):
|
|
1186
|
+
raise InvalidRuleException(
|
|
1187
|
+
f"Unsupported JSON path '{path}' for CheckJSONSchema in rule {self.rule_id}"
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
current = payload
|
|
1191
|
+
for segment in path[2:].split("."):
|
|
1192
|
+
if current is None:
|
|
1193
|
+
return None
|
|
1194
|
+
|
|
1195
|
+
token = segment
|
|
1196
|
+
while token:
|
|
1197
|
+
array_match = re.match(
|
|
1198
|
+
r"^([A-Za-z_][A-Za-z0-9_]*)(\[(\d+)\])?(.*)$", token
|
|
1199
|
+
)
|
|
1200
|
+
if not array_match:
|
|
1201
|
+
raise InvalidRuleException(
|
|
1202
|
+
f"Unsupported JSON path segment '{segment}' for CheckJSONSchema in rule {self.rule_id}"
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
key_name, _, array_idx, remainder = array_match.groups()
|
|
1206
|
+
if not isinstance(current, dict):
|
|
1207
|
+
return None
|
|
1208
|
+
current = current.get(key_name)
|
|
1209
|
+
|
|
1210
|
+
if array_idx is not None:
|
|
1211
|
+
if not isinstance(current, list):
|
|
1212
|
+
return None
|
|
1213
|
+
idx = int(array_idx)
|
|
1214
|
+
if idx >= len(current):
|
|
1215
|
+
return None
|
|
1216
|
+
current = current[idx]
|
|
1217
|
+
|
|
1218
|
+
token = remainder or ""
|
|
1219
|
+
|
|
1220
|
+
return current
|
|
1221
|
+
|
|
1222
|
+
def generateCheck(self) -> DuckDBColumnCheck:
|
|
1223
|
+
chk = super().generateCheck()
|
|
1224
|
+
|
|
1225
|
+
schema_map = getattr(self.params, "schemas", None) or {}
|
|
1226
|
+
schema_id = self.params.SchemaId
|
|
1227
|
+
schema_entry = schema_map.get(schema_id)
|
|
1228
|
+
|
|
1229
|
+
if not isinstance(schema_entry, dict) or "Schema" not in schema_entry:
|
|
1230
|
+
raise InvalidRuleException(
|
|
1231
|
+
f"SchemaId '{schema_id}' referenced by rule {self.rule_id} was not found in model Schemas"
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
schema = schema_entry["Schema"]
|
|
1235
|
+
path = getattr(self.params, "Path", "$")
|
|
1236
|
+
col = self.params.ColumnName
|
|
1237
|
+
where_clauses = [f"{col} IS NOT NULL"]
|
|
1238
|
+
row_condition = (self.row_condition_sql or "").strip()
|
|
1239
|
+
if row_condition:
|
|
1240
|
+
where_clauses.append(f"({row_condition})")
|
|
1241
|
+
|
|
1242
|
+
query = f"SELECT {col} FROM {{table_name}} WHERE " + " AND ".join(where_clauses)
|
|
1243
|
+
|
|
1244
|
+
def _exec_json_schema(conn):
|
|
1245
|
+
try:
|
|
1246
|
+
from jsonschema import ( # type: ignore[import-untyped]
|
|
1247
|
+
Draft202012Validator,
|
|
1248
|
+
)
|
|
1249
|
+
except ModuleNotFoundError as exc:
|
|
1250
|
+
raise RuntimeError(
|
|
1251
|
+
"CheckJSONSchema requires the 'jsonschema' package to be installed"
|
|
1252
|
+
) from exc
|
|
1253
|
+
|
|
1254
|
+
Draft202012Validator.check_schema(schema)
|
|
1255
|
+
validator = Draft202012Validator(schema)
|
|
1256
|
+
table_name = getattr(self.params, "table_name", "focus_data")
|
|
1257
|
+
sql = query.replace("{table_name}", table_name)
|
|
1258
|
+
sql = sql.replace("{table_name}", table_name)
|
|
1259
|
+
try:
|
|
1260
|
+
rows = conn.execute(sql).fetchall()
|
|
1261
|
+
except (duckdb.BinderException, duckdb.CatalogException) as exc:
|
|
1262
|
+
msg = str(exc)
|
|
1263
|
+
missing = []
|
|
1264
|
+
patterns = [
|
|
1265
|
+
r'Column with name ([A-Za-z0-9_"]+) does not exist',
|
|
1266
|
+
r'Referenced column "([A-Za-z0-9_]+)" not found',
|
|
1267
|
+
r'Binder Error: .*? column ([A-Za-z0-9_"]+)',
|
|
1268
|
+
r'"([A-Za-z0-9_]+)" not found',
|
|
1269
|
+
]
|
|
1270
|
+
for pattern in patterns:
|
|
1271
|
+
for match in re.finditer(pattern, msg):
|
|
1272
|
+
col_name = match.group(1).strip('"')
|
|
1273
|
+
if col_name and col_name not in missing:
|
|
1274
|
+
missing.append(col_name)
|
|
1275
|
+
|
|
1276
|
+
missing_msg = (
|
|
1277
|
+
f"Missing columns: {', '.join(missing)}"
|
|
1278
|
+
if missing
|
|
1279
|
+
else "Missing required column(s)"
|
|
1280
|
+
)
|
|
1281
|
+
return False, {
|
|
1282
|
+
"violations": 1,
|
|
1283
|
+
"schema_id": schema_id,
|
|
1284
|
+
"message": f"{self.errorMessage}. {missing_msg}",
|
|
1285
|
+
"failure_reason": missing_msg,
|
|
1286
|
+
"error_type": "missing_columns",
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
failure_messages: list[str] = []
|
|
1290
|
+
violations = 0
|
|
1291
|
+
for row_num, row in enumerate(rows, start=1):
|
|
1292
|
+
raw_value = row[0] if isinstance(row, (tuple, list)) else row
|
|
1293
|
+
try:
|
|
1294
|
+
payload = (
|
|
1295
|
+
json.loads(raw_value)
|
|
1296
|
+
if isinstance(raw_value, str)
|
|
1297
|
+
else raw_value
|
|
1298
|
+
)
|
|
1299
|
+
except Exception as exc:
|
|
1300
|
+
violations += 1
|
|
1301
|
+
failure_messages.append(f"row {row_num}: invalid JSON ({exc})")
|
|
1302
|
+
continue
|
|
1303
|
+
|
|
1304
|
+
instance = self._extract_path_value(payload, path)
|
|
1305
|
+
errors = sorted(
|
|
1306
|
+
validator.iter_errors(instance), key=lambda err: list(err.path)
|
|
1307
|
+
)
|
|
1308
|
+
if errors:
|
|
1309
|
+
violations += 1
|
|
1310
|
+
failure_messages.append(f"row {row_num}: {errors[0].message}")
|
|
1311
|
+
|
|
1312
|
+
ok = violations == 0
|
|
1313
|
+
details = {
|
|
1314
|
+
"violations": violations,
|
|
1315
|
+
"schema_id": schema_id,
|
|
1316
|
+
"message": (
|
|
1317
|
+
self.errorMessage
|
|
1318
|
+
if ok
|
|
1319
|
+
else f"{self.errorMessage}. First error: {failure_messages[0]}"
|
|
1320
|
+
),
|
|
1321
|
+
}
|
|
1322
|
+
if failure_messages:
|
|
1323
|
+
details["failure_messages"] = failure_messages[:5]
|
|
1324
|
+
return ok, details
|
|
1325
|
+
|
|
1326
|
+
chk.special_executor = _exec_json_schema
|
|
1327
|
+
chk.meta["special_executor_kind"] = "json_schema"
|
|
1328
|
+
return chk
|
|
1329
|
+
|
|
1330
|
+
|
|
1122
1331
|
class CheckValueGenerator(DuckDBCheckGenerator):
|
|
1123
1332
|
REQUIRED_KEYS = {"ColumnName", "Value"}
|
|
1124
1333
|
|
|
@@ -1157,7 +1366,8 @@ class CheckValueGenerator(DuckDBCheckGenerator):
|
|
|
1157
1366
|
"""
|
|
1158
1367
|
|
|
1159
1368
|
return SQLQuery(
|
|
1160
|
-
requirement_sql=requirement_sql.strip(),
|
|
1369
|
+
requirement_sql=requirement_sql.strip(),
|
|
1370
|
+
predicate_sql=self._apply_condition(predicate),
|
|
1161
1371
|
)
|
|
1162
1372
|
|
|
1163
1373
|
def get_sample_sql(self) -> str:
|
|
@@ -1246,7 +1456,8 @@ class CheckNotValueGenerator(DuckDBCheckGenerator):
|
|
|
1246
1456
|
"""
|
|
1247
1457
|
|
|
1248
1458
|
return SQLQuery(
|
|
1249
|
-
requirement_sql=requirement_sql.strip(),
|
|
1459
|
+
requirement_sql=requirement_sql.strip(),
|
|
1460
|
+
predicate_sql=self._apply_condition(predicate),
|
|
1250
1461
|
)
|
|
1251
1462
|
|
|
1252
1463
|
def get_sample_sql(self) -> str:
|
|
@@ -1286,6 +1497,118 @@ class CheckNotValueGenerator(DuckDBCheckGenerator):
|
|
|
1286
1497
|
return sql_query.get_predicate_sql()
|
|
1287
1498
|
|
|
1288
1499
|
|
|
1500
|
+
class CheckRegexMatchGenerator(DuckDBCheckGenerator):
|
|
1501
|
+
REQUIRED_KEYS = {"ColumnName", "Pattern"}
|
|
1502
|
+
|
|
1503
|
+
def generateSql(self) -> SQLQuery:
|
|
1504
|
+
col = self.params.ColumnName
|
|
1505
|
+
pattern = self.params.Pattern
|
|
1506
|
+
keyword = self._get_validation_keyword()
|
|
1507
|
+
pattern_sql = str(pattern).replace("'", "''")
|
|
1508
|
+
message = self.errorMessage or f"{col} {keyword} match regex '{pattern}'."
|
|
1509
|
+
msg_sql = message.replace("'", "''")
|
|
1510
|
+
|
|
1511
|
+
condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
|
|
1512
|
+
condition = self._apply_condition(condition)
|
|
1513
|
+
|
|
1514
|
+
requirement_sql = f"""
|
|
1515
|
+
WITH invalid AS (
|
|
1516
|
+
SELECT 1
|
|
1517
|
+
FROM {{table_name}}
|
|
1518
|
+
WHERE {condition}
|
|
1519
|
+
)
|
|
1520
|
+
SELECT
|
|
1521
|
+
COUNT(*) AS violations,
|
|
1522
|
+
CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
|
|
1523
|
+
FROM invalid
|
|
1524
|
+
"""
|
|
1525
|
+
|
|
1526
|
+
predicate_sql = self._apply_condition(
|
|
1527
|
+
f"{col} IS NOT NULL AND regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
|
|
1528
|
+
)
|
|
1529
|
+
|
|
1530
|
+
return SQLQuery(
|
|
1531
|
+
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
def get_sample_sql(self) -> str:
|
|
1535
|
+
col = self.params.ColumnName
|
|
1536
|
+
pattern = self.params.Pattern
|
|
1537
|
+
pattern_sql = str(pattern).replace("'", "''")
|
|
1538
|
+
|
|
1539
|
+
condition = f"{col} IS NOT NULL AND NOT regexp_matches(CAST({col} AS VARCHAR), '{pattern_sql}')"
|
|
1540
|
+
condition = self._apply_condition(condition)
|
|
1541
|
+
|
|
1542
|
+
return f"""
|
|
1543
|
+
SELECT {col}
|
|
1544
|
+
FROM {{table_name}}
|
|
1545
|
+
WHERE {condition}
|
|
1546
|
+
"""
|
|
1547
|
+
|
|
1548
|
+
@property
|
|
1549
|
+
def sample_sql(self) -> str:
|
|
1550
|
+
return self.get_sample_sql()
|
|
1551
|
+
|
|
1552
|
+
def getCheckType(self) -> str:
|
|
1553
|
+
return "check_regex_match"
|
|
1554
|
+
|
|
1555
|
+
|
|
1556
|
+
class CheckStringEndsWithGenerator(DuckDBCheckGenerator):
|
|
1557
|
+
REQUIRED_KEYS = {"ColumnName", "Value"}
|
|
1558
|
+
|
|
1559
|
+
def generateSql(self) -> SQLQuery:
|
|
1560
|
+
col = self.params.ColumnName
|
|
1561
|
+
value = self.params.Value
|
|
1562
|
+
keyword = self._get_validation_keyword()
|
|
1563
|
+
value_sql = str(value).replace("'", "''")
|
|
1564
|
+
message = self.errorMessage or f"{col} {keyword} end with '{value}'."
|
|
1565
|
+
msg_sql = message.replace("'", "''")
|
|
1566
|
+
|
|
1567
|
+
condition = f"{col} IS NOT NULL AND NOT ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
|
|
1568
|
+
condition = self._apply_condition(condition)
|
|
1569
|
+
|
|
1570
|
+
requirement_sql = f"""
|
|
1571
|
+
WITH invalid AS (
|
|
1572
|
+
SELECT 1
|
|
1573
|
+
FROM {{table_name}}
|
|
1574
|
+
WHERE {condition}
|
|
1575
|
+
)
|
|
1576
|
+
SELECT
|
|
1577
|
+
COUNT(*) AS violations,
|
|
1578
|
+
CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
|
|
1579
|
+
FROM invalid
|
|
1580
|
+
"""
|
|
1581
|
+
|
|
1582
|
+
predicate_sql = self._apply_condition(
|
|
1583
|
+
f"{col} IS NOT NULL AND ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
|
|
1584
|
+
)
|
|
1585
|
+
|
|
1586
|
+
return SQLQuery(
|
|
1587
|
+
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
def get_sample_sql(self) -> str:
|
|
1591
|
+
col = self.params.ColumnName
|
|
1592
|
+
value = self.params.Value
|
|
1593
|
+
value_sql = str(value).replace("'", "''")
|
|
1594
|
+
|
|
1595
|
+
condition = f"{col} IS NOT NULL AND NOT ends_with(CAST({col} AS VARCHAR), '{value_sql}')"
|
|
1596
|
+
condition = self._apply_condition(condition)
|
|
1597
|
+
|
|
1598
|
+
return f"""
|
|
1599
|
+
SELECT {col}
|
|
1600
|
+
FROM {{table_name}}
|
|
1601
|
+
WHERE {condition}
|
|
1602
|
+
"""
|
|
1603
|
+
|
|
1604
|
+
@property
|
|
1605
|
+
def sample_sql(self) -> str:
|
|
1606
|
+
return self.get_sample_sql()
|
|
1607
|
+
|
|
1608
|
+
def getCheckType(self) -> str:
|
|
1609
|
+
return "check_string_ends_with"
|
|
1610
|
+
|
|
1611
|
+
|
|
1289
1612
|
class CheckSameValueGenerator(DuckDBCheckGenerator):
|
|
1290
1613
|
REQUIRED_KEYS = {"ColumnAName", "ColumnBName"}
|
|
1291
1614
|
|
|
@@ -1317,7 +1640,7 @@ class CheckSameValueGenerator(DuckDBCheckGenerator):
|
|
|
1317
1640
|
"""
|
|
1318
1641
|
|
|
1319
1642
|
# Predicate SQL (for condition mode)
|
|
1320
|
-
predicate_sql = (
|
|
1643
|
+
predicate_sql = self._apply_condition(
|
|
1321
1644
|
f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} = {col_b}"
|
|
1322
1645
|
)
|
|
1323
1646
|
|
|
@@ -1395,7 +1718,7 @@ class CheckNotSameValueGenerator(DuckDBCheckGenerator):
|
|
|
1395
1718
|
"""
|
|
1396
1719
|
|
|
1397
1720
|
# Predicate SQL (for condition mode)
|
|
1398
|
-
predicate_sql = (
|
|
1721
|
+
predicate_sql = self._apply_condition(
|
|
1399
1722
|
f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} <> {col_b}"
|
|
1400
1723
|
)
|
|
1401
1724
|
|
|
@@ -1469,7 +1792,9 @@ class ColumnByColumnEqualsColumnValueGenerator(DuckDBCheckGenerator):
|
|
|
1469
1792
|
"""
|
|
1470
1793
|
|
|
1471
1794
|
# Predicate SQL (for condition mode)
|
|
1472
|
-
predicate_sql =
|
|
1795
|
+
predicate_sql = self._apply_condition(
|
|
1796
|
+
f"{a} IS NOT NULL AND {b} IS NOT NULL AND {r} IS NOT NULL AND ({a} * {b}) = {r}"
|
|
1797
|
+
)
|
|
1473
1798
|
|
|
1474
1799
|
return SQLQuery(
|
|
1475
1800
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -1486,21 +1811,38 @@ class ColumnByColumnEqualsColumnValueGenerator(DuckDBCheckGenerator):
|
|
|
1486
1811
|
return sql_query.get_predicate_sql()
|
|
1487
1812
|
|
|
1488
1813
|
|
|
1489
|
-
class
|
|
1814
|
+
class _CheckScalarComparisonGenerator(DuckDBCheckGenerator):
|
|
1815
|
+
"""Base for single-column scalar comparison checks (>=, >, <=, ...).
|
|
1816
|
+
|
|
1817
|
+
Subclasses differ only by operator and wording, so they set:
|
|
1818
|
+
- PASS_OPERATOR: operator a valid value satisfies (e.g. ">=")
|
|
1819
|
+
- VIOLATION_OPERATOR: its negation, used to find violating rows (e.g. "<")
|
|
1820
|
+
- MESSAGE_PHRASE: human-readable phrase (e.g. "greater than or equal to")
|
|
1821
|
+
- CHECK_TYPE: value returned by getCheckType()
|
|
1822
|
+
"""
|
|
1823
|
+
|
|
1490
1824
|
REQUIRED_KEYS = {"ColumnName", "Value"}
|
|
1825
|
+
PASS_OPERATOR: ClassVar[str]
|
|
1826
|
+
VIOLATION_OPERATOR: ClassVar[str]
|
|
1827
|
+
MESSAGE_PHRASE: ClassVar[str]
|
|
1828
|
+
CHECK_TYPE: ClassVar[str]
|
|
1829
|
+
|
|
1830
|
+
def _violation_condition(self) -> str:
|
|
1831
|
+
col = self.params.ColumnName
|
|
1832
|
+
val = self.params.Value
|
|
1833
|
+
return f"{col} IS NOT NULL AND {col} {self.VIOLATION_OPERATOR} {self._lit(val)}"
|
|
1491
1834
|
|
|
1492
1835
|
def generateSql(self) -> SQLQuery:
|
|
1493
1836
|
col = self.params.ColumnName
|
|
1494
1837
|
val = self.params.Value
|
|
1495
1838
|
keyword = self._get_validation_keyword()
|
|
1496
1839
|
message = (
|
|
1497
|
-
self.errorMessage or f"{col} {keyword} be
|
|
1840
|
+
self.errorMessage or f"{col} {keyword} be {self.MESSAGE_PHRASE} {val}."
|
|
1498
1841
|
)
|
|
1499
1842
|
msg_sql = message.replace("'", "''")
|
|
1500
1843
|
|
|
1501
1844
|
# Requirement SQL (finds violations)
|
|
1502
|
-
condition =
|
|
1503
|
-
condition = self._apply_condition(condition)
|
|
1845
|
+
condition = self._apply_condition(self._violation_condition())
|
|
1504
1846
|
|
|
1505
1847
|
requirement_sql = f"""
|
|
1506
1848
|
WITH invalid AS (
|
|
@@ -1515,7 +1857,9 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
|
|
|
1515
1857
|
"""
|
|
1516
1858
|
|
|
1517
1859
|
# Predicate SQL (for condition mode)
|
|
1518
|
-
predicate_sql =
|
|
1860
|
+
predicate_sql = self._apply_condition(
|
|
1861
|
+
f"{col} IS NOT NULL AND {col} {self.PASS_OPERATOR} {self._lit(val)}"
|
|
1862
|
+
)
|
|
1519
1863
|
|
|
1520
1864
|
return SQLQuery(
|
|
1521
1865
|
requirement_sql=requirement_sql.strip(), predicate_sql=predicate_sql
|
|
@@ -1524,11 +1868,7 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
|
|
|
1524
1868
|
def get_sample_sql(self) -> str:
|
|
1525
1869
|
"""Return SQL to fetch sample violating rows for display"""
|
|
1526
1870
|
col = self.params.ColumnName
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
# Build condition to find violating rows (values less than the required minimum)
|
|
1530
|
-
condition = f"{col} IS NOT NULL AND {col} < {val}"
|
|
1531
|
-
condition = self._apply_condition(condition)
|
|
1871
|
+
condition = self._apply_condition(self._violation_condition())
|
|
1532
1872
|
|
|
1533
1873
|
return f"""
|
|
1534
1874
|
SELECT {col}
|
|
@@ -1542,7 +1882,7 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
|
|
|
1542
1882
|
return self.get_sample_sql()
|
|
1543
1883
|
|
|
1544
1884
|
def getCheckType(self) -> str:
|
|
1545
|
-
return
|
|
1885
|
+
return self.CHECK_TYPE
|
|
1546
1886
|
|
|
1547
1887
|
def generatePredicate(self) -> str | None:
|
|
1548
1888
|
"""Backward compatibility wrapper"""
|
|
@@ -1552,6 +1892,88 @@ class CheckGreaterOrEqualGenerator(DuckDBCheckGenerator):
|
|
|
1552
1892
|
return sql_query.get_predicate_sql()
|
|
1553
1893
|
|
|
1554
1894
|
|
|
1895
|
+
class CheckGreaterOrEqualGenerator(_CheckScalarComparisonGenerator):
|
|
1896
|
+
PASS_OPERATOR = ">="
|
|
1897
|
+
VIOLATION_OPERATOR = "<"
|
|
1898
|
+
MESSAGE_PHRASE = "greater than or equal to"
|
|
1899
|
+
CHECK_TYPE = "check_greater_equal"
|
|
1900
|
+
|
|
1901
|
+
|
|
1902
|
+
class CheckGreaterThanGenerator(_CheckScalarComparisonGenerator):
|
|
1903
|
+
PASS_OPERATOR = ">"
|
|
1904
|
+
VIOLATION_OPERATOR = "<="
|
|
1905
|
+
MESSAGE_PHRASE = "greater than"
|
|
1906
|
+
CHECK_TYPE = "check_greater_than"
|
|
1907
|
+
|
|
1908
|
+
|
|
1909
|
+
class CheckLessOrEqualGenerator(_CheckScalarComparisonGenerator):
|
|
1910
|
+
PASS_OPERATOR = "<="
|
|
1911
|
+
VIOLATION_OPERATOR = ">"
|
|
1912
|
+
MESSAGE_PHRASE = "less than or equal to"
|
|
1913
|
+
CHECK_TYPE = "check_less_or_equal"
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
class CheckColumnComparisonGenerator(DuckDBCheckGenerator):
|
|
1917
|
+
REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "Comparator"}
|
|
1918
|
+
|
|
1919
|
+
_VALID_COMPARATORS: ClassVar[Set[str]] = {"=", "!=", "<>", ">", ">=", "<", "<="}
|
|
1920
|
+
|
|
1921
|
+
def generateSql(self) -> SQLQuery:
|
|
1922
|
+
col_a = self.params.ColumnAName
|
|
1923
|
+
col_b = self.params.ColumnBName
|
|
1924
|
+
comparator = self.params.Comparator
|
|
1925
|
+
keyword = self._get_validation_keyword()
|
|
1926
|
+
|
|
1927
|
+
if comparator not in self._VALID_COMPARATORS:
|
|
1928
|
+
raise InvalidRuleException(
|
|
1929
|
+
f"Unsupported comparator for {self.rule_id}: {comparator}"
|
|
1930
|
+
)
|
|
1931
|
+
|
|
1932
|
+
message = self.errorMessage or f"{col_a} {keyword} be {comparator} {col_b}."
|
|
1933
|
+
msg_sql = message.replace("'", "''")
|
|
1934
|
+
|
|
1935
|
+
pass_predicate = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND {col_a} {comparator} {col_b}"
|
|
1936
|
+
condition = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND NOT ({col_a} {comparator} {col_b})"
|
|
1937
|
+
condition = self._apply_condition(condition)
|
|
1938
|
+
|
|
1939
|
+
requirement_sql = f"""
|
|
1940
|
+
WITH invalid AS (
|
|
1941
|
+
SELECT 1
|
|
1942
|
+
FROM {{table_name}}
|
|
1943
|
+
WHERE {condition}
|
|
1944
|
+
)
|
|
1945
|
+
SELECT
|
|
1946
|
+
COUNT(*) AS violations,
|
|
1947
|
+
CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
|
|
1948
|
+
FROM invalid
|
|
1949
|
+
"""
|
|
1950
|
+
|
|
1951
|
+
return SQLQuery(
|
|
1952
|
+
requirement_sql=requirement_sql.strip(),
|
|
1953
|
+
predicate_sql=self._apply_condition(pass_predicate),
|
|
1954
|
+
)
|
|
1955
|
+
|
|
1956
|
+
def get_sample_sql(self) -> str:
|
|
1957
|
+
col_a = self.params.ColumnAName
|
|
1958
|
+
col_b = self.params.ColumnBName
|
|
1959
|
+
comparator = self.params.Comparator
|
|
1960
|
+
condition = f"{col_a} IS NOT NULL AND {col_b} IS NOT NULL AND NOT ({col_a} {comparator} {col_b})"
|
|
1961
|
+
condition = self._apply_condition(condition)
|
|
1962
|
+
|
|
1963
|
+
return f"""
|
|
1964
|
+
SELECT {col_a}, {col_b}
|
|
1965
|
+
FROM {{table_name}}
|
|
1966
|
+
WHERE {condition}
|
|
1967
|
+
"""
|
|
1968
|
+
|
|
1969
|
+
@property
|
|
1970
|
+
def sample_sql(self) -> str:
|
|
1971
|
+
return self.get_sample_sql()
|
|
1972
|
+
|
|
1973
|
+
def getCheckType(self) -> str:
|
|
1974
|
+
return "check_column_comparison"
|
|
1975
|
+
|
|
1976
|
+
|
|
1555
1977
|
class CheckDistinctCountGenerator(DuckDBCheckGenerator):
|
|
1556
1978
|
REQUIRED_KEYS = {"ColumnAName", "ColumnBName", "ExpectedCount"}
|
|
1557
1979
|
|
|
@@ -1605,6 +2027,66 @@ class CheckDistinctCountGenerator(DuckDBCheckGenerator):
|
|
|
1605
2027
|
return "distinct_count"
|
|
1606
2028
|
|
|
1607
2029
|
|
|
2030
|
+
class CheckNoDuplicatesGenerator(DuckDBCheckGenerator):
|
|
2031
|
+
REQUIRED_KEYS = {"ColumnName"}
|
|
2032
|
+
|
|
2033
|
+
def generateSql(self) -> SQLQuery:
|
|
2034
|
+
col = self.params.ColumnName
|
|
2035
|
+
keyword = self._get_validation_keyword()
|
|
2036
|
+
message = self.errorMessage or f"{col} {keyword} contain no duplicate values."
|
|
2037
|
+
msg_sql = message.replace("'", "''")
|
|
2038
|
+
|
|
2039
|
+
where_clause = f"WHERE {col} IS NOT NULL"
|
|
2040
|
+
if self.row_condition_sql and self.row_condition_sql.strip():
|
|
2041
|
+
where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})"
|
|
2042
|
+
|
|
2043
|
+
requirement_sql = f"""
|
|
2044
|
+
WITH counts AS (
|
|
2045
|
+
SELECT {col} AS value, COUNT(*) AS occurrences
|
|
2046
|
+
FROM {{table_name}}
|
|
2047
|
+
{where_clause}
|
|
2048
|
+
GROUP BY {col}
|
|
2049
|
+
),
|
|
2050
|
+
invalid AS (
|
|
2051
|
+
SELECT value, occurrences
|
|
2052
|
+
FROM counts
|
|
2053
|
+
WHERE occurrences > 1
|
|
2054
|
+
)
|
|
2055
|
+
SELECT
|
|
2056
|
+
COUNT(*) AS violations,
|
|
2057
|
+
CASE WHEN COUNT(*) > 0 THEN '{msg_sql}' END AS error_message
|
|
2058
|
+
FROM invalid
|
|
2059
|
+
"""
|
|
2060
|
+
|
|
2061
|
+
return SQLQuery(requirement_sql=requirement_sql.strip(), predicate_sql=None)
|
|
2062
|
+
|
|
2063
|
+
def get_sample_sql(self) -> str:
|
|
2064
|
+
col = self.params.ColumnName
|
|
2065
|
+
where_clause = f"WHERE {col} IS NOT NULL"
|
|
2066
|
+
if self.row_condition_sql and self.row_condition_sql.strip():
|
|
2067
|
+
where_clause = f"WHERE ({col} IS NOT NULL) AND ({self.row_condition_sql})"
|
|
2068
|
+
|
|
2069
|
+
return f"""
|
|
2070
|
+
WITH dupes AS (
|
|
2071
|
+
SELECT {col} AS value
|
|
2072
|
+
FROM {{table_name}}
|
|
2073
|
+
{where_clause}
|
|
2074
|
+
GROUP BY {col}
|
|
2075
|
+
HAVING COUNT(*) > 1
|
|
2076
|
+
)
|
|
2077
|
+
SELECT t.{col}
|
|
2078
|
+
FROM {{table_name}} t
|
|
2079
|
+
JOIN dupes d ON t.{col} = d.value
|
|
2080
|
+
"""
|
|
2081
|
+
|
|
2082
|
+
@property
|
|
2083
|
+
def sample_sql(self) -> str:
|
|
2084
|
+
return self.get_sample_sql()
|
|
2085
|
+
|
|
2086
|
+
def getCheckType(self) -> str:
|
|
2087
|
+
return "check_no_duplicates"
|
|
2088
|
+
|
|
2089
|
+
|
|
1608
2090
|
class CheckModelRuleGenerator(DuckDBCheckGenerator):
|
|
1609
2091
|
REQUIRED_KEYS = {"ModelRuleId"}
|
|
1610
2092
|
|
|
@@ -1825,7 +2307,8 @@ class JSONCheckPathTypeGenerator(DuckDBCheckGenerator):
|
|
|
1825
2307
|
"""
|
|
1826
2308
|
|
|
1827
2309
|
return SQLQuery(
|
|
1828
|
-
requirement_sql=requirement_sql.strip(),
|
|
2310
|
+
requirement_sql=requirement_sql.strip(),
|
|
2311
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
1829
2312
|
)
|
|
1830
2313
|
|
|
1831
2314
|
def getCheckType(self) -> str:
|
|
@@ -1947,7 +2430,8 @@ class JSONCheckPathKeyValueFormatGenerator(DuckDBCheckGenerator):
|
|
|
1947
2430
|
"""
|
|
1948
2431
|
|
|
1949
2432
|
return SQLQuery(
|
|
1950
|
-
requirement_sql=requirement_sql.strip(),
|
|
2433
|
+
requirement_sql=requirement_sql.strip(),
|
|
2434
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
1951
2435
|
)
|
|
1952
2436
|
|
|
1953
2437
|
def getCheckType(self) -> str:
|
|
@@ -2080,7 +2564,8 @@ class JSONCheckPathKeyStartsWithGenerator(DuckDBCheckGenerator):
|
|
|
2080
2564
|
"""
|
|
2081
2565
|
|
|
2082
2566
|
return SQLQuery(
|
|
2083
|
-
requirement_sql=requirement_sql.strip(),
|
|
2567
|
+
requirement_sql=requirement_sql.strip(),
|
|
2568
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2084
2569
|
)
|
|
2085
2570
|
|
|
2086
2571
|
def getCheckType(self) -> str:
|
|
@@ -2213,7 +2698,8 @@ class JSONCheckPathKeyExistsGenerator(DuckDBCheckGenerator):
|
|
|
2213
2698
|
"""
|
|
2214
2699
|
|
|
2215
2700
|
return SQLQuery(
|
|
2216
|
-
requirement_sql=requirement_sql.strip(),
|
|
2701
|
+
requirement_sql=requirement_sql.strip(),
|
|
2702
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2217
2703
|
)
|
|
2218
2704
|
|
|
2219
2705
|
def getCheckType(self) -> str:
|
|
@@ -2384,7 +2870,8 @@ class JSONCheckPathValueGenerator(DuckDBCheckGenerator):
|
|
|
2384
2870
|
"""
|
|
2385
2871
|
|
|
2386
2872
|
return SQLQuery(
|
|
2387
|
-
requirement_sql=requirement_sql.strip(),
|
|
2873
|
+
requirement_sql=requirement_sql.strip(),
|
|
2874
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2388
2875
|
)
|
|
2389
2876
|
|
|
2390
2877
|
def getCheckType(self) -> str:
|
|
@@ -2548,7 +3035,8 @@ class JSONCheckPathNotValueGenerator(DuckDBCheckGenerator):
|
|
|
2548
3035
|
"""
|
|
2549
3036
|
|
|
2550
3037
|
return SQLQuery(
|
|
2551
|
-
requirement_sql=requirement_sql.strip(),
|
|
3038
|
+
requirement_sql=requirement_sql.strip(),
|
|
3039
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2552
3040
|
)
|
|
2553
3041
|
|
|
2554
3042
|
def getCheckType(self) -> str:
|
|
@@ -2843,7 +3331,8 @@ class JSONCheckPathSameValueGenerator(DuckDBCheckGenerator):
|
|
|
2843
3331
|
"""
|
|
2844
3332
|
|
|
2845
3333
|
return SQLQuery(
|
|
2846
|
-
requirement_sql=requirement_sql.strip(),
|
|
3334
|
+
requirement_sql=requirement_sql.strip(),
|
|
3335
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2847
3336
|
)
|
|
2848
3337
|
|
|
2849
3338
|
def getCheckType(self) -> str:
|
|
@@ -2948,7 +3437,8 @@ class JSONCheckPathNumericFormatGenerator(DuckDBCheckGenerator):
|
|
|
2948
3437
|
"""
|
|
2949
3438
|
|
|
2950
3439
|
return SQLQuery(
|
|
2951
|
-
requirement_sql=requirement_sql.strip(),
|
|
3440
|
+
requirement_sql=requirement_sql.strip(),
|
|
3441
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
2952
3442
|
)
|
|
2953
3443
|
|
|
2954
3444
|
def getCheckType(self) -> str:
|
|
@@ -3127,7 +3617,8 @@ class JSONCheckPathUnitFormatGenerator(DuckDBCheckGenerator):
|
|
|
3127
3617
|
"""
|
|
3128
3618
|
|
|
3129
3619
|
return SQLQuery(
|
|
3130
|
-
requirement_sql=requirement_sql.strip(),
|
|
3620
|
+
requirement_sql=requirement_sql.strip(),
|
|
3621
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3131
3622
|
)
|
|
3132
3623
|
|
|
3133
3624
|
def getCheckType(self) -> str:
|
|
@@ -3225,7 +3716,8 @@ class JSONCheckPathDistinctParentGenerator(DuckDBCheckGenerator):
|
|
|
3225
3716
|
"""
|
|
3226
3717
|
|
|
3227
3718
|
return SQLQuery(
|
|
3228
|
-
requirement_sql=requirement_sql.strip(),
|
|
3719
|
+
requirement_sql=requirement_sql.strip(),
|
|
3720
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3229
3721
|
)
|
|
3230
3722
|
|
|
3231
3723
|
def getCheckType(self) -> str:
|
|
@@ -3281,7 +3773,7 @@ class FormatJSONFormatGenerator(DuckDBCheckGenerator):
|
|
|
3281
3773
|
|
|
3282
3774
|
return SQLQuery(
|
|
3283
3775
|
requirement_sql=requirement_sql.strip(),
|
|
3284
|
-
predicate_sql=predicate_sql.strip(),
|
|
3776
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3285
3777
|
)
|
|
3286
3778
|
|
|
3287
3779
|
# Path provided - validate elements at that path
|
|
@@ -3360,7 +3852,8 @@ class FormatJSONFormatGenerator(DuckDBCheckGenerator):
|
|
|
3360
3852
|
"""
|
|
3361
3853
|
|
|
3362
3854
|
return SQLQuery(
|
|
3363
|
-
requirement_sql=requirement_sql.strip(),
|
|
3855
|
+
requirement_sql=requirement_sql.strip(),
|
|
3856
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3364
3857
|
)
|
|
3365
3858
|
|
|
3366
3859
|
def getCheckType(self) -> str:
|
|
@@ -3465,7 +3958,8 @@ class JSONFormatStringGenerator(DuckDBCheckGenerator):
|
|
|
3465
3958
|
"""
|
|
3466
3959
|
|
|
3467
3960
|
return SQLQuery(
|
|
3468
|
-
requirement_sql=requirement_sql.strip(),
|
|
3961
|
+
requirement_sql=requirement_sql.strip(),
|
|
3962
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3469
3963
|
)
|
|
3470
3964
|
|
|
3471
3965
|
def getCheckType(self) -> str:
|
|
@@ -3652,7 +4146,8 @@ class JSONFormatUnitGenerator(DuckDBCheckGenerator):
|
|
|
3652
4146
|
"""
|
|
3653
4147
|
|
|
3654
4148
|
return SQLQuery(
|
|
3655
|
-
requirement_sql=requirement_sql.strip(),
|
|
4149
|
+
requirement_sql=requirement_sql.strip(),
|
|
4150
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3656
4151
|
)
|
|
3657
4152
|
|
|
3658
4153
|
def getCheckType(self) -> str:
|
|
@@ -3764,7 +4259,8 @@ class JSONFormatNumericGenerator(DuckDBCheckGenerator):
|
|
|
3764
4259
|
"""
|
|
3765
4260
|
|
|
3766
4261
|
return SQLQuery(
|
|
3767
|
-
requirement_sql=requirement_sql.strip(),
|
|
4262
|
+
requirement_sql=requirement_sql.strip(),
|
|
4263
|
+
predicate_sql=self._apply_condition(predicate_sql.strip()),
|
|
3768
4264
|
)
|
|
3769
4265
|
|
|
3770
4266
|
def getCheckType(self) -> str:
|
|
@@ -4343,6 +4839,10 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4343
4839
|
"generator": TypeStringCheckGenerator,
|
|
4344
4840
|
"factory": lambda args: "ColumnName",
|
|
4345
4841
|
},
|
|
4842
|
+
"TypeJSON": {
|
|
4843
|
+
"generator": TypeJSONCheckGenerator,
|
|
4844
|
+
"factory": lambda args: "ColumnName",
|
|
4845
|
+
},
|
|
4346
4846
|
"TypeDecimal": {
|
|
4347
4847
|
"generator": TypeDecimalCheckGenerator,
|
|
4348
4848
|
"factory": lambda args: "ColumnName",
|
|
@@ -4367,6 +4867,10 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4367
4867
|
"generator": FormatBillingCurrencyCodeGenerator,
|
|
4368
4868
|
"factory": lambda args: "ColumnName",
|
|
4369
4869
|
},
|
|
4870
|
+
"FormatJSON": {
|
|
4871
|
+
"generator": FormatJSONGenerator,
|
|
4872
|
+
"factory": lambda args: "ColumnName",
|
|
4873
|
+
},
|
|
4370
4874
|
"FormatKeyValue": {
|
|
4371
4875
|
"generator": FormatJSONGenerator,
|
|
4372
4876
|
"factory": lambda args: "ColumnName",
|
|
@@ -4375,10 +4879,18 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4375
4879
|
"generator": FormatCurrencyGenerator,
|
|
4376
4880
|
"factory": lambda args: "ColumnName",
|
|
4377
4881
|
},
|
|
4882
|
+
"CheckColumnComparison": {
|
|
4883
|
+
"generator": CheckColumnComparisonGenerator,
|
|
4884
|
+
"factory": lambda args: "ColumnAName",
|
|
4885
|
+
},
|
|
4378
4886
|
"CheckNationalCurrency": {
|
|
4379
4887
|
"generator": FormatBillingCurrencyCodeGenerator,
|
|
4380
4888
|
"factory": lambda args: "ColumnName",
|
|
4381
4889
|
},
|
|
4890
|
+
"CheckGreaterThanValue": {
|
|
4891
|
+
"generator": CheckGreaterThanGenerator,
|
|
4892
|
+
"factory": lambda args: "ColumnName",
|
|
4893
|
+
},
|
|
4382
4894
|
"FormatUnit": {
|
|
4383
4895
|
"generator": FormatUnitGenerator,
|
|
4384
4896
|
"factory": lambda args: "ColumnName",
|
|
@@ -4387,10 +4899,26 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4387
4899
|
"generator": CheckValueGenerator,
|
|
4388
4900
|
"factory": lambda args: "ColumnName",
|
|
4389
4901
|
},
|
|
4902
|
+
"CheckLessOrEqualThanValue": {
|
|
4903
|
+
"generator": CheckLessOrEqualGenerator,
|
|
4904
|
+
"factory": lambda args: "ColumnName",
|
|
4905
|
+
},
|
|
4390
4906
|
"CheckNotValue": {
|
|
4391
4907
|
"generator": CheckNotValueGenerator,
|
|
4392
4908
|
"factory": lambda args: "ColumnName",
|
|
4393
4909
|
},
|
|
4910
|
+
"CheckNoDuplicates": {
|
|
4911
|
+
"generator": CheckNoDuplicatesGenerator,
|
|
4912
|
+
"factory": lambda args: "ColumnName",
|
|
4913
|
+
},
|
|
4914
|
+
"CheckRegexMatch": {
|
|
4915
|
+
"generator": CheckRegexMatchGenerator,
|
|
4916
|
+
"factory": lambda args: "ColumnName",
|
|
4917
|
+
},
|
|
4918
|
+
"CheckStringEndsWith": {
|
|
4919
|
+
"generator": CheckStringEndsWithGenerator,
|
|
4920
|
+
"factory": lambda args: "ColumnName",
|
|
4921
|
+
},
|
|
4394
4922
|
"CheckSameValue": {
|
|
4395
4923
|
"generator": CheckSameValueGenerator,
|
|
4396
4924
|
"factory": lambda args: "ColumnAName",
|
|
@@ -4415,6 +4943,10 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4415
4943
|
"generator": CheckModelRuleGenerator,
|
|
4416
4944
|
"factory": lambda args: "ModelRuleId",
|
|
4417
4945
|
},
|
|
4946
|
+
"CheckJSONSchema": {
|
|
4947
|
+
"generator": CheckJSONSchemaGenerator,
|
|
4948
|
+
"factory": lambda args: "ColumnName",
|
|
4949
|
+
},
|
|
4418
4950
|
"AND": {
|
|
4419
4951
|
"generator": CompositeANDRuleGenerator,
|
|
4420
4952
|
"factory": lambda args: "Items",
|
|
@@ -4595,6 +5127,7 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4595
5127
|
transpile_dialect: Optional[str] = None,
|
|
4596
5128
|
show_violations: bool = False,
|
|
4597
5129
|
rules_version: Optional[str] = None,
|
|
5130
|
+
schemas: Optional[Dict[str, Any]] = None,
|
|
4598
5131
|
) -> None:
|
|
4599
5132
|
self.log = logging.getLogger(f"{__name__}.{self.__class__.__qualname__}")
|
|
4600
5133
|
self.conn: duckdb.DuckDBPyConnection | None = None
|
|
@@ -4608,6 +5141,7 @@ class FocusToDuckDBSchemaConverter:
|
|
|
4608
5141
|
)
|
|
4609
5142
|
self.show_violations = show_violations
|
|
4610
5143
|
self.rules_version = rules_version
|
|
5144
|
+
self.schemas = schemas or {}
|
|
4611
5145
|
|
|
4612
5146
|
# Build the effective CHECK_GENERATORS mapping for this version
|
|
4613
5147
|
self.CHECK_GENERATORS = self._build_check_generators_for_version(rules_version)
|
|
@@ -5304,7 +5838,7 @@ class FocusToDuckDBSchemaConverter:
|
|
|
5304
5838
|
gen_cls = reg["generator"]
|
|
5305
5839
|
|
|
5306
5840
|
# Strip reserved + 'CheckFunction' and pass as-is (no aliasing)
|
|
5307
|
-
reserved = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set()
|
|
5841
|
+
reserved: set = getattr(DuckDBCheckGenerator, "RESERVED", set()) or set()
|
|
5308
5842
|
params = {
|
|
5309
5843
|
k: v
|
|
5310
5844
|
for k, v in requirement.items()
|
|
@@ -5374,6 +5908,8 @@ class FocusToDuckDBSchemaConverter:
|
|
|
5374
5908
|
rule_id=rule_id,
|
|
5375
5909
|
plan=self.plan,
|
|
5376
5910
|
conn=self.conn,
|
|
5911
|
+
schemas=self.schemas,
|
|
5912
|
+
table_name=self.table_name,
|
|
5377
5913
|
parent_results_by_idx=parent_results_by_idx or {},
|
|
5378
5914
|
parent_edges=parent_edges or (),
|
|
5379
5915
|
row_condition_sql=row_condition_sql,
|
|
@@ -5683,7 +6219,7 @@ class FocusToDuckDBSchemaConverter:
|
|
|
5683
6219
|
gen_cls = reg["generator"]
|
|
5684
6220
|
|
|
5685
6221
|
# Basic required-key validation (optional)
|
|
5686
|
-
required = getattr(gen_cls, "REQUIRED_KEYS", set()) or set()
|
|
6222
|
+
required: set = getattr(gen_cls, "REQUIRED_KEYS", set()) or set()
|
|
5687
6223
|
missing = [k for k in required if k not in spec]
|
|
5688
6224
|
if missing:
|
|
5689
6225
|
# For conditions, you can choose to return None or raise
|
|
@@ -5911,15 +6447,20 @@ class FocusToDuckDBSchemaConverter:
|
|
|
5911
6447
|
# Conformance reference / special executor (no SQL)
|
|
5912
6448
|
special = getattr(check, "special_executor", None)
|
|
5913
6449
|
if callable(special):
|
|
6450
|
+
special_kind = meta.get("special_executor_kind")
|
|
5914
6451
|
return {
|
|
5915
6452
|
"rule_id": rid,
|
|
5916
|
-
"type": "
|
|
6453
|
+
"type": "special",
|
|
5917
6454
|
"check_type": ctype,
|
|
5918
6455
|
"generator": meta.get("generator"),
|
|
5919
6456
|
"row_condition_sql": meta.get("row_condition_sql"),
|
|
5920
6457
|
"referenced": getattr(check, "referenced_rule_id", None),
|
|
5921
6458
|
"sql": None, # executed by reference, not SQL
|
|
5922
|
-
"note":
|
|
6459
|
+
"note": (
|
|
6460
|
+
"mirrors referenced rule outcome (no SQL)"
|
|
6461
|
+
if special_kind == "reference"
|
|
6462
|
+
else "executed via special executor (no SQL)"
|
|
6463
|
+
),
|
|
5923
6464
|
"must_satisfy": must_satisfy,
|
|
5924
6465
|
}
|
|
5925
6466
|
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/parquet_data_loader.py
RENAMED
|
@@ -58,14 +58,23 @@ class ParquetDataLoader:
|
|
|
58
58
|
# Try multiple datetime parsing strategies
|
|
59
59
|
converted = None
|
|
60
60
|
|
|
61
|
+
# A strategy succeeds only if parsing introduces no nulls
|
|
62
|
+
# beyond those already present (so nullable columns convert),
|
|
63
|
+
# and the column has at least one real value (an all-null
|
|
64
|
+
# column carries no evidence of being a datetime, so it is
|
|
65
|
+
# left as a string rather than coerced).
|
|
66
|
+
original_null_count = series.null_count()
|
|
67
|
+
has_values = original_null_count < series.len()
|
|
68
|
+
|
|
61
69
|
# Strategy 1: Try ISO format with timezone
|
|
62
70
|
try:
|
|
63
71
|
candidate = series.str.to_datetime(
|
|
64
72
|
format="%Y-%m-%dT%H:%M:%S%z", # ISO with timezone like -05:00
|
|
65
73
|
strict=False,
|
|
66
74
|
)
|
|
67
|
-
#
|
|
68
|
-
|
|
75
|
+
# Accept if parsing added no new nulls (nullable columns)
|
|
76
|
+
# and at least one value actually parsed
|
|
77
|
+
if has_values and candidate.null_count() == original_null_count:
|
|
69
78
|
converted = candidate
|
|
70
79
|
except Exception:
|
|
71
80
|
pass
|
|
@@ -77,8 +86,12 @@ class ParquetDataLoader:
|
|
|
77
86
|
format="%Y-%m-%dT%H:%M:%SZ", # ISO with Z timezone
|
|
78
87
|
strict=False,
|
|
79
88
|
)
|
|
80
|
-
#
|
|
81
|
-
|
|
89
|
+
# Accept if parsing added no new nulls (nullable
|
|
90
|
+
# columns) and at least one value actually parsed
|
|
91
|
+
if (
|
|
92
|
+
has_values
|
|
93
|
+
and candidate.null_count() == original_null_count
|
|
94
|
+
):
|
|
82
95
|
converted = candidate
|
|
83
96
|
except Exception:
|
|
84
97
|
pass
|
|
@@ -90,8 +103,12 @@ class ParquetDataLoader:
|
|
|
90
103
|
format="%Y-%m-%d %H:%M:%S", # Space-separated format
|
|
91
104
|
strict=False,
|
|
92
105
|
)
|
|
93
|
-
#
|
|
94
|
-
|
|
106
|
+
# Accept if parsing added no new nulls (nullable
|
|
107
|
+
# columns) and at least one value actually parsed
|
|
108
|
+
if (
|
|
109
|
+
has_values
|
|
110
|
+
and candidate.null_count() == original_null_count
|
|
111
|
+
):
|
|
95
112
|
converted = candidate
|
|
96
113
|
except Exception:
|
|
97
114
|
pass
|
|
@@ -102,8 +119,12 @@ class ParquetDataLoader:
|
|
|
102
119
|
candidate = series.str.to_datetime(
|
|
103
120
|
format="%Y-%m-%d", strict=False # Simple date format
|
|
104
121
|
)
|
|
105
|
-
#
|
|
106
|
-
|
|
122
|
+
# Accept if parsing added no new nulls (nullable
|
|
123
|
+
# columns) and at least one value actually parsed
|
|
124
|
+
if (
|
|
125
|
+
has_values
|
|
126
|
+
and candidate.null_count() == original_null_count
|
|
127
|
+
):
|
|
107
128
|
converted = candidate
|
|
108
129
|
except Exception:
|
|
109
130
|
pass
|
|
@@ -147,14 +168,21 @@ class ParquetDataLoader:
|
|
|
147
168
|
series.name, converted_values, dtype=pl.Datetime("us")
|
|
148
169
|
)
|
|
149
170
|
|
|
150
|
-
#
|
|
151
|
-
|
|
171
|
+
# Accept if parsing added no new nulls (nullable
|
|
172
|
+
# columns) and at least one value actually parsed
|
|
173
|
+
if (
|
|
174
|
+
has_values
|
|
175
|
+
and candidate.null_count() == original_null_count
|
|
176
|
+
):
|
|
152
177
|
converted = candidate
|
|
153
178
|
|
|
154
179
|
except Exception:
|
|
155
180
|
pass
|
|
156
181
|
|
|
157
|
-
# Strategy 6: Let Polars infer format (for
|
|
182
|
+
# Strategy 6: Let Polars infer the format (fallback for any
|
|
183
|
+
# single format strategies 1-5 did not match). Format inference
|
|
184
|
+
# cannot parse timezone-qualified ISO (trailing 'Z' or offsets),
|
|
185
|
+
# but strategies 1-2 already cover those.
|
|
158
186
|
if converted is None:
|
|
159
187
|
try:
|
|
160
188
|
candidate = series.str.to_datetime(
|
|
@@ -163,8 +191,12 @@ class ParquetDataLoader:
|
|
|
163
191
|
exact=False, # Allow partial matches
|
|
164
192
|
cache=True, # Cache format inference
|
|
165
193
|
)
|
|
166
|
-
#
|
|
167
|
-
|
|
194
|
+
# Accept if parsing added no new nulls (nullable columns)
|
|
195
|
+
# and at least one value actually parsed
|
|
196
|
+
if (
|
|
197
|
+
has_values
|
|
198
|
+
and candidate.null_count() == original_null_count
|
|
199
|
+
):
|
|
168
200
|
converted = candidate
|
|
169
201
|
except Exception:
|
|
170
202
|
pass
|
|
@@ -442,6 +442,7 @@ class SpecRules:
|
|
|
442
442
|
|
|
443
443
|
self.plan = val_plan
|
|
444
444
|
self.column_types = column_types
|
|
445
|
+
self.model_data = model_data
|
|
445
446
|
self._meta = {
|
|
446
447
|
"json_rule_file": self.json_rule_file,
|
|
447
448
|
"focus_dataset": self.focus_dataset,
|
|
@@ -482,6 +483,7 @@ class SpecRules:
|
|
|
482
483
|
transpile_dialect=self.transpile_dialect,
|
|
483
484
|
show_violations=show_violations,
|
|
484
485
|
rules_version=self.rules_version,
|
|
486
|
+
schemas=getattr(self, "model_data", {}).get("Schemas", {}),
|
|
485
487
|
)
|
|
486
488
|
# 1) Let the converter prepare schemas, UDFs, temp views, etc.
|
|
487
489
|
if connection is None:
|
|
@@ -620,6 +622,7 @@ class SpecRules:
|
|
|
620
622
|
transpile_dialect=self.transpile_dialect,
|
|
621
623
|
show_violations=False, # Not relevant for explain mode
|
|
622
624
|
rules_version=self.rules_version,
|
|
625
|
+
schemas=getattr(self, "model_data", {}).get("Schemas", {}),
|
|
623
626
|
)
|
|
624
627
|
|
|
625
628
|
# Create a minimal connection for explain mode (converter needs it for initialization)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "focus_validator"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.2.0"
|
|
4
4
|
description = "FOCUS spec validator."
|
|
5
5
|
authors = []
|
|
6
6
|
readme = "README.md"
|
|
@@ -26,6 +26,7 @@ requests = "*"
|
|
|
26
26
|
pandera = { version = "^0.26.1" }
|
|
27
27
|
multimethod = ">=2.0,<2.1"
|
|
28
28
|
sqlglot = "^27.28.1"
|
|
29
|
+
jsonschema = "^4.25.1"
|
|
29
30
|
numpy = { version = "^1.26"}
|
|
30
31
|
pytz = "^2025.2"
|
|
31
32
|
pandasql = "^0.7.3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/json_loader.py
RENAMED
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/config_objects/plan_builder.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/data_loaders/csv_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_console.py
RENAMED
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/outputter/outputter_unittest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/download_currency_codes.py
RENAMED
|
File without changes
|
{focus_validator-2.1.0 → focus_validator-2.2.0}/focus_validator/utils/performance_logging.py
RENAMED
|
File without changes
|
|
File without changes
|