pointblank 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +0 -2
- pointblank/_constants.py +2 -28
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +1483 -1735
- pointblank/column.py +6 -2
- pointblank/datascan.py +3 -2
- pointblank/schema.py +155 -1
- pointblank/validate.py +459 -222
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/METADATA +3 -2
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/RECORD +14 -15
- pointblank/tf.py +0 -287
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/WHEEL +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -31,7 +31,6 @@ from pointblank._constants import (
|
|
|
31
31
|
CROSS_MARK_SPAN,
|
|
32
32
|
IBIS_BACKENDS,
|
|
33
33
|
LOG_LEVELS_MAP,
|
|
34
|
-
METHOD_CATEGORY_MAP,
|
|
35
34
|
REPORTING_LANGUAGES,
|
|
36
35
|
ROW_BASED_VALIDATION_TYPES,
|
|
37
36
|
RTL_LANGUAGES,
|
|
@@ -46,25 +45,35 @@ from pointblank._constants_translations import (
|
|
|
46
45
|
VALIDATION_REPORT_TEXT,
|
|
47
46
|
)
|
|
48
47
|
from pointblank._interrogation import (
|
|
49
|
-
ColCountMatch,
|
|
50
|
-
ColExistsHasType,
|
|
51
|
-
ColSchemaMatch,
|
|
52
|
-
ColValsCompareOne,
|
|
53
|
-
ColValsCompareSet,
|
|
54
|
-
ColValsCompareTwo,
|
|
55
|
-
ColValsExpr,
|
|
56
|
-
ColValsRegex,
|
|
57
|
-
ConjointlyValidation,
|
|
58
48
|
NumberOfTestUnits,
|
|
59
|
-
RowCountMatch,
|
|
60
|
-
RowsComplete,
|
|
61
|
-
RowsDistinct,
|
|
62
49
|
SpeciallyValidation,
|
|
50
|
+
col_count_match,
|
|
51
|
+
col_exists,
|
|
52
|
+
col_schema_match,
|
|
53
|
+
col_vals_expr,
|
|
54
|
+
conjointly_validation,
|
|
55
|
+
interrogate_between,
|
|
56
|
+
interrogate_eq,
|
|
57
|
+
interrogate_ge,
|
|
58
|
+
interrogate_gt,
|
|
59
|
+
interrogate_isin,
|
|
60
|
+
interrogate_le,
|
|
61
|
+
interrogate_lt,
|
|
62
|
+
interrogate_ne,
|
|
63
|
+
interrogate_not_null,
|
|
64
|
+
interrogate_notin,
|
|
65
|
+
interrogate_null,
|
|
66
|
+
interrogate_outside,
|
|
67
|
+
interrogate_regex,
|
|
68
|
+
interrogate_rows_distinct,
|
|
69
|
+
row_count_match,
|
|
70
|
+
rows_complete,
|
|
63
71
|
)
|
|
64
72
|
from pointblank._typing import SegmentSpec
|
|
65
73
|
from pointblank._utils import (
|
|
66
74
|
_check_any_df_lib,
|
|
67
75
|
_check_invalid_fields,
|
|
76
|
+
_column_test_prep,
|
|
68
77
|
_count_null_values_in_column,
|
|
69
78
|
_count_true_values_in_column,
|
|
70
79
|
_derive_bounds,
|
|
@@ -1584,13 +1593,22 @@ def _generate_display_table(
|
|
|
1584
1593
|
|
|
1585
1594
|
tail_data = pd.DataFrame(columns=head_data.columns)
|
|
1586
1595
|
|
|
1587
|
-
|
|
1596
|
+
# Suppress the FutureWarning about DataFrame concatenation with empty entries
|
|
1597
|
+
import warnings
|
|
1598
|
+
|
|
1599
|
+
with warnings.catch_warnings():
|
|
1600
|
+
warnings.filterwarnings(
|
|
1601
|
+
"ignore",
|
|
1602
|
+
category=FutureWarning,
|
|
1603
|
+
message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
|
|
1604
|
+
)
|
|
1605
|
+
data = pd.concat([head_data, tail_data])
|
|
1588
1606
|
|
|
1589
1607
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
1590
1608
|
range(n_rows - n_tail + 1, n_rows + 1)
|
|
1591
1609
|
)
|
|
1592
1610
|
|
|
1593
|
-
# For PySpark, update schema after conversion to
|
|
1611
|
+
# For PySpark, update schema after conversion to Pandas
|
|
1594
1612
|
if tbl_type == "pyspark":
|
|
1595
1613
|
tbl_schema = Schema(tbl=data)
|
|
1596
1614
|
|
|
@@ -2398,10 +2416,31 @@ def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
|
|
|
2398
2416
|
return [lhs_values, rhs_values]
|
|
2399
2417
|
|
|
2400
2418
|
|
|
2419
|
+
def _get_column_names_safe(data: Any) -> list[str]:
|
|
2420
|
+
"""
|
|
2421
|
+
Safely get column names from a DataFrame, optimized for LazyFrames.
|
|
2422
|
+
This function avoids the Narwhals PerformanceWarning for LazyFrames.
|
|
2423
|
+
"""
|
|
2424
|
+
try:
|
|
2425
|
+
import narwhals as nw
|
|
2426
|
+
|
|
2427
|
+
df_nw = nw.from_native(data)
|
|
2428
|
+
# Use `collect_schema()` for LazyFrames to avoid performance warnings
|
|
2429
|
+
if hasattr(df_nw, "collect_schema"):
|
|
2430
|
+
return list(df_nw.collect_schema().keys())
|
|
2431
|
+
else:
|
|
2432
|
+
return list(df_nw.columns)
|
|
2433
|
+
except Exception:
|
|
2434
|
+
# Fallback to direct column access
|
|
2435
|
+
return list(data.columns)
|
|
2436
|
+
|
|
2437
|
+
|
|
2401
2438
|
def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
|
2402
2439
|
if ibis_tbl:
|
|
2403
2440
|
return data.columns if df_lib_name_gt == "polars" else list(data.columns)
|
|
2404
|
-
|
|
2441
|
+
|
|
2442
|
+
# Use the optimized helper function
|
|
2443
|
+
return _get_column_names_safe(data)
|
|
2405
2444
|
|
|
2406
2445
|
|
|
2407
2446
|
def _validate_columns_subset(
|
|
@@ -2590,7 +2629,11 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2590
2629
|
import narwhals as nw
|
|
2591
2630
|
|
|
2592
2631
|
df_nw = nw.from_native(data)
|
|
2593
|
-
|
|
2632
|
+
# Use `collect_schema()` for LazyFrames to avoid performance warnings
|
|
2633
|
+
if hasattr(df_nw, "collect_schema"):
|
|
2634
|
+
return len(df_nw.collect_schema())
|
|
2635
|
+
else:
|
|
2636
|
+
return len(df_nw.columns)
|
|
2594
2637
|
except Exception:
|
|
2595
2638
|
# Fallback for unsupported types
|
|
2596
2639
|
if "pandas" in str(type(data)):
|
|
@@ -4702,7 +4745,8 @@ class Validate:
|
|
|
4702
4745
|
_check_boolean_input(param=active, param_name="active")
|
|
4703
4746
|
|
|
4704
4747
|
# If value is a string-based date or datetime, convert it to the appropriate type
|
|
4705
|
-
|
|
4748
|
+
# Allow regular strings to pass through for string comparisons
|
|
4749
|
+
value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
|
|
4706
4750
|
|
|
4707
4751
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
4708
4752
|
thresholds = (
|
|
@@ -4990,7 +5034,8 @@ class Validate:
|
|
|
4990
5034
|
_check_boolean_input(param=active, param_name="active")
|
|
4991
5035
|
|
|
4992
5036
|
# If value is a string-based date or datetime, convert it to the appropriate type
|
|
4993
|
-
|
|
5037
|
+
# Allow regular strings to pass through for string comparisons
|
|
5038
|
+
value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
|
|
4994
5039
|
|
|
4995
5040
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
4996
5041
|
thresholds = (
|
|
@@ -9738,8 +9783,8 @@ class Validate:
|
|
|
9738
9783
|
threshold = validation.thresholds
|
|
9739
9784
|
segment = validation.segments
|
|
9740
9785
|
|
|
9786
|
+
# Get compatible data types for this assertion type
|
|
9741
9787
|
assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
|
|
9742
|
-
assertion_category = METHOD_CATEGORY_MAP[assertion_method]
|
|
9743
9788
|
compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
|
|
9744
9789
|
|
|
9745
9790
|
# Process the `brief` text for the validation step by including template variables to
|
|
@@ -9870,197 +9915,243 @@ class Validate:
|
|
|
9870
9915
|
# Validation stage
|
|
9871
9916
|
# ------------------------------------------------
|
|
9872
9917
|
|
|
9873
|
-
|
|
9874
|
-
|
|
9875
|
-
|
|
9876
|
-
|
|
9877
|
-
|
|
9878
|
-
|
|
9879
|
-
|
|
9880
|
-
|
|
9881
|
-
|
|
9882
|
-
|
|
9883
|
-
|
|
9884
|
-
|
|
9885
|
-
|
|
9886
|
-
|
|
9887
|
-
|
|
9888
|
-
|
|
9889
|
-
|
|
9890
|
-
|
|
9891
|
-
|
|
9892
|
-
|
|
9893
|
-
|
|
9894
|
-
|
|
9895
|
-
|
|
9896
|
-
tbl_type=tbl_type,
|
|
9897
|
-
).get_test_results()
|
|
9898
|
-
|
|
9899
|
-
if assertion_category == "COMPARE_SET":
|
|
9900
|
-
inside = True if assertion_method == "in_set" else False
|
|
9901
|
-
|
|
9902
|
-
results_tbl = ColValsCompareSet(
|
|
9903
|
-
data_tbl=data_tbl_step,
|
|
9904
|
-
column=column,
|
|
9905
|
-
values=value,
|
|
9906
|
-
threshold=threshold,
|
|
9907
|
-
inside=inside,
|
|
9908
|
-
allowed_types=compatible_dtypes,
|
|
9909
|
-
tbl_type=tbl_type,
|
|
9910
|
-
).get_test_results()
|
|
9911
|
-
|
|
9912
|
-
if assertion_category == "COMPARE_REGEX":
|
|
9913
|
-
results_tbl = ColValsRegex(
|
|
9914
|
-
data_tbl=data_tbl_step,
|
|
9915
|
-
column=column,
|
|
9916
|
-
pattern=value,
|
|
9917
|
-
na_pass=na_pass,
|
|
9918
|
-
threshold=threshold,
|
|
9919
|
-
allowed_types=compatible_dtypes,
|
|
9920
|
-
tbl_type=tbl_type,
|
|
9921
|
-
).get_test_results()
|
|
9922
|
-
|
|
9923
|
-
if assertion_category == "COMPARE_EXPR":
|
|
9924
|
-
results_tbl = ColValsExpr(
|
|
9925
|
-
data_tbl=data_tbl_step,
|
|
9926
|
-
expr=value,
|
|
9927
|
-
threshold=threshold,
|
|
9928
|
-
tbl_type=tbl_type,
|
|
9929
|
-
).get_test_results()
|
|
9930
|
-
|
|
9931
|
-
if assertion_category == "ROWS_DISTINCT":
|
|
9932
|
-
results_tbl = RowsDistinct(
|
|
9933
|
-
data_tbl=data_tbl_step,
|
|
9934
|
-
columns_subset=column,
|
|
9935
|
-
threshold=threshold,
|
|
9936
|
-
tbl_type=tbl_type,
|
|
9937
|
-
).get_test_results()
|
|
9938
|
-
|
|
9939
|
-
if assertion_category == "ROWS_COMPLETE":
|
|
9940
|
-
results_tbl = RowsComplete(
|
|
9941
|
-
data_tbl=data_tbl_step,
|
|
9942
|
-
columns_subset=column,
|
|
9943
|
-
threshold=threshold,
|
|
9944
|
-
tbl_type=tbl_type,
|
|
9945
|
-
).get_test_results()
|
|
9946
|
-
|
|
9947
|
-
if assertion_category == "COL_EXISTS_HAS_TYPE":
|
|
9948
|
-
result_bool = ColExistsHasType(
|
|
9949
|
-
data_tbl=data_tbl_step,
|
|
9950
|
-
column=column,
|
|
9951
|
-
threshold=threshold,
|
|
9952
|
-
assertion_method="exists",
|
|
9953
|
-
tbl_type=tbl_type,
|
|
9954
|
-
).get_test_results()
|
|
9955
|
-
|
|
9956
|
-
validation.all_passed = result_bool
|
|
9957
|
-
validation.n = 1
|
|
9958
|
-
validation.n_passed = result_bool
|
|
9959
|
-
validation.n_failed = 1 - result_bool
|
|
9960
|
-
|
|
9961
|
-
results_tbl = None
|
|
9962
|
-
|
|
9963
|
-
if assertion_category == "COL_SCHEMA_MATCH":
|
|
9964
|
-
result_bool = ColSchemaMatch(
|
|
9965
|
-
data_tbl=data_tbl_step,
|
|
9966
|
-
schema=value["schema"],
|
|
9967
|
-
complete=value["complete"],
|
|
9968
|
-
in_order=value["in_order"],
|
|
9969
|
-
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
9970
|
-
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
9971
|
-
full_match_dtypes=value["full_match_dtypes"],
|
|
9972
|
-
threshold=threshold,
|
|
9973
|
-
).get_test_results()
|
|
9974
|
-
|
|
9975
|
-
schema_validation_info = _get_schema_validation_info(
|
|
9976
|
-
data_tbl=data_tbl,
|
|
9977
|
-
schema=value["schema"],
|
|
9978
|
-
passed=result_bool,
|
|
9979
|
-
complete=value["complete"],
|
|
9980
|
-
in_order=value["in_order"],
|
|
9981
|
-
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
9982
|
-
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
9983
|
-
full_match_dtypes=value["full_match_dtypes"],
|
|
9984
|
-
)
|
|
9918
|
+
# Apply error handling only to data quality validations, not programming error validations
|
|
9919
|
+
if assertion_type != "specially":
|
|
9920
|
+
try:
|
|
9921
|
+
# validations requiring `_column_test_prep()`
|
|
9922
|
+
if assertion_type in [
|
|
9923
|
+
"col_vals_gt",
|
|
9924
|
+
"col_vals_lt",
|
|
9925
|
+
"col_vals_eq",
|
|
9926
|
+
"col_vals_ne",
|
|
9927
|
+
"col_vals_ge",
|
|
9928
|
+
"col_vals_le",
|
|
9929
|
+
"col_vals_null",
|
|
9930
|
+
"col_vals_not_null",
|
|
9931
|
+
"col_vals_between",
|
|
9932
|
+
"col_vals_outside",
|
|
9933
|
+
"col_vals_in_set",
|
|
9934
|
+
"col_vals_not_in_set",
|
|
9935
|
+
"col_vals_regex",
|
|
9936
|
+
]:
|
|
9937
|
+
# Process table for column validation
|
|
9938
|
+
tbl = _column_test_prep(
|
|
9939
|
+
df=data_tbl_step, column=column, allowed_types=compatible_dtypes
|
|
9940
|
+
)
|
|
9985
9941
|
|
|
9986
|
-
|
|
9987
|
-
|
|
9988
|
-
|
|
9989
|
-
|
|
9990
|
-
|
|
9991
|
-
|
|
9992
|
-
|
|
9993
|
-
|
|
9994
|
-
|
|
9995
|
-
|
|
9996
|
-
|
|
9997
|
-
|
|
9998
|
-
|
|
9999
|
-
|
|
10000
|
-
|
|
10001
|
-
|
|
10002
|
-
|
|
10003
|
-
|
|
10004
|
-
|
|
10005
|
-
|
|
10006
|
-
|
|
10007
|
-
|
|
10008
|
-
|
|
10009
|
-
|
|
10010
|
-
|
|
10011
|
-
|
|
10012
|
-
|
|
10013
|
-
|
|
10014
|
-
|
|
10015
|
-
|
|
10016
|
-
|
|
10017
|
-
|
|
10018
|
-
|
|
10019
|
-
|
|
10020
|
-
|
|
10021
|
-
|
|
10022
|
-
|
|
10023
|
-
|
|
10024
|
-
validation.n_passed = int(result_bool)
|
|
10025
|
-
validation.n_failed = 1 - result_bool
|
|
10026
|
-
|
|
10027
|
-
results_tbl = None
|
|
10028
|
-
|
|
10029
|
-
if assertion_category == "CONJOINTLY":
|
|
10030
|
-
results_tbl = ConjointlyValidation(
|
|
10031
|
-
data_tbl=data_tbl_step,
|
|
10032
|
-
expressions=value["expressions"],
|
|
10033
|
-
threshold=threshold,
|
|
10034
|
-
tbl_type=tbl_type,
|
|
10035
|
-
).get_test_results()
|
|
10036
|
-
|
|
10037
|
-
if assertion_category == "SPECIALLY":
|
|
10038
|
-
results_tbl_list = SpeciallyValidation(
|
|
10039
|
-
data_tbl=data_tbl_step,
|
|
10040
|
-
expression=value,
|
|
10041
|
-
threshold=threshold,
|
|
10042
|
-
tbl_type=tbl_type,
|
|
10043
|
-
).get_test_results()
|
|
10044
|
-
|
|
10045
|
-
#
|
|
10046
|
-
# The result from this could either be a table in the conventional form, or,
|
|
10047
|
-
# a list of boolean values; handle both cases
|
|
10048
|
-
#
|
|
10049
|
-
|
|
10050
|
-
if isinstance(results_tbl_list, list):
|
|
10051
|
-
# If the result is a list of boolean values, then we need to convert it to a
|
|
10052
|
-
# set the validation results from the list
|
|
10053
|
-
validation.all_passed = all(results_tbl_list)
|
|
10054
|
-
validation.n = len(results_tbl_list)
|
|
10055
|
-
validation.n_passed = results_tbl_list.count(True)
|
|
10056
|
-
validation.n_failed = results_tbl_list.count(False)
|
|
10057
|
-
|
|
10058
|
-
results_tbl = None
|
|
9942
|
+
if assertion_method == "gt":
|
|
9943
|
+
results_tbl = interrogate_gt(
|
|
9944
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9945
|
+
)
|
|
9946
|
+
elif assertion_method == "lt":
|
|
9947
|
+
results_tbl = interrogate_lt(
|
|
9948
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9949
|
+
)
|
|
9950
|
+
elif assertion_method == "eq":
|
|
9951
|
+
results_tbl = interrogate_eq(
|
|
9952
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9953
|
+
)
|
|
9954
|
+
elif assertion_method == "ne":
|
|
9955
|
+
results_tbl = interrogate_ne(
|
|
9956
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9957
|
+
)
|
|
9958
|
+
elif assertion_method == "ge":
|
|
9959
|
+
results_tbl = interrogate_ge(
|
|
9960
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9961
|
+
)
|
|
9962
|
+
elif assertion_method == "le":
|
|
9963
|
+
results_tbl = interrogate_le(
|
|
9964
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9965
|
+
)
|
|
9966
|
+
elif assertion_method == "null":
|
|
9967
|
+
results_tbl = interrogate_null(tbl=tbl, column=column)
|
|
9968
|
+
elif assertion_method == "not_null":
|
|
9969
|
+
results_tbl = interrogate_not_null(tbl=tbl, column=column)
|
|
9970
|
+
|
|
9971
|
+
elif assertion_type == "col_vals_between":
|
|
9972
|
+
results_tbl = interrogate_between(
|
|
9973
|
+
tbl=tbl,
|
|
9974
|
+
column=column,
|
|
9975
|
+
low=value[0],
|
|
9976
|
+
high=value[1],
|
|
9977
|
+
inclusive=inclusive,
|
|
9978
|
+
na_pass=na_pass,
|
|
9979
|
+
)
|
|
10059
9980
|
|
|
10060
|
-
|
|
10061
|
-
|
|
10062
|
-
|
|
10063
|
-
|
|
9981
|
+
elif assertion_type == "col_vals_outside":
|
|
9982
|
+
results_tbl = interrogate_outside(
|
|
9983
|
+
tbl=tbl,
|
|
9984
|
+
column=column,
|
|
9985
|
+
low=value[0],
|
|
9986
|
+
high=value[1],
|
|
9987
|
+
inclusive=inclusive,
|
|
9988
|
+
na_pass=na_pass,
|
|
9989
|
+
)
|
|
9990
|
+
|
|
9991
|
+
elif assertion_type == "col_vals_in_set":
|
|
9992
|
+
results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value)
|
|
9993
|
+
|
|
9994
|
+
elif assertion_type == "col_vals_not_in_set":
|
|
9995
|
+
results_tbl = interrogate_notin(
|
|
9996
|
+
tbl=tbl, column=column, set_values=value
|
|
9997
|
+
)
|
|
9998
|
+
|
|
9999
|
+
elif assertion_type == "col_vals_regex":
|
|
10000
|
+
results_tbl = interrogate_regex(
|
|
10001
|
+
tbl=tbl, column=column, pattern=value, na_pass=na_pass
|
|
10002
|
+
)
|
|
10003
|
+
|
|
10004
|
+
elif assertion_type == "col_vals_expr":
|
|
10005
|
+
results_tbl = col_vals_expr(
|
|
10006
|
+
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
10007
|
+
)
|
|
10008
|
+
|
|
10009
|
+
elif assertion_type == "rows_distinct":
|
|
10010
|
+
results_tbl = interrogate_rows_distinct(
|
|
10011
|
+
data_tbl=data_tbl_step, columns_subset=column
|
|
10012
|
+
)
|
|
10013
|
+
|
|
10014
|
+
elif assertion_type == "rows_complete":
|
|
10015
|
+
results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
|
|
10016
|
+
|
|
10017
|
+
elif assertion_type == "col_exists":
|
|
10018
|
+
result_bool = col_exists(
|
|
10019
|
+
data_tbl=data_tbl_step,
|
|
10020
|
+
column=column,
|
|
10021
|
+
)
|
|
10022
|
+
|
|
10023
|
+
validation.all_passed = result_bool
|
|
10024
|
+
validation.n = 1
|
|
10025
|
+
validation.n_passed = int(result_bool)
|
|
10026
|
+
validation.n_failed = 1 - int(result_bool)
|
|
10027
|
+
|
|
10028
|
+
results_tbl = None
|
|
10029
|
+
|
|
10030
|
+
elif assertion_type == "col_schema_match":
|
|
10031
|
+
result_bool = col_schema_match(
|
|
10032
|
+
data_tbl=data_tbl_step,
|
|
10033
|
+
schema=value["schema"],
|
|
10034
|
+
complete=value["complete"],
|
|
10035
|
+
in_order=value["in_order"],
|
|
10036
|
+
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
10037
|
+
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
10038
|
+
full_match_dtypes=value["full_match_dtypes"],
|
|
10039
|
+
threshold=threshold,
|
|
10040
|
+
)
|
|
10041
|
+
|
|
10042
|
+
schema_validation_info = _get_schema_validation_info(
|
|
10043
|
+
data_tbl=data_tbl,
|
|
10044
|
+
schema=value["schema"],
|
|
10045
|
+
passed=result_bool,
|
|
10046
|
+
complete=value["complete"],
|
|
10047
|
+
in_order=value["in_order"],
|
|
10048
|
+
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
10049
|
+
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
10050
|
+
full_match_dtypes=value["full_match_dtypes"],
|
|
10051
|
+
)
|
|
10052
|
+
|
|
10053
|
+
# Add the schema validation info to the validation object
|
|
10054
|
+
validation.val_info = schema_validation_info
|
|
10055
|
+
|
|
10056
|
+
validation.all_passed = result_bool
|
|
10057
|
+
validation.n = 1
|
|
10058
|
+
validation.n_passed = int(result_bool)
|
|
10059
|
+
validation.n_failed = 1 - result_bool
|
|
10060
|
+
|
|
10061
|
+
results_tbl = None
|
|
10062
|
+
|
|
10063
|
+
elif assertion_type == "row_count_match":
|
|
10064
|
+
result_bool = row_count_match(
|
|
10065
|
+
data_tbl=data_tbl_step,
|
|
10066
|
+
count=value["count"],
|
|
10067
|
+
inverse=value["inverse"],
|
|
10068
|
+
abs_tol_bounds=value["abs_tol_bounds"],
|
|
10069
|
+
)
|
|
10070
|
+
|
|
10071
|
+
validation.all_passed = result_bool
|
|
10072
|
+
validation.n = 1
|
|
10073
|
+
validation.n_passed = int(result_bool)
|
|
10074
|
+
validation.n_failed = 1 - result_bool
|
|
10075
|
+
|
|
10076
|
+
results_tbl = None
|
|
10077
|
+
|
|
10078
|
+
elif assertion_type == "col_count_match":
|
|
10079
|
+
result_bool = col_count_match(
|
|
10080
|
+
data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"]
|
|
10081
|
+
)
|
|
10082
|
+
|
|
10083
|
+
validation.all_passed = result_bool
|
|
10084
|
+
validation.n = 1
|
|
10085
|
+
validation.n_passed = int(result_bool)
|
|
10086
|
+
validation.n_failed = 1 - result_bool
|
|
10087
|
+
|
|
10088
|
+
results_tbl = None
|
|
10089
|
+
|
|
10090
|
+
elif assertion_type == "conjointly":
|
|
10091
|
+
results_tbl = conjointly_validation(
|
|
10092
|
+
data_tbl=data_tbl_step,
|
|
10093
|
+
expressions=value["expressions"],
|
|
10094
|
+
threshold=threshold,
|
|
10095
|
+
tbl_type=tbl_type,
|
|
10096
|
+
)
|
|
10097
|
+
|
|
10098
|
+
else:
|
|
10099
|
+
raise ValueError(f"Unknown assertion type: {assertion_type}")
|
|
10100
|
+
|
|
10101
|
+
except Exception as e:
|
|
10102
|
+
# Only catch specific data quality comparison errors, not programming errors
|
|
10103
|
+
error_msg = str(e).lower()
|
|
10104
|
+
is_comparison_error = (
|
|
10105
|
+
"boolean value of na is ambiguous" in error_msg
|
|
10106
|
+
or "cannot compare" in error_msg
|
|
10107
|
+
or (
|
|
10108
|
+
"type" in error_msg
|
|
10109
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
10110
|
+
)
|
|
10111
|
+
or ("dtype" in error_msg and "compare" in error_msg)
|
|
10112
|
+
)
|
|
10113
|
+
|
|
10114
|
+
if is_comparison_error:
|
|
10115
|
+
# If data quality comparison fails, mark the validation as having an eval_error
|
|
10116
|
+
validation.eval_error = True
|
|
10117
|
+
end_time = datetime.datetime.now(datetime.timezone.utc)
|
|
10118
|
+
validation.proc_duration_s = (end_time - start_time).total_seconds()
|
|
10119
|
+
validation.time_processed = end_time.isoformat(timespec="milliseconds")
|
|
10120
|
+
validation.active = False
|
|
10121
|
+
continue
|
|
10122
|
+
else:
|
|
10123
|
+
# For other errors (like missing columns), let them propagate
|
|
10124
|
+
raise
|
|
10125
|
+
|
|
10126
|
+
else:
|
|
10127
|
+
# For "specially" validations, let programming errors propagate as exceptions
|
|
10128
|
+
if assertion_type == "specially":
|
|
10129
|
+
results_tbl_list = SpeciallyValidation(
|
|
10130
|
+
data_tbl=data_tbl_step,
|
|
10131
|
+
expression=value,
|
|
10132
|
+
threshold=threshold,
|
|
10133
|
+
tbl_type=tbl_type,
|
|
10134
|
+
).get_test_results()
|
|
10135
|
+
|
|
10136
|
+
#
|
|
10137
|
+
# The result from this could either be a table in the conventional form, or,
|
|
10138
|
+
# a list of boolean values; handle both cases
|
|
10139
|
+
#
|
|
10140
|
+
|
|
10141
|
+
if isinstance(results_tbl_list, list):
|
|
10142
|
+
# If the result is a list of boolean values, then we need to convert it to a
|
|
10143
|
+
# set the validation results from the list
|
|
10144
|
+
validation.all_passed = all(results_tbl_list)
|
|
10145
|
+
validation.n = len(results_tbl_list)
|
|
10146
|
+
validation.n_passed = results_tbl_list.count(True)
|
|
10147
|
+
validation.n_failed = results_tbl_list.count(False)
|
|
10148
|
+
|
|
10149
|
+
results_tbl = None
|
|
10150
|
+
|
|
10151
|
+
else:
|
|
10152
|
+
# If the result is not a list, then we assume it's a table in the conventional
|
|
10153
|
+
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
10154
|
+
results_tbl = results_tbl_list
|
|
10064
10155
|
|
|
10065
10156
|
# If the results table is not `None`, then we assume there is a table with a column
|
|
10066
10157
|
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
@@ -13670,6 +13761,48 @@ def _string_date_dttm_conversion(value: any) -> any:
|
|
|
13670
13761
|
return value
|
|
13671
13762
|
|
|
13672
13763
|
|
|
13764
|
+
def _conditional_string_date_dttm_conversion(
|
|
13765
|
+
value: any, allow_regular_strings: bool = False
|
|
13766
|
+
) -> any:
|
|
13767
|
+
"""
|
|
13768
|
+
Conditionally convert a string to a date or datetime object if it is in the correct format. If
|
|
13769
|
+
`allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
|
|
13770
|
+
the value is not a string, it is returned as is.
|
|
13771
|
+
|
|
13772
|
+
Parameters
|
|
13773
|
+
----------
|
|
13774
|
+
value
|
|
13775
|
+
The value to convert. It can be a string, date, or datetime object.
|
|
13776
|
+
allow_regular_strings
|
|
13777
|
+
If `True`, regular strings (non-date/datetime) are allowed to pass through unchanged. If
|
|
13778
|
+
`False`, behaves like `_string_date_dttm_conversion()` and raises `ValueError` for regular
|
|
13779
|
+
strings.
|
|
13780
|
+
|
|
13781
|
+
Returns
|
|
13782
|
+
-------
|
|
13783
|
+
any
|
|
13784
|
+
The converted date or datetime object, or the original value.
|
|
13785
|
+
|
|
13786
|
+
Raises
|
|
13787
|
+
------
|
|
13788
|
+
ValueError
|
|
13789
|
+
If allow_regular_strings is False and the string cannot be converted to a date or datetime.
|
|
13790
|
+
"""
|
|
13791
|
+
|
|
13792
|
+
if isinstance(value, str):
|
|
13793
|
+
if _is_string_date(value):
|
|
13794
|
+
value = _convert_string_to_date(value)
|
|
13795
|
+
elif _is_string_datetime(value):
|
|
13796
|
+
value = _convert_string_to_datetime(value)
|
|
13797
|
+
elif not allow_regular_strings:
|
|
13798
|
+
raise ValueError(
|
|
13799
|
+
"If `value=` is provided as a string it must be a date or datetime string."
|
|
13800
|
+
)
|
|
13801
|
+
# If allow_regular_strings is True, regular strings pass through unchanged
|
|
13802
|
+
|
|
13803
|
+
return value
|
|
13804
|
+
|
|
13805
|
+
|
|
13673
13806
|
def _process_brief(
|
|
13674
13807
|
brief: str | None,
|
|
13675
13808
|
step: int,
|
|
@@ -14319,17 +14452,108 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14319
14452
|
column, segment = segments_expr
|
|
14320
14453
|
|
|
14321
14454
|
if tbl_type in ["pandas", "polars", "pyspark"]:
|
|
14322
|
-
# If the table is a Pandas, Polars, or PySpark DataFrame,
|
|
14455
|
+
# If the table is a Pandas, Polars, or PySpark DataFrame, transform to a Narwhals table
|
|
14323
14456
|
# and perform the filtering operation
|
|
14324
14457
|
|
|
14325
14458
|
# Transform to Narwhals table if a DataFrame
|
|
14326
14459
|
data_tbl_nw = nw.from_native(data_tbl)
|
|
14327
14460
|
|
|
14461
|
+
# Handle Polars expressions by attempting to extract literal values
|
|
14462
|
+
# This is a compatibility measure for cases where `pl.datetime()`, `pl.lit()`, etc.,
|
|
14463
|
+
# are accidentally used instead of native Python types
|
|
14464
|
+
if (
|
|
14465
|
+
hasattr(segment, "__class__")
|
|
14466
|
+
and "polars" in segment.__class__.__module__
|
|
14467
|
+
and segment.__class__.__name__ == "Expr"
|
|
14468
|
+
):
|
|
14469
|
+
# This is a Polars expression so we should warn about this and suggest native types
|
|
14470
|
+
import warnings
|
|
14471
|
+
from datetime import date, datetime
|
|
14472
|
+
|
|
14473
|
+
warnings.warn(
|
|
14474
|
+
"Polars expressions in segments are deprecated. Please use native Python types instead. "
|
|
14475
|
+
"For example, use datetime.date(2016, 1, 4) instead of pl.datetime(2016, 1, 4).",
|
|
14476
|
+
DeprecationWarning,
|
|
14477
|
+
stacklevel=3,
|
|
14478
|
+
)
|
|
14479
|
+
|
|
14480
|
+
# Try to extract the literal value from various Polars expression patterns
|
|
14481
|
+
segment_str = str(segment)
|
|
14482
|
+
parsed_value = None
|
|
14483
|
+
|
|
14484
|
+
# Handle different Polars expression string formats
|
|
14485
|
+
# Format 1: Direct date strings like "2016-01-04"
|
|
14486
|
+
if len(segment_str) == 10 and segment_str.count("-") == 2:
|
|
14487
|
+
try:
|
|
14488
|
+
parsed_value = date.fromisoformat(segment_str)
|
|
14489
|
+
except ValueError:
|
|
14490
|
+
pass
|
|
14491
|
+
|
|
14492
|
+
# Format 2: Datetime strings with UTC timezone like
|
|
14493
|
+
# "2016-01-04 00:00:01 UTC.strict_cast(...)"
|
|
14494
|
+
elif " UTC" in segment_str:
|
|
14495
|
+
try:
|
|
14496
|
+
# Extract just the datetime part before "UTC"
|
|
14497
|
+
datetime_part = segment_str.split(" UTC")[0]
|
|
14498
|
+
if len(datetime_part) >= 10:
|
|
14499
|
+
parsed_dt = datetime.fromisoformat(datetime_part)
|
|
14500
|
+
# Convert midnight datetimes to dates for consistency
|
|
14501
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14502
|
+
parsed_value = parsed_dt.date()
|
|
14503
|
+
else:
|
|
14504
|
+
parsed_value = parsed_dt
|
|
14505
|
+
except (ValueError, IndexError):
|
|
14506
|
+
pass
|
|
14507
|
+
|
|
14508
|
+
# Format 3: Bracketed expressions like ['2016-01-04']
|
|
14509
|
+
elif segment_str.startswith("[") and segment_str.endswith("]"):
|
|
14510
|
+
try:
|
|
14511
|
+
content = segment_str[2:-2] # Remove [' and ']
|
|
14512
|
+
|
|
14513
|
+
# Try parsing as date first
|
|
14514
|
+
if len(content) == 10 and content.count("-") == 2:
|
|
14515
|
+
try:
|
|
14516
|
+
parsed_value = date.fromisoformat(content)
|
|
14517
|
+
except ValueError:
|
|
14518
|
+
pass
|
|
14519
|
+
|
|
14520
|
+
# Try parsing as datetime
|
|
14521
|
+
if parsed_value is None:
|
|
14522
|
+
try:
|
|
14523
|
+
parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
|
|
14524
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14525
|
+
parsed_value = parsed_dt.date()
|
|
14526
|
+
else:
|
|
14527
|
+
parsed_value = parsed_dt
|
|
14528
|
+
except ValueError:
|
|
14529
|
+
pass
|
|
14530
|
+
|
|
14531
|
+
except (ValueError, IndexError):
|
|
14532
|
+
pass
|
|
14533
|
+
|
|
14534
|
+
# Handle `pl.datetime()` expressions with .alias("datetime")
|
|
14535
|
+
elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
|
|
14536
|
+
try:
|
|
14537
|
+
datetime_part = segment_str.split('.alias("datetime")')[0]
|
|
14538
|
+
parsed_dt = datetime.fromisoformat(datetime_part)
|
|
14539
|
+
|
|
14540
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14541
|
+
parsed_value = parsed_dt.date()
|
|
14542
|
+
else:
|
|
14543
|
+
parsed_value = parsed_dt
|
|
14544
|
+
|
|
14545
|
+
except (ValueError, AttributeError):
|
|
14546
|
+
pass
|
|
14547
|
+
|
|
14548
|
+
# If we successfully parsed a value, use it; otherwise leave segment as is
|
|
14549
|
+
if parsed_value is not None:
|
|
14550
|
+
segment = parsed_value
|
|
14551
|
+
|
|
14328
14552
|
# Filter the data table based on the column name and segment
|
|
14329
14553
|
if segment is None:
|
|
14330
14554
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_null())
|
|
14331
|
-
# Check if the segment is a segment group
|
|
14332
14555
|
elif isinstance(segment, list):
|
|
14556
|
+
# Check if the segment is a segment group
|
|
14333
14557
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_in(segment))
|
|
14334
14558
|
else:
|
|
14335
14559
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column) == segment)
|
|
@@ -14341,12 +14565,13 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14341
14565
|
# If the table is an Ibis backend table, perform the filtering operation directly
|
|
14342
14566
|
|
|
14343
14567
|
# Filter the data table based on the column name and segment
|
|
14568
|
+
# Use the new Ibis API methods to avoid deprecation warnings
|
|
14344
14569
|
if segment is None:
|
|
14345
|
-
data_tbl = data_tbl
|
|
14570
|
+
data_tbl = data_tbl.filter(data_tbl[column].isnull())
|
|
14346
14571
|
elif isinstance(segment, list):
|
|
14347
|
-
data_tbl = data_tbl
|
|
14572
|
+
data_tbl = data_tbl.filter(data_tbl[column].isin(segment))
|
|
14348
14573
|
else:
|
|
14349
|
-
data_tbl = data_tbl
|
|
14574
|
+
data_tbl = data_tbl.filter(data_tbl[column] == segment)
|
|
14350
14575
|
|
|
14351
14576
|
return data_tbl
|
|
14352
14577
|
|
|
@@ -15113,6 +15338,8 @@ def _step_report_row_based(
|
|
|
15113
15338
|
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
15114
15339
|
elif assertion_type == "col_vals_not_null":
|
|
15115
15340
|
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
15341
|
+
elif assertion_type == "col_vals_expr":
|
|
15342
|
+
text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
|
|
15116
15343
|
elif assertion_type == "rows_complete":
|
|
15117
15344
|
if column is None:
|
|
15118
15345
|
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
@@ -15159,10 +15386,14 @@ def _step_report_row_based(
|
|
|
15159
15386
|
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
15160
15387
|
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
15161
15388
|
|
|
15162
|
-
|
|
15163
|
-
|
|
15164
|
-
|
|
15165
|
-
|
|
15389
|
+
# Use success_statement_no_column for col_vals_expr since it doesn't target a specific column
|
|
15390
|
+
if assertion_type == "col_vals_expr":
|
|
15391
|
+
success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(n=n)
|
|
15392
|
+
else:
|
|
15393
|
+
success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
|
|
15394
|
+
n=n,
|
|
15395
|
+
column_position=column_position,
|
|
15396
|
+
)
|
|
15166
15397
|
preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
|
|
15167
15398
|
|
|
15168
15399
|
details = (
|
|
@@ -15242,10 +15473,16 @@ def _step_report_row_based(
|
|
|
15242
15473
|
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
15243
15474
|
failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
|
|
15244
15475
|
|
|
15245
|
-
|
|
15246
|
-
|
|
15247
|
-
|
|
15248
|
-
|
|
15476
|
+
# Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column
|
|
15477
|
+
if assertion_type == "col_vals_expr":
|
|
15478
|
+
failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format(
|
|
15479
|
+
failure_rate=failure_rate_metrics
|
|
15480
|
+
)
|
|
15481
|
+
else:
|
|
15482
|
+
failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
|
|
15483
|
+
failure_rate=failure_rate_metrics,
|
|
15484
|
+
column_position=column_position,
|
|
15485
|
+
)
|
|
15249
15486
|
|
|
15250
15487
|
if limit < extract_length:
|
|
15251
15488
|
extract_length_resolved = limit
|