pointblank 0.13.1__py3-none-any.whl → 0.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +0 -2
- pointblank/_constants.py +2 -28
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +1483 -1735
- pointblank/column.py +6 -2
- pointblank/datascan.py +3 -2
- pointblank/schema.py +155 -1
- pointblank/validate.py +626 -334
- pointblank/yaml.py +154 -44
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/METADATA +3 -2
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/RECORD +15 -16
- pointblank/tf.py +0 -287
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/WHEEL +0 -0
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.1.dist-info → pointblank-0.13.3.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -31,7 +31,6 @@ from pointblank._constants import (
|
|
|
31
31
|
CROSS_MARK_SPAN,
|
|
32
32
|
IBIS_BACKENDS,
|
|
33
33
|
LOG_LEVELS_MAP,
|
|
34
|
-
METHOD_CATEGORY_MAP,
|
|
35
34
|
REPORTING_LANGUAGES,
|
|
36
35
|
ROW_BASED_VALIDATION_TYPES,
|
|
37
36
|
RTL_LANGUAGES,
|
|
@@ -46,25 +45,35 @@ from pointblank._constants_translations import (
|
|
|
46
45
|
VALIDATION_REPORT_TEXT,
|
|
47
46
|
)
|
|
48
47
|
from pointblank._interrogation import (
|
|
49
|
-
ColCountMatch,
|
|
50
|
-
ColExistsHasType,
|
|
51
|
-
ColSchemaMatch,
|
|
52
|
-
ColValsCompareOne,
|
|
53
|
-
ColValsCompareSet,
|
|
54
|
-
ColValsCompareTwo,
|
|
55
|
-
ColValsExpr,
|
|
56
|
-
ColValsRegex,
|
|
57
|
-
ConjointlyValidation,
|
|
58
48
|
NumberOfTestUnits,
|
|
59
|
-
RowCountMatch,
|
|
60
|
-
RowsComplete,
|
|
61
|
-
RowsDistinct,
|
|
62
49
|
SpeciallyValidation,
|
|
50
|
+
col_count_match,
|
|
51
|
+
col_exists,
|
|
52
|
+
col_schema_match,
|
|
53
|
+
col_vals_expr,
|
|
54
|
+
conjointly_validation,
|
|
55
|
+
interrogate_between,
|
|
56
|
+
interrogate_eq,
|
|
57
|
+
interrogate_ge,
|
|
58
|
+
interrogate_gt,
|
|
59
|
+
interrogate_isin,
|
|
60
|
+
interrogate_le,
|
|
61
|
+
interrogate_lt,
|
|
62
|
+
interrogate_ne,
|
|
63
|
+
interrogate_not_null,
|
|
64
|
+
interrogate_notin,
|
|
65
|
+
interrogate_null,
|
|
66
|
+
interrogate_outside,
|
|
67
|
+
interrogate_regex,
|
|
68
|
+
interrogate_rows_distinct,
|
|
69
|
+
row_count_match,
|
|
70
|
+
rows_complete,
|
|
63
71
|
)
|
|
64
72
|
from pointblank._typing import SegmentSpec
|
|
65
73
|
from pointblank._utils import (
|
|
66
74
|
_check_any_df_lib,
|
|
67
75
|
_check_invalid_fields,
|
|
76
|
+
_column_test_prep,
|
|
68
77
|
_count_null_values_in_column,
|
|
69
78
|
_count_true_values_in_column,
|
|
70
79
|
_derive_bounds,
|
|
@@ -1584,13 +1593,22 @@ def _generate_display_table(
|
|
|
1584
1593
|
|
|
1585
1594
|
tail_data = pd.DataFrame(columns=head_data.columns)
|
|
1586
1595
|
|
|
1587
|
-
|
|
1596
|
+
# Suppress the FutureWarning about DataFrame concatenation with empty entries
|
|
1597
|
+
import warnings
|
|
1598
|
+
|
|
1599
|
+
with warnings.catch_warnings():
|
|
1600
|
+
warnings.filterwarnings(
|
|
1601
|
+
"ignore",
|
|
1602
|
+
category=FutureWarning,
|
|
1603
|
+
message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
|
|
1604
|
+
)
|
|
1605
|
+
data = pd.concat([head_data, tail_data])
|
|
1588
1606
|
|
|
1589
1607
|
row_number_list = list(range(1, n_head + 1)) + list(
|
|
1590
1608
|
range(n_rows - n_tail + 1, n_rows + 1)
|
|
1591
1609
|
)
|
|
1592
1610
|
|
|
1593
|
-
# For PySpark, update schema after conversion to
|
|
1611
|
+
# For PySpark, update schema after conversion to Pandas
|
|
1594
1612
|
if tbl_type == "pyspark":
|
|
1595
1613
|
tbl_schema = Schema(tbl=data)
|
|
1596
1614
|
|
|
@@ -1988,9 +2006,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1988
2006
|
|
|
1989
2007
|
# Apply the appropriate conversion method
|
|
1990
2008
|
if use_polars_conversion:
|
|
1991
|
-
null_sum_converted = null_sum.to_polars()
|
|
2009
|
+
null_sum_converted = null_sum.to_polars() # pragma: no cover
|
|
1992
2010
|
else:
|
|
1993
|
-
null_sum_converted = null_sum.to_pandas()
|
|
2011
|
+
null_sum_converted = null_sum.to_pandas() # pragma: no cover
|
|
1994
2012
|
|
|
1995
2013
|
missing_prop = (null_sum_converted / sector_size) * 100
|
|
1996
2014
|
col_missing_props.append(missing_prop)
|
|
@@ -2007,9 +2025,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2007
2025
|
|
|
2008
2026
|
# Apply the appropriate conversion method
|
|
2009
2027
|
if use_polars_conversion:
|
|
2010
|
-
null_sum_converted = null_sum.to_polars()
|
|
2028
|
+
null_sum_converted = null_sum.to_polars() # pragma: no cover
|
|
2011
2029
|
else:
|
|
2012
|
-
null_sum_converted = null_sum.to_pandas()
|
|
2030
|
+
null_sum_converted = null_sum.to_pandas() # pragma: no cover
|
|
2013
2031
|
|
|
2014
2032
|
missing_prop = (null_sum_converted / sector_size) * 100
|
|
2015
2033
|
col_missing_props.append(missing_prop)
|
|
@@ -2022,9 +2040,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2022
2040
|
|
|
2023
2041
|
# Use the helper function based on the DataFrame library
|
|
2024
2042
|
if df_lib_name_gt == "polars":
|
|
2025
|
-
missing_vals = _calculate_missing_proportions(
|
|
2043
|
+
missing_vals = _calculate_missing_proportions(
|
|
2044
|
+
use_polars_conversion=True
|
|
2045
|
+
) # pragma: no cover
|
|
2026
2046
|
else:
|
|
2027
|
-
missing_vals = _calculate_missing_proportions(
|
|
2047
|
+
missing_vals = _calculate_missing_proportions(
|
|
2048
|
+
use_polars_conversion=False
|
|
2049
|
+
) # pragma: no cover
|
|
2028
2050
|
|
|
2029
2051
|
# Pivot the `missing_vals` dictionary to create a table with the missing value proportions
|
|
2030
2052
|
missing_vals = {
|
|
@@ -2037,9 +2059,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
2037
2059
|
|
|
2038
2060
|
# Get a dictionary of counts of missing values in each column
|
|
2039
2061
|
if df_lib_name_gt == "polars":
|
|
2040
|
-
missing_val_counts = {
|
|
2062
|
+
missing_val_counts = {
|
|
2063
|
+
col: data[col].isnull().sum().to_polars() for col in data.columns
|
|
2064
|
+
} # pragma: no cover
|
|
2041
2065
|
else:
|
|
2042
|
-
missing_val_counts = {
|
|
2066
|
+
missing_val_counts = {
|
|
2067
|
+
col: data[col].isnull().sum().to_pandas() for col in data.columns
|
|
2068
|
+
} # pragma: no cover
|
|
2043
2069
|
|
|
2044
2070
|
if pl_pb_tbl:
|
|
2045
2071
|
# Get the column names from the table
|
|
@@ -2398,10 +2424,31 @@ def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
|
|
|
2398
2424
|
return [lhs_values, rhs_values]
|
|
2399
2425
|
|
|
2400
2426
|
|
|
2427
|
+
def _get_column_names_safe(data: Any) -> list[str]:
|
|
2428
|
+
"""
|
|
2429
|
+
Safely get column names from a DataFrame, optimized for LazyFrames.
|
|
2430
|
+
This function avoids the Narwhals PerformanceWarning for LazyFrames.
|
|
2431
|
+
"""
|
|
2432
|
+
try:
|
|
2433
|
+
import narwhals as nw
|
|
2434
|
+
|
|
2435
|
+
df_nw = nw.from_native(data)
|
|
2436
|
+
# Use `collect_schema()` for LazyFrames to avoid performance warnings
|
|
2437
|
+
if hasattr(df_nw, "collect_schema"):
|
|
2438
|
+
return list(df_nw.collect_schema().keys())
|
|
2439
|
+
else:
|
|
2440
|
+
return list(df_nw.columns) # pragma: no cover
|
|
2441
|
+
except Exception: # pragma: no cover
|
|
2442
|
+
# Fallback to direct column access
|
|
2443
|
+
return list(data.columns) # pragma: no cover
|
|
2444
|
+
|
|
2445
|
+
|
|
2401
2446
|
def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
|
2402
2447
|
if ibis_tbl:
|
|
2403
2448
|
return data.columns if df_lib_name_gt == "polars" else list(data.columns)
|
|
2404
|
-
|
|
2449
|
+
|
|
2450
|
+
# Use the optimized helper function
|
|
2451
|
+
return _get_column_names_safe(data)
|
|
2405
2452
|
|
|
2406
2453
|
|
|
2407
2454
|
def _validate_columns_subset(
|
|
@@ -2590,7 +2637,11 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2590
2637
|
import narwhals as nw
|
|
2591
2638
|
|
|
2592
2639
|
df_nw = nw.from_native(data)
|
|
2593
|
-
|
|
2640
|
+
# Use `collect_schema()` for LazyFrames to avoid performance warnings
|
|
2641
|
+
if hasattr(df_nw, "collect_schema"):
|
|
2642
|
+
return len(df_nw.collect_schema())
|
|
2643
|
+
else:
|
|
2644
|
+
return len(df_nw.columns) # pragma: no cover
|
|
2594
2645
|
except Exception:
|
|
2595
2646
|
# Fallback for unsupported types
|
|
2596
2647
|
if "pandas" in str(type(data)):
|
|
@@ -2763,11 +2814,11 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
2763
2814
|
# Try different ways to get row count
|
|
2764
2815
|
if hasattr(df_nw, "shape"):
|
|
2765
2816
|
return df_nw.shape[0]
|
|
2766
|
-
elif hasattr(df_nw, "height"):
|
|
2817
|
+
elif hasattr(df_nw, "height"): # pragma: no cover
|
|
2767
2818
|
return df_nw.height # pragma: no cover
|
|
2768
2819
|
else: # pragma: no cover
|
|
2769
2820
|
raise ValueError("Unable to determine row count from Narwhals DataFrame")
|
|
2770
|
-
except Exception:
|
|
2821
|
+
except Exception: # pragma: no cover
|
|
2771
2822
|
# Fallback for types that don't work with Narwhals
|
|
2772
2823
|
if "pandas" in str(type(data)): # pragma: no cover
|
|
2773
2824
|
return data.shape[0]
|
|
@@ -4702,7 +4753,8 @@ class Validate:
|
|
|
4702
4753
|
_check_boolean_input(param=active, param_name="active")
|
|
4703
4754
|
|
|
4704
4755
|
# If value is a string-based date or datetime, convert it to the appropriate type
|
|
4705
|
-
|
|
4756
|
+
# Allow regular strings to pass through for string comparisons
|
|
4757
|
+
value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
|
|
4706
4758
|
|
|
4707
4759
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
4708
4760
|
thresholds = (
|
|
@@ -4990,7 +5042,8 @@ class Validate:
|
|
|
4990
5042
|
_check_boolean_input(param=active, param_name="active")
|
|
4991
5043
|
|
|
4992
5044
|
# If value is a string-based date or datetime, convert it to the appropriate type
|
|
4993
|
-
|
|
5045
|
+
# Allow regular strings to pass through for string comparisons
|
|
5046
|
+
value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
|
|
4994
5047
|
|
|
4995
5048
|
# Determine threshold to use (global or local) and normalize a local `thresholds=` value
|
|
4996
5049
|
thresholds = (
|
|
@@ -8356,8 +8409,8 @@ class Validate:
|
|
|
8356
8409
|
self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
|
|
8357
8410
|
)
|
|
8358
8411
|
|
|
8359
|
-
if columns_subset is not None and isinstance(columns_subset, str):
|
|
8360
|
-
columns_subset = [columns_subset]
|
|
8412
|
+
if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
|
|
8413
|
+
columns_subset = [columns_subset] # pragma: no cover
|
|
8361
8414
|
|
|
8362
8415
|
# TODO: incorporate Column object
|
|
8363
8416
|
|
|
@@ -9738,8 +9791,8 @@ class Validate:
|
|
|
9738
9791
|
threshold = validation.thresholds
|
|
9739
9792
|
segment = validation.segments
|
|
9740
9793
|
|
|
9794
|
+
# Get compatible data types for this assertion type
|
|
9741
9795
|
assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
|
|
9742
|
-
assertion_category = METHOD_CATEGORY_MAP[assertion_method]
|
|
9743
9796
|
compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
|
|
9744
9797
|
|
|
9745
9798
|
# Process the `brief` text for the validation step by including template variables to
|
|
@@ -9870,197 +9923,249 @@ class Validate:
|
|
|
9870
9923
|
# Validation stage
|
|
9871
9924
|
# ------------------------------------------------
|
|
9872
9925
|
|
|
9873
|
-
|
|
9874
|
-
|
|
9875
|
-
|
|
9876
|
-
|
|
9877
|
-
|
|
9878
|
-
|
|
9879
|
-
|
|
9880
|
-
|
|
9881
|
-
|
|
9882
|
-
|
|
9883
|
-
|
|
9884
|
-
|
|
9885
|
-
|
|
9886
|
-
|
|
9887
|
-
|
|
9888
|
-
|
|
9889
|
-
|
|
9890
|
-
|
|
9891
|
-
|
|
9892
|
-
|
|
9893
|
-
|
|
9894
|
-
|
|
9895
|
-
|
|
9896
|
-
tbl_type=tbl_type,
|
|
9897
|
-
).get_test_results()
|
|
9898
|
-
|
|
9899
|
-
if assertion_category == "COMPARE_SET":
|
|
9900
|
-
inside = True if assertion_method == "in_set" else False
|
|
9901
|
-
|
|
9902
|
-
results_tbl = ColValsCompareSet(
|
|
9903
|
-
data_tbl=data_tbl_step,
|
|
9904
|
-
column=column,
|
|
9905
|
-
values=value,
|
|
9906
|
-
threshold=threshold,
|
|
9907
|
-
inside=inside,
|
|
9908
|
-
allowed_types=compatible_dtypes,
|
|
9909
|
-
tbl_type=tbl_type,
|
|
9910
|
-
).get_test_results()
|
|
9911
|
-
|
|
9912
|
-
if assertion_category == "COMPARE_REGEX":
|
|
9913
|
-
results_tbl = ColValsRegex(
|
|
9914
|
-
data_tbl=data_tbl_step,
|
|
9915
|
-
column=column,
|
|
9916
|
-
pattern=value,
|
|
9917
|
-
na_pass=na_pass,
|
|
9918
|
-
threshold=threshold,
|
|
9919
|
-
allowed_types=compatible_dtypes,
|
|
9920
|
-
tbl_type=tbl_type,
|
|
9921
|
-
).get_test_results()
|
|
9922
|
-
|
|
9923
|
-
if assertion_category == "COMPARE_EXPR":
|
|
9924
|
-
results_tbl = ColValsExpr(
|
|
9925
|
-
data_tbl=data_tbl_step,
|
|
9926
|
-
expr=value,
|
|
9927
|
-
threshold=threshold,
|
|
9928
|
-
tbl_type=tbl_type,
|
|
9929
|
-
).get_test_results()
|
|
9930
|
-
|
|
9931
|
-
if assertion_category == "ROWS_DISTINCT":
|
|
9932
|
-
results_tbl = RowsDistinct(
|
|
9933
|
-
data_tbl=data_tbl_step,
|
|
9934
|
-
columns_subset=column,
|
|
9935
|
-
threshold=threshold,
|
|
9936
|
-
tbl_type=tbl_type,
|
|
9937
|
-
).get_test_results()
|
|
9938
|
-
|
|
9939
|
-
if assertion_category == "ROWS_COMPLETE":
|
|
9940
|
-
results_tbl = RowsComplete(
|
|
9941
|
-
data_tbl=data_tbl_step,
|
|
9942
|
-
columns_subset=column,
|
|
9943
|
-
threshold=threshold,
|
|
9944
|
-
tbl_type=tbl_type,
|
|
9945
|
-
).get_test_results()
|
|
9946
|
-
|
|
9947
|
-
if assertion_category == "COL_EXISTS_HAS_TYPE":
|
|
9948
|
-
result_bool = ColExistsHasType(
|
|
9949
|
-
data_tbl=data_tbl_step,
|
|
9950
|
-
column=column,
|
|
9951
|
-
threshold=threshold,
|
|
9952
|
-
assertion_method="exists",
|
|
9953
|
-
tbl_type=tbl_type,
|
|
9954
|
-
).get_test_results()
|
|
9955
|
-
|
|
9956
|
-
validation.all_passed = result_bool
|
|
9957
|
-
validation.n = 1
|
|
9958
|
-
validation.n_passed = result_bool
|
|
9959
|
-
validation.n_failed = 1 - result_bool
|
|
9960
|
-
|
|
9961
|
-
results_tbl = None
|
|
9962
|
-
|
|
9963
|
-
if assertion_category == "COL_SCHEMA_MATCH":
|
|
9964
|
-
result_bool = ColSchemaMatch(
|
|
9965
|
-
data_tbl=data_tbl_step,
|
|
9966
|
-
schema=value["schema"],
|
|
9967
|
-
complete=value["complete"],
|
|
9968
|
-
in_order=value["in_order"],
|
|
9969
|
-
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
9970
|
-
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
9971
|
-
full_match_dtypes=value["full_match_dtypes"],
|
|
9972
|
-
threshold=threshold,
|
|
9973
|
-
).get_test_results()
|
|
9974
|
-
|
|
9975
|
-
schema_validation_info = _get_schema_validation_info(
|
|
9976
|
-
data_tbl=data_tbl,
|
|
9977
|
-
schema=value["schema"],
|
|
9978
|
-
passed=result_bool,
|
|
9979
|
-
complete=value["complete"],
|
|
9980
|
-
in_order=value["in_order"],
|
|
9981
|
-
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
9982
|
-
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
9983
|
-
full_match_dtypes=value["full_match_dtypes"],
|
|
9984
|
-
)
|
|
9926
|
+
# Apply error handling only to data quality validations, not programming error validations
|
|
9927
|
+
if assertion_type != "specially":
|
|
9928
|
+
try:
|
|
9929
|
+
# validations requiring `_column_test_prep()`
|
|
9930
|
+
if assertion_type in [
|
|
9931
|
+
"col_vals_gt",
|
|
9932
|
+
"col_vals_lt",
|
|
9933
|
+
"col_vals_eq",
|
|
9934
|
+
"col_vals_ne",
|
|
9935
|
+
"col_vals_ge",
|
|
9936
|
+
"col_vals_le",
|
|
9937
|
+
"col_vals_null",
|
|
9938
|
+
"col_vals_not_null",
|
|
9939
|
+
"col_vals_between",
|
|
9940
|
+
"col_vals_outside",
|
|
9941
|
+
"col_vals_in_set",
|
|
9942
|
+
"col_vals_not_in_set",
|
|
9943
|
+
"col_vals_regex",
|
|
9944
|
+
]:
|
|
9945
|
+
# Process table for column validation
|
|
9946
|
+
tbl = _column_test_prep(
|
|
9947
|
+
df=data_tbl_step, column=column, allowed_types=compatible_dtypes
|
|
9948
|
+
)
|
|
9985
9949
|
|
|
9986
|
-
|
|
9987
|
-
|
|
9988
|
-
|
|
9989
|
-
|
|
9990
|
-
|
|
9991
|
-
|
|
9992
|
-
|
|
9993
|
-
|
|
9994
|
-
|
|
9995
|
-
|
|
9996
|
-
|
|
9997
|
-
|
|
9998
|
-
|
|
9999
|
-
|
|
10000
|
-
|
|
10001
|
-
|
|
10002
|
-
|
|
10003
|
-
|
|
10004
|
-
|
|
10005
|
-
|
|
10006
|
-
|
|
10007
|
-
|
|
10008
|
-
|
|
10009
|
-
|
|
10010
|
-
|
|
10011
|
-
|
|
10012
|
-
|
|
10013
|
-
|
|
10014
|
-
|
|
10015
|
-
|
|
10016
|
-
|
|
10017
|
-
|
|
10018
|
-
|
|
10019
|
-
|
|
10020
|
-
|
|
10021
|
-
|
|
10022
|
-
|
|
10023
|
-
|
|
10024
|
-
validation.n_passed = int(result_bool)
|
|
10025
|
-
validation.n_failed = 1 - result_bool
|
|
10026
|
-
|
|
10027
|
-
results_tbl = None
|
|
10028
|
-
|
|
10029
|
-
if assertion_category == "CONJOINTLY":
|
|
10030
|
-
results_tbl = ConjointlyValidation(
|
|
10031
|
-
data_tbl=data_tbl_step,
|
|
10032
|
-
expressions=value["expressions"],
|
|
10033
|
-
threshold=threshold,
|
|
10034
|
-
tbl_type=tbl_type,
|
|
10035
|
-
).get_test_results()
|
|
10036
|
-
|
|
10037
|
-
if assertion_category == "SPECIALLY":
|
|
10038
|
-
results_tbl_list = SpeciallyValidation(
|
|
10039
|
-
data_tbl=data_tbl_step,
|
|
10040
|
-
expression=value,
|
|
10041
|
-
threshold=threshold,
|
|
10042
|
-
tbl_type=tbl_type,
|
|
10043
|
-
).get_test_results()
|
|
10044
|
-
|
|
10045
|
-
#
|
|
10046
|
-
# The result from this could either be a table in the conventional form, or,
|
|
10047
|
-
# a list of boolean values; handle both cases
|
|
10048
|
-
#
|
|
10049
|
-
|
|
10050
|
-
if isinstance(results_tbl_list, list):
|
|
10051
|
-
# If the result is a list of boolean values, then we need to convert it to a
|
|
10052
|
-
# set the validation results from the list
|
|
10053
|
-
validation.all_passed = all(results_tbl_list)
|
|
10054
|
-
validation.n = len(results_tbl_list)
|
|
10055
|
-
validation.n_passed = results_tbl_list.count(True)
|
|
10056
|
-
validation.n_failed = results_tbl_list.count(False)
|
|
10057
|
-
|
|
10058
|
-
results_tbl = None
|
|
9950
|
+
if assertion_method == "gt":
|
|
9951
|
+
results_tbl = interrogate_gt(
|
|
9952
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9953
|
+
)
|
|
9954
|
+
elif assertion_method == "lt":
|
|
9955
|
+
results_tbl = interrogate_lt(
|
|
9956
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9957
|
+
)
|
|
9958
|
+
elif assertion_method == "eq":
|
|
9959
|
+
results_tbl = interrogate_eq(
|
|
9960
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9961
|
+
)
|
|
9962
|
+
elif assertion_method == "ne":
|
|
9963
|
+
results_tbl = interrogate_ne(
|
|
9964
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9965
|
+
)
|
|
9966
|
+
elif assertion_method == "ge":
|
|
9967
|
+
results_tbl = interrogate_ge(
|
|
9968
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9969
|
+
)
|
|
9970
|
+
elif assertion_method == "le":
|
|
9971
|
+
results_tbl = interrogate_le(
|
|
9972
|
+
tbl=tbl, column=column, compare=value, na_pass=na_pass
|
|
9973
|
+
)
|
|
9974
|
+
elif assertion_method == "null":
|
|
9975
|
+
results_tbl = interrogate_null(tbl=tbl, column=column)
|
|
9976
|
+
elif assertion_method == "not_null":
|
|
9977
|
+
results_tbl = interrogate_not_null(tbl=tbl, column=column)
|
|
9978
|
+
|
|
9979
|
+
elif assertion_type == "col_vals_between":
|
|
9980
|
+
results_tbl = interrogate_between(
|
|
9981
|
+
tbl=tbl,
|
|
9982
|
+
column=column,
|
|
9983
|
+
low=value[0],
|
|
9984
|
+
high=value[1],
|
|
9985
|
+
inclusive=inclusive,
|
|
9986
|
+
na_pass=na_pass,
|
|
9987
|
+
)
|
|
10059
9988
|
|
|
10060
|
-
|
|
10061
|
-
|
|
10062
|
-
|
|
10063
|
-
|
|
9989
|
+
elif assertion_type == "col_vals_outside":
|
|
9990
|
+
results_tbl = interrogate_outside(
|
|
9991
|
+
tbl=tbl,
|
|
9992
|
+
column=column,
|
|
9993
|
+
low=value[0],
|
|
9994
|
+
high=value[1],
|
|
9995
|
+
inclusive=inclusive,
|
|
9996
|
+
na_pass=na_pass,
|
|
9997
|
+
)
|
|
9998
|
+
|
|
9999
|
+
elif assertion_type == "col_vals_in_set":
|
|
10000
|
+
results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value)
|
|
10001
|
+
|
|
10002
|
+
elif assertion_type == "col_vals_not_in_set":
|
|
10003
|
+
results_tbl = interrogate_notin(
|
|
10004
|
+
tbl=tbl, column=column, set_values=value
|
|
10005
|
+
)
|
|
10006
|
+
|
|
10007
|
+
elif assertion_type == "col_vals_regex":
|
|
10008
|
+
results_tbl = interrogate_regex(
|
|
10009
|
+
tbl=tbl, column=column, pattern=value, na_pass=na_pass
|
|
10010
|
+
)
|
|
10011
|
+
|
|
10012
|
+
elif assertion_type == "col_vals_expr":
|
|
10013
|
+
results_tbl = col_vals_expr(
|
|
10014
|
+
data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
|
|
10015
|
+
)
|
|
10016
|
+
|
|
10017
|
+
elif assertion_type == "rows_distinct":
|
|
10018
|
+
results_tbl = interrogate_rows_distinct(
|
|
10019
|
+
data_tbl=data_tbl_step, columns_subset=column
|
|
10020
|
+
)
|
|
10021
|
+
|
|
10022
|
+
elif assertion_type == "rows_complete":
|
|
10023
|
+
results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
|
|
10024
|
+
|
|
10025
|
+
elif assertion_type == "col_exists":
|
|
10026
|
+
result_bool = col_exists(
|
|
10027
|
+
data_tbl=data_tbl_step,
|
|
10028
|
+
column=column,
|
|
10029
|
+
)
|
|
10030
|
+
|
|
10031
|
+
validation.all_passed = result_bool
|
|
10032
|
+
validation.n = 1
|
|
10033
|
+
validation.n_passed = int(result_bool)
|
|
10034
|
+
validation.n_failed = 1 - int(result_bool)
|
|
10035
|
+
|
|
10036
|
+
results_tbl = None
|
|
10037
|
+
|
|
10038
|
+
elif assertion_type == "col_schema_match":
|
|
10039
|
+
result_bool = col_schema_match(
|
|
10040
|
+
data_tbl=data_tbl_step,
|
|
10041
|
+
schema=value["schema"],
|
|
10042
|
+
complete=value["complete"],
|
|
10043
|
+
in_order=value["in_order"],
|
|
10044
|
+
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
10045
|
+
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
10046
|
+
full_match_dtypes=value["full_match_dtypes"],
|
|
10047
|
+
threshold=threshold,
|
|
10048
|
+
)
|
|
10049
|
+
|
|
10050
|
+
schema_validation_info = _get_schema_validation_info(
|
|
10051
|
+
data_tbl=data_tbl,
|
|
10052
|
+
schema=value["schema"],
|
|
10053
|
+
passed=result_bool,
|
|
10054
|
+
complete=value["complete"],
|
|
10055
|
+
in_order=value["in_order"],
|
|
10056
|
+
case_sensitive_colnames=value["case_sensitive_colnames"],
|
|
10057
|
+
case_sensitive_dtypes=value["case_sensitive_dtypes"],
|
|
10058
|
+
full_match_dtypes=value["full_match_dtypes"],
|
|
10059
|
+
)
|
|
10060
|
+
|
|
10061
|
+
# Add the schema validation info to the validation object
|
|
10062
|
+
validation.val_info = schema_validation_info
|
|
10063
|
+
|
|
10064
|
+
validation.all_passed = result_bool
|
|
10065
|
+
validation.n = 1
|
|
10066
|
+
validation.n_passed = int(result_bool)
|
|
10067
|
+
validation.n_failed = 1 - result_bool
|
|
10068
|
+
|
|
10069
|
+
results_tbl = None
|
|
10070
|
+
|
|
10071
|
+
elif assertion_type == "row_count_match":
|
|
10072
|
+
result_bool = row_count_match(
|
|
10073
|
+
data_tbl=data_tbl_step,
|
|
10074
|
+
count=value["count"],
|
|
10075
|
+
inverse=value["inverse"],
|
|
10076
|
+
abs_tol_bounds=value["abs_tol_bounds"],
|
|
10077
|
+
)
|
|
10078
|
+
|
|
10079
|
+
validation.all_passed = result_bool
|
|
10080
|
+
validation.n = 1
|
|
10081
|
+
validation.n_passed = int(result_bool)
|
|
10082
|
+
validation.n_failed = 1 - result_bool
|
|
10083
|
+
|
|
10084
|
+
results_tbl = None
|
|
10085
|
+
|
|
10086
|
+
elif assertion_type == "col_count_match":
|
|
10087
|
+
result_bool = col_count_match(
|
|
10088
|
+
data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"]
|
|
10089
|
+
)
|
|
10090
|
+
|
|
10091
|
+
validation.all_passed = result_bool
|
|
10092
|
+
validation.n = 1
|
|
10093
|
+
validation.n_passed = int(result_bool)
|
|
10094
|
+
validation.n_failed = 1 - result_bool
|
|
10095
|
+
|
|
10096
|
+
results_tbl = None
|
|
10097
|
+
|
|
10098
|
+
elif assertion_type == "conjointly":
|
|
10099
|
+
results_tbl = conjointly_validation(
|
|
10100
|
+
data_tbl=data_tbl_step,
|
|
10101
|
+
expressions=value["expressions"],
|
|
10102
|
+
threshold=threshold,
|
|
10103
|
+
tbl_type=tbl_type,
|
|
10104
|
+
)
|
|
10105
|
+
|
|
10106
|
+
else:
|
|
10107
|
+
raise ValueError(
|
|
10108
|
+
f"Unknown assertion type: {assertion_type}"
|
|
10109
|
+
) # pragma: no cover
|
|
10110
|
+
|
|
10111
|
+
except Exception as e:
|
|
10112
|
+
# Only catch specific data quality comparison errors, not programming errors
|
|
10113
|
+
error_msg = str(e).lower()
|
|
10114
|
+
is_comparison_error = (
|
|
10115
|
+
"boolean value of na is ambiguous" in error_msg
|
|
10116
|
+
or "cannot compare" in error_msg
|
|
10117
|
+
or (
|
|
10118
|
+
"type" in error_msg
|
|
10119
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
10120
|
+
)
|
|
10121
|
+
or ("dtype" in error_msg and "compare" in error_msg)
|
|
10122
|
+
)
|
|
10123
|
+
|
|
10124
|
+
if is_comparison_error: # pragma: no cover
|
|
10125
|
+
# If data quality comparison fails, mark the validation as having an eval_error
|
|
10126
|
+
validation.eval_error = True # pragma: no cover
|
|
10127
|
+
end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
|
|
10128
|
+
validation.proc_duration_s = (
|
|
10129
|
+
end_time - start_time
|
|
10130
|
+
).total_seconds() # pragma: no cover
|
|
10131
|
+
validation.time_processed = end_time.isoformat(
|
|
10132
|
+
timespec="milliseconds"
|
|
10133
|
+
) # pragma: no cover
|
|
10134
|
+
validation.active = False # pragma: no cover
|
|
10135
|
+
continue # pragma: no cover
|
|
10136
|
+
else:
|
|
10137
|
+
# For other errors (like missing columns), let them propagate
|
|
10138
|
+
raise
|
|
10139
|
+
|
|
10140
|
+
else:
|
|
10141
|
+
# For "specially" validations, let programming errors propagate as exceptions
|
|
10142
|
+
if assertion_type == "specially":
|
|
10143
|
+
results_tbl_list = SpeciallyValidation(
|
|
10144
|
+
data_tbl=data_tbl_step,
|
|
10145
|
+
expression=value,
|
|
10146
|
+
threshold=threshold,
|
|
10147
|
+
tbl_type=tbl_type,
|
|
10148
|
+
).get_test_results()
|
|
10149
|
+
|
|
10150
|
+
#
|
|
10151
|
+
# The result from this could either be a table in the conventional form, or,
|
|
10152
|
+
# a list of boolean values; handle both cases
|
|
10153
|
+
#
|
|
10154
|
+
|
|
10155
|
+
if isinstance(results_tbl_list, list):
|
|
10156
|
+
# If the result is a list of boolean values, then we need to convert it to a
|
|
10157
|
+
# set the validation results from the list
|
|
10158
|
+
validation.all_passed = all(results_tbl_list)
|
|
10159
|
+
validation.n = len(results_tbl_list)
|
|
10160
|
+
validation.n_passed = results_tbl_list.count(True)
|
|
10161
|
+
validation.n_failed = results_tbl_list.count(False)
|
|
10162
|
+
|
|
10163
|
+
results_tbl = None
|
|
10164
|
+
|
|
10165
|
+
else:
|
|
10166
|
+
# If the result is not a list, then we assume it's a table in the conventional
|
|
10167
|
+
# form (where the column is `pb_is_good_` exists, with boolean values
|
|
10168
|
+
results_tbl = results_tbl_list
|
|
10064
10169
|
|
|
10065
10170
|
# If the results table is not `None`, then we assume there is a table with a column
|
|
10066
10171
|
# called `pb_is_good_` that contains boolean values; we can then use this table to
|
|
@@ -10272,32 +10377,46 @@ class Validate:
|
|
|
10272
10377
|
except AttributeError:
|
|
10273
10378
|
# For LazyFrames without sample method, collect first then sample
|
|
10274
10379
|
validation_extract_native = validation_extract_nw.collect().to_native()
|
|
10275
|
-
if hasattr(validation_extract_native, "sample"):
|
|
10380
|
+
if hasattr(validation_extract_native, "sample"): # pragma: no cover
|
|
10276
10381
|
# PySpark DataFrame has sample method
|
|
10277
|
-
validation_extract_native =
|
|
10278
|
-
|
|
10279
|
-
|
|
10280
|
-
|
|
10382
|
+
validation_extract_native = (
|
|
10383
|
+
validation_extract_native.sample( # pragma: no cover
|
|
10384
|
+
fraction=min(
|
|
10385
|
+
1.0, sample_n / validation_extract_native.count()
|
|
10386
|
+
) # pragma: no cover
|
|
10387
|
+
).limit(sample_n)
|
|
10388
|
+
) # pragma: no cover
|
|
10389
|
+
validation_extract_nw = nw.from_native(
|
|
10390
|
+
validation_extract_native
|
|
10391
|
+
) # pragma: no cover
|
|
10281
10392
|
else:
|
|
10282
10393
|
# Fallback: just take first n rows after collecting
|
|
10283
|
-
validation_extract_nw = validation_extract_nw.collect().head(
|
|
10394
|
+
validation_extract_nw = validation_extract_nw.collect().head(
|
|
10395
|
+
sample_n
|
|
10396
|
+
) # pragma: no cover
|
|
10284
10397
|
elif sample_frac is not None:
|
|
10285
10398
|
try:
|
|
10286
10399
|
validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac)
|
|
10287
|
-
except AttributeError:
|
|
10400
|
+
except AttributeError: # pragma: no cover
|
|
10288
10401
|
# For LazyFrames without sample method, collect first then sample
|
|
10289
|
-
validation_extract_native =
|
|
10290
|
-
|
|
10402
|
+
validation_extract_native = (
|
|
10403
|
+
validation_extract_nw.collect().to_native()
|
|
10404
|
+
) # pragma: no cover
|
|
10405
|
+
if hasattr(validation_extract_native, "sample"): # pragma: no cover
|
|
10291
10406
|
# PySpark DataFrame has sample method
|
|
10292
10407
|
validation_extract_native = validation_extract_native.sample(
|
|
10293
10408
|
fraction=sample_frac
|
|
10294
|
-
)
|
|
10295
|
-
validation_extract_nw = nw.from_native(
|
|
10409
|
+
) # pragma: no cover
|
|
10410
|
+
validation_extract_nw = nw.from_native(
|
|
10411
|
+
validation_extract_native
|
|
10412
|
+
) # pragma: no cover
|
|
10296
10413
|
else:
|
|
10297
10414
|
# Fallback: use fraction to calculate head size
|
|
10298
|
-
collected = validation_extract_nw.collect()
|
|
10299
|
-
sample_size = max(
|
|
10300
|
-
|
|
10415
|
+
collected = validation_extract_nw.collect() # pragma: no cover
|
|
10416
|
+
sample_size = max(
|
|
10417
|
+
1, int(len(collected) * sample_frac)
|
|
10418
|
+
) # pragma: no cover
|
|
10419
|
+
validation_extract_nw = collected.head(sample_size) # pragma: no cover
|
|
10301
10420
|
|
|
10302
10421
|
# Ensure a limit is set on the number of rows to extract
|
|
10303
10422
|
try:
|
|
@@ -10307,9 +10426,9 @@ class Validate:
|
|
|
10307
10426
|
# For LazyFrames, collect to get length (or use a reasonable default)
|
|
10308
10427
|
try:
|
|
10309
10428
|
extract_length = len(validation_extract_nw.collect())
|
|
10310
|
-
except Exception:
|
|
10429
|
+
except Exception: # pragma: no cover
|
|
10311
10430
|
# If collection fails, apply limit anyway as a safety measure
|
|
10312
|
-
extract_length = extract_limit + 1 #
|
|
10431
|
+
extract_length = extract_limit + 1 # pragma: no cover
|
|
10313
10432
|
|
|
10314
10433
|
if extract_length > extract_limit:
|
|
10315
10434
|
validation_extract_nw = validation_extract_nw.head(extract_limit)
|
|
@@ -11974,10 +12093,12 @@ class Validate:
|
|
|
11974
12093
|
try:
|
|
11975
12094
|
# Try without order_by first (for DataFrames)
|
|
11976
12095
|
data_nw = data_nw.with_row_index(name=index_name)
|
|
11977
|
-
except TypeError:
|
|
12096
|
+
except TypeError: # pragma: no cover
|
|
11978
12097
|
# LazyFrames require order_by parameter - use first column for ordering
|
|
11979
|
-
first_col = data_nw.columns[0]
|
|
11980
|
-
data_nw = data_nw.with_row_index(
|
|
12098
|
+
first_col = data_nw.columns[0] # pragma: no cover
|
|
12099
|
+
data_nw = data_nw.with_row_index(
|
|
12100
|
+
name=index_name, order_by=first_col
|
|
12101
|
+
) # pragma: no cover
|
|
11981
12102
|
|
|
11982
12103
|
# Get all validation step result tables and join together the `pb_is_good_` columns
|
|
11983
12104
|
# ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
|
|
@@ -11989,10 +12110,12 @@ class Validate:
|
|
|
11989
12110
|
try:
|
|
11990
12111
|
# Try without order_by first (for DataFrames)
|
|
11991
12112
|
results_tbl = results_tbl.with_row_index(name=index_name)
|
|
11992
|
-
except TypeError:
|
|
12113
|
+
except TypeError: # pragma: no cover
|
|
11993
12114
|
# LazyFrames require order_by parameter - use first column for ordering
|
|
11994
|
-
first_col = results_tbl.columns[0]
|
|
11995
|
-
results_tbl = results_tbl.with_row_index(
|
|
12115
|
+
first_col = results_tbl.columns[0] # pragma: no cover
|
|
12116
|
+
results_tbl = results_tbl.with_row_index(
|
|
12117
|
+
name=index_name, order_by=first_col
|
|
12118
|
+
) # pragma: no cover
|
|
11996
12119
|
|
|
11997
12120
|
# Add numerical suffix to the `pb_is_good_` column to make it unique
|
|
11998
12121
|
results_tbl = results_tbl.select([index_name, "pb_is_good_"]).rename(
|
|
@@ -12124,15 +12247,15 @@ class Validate:
|
|
|
12124
12247
|
# If the table is a Polars one, determine if it's a LazyFrame
|
|
12125
12248
|
if tbl_info == "polars":
|
|
12126
12249
|
if _is_lazy_frame(self.data):
|
|
12127
|
-
tbl_info = "polars-lazy"
|
|
12250
|
+
tbl_info = "polars-lazy" # pragma: no cover
|
|
12128
12251
|
|
|
12129
12252
|
# Determine if the input table is a Narwhals DF
|
|
12130
12253
|
if _is_narwhals_table(self.data):
|
|
12131
12254
|
# Determine if the Narwhals table is a LazyFrame
|
|
12132
|
-
if _is_lazy_frame(self.data):
|
|
12133
|
-
tbl_info = "narwhals-lazy"
|
|
12255
|
+
if _is_lazy_frame(self.data): # pragma: no cover
|
|
12256
|
+
tbl_info = "narwhals-lazy" # pragma: no cover
|
|
12134
12257
|
else:
|
|
12135
|
-
tbl_info = "narwhals"
|
|
12258
|
+
tbl_info = "narwhals" # pragma: no cover
|
|
12136
12259
|
|
|
12137
12260
|
# Get the thresholds object
|
|
12138
12261
|
thresholds = self.thresholds
|
|
@@ -12297,7 +12420,7 @@ class Validate:
|
|
|
12297
12420
|
if lang in RTL_LANGUAGES:
|
|
12298
12421
|
gt_tbl = gt_tbl.tab_style(
|
|
12299
12422
|
style=style.css("direction: rtl;"), locations=loc.source_notes()
|
|
12300
|
-
)
|
|
12423
|
+
) # pragma: no cover
|
|
12301
12424
|
|
|
12302
12425
|
if incl_header:
|
|
12303
12426
|
gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
|
|
@@ -12614,9 +12737,11 @@ class Validate:
|
|
|
12614
12737
|
# Get the number of rows in the extract (safe for LazyFrames)
|
|
12615
12738
|
try:
|
|
12616
12739
|
n_rows = len(extract_nw)
|
|
12617
|
-
except TypeError:
|
|
12740
|
+
except TypeError: # pragma: no cover
|
|
12618
12741
|
# For LazyFrames, collect() first to get length
|
|
12619
|
-
n_rows =
|
|
12742
|
+
n_rows = (
|
|
12743
|
+
len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
|
|
12744
|
+
) # pragma: no cover
|
|
12620
12745
|
|
|
12621
12746
|
# If the number of rows is zero, then produce an em dash then go to the next iteration
|
|
12622
12747
|
if n_rows == 0:
|
|
@@ -12624,7 +12749,7 @@ class Validate:
|
|
|
12624
12749
|
continue
|
|
12625
12750
|
|
|
12626
12751
|
# Write the CSV text (ensure LazyFrames are collected first)
|
|
12627
|
-
if hasattr(extract_nw, "collect"):
|
|
12752
|
+
if hasattr(extract_nw, "collect"): # pragma: no cover
|
|
12628
12753
|
extract_nw = extract_nw.collect()
|
|
12629
12754
|
csv_text = extract_nw.write_csv()
|
|
12630
12755
|
|
|
@@ -13126,7 +13251,7 @@ class Validate:
|
|
|
13126
13251
|
elif isinstance(column, list):
|
|
13127
13252
|
column_position = [list(self.data.columns).index(col) + 1 for col in column]
|
|
13128
13253
|
else:
|
|
13129
|
-
column_position = None
|
|
13254
|
+
column_position = None # pragma: no cover
|
|
13130
13255
|
else:
|
|
13131
13256
|
column_position = None
|
|
13132
13257
|
|
|
@@ -13218,7 +13343,7 @@ class Validate:
|
|
|
13218
13343
|
)
|
|
13219
13344
|
|
|
13220
13345
|
else:
|
|
13221
|
-
step_report = None
|
|
13346
|
+
step_report = None # pragma: no cover
|
|
13222
13347
|
|
|
13223
13348
|
return step_report
|
|
13224
13349
|
|
|
@@ -13670,6 +13795,48 @@ def _string_date_dttm_conversion(value: any) -> any:
|
|
|
13670
13795
|
return value
|
|
13671
13796
|
|
|
13672
13797
|
|
|
13798
|
+
def _conditional_string_date_dttm_conversion(
|
|
13799
|
+
value: any, allow_regular_strings: bool = False
|
|
13800
|
+
) -> any:
|
|
13801
|
+
"""
|
|
13802
|
+
Conditionally convert a string to a date or datetime object if it is in the correct format. If
|
|
13803
|
+
`allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
|
|
13804
|
+
the value is not a string, it is returned as is.
|
|
13805
|
+
|
|
13806
|
+
Parameters
|
|
13807
|
+
----------
|
|
13808
|
+
value
|
|
13809
|
+
The value to convert. It can be a string, date, or datetime object.
|
|
13810
|
+
allow_regular_strings
|
|
13811
|
+
If `True`, regular strings (non-date/datetime) are allowed to pass through unchanged. If
|
|
13812
|
+
`False`, behaves like `_string_date_dttm_conversion()` and raises `ValueError` for regular
|
|
13813
|
+
strings.
|
|
13814
|
+
|
|
13815
|
+
Returns
|
|
13816
|
+
-------
|
|
13817
|
+
any
|
|
13818
|
+
The converted date or datetime object, or the original value.
|
|
13819
|
+
|
|
13820
|
+
Raises
|
|
13821
|
+
------
|
|
13822
|
+
ValueError
|
|
13823
|
+
If allow_regular_strings is False and the string cannot be converted to a date or datetime.
|
|
13824
|
+
"""
|
|
13825
|
+
|
|
13826
|
+
if isinstance(value, str):
|
|
13827
|
+
if _is_string_date(value):
|
|
13828
|
+
value = _convert_string_to_date(value)
|
|
13829
|
+
elif _is_string_datetime(value):
|
|
13830
|
+
value = _convert_string_to_datetime(value)
|
|
13831
|
+
elif not allow_regular_strings:
|
|
13832
|
+
raise ValueError(
|
|
13833
|
+
"If `value=` is provided as a string it must be a date or datetime string."
|
|
13834
|
+
) # pragma: no cover
|
|
13835
|
+
# If allow_regular_strings is True, regular strings pass through unchanged
|
|
13836
|
+
|
|
13837
|
+
return value
|
|
13838
|
+
|
|
13839
|
+
|
|
13673
13840
|
def _process_brief(
|
|
13674
13841
|
brief: str | None,
|
|
13675
13842
|
step: int,
|
|
@@ -13718,12 +13885,33 @@ def _process_brief(
|
|
|
13718
13885
|
|
|
13719
13886
|
if segment is not None:
|
|
13720
13887
|
# The segment is always a tuple of the form ("{column}", "{value}")
|
|
13888
|
+
# Handle both regular lists and Segment objects (from seg_group())
|
|
13889
|
+
|
|
13890
|
+
segment_column = segment[0]
|
|
13891
|
+
segment_value = segment[1]
|
|
13892
|
+
|
|
13893
|
+
# If segment_value is a Segment object (from seg_group()), format it appropriately
|
|
13894
|
+
if isinstance(segment_value, Segment):
|
|
13895
|
+
# For Segment objects, format the segments as a readable string
|
|
13896
|
+
segments = segment_value.segments
|
|
13897
|
+
if len(segments) == 1:
|
|
13898
|
+
# Single segment: join the values with commas
|
|
13899
|
+
segment_value_str = ", ".join(str(v) for v in segments[0])
|
|
13900
|
+
else:
|
|
13901
|
+
# Multiple segments: join each segment with commas, separate segments with " | "
|
|
13902
|
+
segment_value_str = " | ".join([", ".join(str(v) for v in seg) for seg in segments])
|
|
13903
|
+
else:
|
|
13904
|
+
# For regular lists or other types, convert to string
|
|
13905
|
+
if isinstance(segment_value, list):
|
|
13906
|
+
segment_value_str = ", ".join(str(v) for v in segment_value)
|
|
13907
|
+
else:
|
|
13908
|
+
segment_value_str = str(segment_value)
|
|
13721
13909
|
|
|
13722
|
-
segment_fmt = f"{
|
|
13910
|
+
segment_fmt = f"{segment_column} / {segment_value_str}"
|
|
13723
13911
|
|
|
13724
13912
|
brief = brief.replace("{segment}", segment_fmt)
|
|
13725
|
-
brief = brief.replace("{segment_column}",
|
|
13726
|
-
brief = brief.replace("{segment_value}",
|
|
13913
|
+
brief = brief.replace("{segment_column}", segment_column)
|
|
13914
|
+
brief = brief.replace("{segment_value}", segment_value_str)
|
|
13727
13915
|
|
|
13728
13916
|
return brief
|
|
13729
13917
|
|
|
@@ -13757,7 +13945,7 @@ def _process_action_str(
|
|
|
13757
13945
|
if col is not None:
|
|
13758
13946
|
# If a list of columns is provided, then join the columns into a comma-separated string
|
|
13759
13947
|
if isinstance(col, list):
|
|
13760
|
-
col = ", ".join(col)
|
|
13948
|
+
col = ", ".join(col) # pragma: no cover
|
|
13761
13949
|
|
|
13762
13950
|
action_str = action_str.replace("{col}", col)
|
|
13763
13951
|
action_str = action_str.replace("{column}", col)
|
|
@@ -14154,7 +14342,7 @@ def _prep_values_text(
|
|
|
14154
14342
|
length_values = len(values)
|
|
14155
14343
|
|
|
14156
14344
|
if length_values == 0:
|
|
14157
|
-
return ""
|
|
14345
|
+
return "" # pragma: no cover
|
|
14158
14346
|
|
|
14159
14347
|
if length_values > limit:
|
|
14160
14348
|
num_omitted = length_values - limit
|
|
@@ -14163,7 +14351,7 @@ def _prep_values_text(
|
|
|
14163
14351
|
formatted_values = []
|
|
14164
14352
|
for value in values[:limit]:
|
|
14165
14353
|
if isinstance(value, (datetime.datetime, datetime.date)):
|
|
14166
|
-
formatted_values.append(f"`{value.isoformat()}`")
|
|
14354
|
+
formatted_values.append(f"`{value.isoformat()}`") # pragma: no cover
|
|
14167
14355
|
else:
|
|
14168
14356
|
formatted_values.append(f"`{value}`")
|
|
14169
14357
|
|
|
@@ -14319,17 +14507,109 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14319
14507
|
column, segment = segments_expr
|
|
14320
14508
|
|
|
14321
14509
|
if tbl_type in ["pandas", "polars", "pyspark"]:
|
|
14322
|
-
# If the table is a Pandas, Polars, or PySpark DataFrame,
|
|
14510
|
+
# If the table is a Pandas, Polars, or PySpark DataFrame, transform to a Narwhals table
|
|
14323
14511
|
# and perform the filtering operation
|
|
14324
14512
|
|
|
14325
14513
|
# Transform to Narwhals table if a DataFrame
|
|
14326
14514
|
data_tbl_nw = nw.from_native(data_tbl)
|
|
14327
14515
|
|
|
14516
|
+
# Handle Polars expressions by attempting to extract literal values
|
|
14517
|
+
# This is a compatibility measure for cases where `pl.datetime()`, `pl.lit()`, etc.,
|
|
14518
|
+
# are accidentally used instead of native Python types
|
|
14519
|
+
if (
|
|
14520
|
+
hasattr(segment, "__class__")
|
|
14521
|
+
and "polars" in segment.__class__.__module__
|
|
14522
|
+
and segment.__class__.__name__ == "Expr"
|
|
14523
|
+
):
|
|
14524
|
+
# This is a Polars expression so we should warn about this and suggest native types
|
|
14525
|
+
import warnings
|
|
14526
|
+
from datetime import date, datetime
|
|
14527
|
+
|
|
14528
|
+
warnings.warn(
|
|
14529
|
+
"Polars expressions in segments are deprecated. Please use native Python types instead. "
|
|
14530
|
+
"For example, use datetime.date(2016, 1, 4) instead of pl.datetime(2016, 1, 4).",
|
|
14531
|
+
DeprecationWarning,
|
|
14532
|
+
stacklevel=3,
|
|
14533
|
+
)
|
|
14534
|
+
|
|
14535
|
+
# Try to extract the literal value from various Polars expression patterns
|
|
14536
|
+
segment_str = str(segment)
|
|
14537
|
+
parsed_value = None
|
|
14538
|
+
|
|
14539
|
+
# Handle different Polars expression string formats
|
|
14540
|
+
# Format 1: Direct date strings like "2016-01-04"
|
|
14541
|
+
if len(segment_str) == 10 and segment_str.count("-") == 2:
|
|
14542
|
+
try:
|
|
14543
|
+
parsed_value = date.fromisoformat(segment_str)
|
|
14544
|
+
except ValueError: # pragma: no cover
|
|
14545
|
+
pass # pragma: no cover
|
|
14546
|
+
|
|
14547
|
+
# Format 2: Datetime strings with UTC timezone like
|
|
14548
|
+
# "2016-01-04 00:00:01 UTC.strict_cast(...)"
|
|
14549
|
+
elif " UTC" in segment_str:
|
|
14550
|
+
try:
|
|
14551
|
+
# Extract just the datetime part before "UTC"
|
|
14552
|
+
datetime_part = segment_str.split(" UTC")[0]
|
|
14553
|
+
if len(datetime_part) >= 10:
|
|
14554
|
+
parsed_dt = datetime.fromisoformat(datetime_part)
|
|
14555
|
+
# Convert midnight datetimes to dates for consistency
|
|
14556
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14557
|
+
parsed_value = parsed_dt.date() # pragma: no cover
|
|
14558
|
+
else:
|
|
14559
|
+
parsed_value = parsed_dt
|
|
14560
|
+
except (ValueError, IndexError): # pragma: no cover
|
|
14561
|
+
pass # pragma: no cover
|
|
14562
|
+
|
|
14563
|
+
# Format 3: Bracketed expressions like ['2016-01-04']
|
|
14564
|
+
elif segment_str.startswith("[") and segment_str.endswith("]"):
|
|
14565
|
+
try: # pragma: no cover
|
|
14566
|
+
# Remove [' and ']
|
|
14567
|
+
content = segment_str[2:-2] # pragma: no cover
|
|
14568
|
+
|
|
14569
|
+
# Try parsing as date first
|
|
14570
|
+
if len(content) == 10 and content.count("-") == 2: # pragma: no cover
|
|
14571
|
+
try: # pragma: no cover
|
|
14572
|
+
parsed_value = date.fromisoformat(content) # pragma: no cover
|
|
14573
|
+
except ValueError: # pragma: no cover
|
|
14574
|
+
pass # pragma: no cover
|
|
14575
|
+
|
|
14576
|
+
# Try parsing as datetime
|
|
14577
|
+
if parsed_value is None: # pragma: no cover
|
|
14578
|
+
try: # pragma: no cover
|
|
14579
|
+
parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
|
|
14580
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14581
|
+
parsed_value = parsed_dt.date()
|
|
14582
|
+
else:
|
|
14583
|
+
parsed_value = parsed_dt
|
|
14584
|
+
except ValueError:
|
|
14585
|
+
pass
|
|
14586
|
+
|
|
14587
|
+
except (ValueError, IndexError): # pragma: no cover
|
|
14588
|
+
pass # pragma: no cover
|
|
14589
|
+
|
|
14590
|
+
# Handle `pl.datetime()` expressions with .alias("datetime")
|
|
14591
|
+
elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
|
|
14592
|
+
try:
|
|
14593
|
+
datetime_part = segment_str.split('.alias("datetime")')[0]
|
|
14594
|
+
parsed_dt = datetime.fromisoformat(datetime_part)
|
|
14595
|
+
|
|
14596
|
+
if parsed_dt.time() == datetime.min.time():
|
|
14597
|
+
parsed_value = parsed_dt.date()
|
|
14598
|
+
else:
|
|
14599
|
+
parsed_value = parsed_dt # pragma: no cover
|
|
14600
|
+
|
|
14601
|
+
except (ValueError, AttributeError): # pragma: no cover
|
|
14602
|
+
pass # pragma: no cover
|
|
14603
|
+
|
|
14604
|
+
# If we successfully parsed a value, use it; otherwise leave segment as is
|
|
14605
|
+
if parsed_value is not None:
|
|
14606
|
+
segment = parsed_value
|
|
14607
|
+
|
|
14328
14608
|
# Filter the data table based on the column name and segment
|
|
14329
14609
|
if segment is None:
|
|
14330
14610
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_null())
|
|
14331
|
-
# Check if the segment is a segment group
|
|
14332
14611
|
elif isinstance(segment, list):
|
|
14612
|
+
# Check if the segment is a segment group
|
|
14333
14613
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_in(segment))
|
|
14334
14614
|
else:
|
|
14335
14615
|
data_tbl_nw = data_tbl_nw.filter(nw.col(column) == segment)
|
|
@@ -14341,12 +14621,13 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
|
|
|
14341
14621
|
# If the table is an Ibis backend table, perform the filtering operation directly
|
|
14342
14622
|
|
|
14343
14623
|
# Filter the data table based on the column name and segment
|
|
14624
|
+
# Use the new Ibis API methods to avoid deprecation warnings
|
|
14344
14625
|
if segment is None:
|
|
14345
|
-
data_tbl = data_tbl
|
|
14626
|
+
data_tbl = data_tbl.filter(data_tbl[column].isnull()) # pragma: no cover
|
|
14346
14627
|
elif isinstance(segment, list):
|
|
14347
|
-
data_tbl = data_tbl
|
|
14628
|
+
data_tbl = data_tbl.filter(data_tbl[column].isin(segment)) # pragma: no cover
|
|
14348
14629
|
else:
|
|
14349
|
-
data_tbl = data_tbl
|
|
14630
|
+
data_tbl = data_tbl.filter(data_tbl[column] == segment)
|
|
14350
14631
|
|
|
14351
14632
|
return data_tbl
|
|
14352
14633
|
|
|
@@ -14465,7 +14746,7 @@ def _get_title_text(
|
|
|
14465
14746
|
"</span>"
|
|
14466
14747
|
f'<span style="float: right;">{title}</span>'
|
|
14467
14748
|
"</div>"
|
|
14468
|
-
)
|
|
14749
|
+
) # pragma: no cover
|
|
14469
14750
|
|
|
14470
14751
|
return html_str
|
|
14471
14752
|
|
|
@@ -14543,24 +14824,6 @@ def _transform_eval(
|
|
|
14543
14824
|
return symbol_list
|
|
14544
14825
|
|
|
14545
14826
|
|
|
14546
|
-
def _format_numbers_with_gt(
|
|
14547
|
-
values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
|
|
14548
|
-
) -> list[str]:
|
|
14549
|
-
"""Format numbers using Great Tables GT object to avoid pandas dependency."""
|
|
14550
|
-
import polars as pl
|
|
14551
|
-
|
|
14552
|
-
# Create a single-column DataFrame with all values
|
|
14553
|
-
df = pl.DataFrame({"values": values})
|
|
14554
|
-
|
|
14555
|
-
# Create GT object and format the column
|
|
14556
|
-
gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
|
|
14557
|
-
|
|
14558
|
-
# Extract the formatted values using _get_column_of_values
|
|
14559
|
-
formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
|
|
14560
|
-
|
|
14561
|
-
return formatted_values
|
|
14562
|
-
|
|
14563
|
-
|
|
14564
14827
|
def _format_single_number_with_gt(
|
|
14565
14828
|
value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
|
|
14566
14829
|
) -> str:
|
|
@@ -14571,12 +14834,14 @@ def _format_single_number_with_gt(
|
|
|
14571
14834
|
import polars as pl
|
|
14572
14835
|
|
|
14573
14836
|
df_lib = pl
|
|
14574
|
-
elif _is_lib_present("pandas"):
|
|
14575
|
-
import pandas as pd
|
|
14837
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
14838
|
+
import pandas as pd # pragma: no cover
|
|
14576
14839
|
|
|
14577
|
-
df_lib = pd
|
|
14578
|
-
else:
|
|
14579
|
-
raise ImportError(
|
|
14840
|
+
df_lib = pd # pragma: no cover
|
|
14841
|
+
else: # pragma: no cover
|
|
14842
|
+
raise ImportError(
|
|
14843
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
14844
|
+
) # pragma: no cover
|
|
14580
14845
|
|
|
14581
14846
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14582
14847
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14642,12 +14907,14 @@ def _format_single_float_with_gt(
|
|
|
14642
14907
|
import polars as pl
|
|
14643
14908
|
|
|
14644
14909
|
df_lib = pl
|
|
14645
|
-
elif _is_lib_present("pandas"):
|
|
14646
|
-
import pandas as pd
|
|
14910
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
14911
|
+
import pandas as pd # pragma: no cover
|
|
14647
14912
|
|
|
14648
|
-
df_lib = pd
|
|
14649
|
-
else:
|
|
14650
|
-
raise ImportError(
|
|
14913
|
+
df_lib = pd # pragma: no cover
|
|
14914
|
+
else: # pragma: no cover
|
|
14915
|
+
raise ImportError(
|
|
14916
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
14917
|
+
) # pragma: no cover
|
|
14651
14918
|
|
|
14652
14919
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14653
14920
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14679,7 +14946,7 @@ def _transform_passed_failed(
|
|
|
14679
14946
|
return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
|
|
14680
14947
|
else:
|
|
14681
14948
|
# Fallback to the original behavior
|
|
14682
|
-
return vals.fmt_number(value, decimals=2, locale=locale)[0]
|
|
14949
|
+
return vals.fmt_number(value, decimals=2, locale=locale)[0] # pragma: no cover
|
|
14683
14950
|
|
|
14684
14951
|
passed_failed = [
|
|
14685
14952
|
(
|
|
@@ -14819,7 +15086,7 @@ def _get_callable_source(fn: Callable) -> str:
|
|
|
14819
15086
|
return pre_arg
|
|
14820
15087
|
except (OSError, TypeError): # pragma: no cover
|
|
14821
15088
|
return fn.__name__
|
|
14822
|
-
return fn
|
|
15089
|
+
return fn # pragma: no cover
|
|
14823
15090
|
|
|
14824
15091
|
|
|
14825
15092
|
def _extract_pre_argument(source: str) -> str:
|
|
@@ -14903,12 +15170,14 @@ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None)
|
|
|
14903
15170
|
import polars as pl
|
|
14904
15171
|
|
|
14905
15172
|
df_lib = pl
|
|
14906
|
-
elif _is_lib_present("pandas"):
|
|
14907
|
-
import pandas as pd
|
|
15173
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15174
|
+
import pandas as pd # pragma: no cover
|
|
14908
15175
|
|
|
14909
|
-
df_lib = pd
|
|
14910
|
-
else:
|
|
14911
|
-
raise ImportError(
|
|
15176
|
+
df_lib = pd # pragma: no cover
|
|
15177
|
+
else: # pragma: no cover
|
|
15178
|
+
raise ImportError(
|
|
15179
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15180
|
+
) # pragma: no cover
|
|
14912
15181
|
|
|
14913
15182
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14914
15183
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14936,12 +15205,14 @@ def _format_single_float_with_gt_custom(
|
|
|
14936
15205
|
import polars as pl
|
|
14937
15206
|
|
|
14938
15207
|
df_lib = pl
|
|
14939
|
-
elif _is_lib_present("pandas"):
|
|
14940
|
-
import pandas as pd
|
|
15208
|
+
elif _is_lib_present("pandas"): # pragma: no cover
|
|
15209
|
+
import pandas as pd # pragma: no cover
|
|
14941
15210
|
|
|
14942
|
-
df_lib = pd
|
|
14943
|
-
else:
|
|
14944
|
-
raise ImportError(
|
|
15211
|
+
df_lib = pd # pragma: no cover
|
|
15212
|
+
else: # pragma: no cover
|
|
15213
|
+
raise ImportError(
|
|
15214
|
+
"Neither Polars nor Pandas is available for formatting"
|
|
15215
|
+
) # pragma: no cover
|
|
14945
15216
|
|
|
14946
15217
|
# Create a single-row, single-column DataFrame using the specified library
|
|
14947
15218
|
df = df_lib.DataFrame({"value": [value]})
|
|
@@ -14976,7 +15247,7 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) ->
|
|
|
14976
15247
|
# Fallback to the original behavior
|
|
14977
15248
|
return fmt_number(
|
|
14978
15249
|
value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
|
|
14979
|
-
)[0]
|
|
15250
|
+
)[0] # pragma: no cover
|
|
14980
15251
|
|
|
14981
15252
|
def _format_integer_safe(value: int) -> str:
|
|
14982
15253
|
if df_lib is not None and value is not None:
|
|
@@ -15113,6 +15384,8 @@ def _step_report_row_based(
|
|
|
15113
15384
|
text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
|
|
15114
15385
|
elif assertion_type == "col_vals_not_null":
|
|
15115
15386
|
text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
|
|
15387
|
+
elif assertion_type == "col_vals_expr":
|
|
15388
|
+
text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
|
|
15116
15389
|
elif assertion_type == "rows_complete":
|
|
15117
15390
|
if column is None:
|
|
15118
15391
|
text = STEP_REPORT_TEXT["rows_complete_all"][lang]
|
|
@@ -15159,10 +15432,17 @@ def _step_report_row_based(
|
|
|
15159
15432
|
title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
|
|
15160
15433
|
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
15161
15434
|
|
|
15162
|
-
|
|
15163
|
-
|
|
15164
|
-
|
|
15165
|
-
|
|
15435
|
+
# Use 'success_statement_no_column' for col_vals_expr() since it doesn't target
|
|
15436
|
+
# a specific column
|
|
15437
|
+
if assertion_type == "col_vals_expr":
|
|
15438
|
+
success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
|
|
15439
|
+
n=n
|
|
15440
|
+
) # pragma: no cover
|
|
15441
|
+
else:
|
|
15442
|
+
success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
|
|
15443
|
+
n=n,
|
|
15444
|
+
column_position=column_position,
|
|
15445
|
+
)
|
|
15166
15446
|
preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
|
|
15167
15447
|
|
|
15168
15448
|
details = (
|
|
@@ -15242,10 +15522,16 @@ def _step_report_row_based(
|
|
|
15242
15522
|
assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
|
|
15243
15523
|
failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
|
|
15244
15524
|
|
|
15245
|
-
|
|
15246
|
-
|
|
15247
|
-
|
|
15248
|
-
|
|
15525
|
+
# Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column
|
|
15526
|
+
if assertion_type == "col_vals_expr":
|
|
15527
|
+
failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format(
|
|
15528
|
+
failure_rate=failure_rate_metrics
|
|
15529
|
+
)
|
|
15530
|
+
else:
|
|
15531
|
+
failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
|
|
15532
|
+
failure_rate=failure_rate_metrics,
|
|
15533
|
+
column_position=column_position,
|
|
15534
|
+
)
|
|
15249
15535
|
|
|
15250
15536
|
if limit < extract_length:
|
|
15251
15537
|
extract_length_resolved = limit
|
|
@@ -15864,14 +16150,14 @@ def _step_report_schema_any_order(
|
|
|
15864
16150
|
if exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
15865
16151
|
col_exp_correct.append(CHECK_MARK_SPAN)
|
|
15866
16152
|
else:
|
|
15867
|
-
col_exp_correct.append(CROSS_MARK_SPAN)
|
|
16153
|
+
col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
15868
16154
|
|
|
15869
16155
|
#
|
|
15870
16156
|
# `dtype_exp` values
|
|
15871
16157
|
#
|
|
15872
16158
|
|
|
15873
16159
|
if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
15874
|
-
dtype_exp.append("")
|
|
16160
|
+
dtype_exp.append("") # pragma: no cover
|
|
15875
16161
|
|
|
15876
16162
|
elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
|
|
15877
16163
|
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
|
|
@@ -15906,9 +16192,9 @@ def _step_report_schema_any_order(
|
|
|
15906
16192
|
#
|
|
15907
16193
|
|
|
15908
16194
|
if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
15909
|
-
dtype_exp_correct.append("—")
|
|
16195
|
+
dtype_exp_correct.append("—") # pragma: no cover
|
|
15910
16196
|
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
15911
|
-
dtype_exp_correct.append("")
|
|
16197
|
+
dtype_exp_correct.append("") # pragma: no cover
|
|
15912
16198
|
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
|
|
15913
16199
|
dtype_exp_correct.append(CHECK_MARK_SPAN)
|
|
15914
16200
|
else:
|
|
@@ -15954,13 +16240,17 @@ def _step_report_schema_any_order(
|
|
|
15954
16240
|
#
|
|
15955
16241
|
|
|
15956
16242
|
if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
15957
|
-
dtype_exp.append("")
|
|
16243
|
+
dtype_exp.append("") # pragma: no cover
|
|
15958
16244
|
|
|
15959
16245
|
elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
|
|
15960
|
-
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
|
|
16246
|
+
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"] # pragma: no cover
|
|
15961
16247
|
|
|
15962
|
-
if
|
|
15963
|
-
|
|
16248
|
+
if (
|
|
16249
|
+
exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None
|
|
16250
|
+
): # pragma: no cover
|
|
16251
|
+
pos = exp_columns_dict[column_name_exp_i][
|
|
16252
|
+
"dtype_matched_pos"
|
|
16253
|
+
] # pragma: no cover
|
|
15964
16254
|
|
|
15965
16255
|
# Combine the dtypes together with pipes but underline the matched dtype in
|
|
15966
16256
|
# green with an HTML span tag and style attribute
|
|
@@ -15972,13 +16262,13 @@ def _step_report_schema_any_order(
|
|
|
15972
16262
|
else dtype[i]
|
|
15973
16263
|
)
|
|
15974
16264
|
for i in range(len(dtype))
|
|
15975
|
-
]
|
|
15976
|
-
dtype = " | ".join(dtype)
|
|
15977
|
-
dtype_exp.append(dtype)
|
|
16265
|
+
] # pragma: no cover
|
|
16266
|
+
dtype = " | ".join(dtype) # pragma: no cover
|
|
16267
|
+
dtype_exp.append(dtype) # pragma: no cover
|
|
15978
16268
|
|
|
15979
16269
|
else:
|
|
15980
|
-
dtype = " | ".join(dtype)
|
|
15981
|
-
dtype_exp.append(dtype)
|
|
16270
|
+
dtype = " | ".join(dtype) # pragma: no cover
|
|
16271
|
+
dtype_exp.append(dtype) # pragma: no cover
|
|
15982
16272
|
|
|
15983
16273
|
else:
|
|
15984
16274
|
dtype = exp_columns_dict[column_name_exp_i]["dtype_input"][0]
|
|
@@ -15990,12 +16280,12 @@ def _step_report_schema_any_order(
|
|
|
15990
16280
|
|
|
15991
16281
|
if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
|
|
15992
16282
|
dtype_exp_correct.append("—")
|
|
15993
|
-
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
|
|
15994
|
-
dtype_exp_correct.append("")
|
|
15995
|
-
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
|
|
15996
|
-
dtype_exp_correct.append(CHECK_MARK_SPAN)
|
|
15997
|
-
else:
|
|
15998
|
-
dtype_exp_correct.append(CROSS_MARK_SPAN)
|
|
16283
|
+
elif not exp_columns_dict[column_name_exp_i]["dtype_present"]: # pragma: no cover
|
|
16284
|
+
dtype_exp_correct.append("") # pragma: no cover
|
|
16285
|
+
elif exp_columns_dict[column_name_exp_i]["dtype_matched"]: # pragma: no cover
|
|
16286
|
+
dtype_exp_correct.append(CHECK_MARK_SPAN) # pragma: no cover
|
|
16287
|
+
else: # pragma: no cover
|
|
16288
|
+
dtype_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
|
|
15999
16289
|
|
|
16000
16290
|
if len(columns_found) > 0:
|
|
16001
16291
|
# Get the last index of the columns found
|
|
@@ -16011,7 +16301,9 @@ def _step_report_schema_any_order(
|
|
|
16011
16301
|
]
|
|
16012
16302
|
|
|
16013
16303
|
else:
|
|
16014
|
-
index_exp = [
|
|
16304
|
+
index_exp = [
|
|
16305
|
+
str(i) for i in range(1, len(colnames_exp_unmatched) + 1)
|
|
16306
|
+
] # pragma: no cover
|
|
16015
16307
|
|
|
16016
16308
|
schema_exp_unmatched = pl.DataFrame(
|
|
16017
16309
|
{
|