pointblank 0.13.1__py3-none-any.whl → 0.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -31,7 +31,6 @@ from pointblank._constants import (
31
31
  CROSS_MARK_SPAN,
32
32
  IBIS_BACKENDS,
33
33
  LOG_LEVELS_MAP,
34
- METHOD_CATEGORY_MAP,
35
34
  REPORTING_LANGUAGES,
36
35
  ROW_BASED_VALIDATION_TYPES,
37
36
  RTL_LANGUAGES,
@@ -46,25 +45,35 @@ from pointblank._constants_translations import (
46
45
  VALIDATION_REPORT_TEXT,
47
46
  )
48
47
  from pointblank._interrogation import (
49
- ColCountMatch,
50
- ColExistsHasType,
51
- ColSchemaMatch,
52
- ColValsCompareOne,
53
- ColValsCompareSet,
54
- ColValsCompareTwo,
55
- ColValsExpr,
56
- ColValsRegex,
57
- ConjointlyValidation,
58
48
  NumberOfTestUnits,
59
- RowCountMatch,
60
- RowsComplete,
61
- RowsDistinct,
62
49
  SpeciallyValidation,
50
+ col_count_match,
51
+ col_exists,
52
+ col_schema_match,
53
+ col_vals_expr,
54
+ conjointly_validation,
55
+ interrogate_between,
56
+ interrogate_eq,
57
+ interrogate_ge,
58
+ interrogate_gt,
59
+ interrogate_isin,
60
+ interrogate_le,
61
+ interrogate_lt,
62
+ interrogate_ne,
63
+ interrogate_not_null,
64
+ interrogate_notin,
65
+ interrogate_null,
66
+ interrogate_outside,
67
+ interrogate_regex,
68
+ interrogate_rows_distinct,
69
+ row_count_match,
70
+ rows_complete,
63
71
  )
64
72
  from pointblank._typing import SegmentSpec
65
73
  from pointblank._utils import (
66
74
  _check_any_df_lib,
67
75
  _check_invalid_fields,
76
+ _column_test_prep,
68
77
  _count_null_values_in_column,
69
78
  _count_true_values_in_column,
70
79
  _derive_bounds,
@@ -1584,13 +1593,22 @@ def _generate_display_table(
1584
1593
 
1585
1594
  tail_data = pd.DataFrame(columns=head_data.columns)
1586
1595
 
1587
- data = pd.concat([head_data, tail_data])
1596
+ # Suppress the FutureWarning about DataFrame concatenation with empty entries
1597
+ import warnings
1598
+
1599
+ with warnings.catch_warnings():
1600
+ warnings.filterwarnings(
1601
+ "ignore",
1602
+ category=FutureWarning,
1603
+ message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
1604
+ )
1605
+ data = pd.concat([head_data, tail_data])
1588
1606
 
1589
1607
  row_number_list = list(range(1, n_head + 1)) + list(
1590
1608
  range(n_rows - n_tail + 1, n_rows + 1)
1591
1609
  )
1592
1610
 
1593
- # For PySpark, update schema after conversion to pandas
1611
+ # For PySpark, update schema after conversion to Pandas
1594
1612
  if tbl_type == "pyspark":
1595
1613
  tbl_schema = Schema(tbl=data)
1596
1614
 
@@ -1988,9 +2006,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1988
2006
 
1989
2007
  # Apply the appropriate conversion method
1990
2008
  if use_polars_conversion:
1991
- null_sum_converted = null_sum.to_polars()
2009
+ null_sum_converted = null_sum.to_polars() # pragma: no cover
1992
2010
  else:
1993
- null_sum_converted = null_sum.to_pandas()
2011
+ null_sum_converted = null_sum.to_pandas() # pragma: no cover
1994
2012
 
1995
2013
  missing_prop = (null_sum_converted / sector_size) * 100
1996
2014
  col_missing_props.append(missing_prop)
@@ -2007,9 +2025,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2007
2025
 
2008
2026
  # Apply the appropriate conversion method
2009
2027
  if use_polars_conversion:
2010
- null_sum_converted = null_sum.to_polars()
2028
+ null_sum_converted = null_sum.to_polars() # pragma: no cover
2011
2029
  else:
2012
- null_sum_converted = null_sum.to_pandas()
2030
+ null_sum_converted = null_sum.to_pandas() # pragma: no cover
2013
2031
 
2014
2032
  missing_prop = (null_sum_converted / sector_size) * 100
2015
2033
  col_missing_props.append(missing_prop)
@@ -2022,9 +2040,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2022
2040
 
2023
2041
  # Use the helper function based on the DataFrame library
2024
2042
  if df_lib_name_gt == "polars":
2025
- missing_vals = _calculate_missing_proportions(use_polars_conversion=True)
2043
+ missing_vals = _calculate_missing_proportions(
2044
+ use_polars_conversion=True
2045
+ ) # pragma: no cover
2026
2046
  else:
2027
- missing_vals = _calculate_missing_proportions(use_polars_conversion=False)
2047
+ missing_vals = _calculate_missing_proportions(
2048
+ use_polars_conversion=False
2049
+ ) # pragma: no cover
2028
2050
 
2029
2051
  # Pivot the `missing_vals` dictionary to create a table with the missing value proportions
2030
2052
  missing_vals = {
@@ -2037,9 +2059,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
2037
2059
 
2038
2060
  # Get a dictionary of counts of missing values in each column
2039
2061
  if df_lib_name_gt == "polars":
2040
- missing_val_counts = {col: data[col].isnull().sum().to_polars() for col in data.columns}
2062
+ missing_val_counts = {
2063
+ col: data[col].isnull().sum().to_polars() for col in data.columns
2064
+ } # pragma: no cover
2041
2065
  else:
2042
- missing_val_counts = {col: data[col].isnull().sum().to_pandas() for col in data.columns}
2066
+ missing_val_counts = {
2067
+ col: data[col].isnull().sum().to_pandas() for col in data.columns
2068
+ } # pragma: no cover
2043
2069
 
2044
2070
  if pl_pb_tbl:
2045
2071
  # Get the column names from the table
@@ -2398,10 +2424,31 @@ def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
2398
2424
  return [lhs_values, rhs_values]
2399
2425
 
2400
2426
 
2427
+ def _get_column_names_safe(data: Any) -> list[str]:
2428
+ """
2429
+ Safely get column names from a DataFrame, optimized for LazyFrames.
2430
+ This function avoids the Narwhals PerformanceWarning for LazyFrames.
2431
+ """
2432
+ try:
2433
+ import narwhals as nw
2434
+
2435
+ df_nw = nw.from_native(data)
2436
+ # Use `collect_schema()` for LazyFrames to avoid performance warnings
2437
+ if hasattr(df_nw, "collect_schema"):
2438
+ return list(df_nw.collect_schema().keys())
2439
+ else:
2440
+ return list(df_nw.columns) # pragma: no cover
2441
+ except Exception: # pragma: no cover
2442
+ # Fallback to direct column access
2443
+ return list(data.columns) # pragma: no cover
2444
+
2445
+
2401
2446
  def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
2402
2447
  if ibis_tbl:
2403
2448
  return data.columns if df_lib_name_gt == "polars" else list(data.columns)
2404
- return list(data.columns)
2449
+
2450
+ # Use the optimized helper function
2451
+ return _get_column_names_safe(data)
2405
2452
 
2406
2453
 
2407
2454
  def _validate_columns_subset(
@@ -2590,7 +2637,11 @@ def get_column_count(data: FrameT | Any) -> int:
2590
2637
  import narwhals as nw
2591
2638
 
2592
2639
  df_nw = nw.from_native(data)
2593
- return len(df_nw.columns)
2640
+ # Use `collect_schema()` for LazyFrames to avoid performance warnings
2641
+ if hasattr(df_nw, "collect_schema"):
2642
+ return len(df_nw.collect_schema())
2643
+ else:
2644
+ return len(df_nw.columns) # pragma: no cover
2594
2645
  except Exception:
2595
2646
  # Fallback for unsupported types
2596
2647
  if "pandas" in str(type(data)):
@@ -2763,11 +2814,11 @@ def get_row_count(data: FrameT | Any) -> int:
2763
2814
  # Try different ways to get row count
2764
2815
  if hasattr(df_nw, "shape"):
2765
2816
  return df_nw.shape[0]
2766
- elif hasattr(df_nw, "height"):
2817
+ elif hasattr(df_nw, "height"): # pragma: no cover
2767
2818
  return df_nw.height # pragma: no cover
2768
2819
  else: # pragma: no cover
2769
2820
  raise ValueError("Unable to determine row count from Narwhals DataFrame")
2770
- except Exception:
2821
+ except Exception: # pragma: no cover
2771
2822
  # Fallback for types that don't work with Narwhals
2772
2823
  if "pandas" in str(type(data)): # pragma: no cover
2773
2824
  return data.shape[0]
@@ -4702,7 +4753,8 @@ class Validate:
4702
4753
  _check_boolean_input(param=active, param_name="active")
4703
4754
 
4704
4755
  # If value is a string-based date or datetime, convert it to the appropriate type
4705
- value = _string_date_dttm_conversion(value=value)
4756
+ # Allow regular strings to pass through for string comparisons
4757
+ value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
4706
4758
 
4707
4759
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
4708
4760
  thresholds = (
@@ -4990,7 +5042,8 @@ class Validate:
4990
5042
  _check_boolean_input(param=active, param_name="active")
4991
5043
 
4992
5044
  # If value is a string-based date or datetime, convert it to the appropriate type
4993
- value = _string_date_dttm_conversion(value=value)
5045
+ # Allow regular strings to pass through for string comparisons
5046
+ value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
4994
5047
 
4995
5048
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
4996
5049
  thresholds = (
@@ -8356,8 +8409,8 @@ class Validate:
8356
8409
  self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
8357
8410
  )
8358
8411
 
8359
- if columns_subset is not None and isinstance(columns_subset, str):
8360
- columns_subset = [columns_subset]
8412
+ if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover
8413
+ columns_subset = [columns_subset] # pragma: no cover
8361
8414
 
8362
8415
  # TODO: incorporate Column object
8363
8416
 
@@ -9738,8 +9791,8 @@ class Validate:
9738
9791
  threshold = validation.thresholds
9739
9792
  segment = validation.segments
9740
9793
 
9794
+ # Get compatible data types for this assertion type
9741
9795
  assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
9742
- assertion_category = METHOD_CATEGORY_MAP[assertion_method]
9743
9796
  compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
9744
9797
 
9745
9798
  # Process the `brief` text for the validation step by including template variables to
@@ -9870,197 +9923,249 @@ class Validate:
9870
9923
  # Validation stage
9871
9924
  # ------------------------------------------------
9872
9925
 
9873
- if assertion_category == "COMPARE_ONE":
9874
- results_tbl = ColValsCompareOne(
9875
- data_tbl=data_tbl_step,
9876
- column=column,
9877
- value=value,
9878
- na_pass=na_pass,
9879
- threshold=threshold,
9880
- assertion_method=assertion_method,
9881
- allowed_types=compatible_dtypes,
9882
- tbl_type=tbl_type,
9883
- ).get_test_results()
9884
-
9885
- if assertion_category == "COMPARE_TWO":
9886
- results_tbl = ColValsCompareTwo(
9887
- data_tbl=data_tbl_step,
9888
- column=column,
9889
- value1=value[0],
9890
- value2=value[1],
9891
- inclusive=inclusive,
9892
- na_pass=na_pass,
9893
- threshold=threshold,
9894
- assertion_method=assertion_method,
9895
- allowed_types=compatible_dtypes,
9896
- tbl_type=tbl_type,
9897
- ).get_test_results()
9898
-
9899
- if assertion_category == "COMPARE_SET":
9900
- inside = True if assertion_method == "in_set" else False
9901
-
9902
- results_tbl = ColValsCompareSet(
9903
- data_tbl=data_tbl_step,
9904
- column=column,
9905
- values=value,
9906
- threshold=threshold,
9907
- inside=inside,
9908
- allowed_types=compatible_dtypes,
9909
- tbl_type=tbl_type,
9910
- ).get_test_results()
9911
-
9912
- if assertion_category == "COMPARE_REGEX":
9913
- results_tbl = ColValsRegex(
9914
- data_tbl=data_tbl_step,
9915
- column=column,
9916
- pattern=value,
9917
- na_pass=na_pass,
9918
- threshold=threshold,
9919
- allowed_types=compatible_dtypes,
9920
- tbl_type=tbl_type,
9921
- ).get_test_results()
9922
-
9923
- if assertion_category == "COMPARE_EXPR":
9924
- results_tbl = ColValsExpr(
9925
- data_tbl=data_tbl_step,
9926
- expr=value,
9927
- threshold=threshold,
9928
- tbl_type=tbl_type,
9929
- ).get_test_results()
9930
-
9931
- if assertion_category == "ROWS_DISTINCT":
9932
- results_tbl = RowsDistinct(
9933
- data_tbl=data_tbl_step,
9934
- columns_subset=column,
9935
- threshold=threshold,
9936
- tbl_type=tbl_type,
9937
- ).get_test_results()
9938
-
9939
- if assertion_category == "ROWS_COMPLETE":
9940
- results_tbl = RowsComplete(
9941
- data_tbl=data_tbl_step,
9942
- columns_subset=column,
9943
- threshold=threshold,
9944
- tbl_type=tbl_type,
9945
- ).get_test_results()
9946
-
9947
- if assertion_category == "COL_EXISTS_HAS_TYPE":
9948
- result_bool = ColExistsHasType(
9949
- data_tbl=data_tbl_step,
9950
- column=column,
9951
- threshold=threshold,
9952
- assertion_method="exists",
9953
- tbl_type=tbl_type,
9954
- ).get_test_results()
9955
-
9956
- validation.all_passed = result_bool
9957
- validation.n = 1
9958
- validation.n_passed = result_bool
9959
- validation.n_failed = 1 - result_bool
9960
-
9961
- results_tbl = None
9962
-
9963
- if assertion_category == "COL_SCHEMA_MATCH":
9964
- result_bool = ColSchemaMatch(
9965
- data_tbl=data_tbl_step,
9966
- schema=value["schema"],
9967
- complete=value["complete"],
9968
- in_order=value["in_order"],
9969
- case_sensitive_colnames=value["case_sensitive_colnames"],
9970
- case_sensitive_dtypes=value["case_sensitive_dtypes"],
9971
- full_match_dtypes=value["full_match_dtypes"],
9972
- threshold=threshold,
9973
- ).get_test_results()
9974
-
9975
- schema_validation_info = _get_schema_validation_info(
9976
- data_tbl=data_tbl,
9977
- schema=value["schema"],
9978
- passed=result_bool,
9979
- complete=value["complete"],
9980
- in_order=value["in_order"],
9981
- case_sensitive_colnames=value["case_sensitive_colnames"],
9982
- case_sensitive_dtypes=value["case_sensitive_dtypes"],
9983
- full_match_dtypes=value["full_match_dtypes"],
9984
- )
9926
+ # Apply error handling only to data quality validations, not programming error validations
9927
+ if assertion_type != "specially":
9928
+ try:
9929
+ # validations requiring `_column_test_prep()`
9930
+ if assertion_type in [
9931
+ "col_vals_gt",
9932
+ "col_vals_lt",
9933
+ "col_vals_eq",
9934
+ "col_vals_ne",
9935
+ "col_vals_ge",
9936
+ "col_vals_le",
9937
+ "col_vals_null",
9938
+ "col_vals_not_null",
9939
+ "col_vals_between",
9940
+ "col_vals_outside",
9941
+ "col_vals_in_set",
9942
+ "col_vals_not_in_set",
9943
+ "col_vals_regex",
9944
+ ]:
9945
+ # Process table for column validation
9946
+ tbl = _column_test_prep(
9947
+ df=data_tbl_step, column=column, allowed_types=compatible_dtypes
9948
+ )
9985
9949
 
9986
- # Add the schema validation info to the validation object
9987
- validation.val_info = schema_validation_info
9988
-
9989
- validation.all_passed = result_bool
9990
- validation.n = 1
9991
- validation.n_passed = int(result_bool)
9992
- validation.n_failed = 1 - result_bool
9993
-
9994
- results_tbl = None
9995
-
9996
- if assertion_category == "ROW_COUNT_MATCH":
9997
- result_bool = RowCountMatch(
9998
- data_tbl=data_tbl_step,
9999
- count=value["count"],
10000
- inverse=value["inverse"],
10001
- threshold=threshold,
10002
- abs_tol_bounds=value["abs_tol_bounds"],
10003
- tbl_type=tbl_type,
10004
- ).get_test_results()
10005
-
10006
- validation.all_passed = result_bool
10007
- validation.n = 1
10008
- validation.n_passed = int(result_bool)
10009
- validation.n_failed = 1 - result_bool
10010
-
10011
- results_tbl = None
10012
-
10013
- if assertion_category == "COL_COUNT_MATCH":
10014
- result_bool = ColCountMatch(
10015
- data_tbl=data_tbl_step,
10016
- count=value["count"],
10017
- inverse=value["inverse"],
10018
- threshold=threshold,
10019
- tbl_type=tbl_type,
10020
- ).get_test_results()
10021
-
10022
- validation.all_passed = result_bool
10023
- validation.n = 1
10024
- validation.n_passed = int(result_bool)
10025
- validation.n_failed = 1 - result_bool
10026
-
10027
- results_tbl = None
10028
-
10029
- if assertion_category == "CONJOINTLY":
10030
- results_tbl = ConjointlyValidation(
10031
- data_tbl=data_tbl_step,
10032
- expressions=value["expressions"],
10033
- threshold=threshold,
10034
- tbl_type=tbl_type,
10035
- ).get_test_results()
10036
-
10037
- if assertion_category == "SPECIALLY":
10038
- results_tbl_list = SpeciallyValidation(
10039
- data_tbl=data_tbl_step,
10040
- expression=value,
10041
- threshold=threshold,
10042
- tbl_type=tbl_type,
10043
- ).get_test_results()
10044
-
10045
- #
10046
- # The result from this could either be a table in the conventional form, or,
10047
- # a list of boolean values; handle both cases
10048
- #
10049
-
10050
- if isinstance(results_tbl_list, list):
10051
- # If the result is a list of boolean values, then we need to convert it to a
10052
- # set the validation results from the list
10053
- validation.all_passed = all(results_tbl_list)
10054
- validation.n = len(results_tbl_list)
10055
- validation.n_passed = results_tbl_list.count(True)
10056
- validation.n_failed = results_tbl_list.count(False)
10057
-
10058
- results_tbl = None
9950
+ if assertion_method == "gt":
9951
+ results_tbl = interrogate_gt(
9952
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9953
+ )
9954
+ elif assertion_method == "lt":
9955
+ results_tbl = interrogate_lt(
9956
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9957
+ )
9958
+ elif assertion_method == "eq":
9959
+ results_tbl = interrogate_eq(
9960
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9961
+ )
9962
+ elif assertion_method == "ne":
9963
+ results_tbl = interrogate_ne(
9964
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9965
+ )
9966
+ elif assertion_method == "ge":
9967
+ results_tbl = interrogate_ge(
9968
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9969
+ )
9970
+ elif assertion_method == "le":
9971
+ results_tbl = interrogate_le(
9972
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9973
+ )
9974
+ elif assertion_method == "null":
9975
+ results_tbl = interrogate_null(tbl=tbl, column=column)
9976
+ elif assertion_method == "not_null":
9977
+ results_tbl = interrogate_not_null(tbl=tbl, column=column)
9978
+
9979
+ elif assertion_type == "col_vals_between":
9980
+ results_tbl = interrogate_between(
9981
+ tbl=tbl,
9982
+ column=column,
9983
+ low=value[0],
9984
+ high=value[1],
9985
+ inclusive=inclusive,
9986
+ na_pass=na_pass,
9987
+ )
10059
9988
 
10060
- else:
10061
- # If the result is not a list, then we assume it's a table in the conventional
10062
- # form (where the column is `pb_is_good_` exists, with boolean values
10063
- results_tbl = results_tbl_list
9989
+ elif assertion_type == "col_vals_outside":
9990
+ results_tbl = interrogate_outside(
9991
+ tbl=tbl,
9992
+ column=column,
9993
+ low=value[0],
9994
+ high=value[1],
9995
+ inclusive=inclusive,
9996
+ na_pass=na_pass,
9997
+ )
9998
+
9999
+ elif assertion_type == "col_vals_in_set":
10000
+ results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value)
10001
+
10002
+ elif assertion_type == "col_vals_not_in_set":
10003
+ results_tbl = interrogate_notin(
10004
+ tbl=tbl, column=column, set_values=value
10005
+ )
10006
+
10007
+ elif assertion_type == "col_vals_regex":
10008
+ results_tbl = interrogate_regex(
10009
+ tbl=tbl, column=column, pattern=value, na_pass=na_pass
10010
+ )
10011
+
10012
+ elif assertion_type == "col_vals_expr":
10013
+ results_tbl = col_vals_expr(
10014
+ data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
10015
+ )
10016
+
10017
+ elif assertion_type == "rows_distinct":
10018
+ results_tbl = interrogate_rows_distinct(
10019
+ data_tbl=data_tbl_step, columns_subset=column
10020
+ )
10021
+
10022
+ elif assertion_type == "rows_complete":
10023
+ results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
10024
+
10025
+ elif assertion_type == "col_exists":
10026
+ result_bool = col_exists(
10027
+ data_tbl=data_tbl_step,
10028
+ column=column,
10029
+ )
10030
+
10031
+ validation.all_passed = result_bool
10032
+ validation.n = 1
10033
+ validation.n_passed = int(result_bool)
10034
+ validation.n_failed = 1 - int(result_bool)
10035
+
10036
+ results_tbl = None
10037
+
10038
+ elif assertion_type == "col_schema_match":
10039
+ result_bool = col_schema_match(
10040
+ data_tbl=data_tbl_step,
10041
+ schema=value["schema"],
10042
+ complete=value["complete"],
10043
+ in_order=value["in_order"],
10044
+ case_sensitive_colnames=value["case_sensitive_colnames"],
10045
+ case_sensitive_dtypes=value["case_sensitive_dtypes"],
10046
+ full_match_dtypes=value["full_match_dtypes"],
10047
+ threshold=threshold,
10048
+ )
10049
+
10050
+ schema_validation_info = _get_schema_validation_info(
10051
+ data_tbl=data_tbl,
10052
+ schema=value["schema"],
10053
+ passed=result_bool,
10054
+ complete=value["complete"],
10055
+ in_order=value["in_order"],
10056
+ case_sensitive_colnames=value["case_sensitive_colnames"],
10057
+ case_sensitive_dtypes=value["case_sensitive_dtypes"],
10058
+ full_match_dtypes=value["full_match_dtypes"],
10059
+ )
10060
+
10061
+ # Add the schema validation info to the validation object
10062
+ validation.val_info = schema_validation_info
10063
+
10064
+ validation.all_passed = result_bool
10065
+ validation.n = 1
10066
+ validation.n_passed = int(result_bool)
10067
+ validation.n_failed = 1 - result_bool
10068
+
10069
+ results_tbl = None
10070
+
10071
+ elif assertion_type == "row_count_match":
10072
+ result_bool = row_count_match(
10073
+ data_tbl=data_tbl_step,
10074
+ count=value["count"],
10075
+ inverse=value["inverse"],
10076
+ abs_tol_bounds=value["abs_tol_bounds"],
10077
+ )
10078
+
10079
+ validation.all_passed = result_bool
10080
+ validation.n = 1
10081
+ validation.n_passed = int(result_bool)
10082
+ validation.n_failed = 1 - result_bool
10083
+
10084
+ results_tbl = None
10085
+
10086
+ elif assertion_type == "col_count_match":
10087
+ result_bool = col_count_match(
10088
+ data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"]
10089
+ )
10090
+
10091
+ validation.all_passed = result_bool
10092
+ validation.n = 1
10093
+ validation.n_passed = int(result_bool)
10094
+ validation.n_failed = 1 - result_bool
10095
+
10096
+ results_tbl = None
10097
+
10098
+ elif assertion_type == "conjointly":
10099
+ results_tbl = conjointly_validation(
10100
+ data_tbl=data_tbl_step,
10101
+ expressions=value["expressions"],
10102
+ threshold=threshold,
10103
+ tbl_type=tbl_type,
10104
+ )
10105
+
10106
+ else:
10107
+ raise ValueError(
10108
+ f"Unknown assertion type: {assertion_type}"
10109
+ ) # pragma: no cover
10110
+
10111
+ except Exception as e:
10112
+ # Only catch specific data quality comparison errors, not programming errors
10113
+ error_msg = str(e).lower()
10114
+ is_comparison_error = (
10115
+ "boolean value of na is ambiguous" in error_msg
10116
+ or "cannot compare" in error_msg
10117
+ or (
10118
+ "type" in error_msg
10119
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
10120
+ )
10121
+ or ("dtype" in error_msg and "compare" in error_msg)
10122
+ )
10123
+
10124
+ if is_comparison_error: # pragma: no cover
10125
+ # If data quality comparison fails, mark the validation as having an eval_error
10126
+ validation.eval_error = True # pragma: no cover
10127
+ end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover
10128
+ validation.proc_duration_s = (
10129
+ end_time - start_time
10130
+ ).total_seconds() # pragma: no cover
10131
+ validation.time_processed = end_time.isoformat(
10132
+ timespec="milliseconds"
10133
+ ) # pragma: no cover
10134
+ validation.active = False # pragma: no cover
10135
+ continue # pragma: no cover
10136
+ else:
10137
+ # For other errors (like missing columns), let them propagate
10138
+ raise
10139
+
10140
+ else:
10141
+ # For "specially" validations, let programming errors propagate as exceptions
10142
+ if assertion_type == "specially":
10143
+ results_tbl_list = SpeciallyValidation(
10144
+ data_tbl=data_tbl_step,
10145
+ expression=value,
10146
+ threshold=threshold,
10147
+ tbl_type=tbl_type,
10148
+ ).get_test_results()
10149
+
10150
+ #
10151
+ # The result from this could either be a table in the conventional form, or,
10152
+ # a list of boolean values; handle both cases
10153
+ #
10154
+
10155
+ if isinstance(results_tbl_list, list):
10156
+ # If the result is a list of boolean values, then we need to convert it to a
10157
+ # set the validation results from the list
10158
+ validation.all_passed = all(results_tbl_list)
10159
+ validation.n = len(results_tbl_list)
10160
+ validation.n_passed = results_tbl_list.count(True)
10161
+ validation.n_failed = results_tbl_list.count(False)
10162
+
10163
+ results_tbl = None
10164
+
10165
+ else:
10166
+ # If the result is not a list, then we assume it's a table in the conventional
10167
+ # form (where the column is `pb_is_good_` exists, with boolean values
10168
+ results_tbl = results_tbl_list
10064
10169
 
10065
10170
  # If the results table is not `None`, then we assume there is a table with a column
10066
10171
  # called `pb_is_good_` that contains boolean values; we can then use this table to
@@ -10272,32 +10377,46 @@ class Validate:
10272
10377
  except AttributeError:
10273
10378
  # For LazyFrames without sample method, collect first then sample
10274
10379
  validation_extract_native = validation_extract_nw.collect().to_native()
10275
- if hasattr(validation_extract_native, "sample"):
10380
+ if hasattr(validation_extract_native, "sample"): # pragma: no cover
10276
10381
  # PySpark DataFrame has sample method
10277
- validation_extract_native = validation_extract_native.sample(
10278
- fraction=min(1.0, sample_n / validation_extract_native.count())
10279
- ).limit(sample_n)
10280
- validation_extract_nw = nw.from_native(validation_extract_native)
10382
+ validation_extract_native = (
10383
+ validation_extract_native.sample( # pragma: no cover
10384
+ fraction=min(
10385
+ 1.0, sample_n / validation_extract_native.count()
10386
+ ) # pragma: no cover
10387
+ ).limit(sample_n)
10388
+ ) # pragma: no cover
10389
+ validation_extract_nw = nw.from_native(
10390
+ validation_extract_native
10391
+ ) # pragma: no cover
10281
10392
  else:
10282
10393
  # Fallback: just take first n rows after collecting
10283
- validation_extract_nw = validation_extract_nw.collect().head(sample_n)
10394
+ validation_extract_nw = validation_extract_nw.collect().head(
10395
+ sample_n
10396
+ ) # pragma: no cover
10284
10397
  elif sample_frac is not None:
10285
10398
  try:
10286
10399
  validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac)
10287
- except AttributeError:
10400
+ except AttributeError: # pragma: no cover
10288
10401
  # For LazyFrames without sample method, collect first then sample
10289
- validation_extract_native = validation_extract_nw.collect().to_native()
10290
- if hasattr(validation_extract_native, "sample"):
10402
+ validation_extract_native = (
10403
+ validation_extract_nw.collect().to_native()
10404
+ ) # pragma: no cover
10405
+ if hasattr(validation_extract_native, "sample"): # pragma: no cover
10291
10406
  # PySpark DataFrame has sample method
10292
10407
  validation_extract_native = validation_extract_native.sample(
10293
10408
  fraction=sample_frac
10294
- )
10295
- validation_extract_nw = nw.from_native(validation_extract_native)
10409
+ ) # pragma: no cover
10410
+ validation_extract_nw = nw.from_native(
10411
+ validation_extract_native
10412
+ ) # pragma: no cover
10296
10413
  else:
10297
10414
  # Fallback: use fraction to calculate head size
10298
- collected = validation_extract_nw.collect()
10299
- sample_size = max(1, int(len(collected) * sample_frac))
10300
- validation_extract_nw = collected.head(sample_size)
10415
+ collected = validation_extract_nw.collect() # pragma: no cover
10416
+ sample_size = max(
10417
+ 1, int(len(collected) * sample_frac)
10418
+ ) # pragma: no cover
10419
+ validation_extract_nw = collected.head(sample_size) # pragma: no cover
10301
10420
 
10302
10421
  # Ensure a limit is set on the number of rows to extract
10303
10422
  try:
@@ -10307,9 +10426,9 @@ class Validate:
10307
10426
  # For LazyFrames, collect to get length (or use a reasonable default)
10308
10427
  try:
10309
10428
  extract_length = len(validation_extract_nw.collect())
10310
- except Exception:
10429
+ except Exception: # pragma: no cover
10311
10430
  # If collection fails, apply limit anyway as a safety measure
10312
- extract_length = extract_limit + 1 # Force limiting
10431
+ extract_length = extract_limit + 1 # pragma: no cover
10313
10432
 
10314
10433
  if extract_length > extract_limit:
10315
10434
  validation_extract_nw = validation_extract_nw.head(extract_limit)
@@ -11974,10 +12093,12 @@ class Validate:
11974
12093
  try:
11975
12094
  # Try without order_by first (for DataFrames)
11976
12095
  data_nw = data_nw.with_row_index(name=index_name)
11977
- except TypeError:
12096
+ except TypeError: # pragma: no cover
11978
12097
  # LazyFrames require order_by parameter - use first column for ordering
11979
- first_col = data_nw.columns[0]
11980
- data_nw = data_nw.with_row_index(name=index_name, order_by=first_col)
12098
+ first_col = data_nw.columns[0] # pragma: no cover
12099
+ data_nw = data_nw.with_row_index(
12100
+ name=index_name, order_by=first_col
12101
+ ) # pragma: no cover
11981
12102
 
11982
12103
  # Get all validation step result tables and join together the `pb_is_good_` columns
11983
12104
  # ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
@@ -11989,10 +12110,12 @@ class Validate:
11989
12110
  try:
11990
12111
  # Try without order_by first (for DataFrames)
11991
12112
  results_tbl = results_tbl.with_row_index(name=index_name)
11992
- except TypeError:
12113
+ except TypeError: # pragma: no cover
11993
12114
  # LazyFrames require order_by parameter - use first column for ordering
11994
- first_col = results_tbl.columns[0]
11995
- results_tbl = results_tbl.with_row_index(name=index_name, order_by=first_col)
12115
+ first_col = results_tbl.columns[0] # pragma: no cover
12116
+ results_tbl = results_tbl.with_row_index(
12117
+ name=index_name, order_by=first_col
12118
+ ) # pragma: no cover
11996
12119
 
11997
12120
  # Add numerical suffix to the `pb_is_good_` column to make it unique
11998
12121
  results_tbl = results_tbl.select([index_name, "pb_is_good_"]).rename(
@@ -12124,15 +12247,15 @@ class Validate:
12124
12247
  # If the table is a Polars one, determine if it's a LazyFrame
12125
12248
  if tbl_info == "polars":
12126
12249
  if _is_lazy_frame(self.data):
12127
- tbl_info = "polars-lazy"
12250
+ tbl_info = "polars-lazy" # pragma: no cover
12128
12251
 
12129
12252
  # Determine if the input table is a Narwhals DF
12130
12253
  if _is_narwhals_table(self.data):
12131
12254
  # Determine if the Narwhals table is a LazyFrame
12132
- if _is_lazy_frame(self.data):
12133
- tbl_info = "narwhals-lazy"
12255
+ if _is_lazy_frame(self.data): # pragma: no cover
12256
+ tbl_info = "narwhals-lazy" # pragma: no cover
12134
12257
  else:
12135
- tbl_info = "narwhals"
12258
+ tbl_info = "narwhals" # pragma: no cover
12136
12259
 
12137
12260
  # Get the thresholds object
12138
12261
  thresholds = self.thresholds
@@ -12297,7 +12420,7 @@ class Validate:
12297
12420
  if lang in RTL_LANGUAGES:
12298
12421
  gt_tbl = gt_tbl.tab_style(
12299
12422
  style=style.css("direction: rtl;"), locations=loc.source_notes()
12300
- )
12423
+ ) # pragma: no cover
12301
12424
 
12302
12425
  if incl_header:
12303
12426
  gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
@@ -12614,9 +12737,11 @@ class Validate:
12614
12737
  # Get the number of rows in the extract (safe for LazyFrames)
12615
12738
  try:
12616
12739
  n_rows = len(extract_nw)
12617
- except TypeError:
12740
+ except TypeError: # pragma: no cover
12618
12741
  # For LazyFrames, collect() first to get length
12619
- n_rows = len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
12742
+ n_rows = (
12743
+ len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
12744
+ ) # pragma: no cover
12620
12745
 
12621
12746
  # If the number of rows is zero, then produce an em dash then go to the next iteration
12622
12747
  if n_rows == 0:
@@ -12624,7 +12749,7 @@ class Validate:
12624
12749
  continue
12625
12750
 
12626
12751
  # Write the CSV text (ensure LazyFrames are collected first)
12627
- if hasattr(extract_nw, "collect"):
12752
+ if hasattr(extract_nw, "collect"): # pragma: no cover
12628
12753
  extract_nw = extract_nw.collect()
12629
12754
  csv_text = extract_nw.write_csv()
12630
12755
 
@@ -13126,7 +13251,7 @@ class Validate:
13126
13251
  elif isinstance(column, list):
13127
13252
  column_position = [list(self.data.columns).index(col) + 1 for col in column]
13128
13253
  else:
13129
- column_position = None
13254
+ column_position = None # pragma: no cover
13130
13255
  else:
13131
13256
  column_position = None
13132
13257
 
@@ -13218,7 +13343,7 @@ class Validate:
13218
13343
  )
13219
13344
 
13220
13345
  else:
13221
- step_report = None
13346
+ step_report = None # pragma: no cover
13222
13347
 
13223
13348
  return step_report
13224
13349
 
@@ -13670,6 +13795,48 @@ def _string_date_dttm_conversion(value: any) -> any:
13670
13795
  return value
13671
13796
 
13672
13797
 
13798
+ def _conditional_string_date_dttm_conversion(
13799
+ value: any, allow_regular_strings: bool = False
13800
+ ) -> any:
13801
+ """
13802
+ Conditionally convert a string to a date or datetime object if it is in the correct format. If
13803
+ `allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
13804
+ the value is not a string, it is returned as is.
13805
+
13806
+ Parameters
13807
+ ----------
13808
+ value
13809
+ The value to convert. It can be a string, date, or datetime object.
13810
+ allow_regular_strings
13811
+ If `True`, regular strings (non-date/datetime) are allowed to pass through unchanged. If
13812
+ `False`, behaves like `_string_date_dttm_conversion()` and raises `ValueError` for regular
13813
+ strings.
13814
+
13815
+ Returns
13816
+ -------
13817
+ any
13818
+ The converted date or datetime object, or the original value.
13819
+
13820
+ Raises
13821
+ ------
13822
+ ValueError
13823
+ If allow_regular_strings is False and the string cannot be converted to a date or datetime.
13824
+ """
13825
+
13826
+ if isinstance(value, str):
13827
+ if _is_string_date(value):
13828
+ value = _convert_string_to_date(value)
13829
+ elif _is_string_datetime(value):
13830
+ value = _convert_string_to_datetime(value)
13831
+ elif not allow_regular_strings:
13832
+ raise ValueError(
13833
+ "If `value=` is provided as a string it must be a date or datetime string."
13834
+ ) # pragma: no cover
13835
+ # If allow_regular_strings is True, regular strings pass through unchanged
13836
+
13837
+ return value
13838
+
13839
+
13673
13840
  def _process_brief(
13674
13841
  brief: str | None,
13675
13842
  step: int,
@@ -13718,12 +13885,33 @@ def _process_brief(
13718
13885
 
13719
13886
  if segment is not None:
13720
13887
  # The segment is always a tuple of the form ("{column}", "{value}")
13888
+ # Handle both regular lists and Segment objects (from seg_group())
13889
+
13890
+ segment_column = segment[0]
13891
+ segment_value = segment[1]
13892
+
13893
+ # If segment_value is a Segment object (from seg_group()), format it appropriately
13894
+ if isinstance(segment_value, Segment):
13895
+ # For Segment objects, format the segments as a readable string
13896
+ segments = segment_value.segments
13897
+ if len(segments) == 1:
13898
+ # Single segment: join the values with commas
13899
+ segment_value_str = ", ".join(str(v) for v in segments[0])
13900
+ else:
13901
+ # Multiple segments: join each segment with commas, separate segments with " | "
13902
+ segment_value_str = " | ".join([", ".join(str(v) for v in seg) for seg in segments])
13903
+ else:
13904
+ # For regular lists or other types, convert to string
13905
+ if isinstance(segment_value, list):
13906
+ segment_value_str = ", ".join(str(v) for v in segment_value)
13907
+ else:
13908
+ segment_value_str = str(segment_value)
13721
13909
 
13722
- segment_fmt = f"{segment[0]} / {segment[1]}"
13910
+ segment_fmt = f"{segment_column} / {segment_value_str}"
13723
13911
 
13724
13912
  brief = brief.replace("{segment}", segment_fmt)
13725
- brief = brief.replace("{segment_column}", segment[0])
13726
- brief = brief.replace("{segment_value}", segment[1])
13913
+ brief = brief.replace("{segment_column}", segment_column)
13914
+ brief = brief.replace("{segment_value}", segment_value_str)
13727
13915
 
13728
13916
  return brief
13729
13917
 
@@ -13757,7 +13945,7 @@ def _process_action_str(
13757
13945
  if col is not None:
13758
13946
  # If a list of columns is provided, then join the columns into a comma-separated string
13759
13947
  if isinstance(col, list):
13760
- col = ", ".join(col)
13948
+ col = ", ".join(col) # pragma: no cover
13761
13949
 
13762
13950
  action_str = action_str.replace("{col}", col)
13763
13951
  action_str = action_str.replace("{column}", col)
@@ -14154,7 +14342,7 @@ def _prep_values_text(
14154
14342
  length_values = len(values)
14155
14343
 
14156
14344
  if length_values == 0:
14157
- return ""
14345
+ return "" # pragma: no cover
14158
14346
 
14159
14347
  if length_values > limit:
14160
14348
  num_omitted = length_values - limit
@@ -14163,7 +14351,7 @@ def _prep_values_text(
14163
14351
  formatted_values = []
14164
14352
  for value in values[:limit]:
14165
14353
  if isinstance(value, (datetime.datetime, datetime.date)):
14166
- formatted_values.append(f"`{value.isoformat()}`")
14354
+ formatted_values.append(f"`{value.isoformat()}`") # pragma: no cover
14167
14355
  else:
14168
14356
  formatted_values.append(f"`{value}`")
14169
14357
 
@@ -14319,17 +14507,109 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
14319
14507
  column, segment = segments_expr
14320
14508
 
14321
14509
  if tbl_type in ["pandas", "polars", "pyspark"]:
14322
- # If the table is a Pandas, Polars, or PySpark DataFrame, transforming to a Narwhals table
14510
+ # If the table is a Pandas, Polars, or PySpark DataFrame, transform to a Narwhals table
14323
14511
  # and perform the filtering operation
14324
14512
 
14325
14513
  # Transform to Narwhals table if a DataFrame
14326
14514
  data_tbl_nw = nw.from_native(data_tbl)
14327
14515
 
14516
+ # Handle Polars expressions by attempting to extract literal values
14517
+ # This is a compatibility measure for cases where `pl.datetime()`, `pl.lit()`, etc.,
14518
+ # are accidentally used instead of native Python types
14519
+ if (
14520
+ hasattr(segment, "__class__")
14521
+ and "polars" in segment.__class__.__module__
14522
+ and segment.__class__.__name__ == "Expr"
14523
+ ):
14524
+ # This is a Polars expression so we should warn about this and suggest native types
14525
+ import warnings
14526
+ from datetime import date, datetime
14527
+
14528
+ warnings.warn(
14529
+ "Polars expressions in segments are deprecated. Please use native Python types instead. "
14530
+ "For example, use datetime.date(2016, 1, 4) instead of pl.datetime(2016, 1, 4).",
14531
+ DeprecationWarning,
14532
+ stacklevel=3,
14533
+ )
14534
+
14535
+ # Try to extract the literal value from various Polars expression patterns
14536
+ segment_str = str(segment)
14537
+ parsed_value = None
14538
+
14539
+ # Handle different Polars expression string formats
14540
+ # Format 1: Direct date strings like "2016-01-04"
14541
+ if len(segment_str) == 10 and segment_str.count("-") == 2:
14542
+ try:
14543
+ parsed_value = date.fromisoformat(segment_str)
14544
+ except ValueError: # pragma: no cover
14545
+ pass # pragma: no cover
14546
+
14547
+ # Format 2: Datetime strings with UTC timezone like
14548
+ # "2016-01-04 00:00:01 UTC.strict_cast(...)"
14549
+ elif " UTC" in segment_str:
14550
+ try:
14551
+ # Extract just the datetime part before "UTC"
14552
+ datetime_part = segment_str.split(" UTC")[0]
14553
+ if len(datetime_part) >= 10:
14554
+ parsed_dt = datetime.fromisoformat(datetime_part)
14555
+ # Convert midnight datetimes to dates for consistency
14556
+ if parsed_dt.time() == datetime.min.time():
14557
+ parsed_value = parsed_dt.date() # pragma: no cover
14558
+ else:
14559
+ parsed_value = parsed_dt
14560
+ except (ValueError, IndexError): # pragma: no cover
14561
+ pass # pragma: no cover
14562
+
14563
+ # Format 3: Bracketed expressions like ['2016-01-04']
14564
+ elif segment_str.startswith("[") and segment_str.endswith("]"):
14565
+ try: # pragma: no cover
14566
+ # Remove [' and ']
14567
+ content = segment_str[2:-2] # pragma: no cover
14568
+
14569
+ # Try parsing as date first
14570
+ if len(content) == 10 and content.count("-") == 2: # pragma: no cover
14571
+ try: # pragma: no cover
14572
+ parsed_value = date.fromisoformat(content) # pragma: no cover
14573
+ except ValueError: # pragma: no cover
14574
+ pass # pragma: no cover
14575
+
14576
+ # Try parsing as datetime
14577
+ if parsed_value is None: # pragma: no cover
14578
+ try: # pragma: no cover
14579
+ parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
14580
+ if parsed_dt.time() == datetime.min.time():
14581
+ parsed_value = parsed_dt.date()
14582
+ else:
14583
+ parsed_value = parsed_dt
14584
+ except ValueError:
14585
+ pass
14586
+
14587
+ except (ValueError, IndexError): # pragma: no cover
14588
+ pass # pragma: no cover
14589
+
14590
+ # Handle `pl.datetime()` expressions with .alias("datetime")
14591
+ elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
14592
+ try:
14593
+ datetime_part = segment_str.split('.alias("datetime")')[0]
14594
+ parsed_dt = datetime.fromisoformat(datetime_part)
14595
+
14596
+ if parsed_dt.time() == datetime.min.time():
14597
+ parsed_value = parsed_dt.date()
14598
+ else:
14599
+ parsed_value = parsed_dt # pragma: no cover
14600
+
14601
+ except (ValueError, AttributeError): # pragma: no cover
14602
+ pass # pragma: no cover
14603
+
14604
+ # If we successfully parsed a value, use it; otherwise leave segment as is
14605
+ if parsed_value is not None:
14606
+ segment = parsed_value
14607
+
14328
14608
  # Filter the data table based on the column name and segment
14329
14609
  if segment is None:
14330
14610
  data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_null())
14331
- # Check if the segment is a segment group
14332
14611
  elif isinstance(segment, list):
14612
+ # Check if the segment is a segment group
14333
14613
  data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_in(segment))
14334
14614
  else:
14335
14615
  data_tbl_nw = data_tbl_nw.filter(nw.col(column) == segment)
@@ -14341,12 +14621,13 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
14341
14621
  # If the table is an Ibis backend table, perform the filtering operation directly
14342
14622
 
14343
14623
  # Filter the data table based on the column name and segment
14624
+ # Use the new Ibis API methods to avoid deprecation warnings
14344
14625
  if segment is None:
14345
- data_tbl = data_tbl[data_tbl[column].isnull()]
14626
+ data_tbl = data_tbl.filter(data_tbl[column].isnull()) # pragma: no cover
14346
14627
  elif isinstance(segment, list):
14347
- data_tbl = data_tbl[data_tbl[column].isin(segment)]
14628
+ data_tbl = data_tbl.filter(data_tbl[column].isin(segment)) # pragma: no cover
14348
14629
  else:
14349
- data_tbl = data_tbl[data_tbl[column] == segment]
14630
+ data_tbl = data_tbl.filter(data_tbl[column] == segment)
14350
14631
 
14351
14632
  return data_tbl
14352
14633
 
@@ -14465,7 +14746,7 @@ def _get_title_text(
14465
14746
  "</span>"
14466
14747
  f'<span style="float: right;">{title}</span>'
14467
14748
  "</div>"
14468
- )
14749
+ ) # pragma: no cover
14469
14750
 
14470
14751
  return html_str
14471
14752
 
@@ -14543,24 +14824,6 @@ def _transform_eval(
14543
14824
  return symbol_list
14544
14825
 
14545
14826
 
14546
- def _format_numbers_with_gt(
14547
- values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
14548
- ) -> list[str]:
14549
- """Format numbers using Great Tables GT object to avoid pandas dependency."""
14550
- import polars as pl
14551
-
14552
- # Create a single-column DataFrame with all values
14553
- df = pl.DataFrame({"values": values})
14554
-
14555
- # Create GT object and format the column
14556
- gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
14557
-
14558
- # Extract the formatted values using _get_column_of_values
14559
- formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
14560
-
14561
- return formatted_values
14562
-
14563
-
14564
14827
  def _format_single_number_with_gt(
14565
14828
  value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
14566
14829
  ) -> str:
@@ -14571,12 +14834,14 @@ def _format_single_number_with_gt(
14571
14834
  import polars as pl
14572
14835
 
14573
14836
  df_lib = pl
14574
- elif _is_lib_present("pandas"):
14575
- import pandas as pd
14837
+ elif _is_lib_present("pandas"): # pragma: no cover
14838
+ import pandas as pd # pragma: no cover
14576
14839
 
14577
- df_lib = pd
14578
- else:
14579
- raise ImportError("Neither Polars nor Pandas is available for formatting")
14840
+ df_lib = pd # pragma: no cover
14841
+ else: # pragma: no cover
14842
+ raise ImportError(
14843
+ "Neither Polars nor Pandas is available for formatting"
14844
+ ) # pragma: no cover
14580
14845
 
14581
14846
  # Create a single-row, single-column DataFrame using the specified library
14582
14847
  df = df_lib.DataFrame({"value": [value]})
@@ -14642,12 +14907,14 @@ def _format_single_float_with_gt(
14642
14907
  import polars as pl
14643
14908
 
14644
14909
  df_lib = pl
14645
- elif _is_lib_present("pandas"):
14646
- import pandas as pd
14910
+ elif _is_lib_present("pandas"): # pragma: no cover
14911
+ import pandas as pd # pragma: no cover
14647
14912
 
14648
- df_lib = pd
14649
- else:
14650
- raise ImportError("Neither Polars nor Pandas is available for formatting")
14913
+ df_lib = pd # pragma: no cover
14914
+ else: # pragma: no cover
14915
+ raise ImportError(
14916
+ "Neither Polars nor Pandas is available for formatting"
14917
+ ) # pragma: no cover
14651
14918
 
14652
14919
  # Create a single-row, single-column DataFrame using the specified library
14653
14920
  df = df_lib.DataFrame({"value": [value]})
@@ -14679,7 +14946,7 @@ def _transform_passed_failed(
14679
14946
  return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
14680
14947
  else:
14681
14948
  # Fallback to the original behavior
14682
- return vals.fmt_number(value, decimals=2, locale=locale)[0]
14949
+ return vals.fmt_number(value, decimals=2, locale=locale)[0] # pragma: no cover
14683
14950
 
14684
14951
  passed_failed = [
14685
14952
  (
@@ -14819,7 +15086,7 @@ def _get_callable_source(fn: Callable) -> str:
14819
15086
  return pre_arg
14820
15087
  except (OSError, TypeError): # pragma: no cover
14821
15088
  return fn.__name__
14822
- return fn
15089
+ return fn # pragma: no cover
14823
15090
 
14824
15091
 
14825
15092
  def _extract_pre_argument(source: str) -> str:
@@ -14903,12 +15170,14 @@ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None)
14903
15170
  import polars as pl
14904
15171
 
14905
15172
  df_lib = pl
14906
- elif _is_lib_present("pandas"):
14907
- import pandas as pd
15173
+ elif _is_lib_present("pandas"): # pragma: no cover
15174
+ import pandas as pd # pragma: no cover
14908
15175
 
14909
- df_lib = pd
14910
- else:
14911
- raise ImportError("Neither Polars nor Pandas is available for formatting")
15176
+ df_lib = pd # pragma: no cover
15177
+ else: # pragma: no cover
15178
+ raise ImportError(
15179
+ "Neither Polars nor Pandas is available for formatting"
15180
+ ) # pragma: no cover
14912
15181
 
14913
15182
  # Create a single-row, single-column DataFrame using the specified library
14914
15183
  df = df_lib.DataFrame({"value": [value]})
@@ -14936,12 +15205,14 @@ def _format_single_float_with_gt_custom(
14936
15205
  import polars as pl
14937
15206
 
14938
15207
  df_lib = pl
14939
- elif _is_lib_present("pandas"):
14940
- import pandas as pd
15208
+ elif _is_lib_present("pandas"): # pragma: no cover
15209
+ import pandas as pd # pragma: no cover
14941
15210
 
14942
- df_lib = pd
14943
- else:
14944
- raise ImportError("Neither Polars nor Pandas is available for formatting")
15211
+ df_lib = pd # pragma: no cover
15212
+ else: # pragma: no cover
15213
+ raise ImportError(
15214
+ "Neither Polars nor Pandas is available for formatting"
15215
+ ) # pragma: no cover
14945
15216
 
14946
15217
  # Create a single-row, single-column DataFrame using the specified library
14947
15218
  df = df_lib.DataFrame({"value": [value]})
@@ -14976,7 +15247,7 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) ->
14976
15247
  # Fallback to the original behavior
14977
15248
  return fmt_number(
14978
15249
  value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
14979
- )[0]
15250
+ )[0] # pragma: no cover
14980
15251
 
14981
15252
  def _format_integer_safe(value: int) -> str:
14982
15253
  if df_lib is not None and value is not None:
@@ -15113,6 +15384,8 @@ def _step_report_row_based(
15113
15384
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
15114
15385
  elif assertion_type == "col_vals_not_null":
15115
15386
  text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
15387
+ elif assertion_type == "col_vals_expr":
15388
+ text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
15116
15389
  elif assertion_type == "rows_complete":
15117
15390
  if column is None:
15118
15391
  text = STEP_REPORT_TEXT["rows_complete_all"][lang]
@@ -15159,10 +15432,17 @@ def _step_report_row_based(
15159
15432
  title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
15160
15433
  assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
15161
15434
 
15162
- success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
15163
- n=n,
15164
- column_position=column_position,
15165
- )
15435
+ # Use 'success_statement_no_column' for col_vals_expr() since it doesn't target
15436
+ # a specific column
15437
+ if assertion_type == "col_vals_expr":
15438
+ success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
15439
+ n=n
15440
+ ) # pragma: no cover
15441
+ else:
15442
+ success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
15443
+ n=n,
15444
+ column_position=column_position,
15445
+ )
15166
15446
  preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
15167
15447
 
15168
15448
  details = (
@@ -15242,10 +15522,16 @@ def _step_report_row_based(
15242
15522
  assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
15243
15523
  failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
15244
15524
 
15245
- failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
15246
- failure_rate=failure_rate_metrics,
15247
- column_position=column_position,
15248
- )
15525
+ # Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column
15526
+ if assertion_type == "col_vals_expr":
15527
+ failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format(
15528
+ failure_rate=failure_rate_metrics
15529
+ )
15530
+ else:
15531
+ failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
15532
+ failure_rate=failure_rate_metrics,
15533
+ column_position=column_position,
15534
+ )
15249
15535
 
15250
15536
  if limit < extract_length:
15251
15537
  extract_length_resolved = limit
@@ -15864,14 +16150,14 @@ def _step_report_schema_any_order(
15864
16150
  if exp_columns_dict[column_name_exp_i]["colname_matched"]:
15865
16151
  col_exp_correct.append(CHECK_MARK_SPAN)
15866
16152
  else:
15867
- col_exp_correct.append(CROSS_MARK_SPAN)
16153
+ col_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
15868
16154
 
15869
16155
  #
15870
16156
  # `dtype_exp` values
15871
16157
  #
15872
16158
 
15873
16159
  if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
15874
- dtype_exp.append("")
16160
+ dtype_exp.append("") # pragma: no cover
15875
16161
 
15876
16162
  elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
15877
16163
  dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
@@ -15906,9 +16192,9 @@ def _step_report_schema_any_order(
15906
16192
  #
15907
16193
 
15908
16194
  if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
15909
- dtype_exp_correct.append("&mdash;")
16195
+ dtype_exp_correct.append("&mdash;") # pragma: no cover
15910
16196
  elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
15911
- dtype_exp_correct.append("")
16197
+ dtype_exp_correct.append("") # pragma: no cover
15912
16198
  elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
15913
16199
  dtype_exp_correct.append(CHECK_MARK_SPAN)
15914
16200
  else:
@@ -15954,13 +16240,17 @@ def _step_report_schema_any_order(
15954
16240
  #
15955
16241
 
15956
16242
  if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
15957
- dtype_exp.append("")
16243
+ dtype_exp.append("") # pragma: no cover
15958
16244
 
15959
16245
  elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
15960
- dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
16246
+ dtype = exp_columns_dict[column_name_exp_i]["dtype_input"] # pragma: no cover
15961
16247
 
15962
- if exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None:
15963
- pos = exp_columns_dict[column_name_exp_i]["dtype_matched_pos"]
16248
+ if (
16249
+ exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None
16250
+ ): # pragma: no cover
16251
+ pos = exp_columns_dict[column_name_exp_i][
16252
+ "dtype_matched_pos"
16253
+ ] # pragma: no cover
15964
16254
 
15965
16255
  # Combine the dtypes together with pipes but underline the matched dtype in
15966
16256
  # green with an HTML span tag and style attribute
@@ -15972,13 +16262,13 @@ def _step_report_schema_any_order(
15972
16262
  else dtype[i]
15973
16263
  )
15974
16264
  for i in range(len(dtype))
15975
- ]
15976
- dtype = " | ".join(dtype)
15977
- dtype_exp.append(dtype)
16265
+ ] # pragma: no cover
16266
+ dtype = " | ".join(dtype) # pragma: no cover
16267
+ dtype_exp.append(dtype) # pragma: no cover
15978
16268
 
15979
16269
  else:
15980
- dtype = " | ".join(dtype)
15981
- dtype_exp.append(dtype)
16270
+ dtype = " | ".join(dtype) # pragma: no cover
16271
+ dtype_exp.append(dtype) # pragma: no cover
15982
16272
 
15983
16273
  else:
15984
16274
  dtype = exp_columns_dict[column_name_exp_i]["dtype_input"][0]
@@ -15990,12 +16280,12 @@ def _step_report_schema_any_order(
15990
16280
 
15991
16281
  if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
15992
16282
  dtype_exp_correct.append("&mdash;")
15993
- elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
15994
- dtype_exp_correct.append("")
15995
- elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
15996
- dtype_exp_correct.append(CHECK_MARK_SPAN)
15997
- else:
15998
- dtype_exp_correct.append(CROSS_MARK_SPAN)
16283
+ elif not exp_columns_dict[column_name_exp_i]["dtype_present"]: # pragma: no cover
16284
+ dtype_exp_correct.append("") # pragma: no cover
16285
+ elif exp_columns_dict[column_name_exp_i]["dtype_matched"]: # pragma: no cover
16286
+ dtype_exp_correct.append(CHECK_MARK_SPAN) # pragma: no cover
16287
+ else: # pragma: no cover
16288
+ dtype_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover
15999
16289
 
16000
16290
  if len(columns_found) > 0:
16001
16291
  # Get the last index of the columns found
@@ -16011,7 +16301,9 @@ def _step_report_schema_any_order(
16011
16301
  ]
16012
16302
 
16013
16303
  else:
16014
- index_exp = [str(i) for i in range(1, len(colnames_exp_unmatched) + 1)]
16304
+ index_exp = [
16305
+ str(i) for i in range(1, len(colnames_exp_unmatched) + 1)
16306
+ ] # pragma: no cover
16015
16307
 
16016
16308
  schema_exp_unmatched = pl.DataFrame(
16017
16309
  {