pointblank 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/validate.py CHANGED
@@ -31,7 +31,6 @@ from pointblank._constants import (
31
31
  CROSS_MARK_SPAN,
32
32
  IBIS_BACKENDS,
33
33
  LOG_LEVELS_MAP,
34
- METHOD_CATEGORY_MAP,
35
34
  REPORTING_LANGUAGES,
36
35
  ROW_BASED_VALIDATION_TYPES,
37
36
  RTL_LANGUAGES,
@@ -46,25 +45,35 @@ from pointblank._constants_translations import (
46
45
  VALIDATION_REPORT_TEXT,
47
46
  )
48
47
  from pointblank._interrogation import (
49
- ColCountMatch,
50
- ColExistsHasType,
51
- ColSchemaMatch,
52
- ColValsCompareOne,
53
- ColValsCompareSet,
54
- ColValsCompareTwo,
55
- ColValsExpr,
56
- ColValsRegex,
57
- ConjointlyValidation,
58
48
  NumberOfTestUnits,
59
- RowCountMatch,
60
- RowsComplete,
61
- RowsDistinct,
62
49
  SpeciallyValidation,
50
+ col_count_match,
51
+ col_exists,
52
+ col_schema_match,
53
+ col_vals_expr,
54
+ conjointly_validation,
55
+ interrogate_between,
56
+ interrogate_eq,
57
+ interrogate_ge,
58
+ interrogate_gt,
59
+ interrogate_isin,
60
+ interrogate_le,
61
+ interrogate_lt,
62
+ interrogate_ne,
63
+ interrogate_not_null,
64
+ interrogate_notin,
65
+ interrogate_null,
66
+ interrogate_outside,
67
+ interrogate_regex,
68
+ interrogate_rows_distinct,
69
+ row_count_match,
70
+ rows_complete,
63
71
  )
64
72
  from pointblank._typing import SegmentSpec
65
73
  from pointblank._utils import (
66
74
  _check_any_df_lib,
67
75
  _check_invalid_fields,
76
+ _column_test_prep,
68
77
  _count_null_values_in_column,
69
78
  _count_true_values_in_column,
70
79
  _derive_bounds,
@@ -1584,13 +1593,22 @@ def _generate_display_table(
1584
1593
 
1585
1594
  tail_data = pd.DataFrame(columns=head_data.columns)
1586
1595
 
1587
- data = pd.concat([head_data, tail_data])
1596
+ # Suppress the FutureWarning about DataFrame concatenation with empty entries
1597
+ import warnings
1598
+
1599
+ with warnings.catch_warnings():
1600
+ warnings.filterwarnings(
1601
+ "ignore",
1602
+ category=FutureWarning,
1603
+ message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
1604
+ )
1605
+ data = pd.concat([head_data, tail_data])
1588
1606
 
1589
1607
  row_number_list = list(range(1, n_head + 1)) + list(
1590
1608
  range(n_rows - n_tail + 1, n_rows + 1)
1591
1609
  )
1592
1610
 
1593
- # For PySpark, update schema after conversion to pandas
1611
+ # For PySpark, update schema after conversion to Pandas
1594
1612
  if tbl_type == "pyspark":
1595
1613
  tbl_schema = Schema(tbl=data)
1596
1614
 
@@ -2398,10 +2416,31 @@ def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
2398
2416
  return [lhs_values, rhs_values]
2399
2417
 
2400
2418
 
2419
+ def _get_column_names_safe(data: Any) -> list[str]:
2420
+ """
2421
+ Safely get column names from a DataFrame, optimized for LazyFrames.
2422
+ This function avoids the Narwhals PerformanceWarning for LazyFrames.
2423
+ """
2424
+ try:
2425
+ import narwhals as nw
2426
+
2427
+ df_nw = nw.from_native(data)
2428
+ # Use `collect_schema()` for LazyFrames to avoid performance warnings
2429
+ if hasattr(df_nw, "collect_schema"):
2430
+ return list(df_nw.collect_schema().keys())
2431
+ else:
2432
+ return list(df_nw.columns)
2433
+ except Exception:
2434
+ # Fallback to direct column access
2435
+ return list(data.columns)
2436
+
2437
+
2401
2438
  def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
2402
2439
  if ibis_tbl:
2403
2440
  return data.columns if df_lib_name_gt == "polars" else list(data.columns)
2404
- return list(data.columns)
2441
+
2442
+ # Use the optimized helper function
2443
+ return _get_column_names_safe(data)
2405
2444
 
2406
2445
 
2407
2446
  def _validate_columns_subset(
@@ -2590,7 +2629,11 @@ def get_column_count(data: FrameT | Any) -> int:
2590
2629
  import narwhals as nw
2591
2630
 
2592
2631
  df_nw = nw.from_native(data)
2593
- return len(df_nw.columns)
2632
+ # Use `collect_schema()` for LazyFrames to avoid performance warnings
2633
+ if hasattr(df_nw, "collect_schema"):
2634
+ return len(df_nw.collect_schema())
2635
+ else:
2636
+ return len(df_nw.columns)
2594
2637
  except Exception:
2595
2638
  # Fallback for unsupported types
2596
2639
  if "pandas" in str(type(data)):
@@ -4702,7 +4745,8 @@ class Validate:
4702
4745
  _check_boolean_input(param=active, param_name="active")
4703
4746
 
4704
4747
  # If value is a string-based date or datetime, convert it to the appropriate type
4705
- value = _string_date_dttm_conversion(value=value)
4748
+ # Allow regular strings to pass through for string comparisons
4749
+ value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
4706
4750
 
4707
4751
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
4708
4752
  thresholds = (
@@ -4990,7 +5034,8 @@ class Validate:
4990
5034
  _check_boolean_input(param=active, param_name="active")
4991
5035
 
4992
5036
  # If value is a string-based date or datetime, convert it to the appropriate type
4993
- value = _string_date_dttm_conversion(value=value)
5037
+ # Allow regular strings to pass through for string comparisons
5038
+ value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
4994
5039
 
4995
5040
  # Determine threshold to use (global or local) and normalize a local `thresholds=` value
4996
5041
  thresholds = (
@@ -9738,8 +9783,8 @@ class Validate:
9738
9783
  threshold = validation.thresholds
9739
9784
  segment = validation.segments
9740
9785
 
9786
+ # Get compatible data types for this assertion type
9741
9787
  assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
9742
- assertion_category = METHOD_CATEGORY_MAP[assertion_method]
9743
9788
  compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
9744
9789
 
9745
9790
  # Process the `brief` text for the validation step by including template variables to
@@ -9870,197 +9915,243 @@ class Validate:
9870
9915
  # Validation stage
9871
9916
  # ------------------------------------------------
9872
9917
 
9873
- if assertion_category == "COMPARE_ONE":
9874
- results_tbl = ColValsCompareOne(
9875
- data_tbl=data_tbl_step,
9876
- column=column,
9877
- value=value,
9878
- na_pass=na_pass,
9879
- threshold=threshold,
9880
- assertion_method=assertion_method,
9881
- allowed_types=compatible_dtypes,
9882
- tbl_type=tbl_type,
9883
- ).get_test_results()
9884
-
9885
- if assertion_category == "COMPARE_TWO":
9886
- results_tbl = ColValsCompareTwo(
9887
- data_tbl=data_tbl_step,
9888
- column=column,
9889
- value1=value[0],
9890
- value2=value[1],
9891
- inclusive=inclusive,
9892
- na_pass=na_pass,
9893
- threshold=threshold,
9894
- assertion_method=assertion_method,
9895
- allowed_types=compatible_dtypes,
9896
- tbl_type=tbl_type,
9897
- ).get_test_results()
9898
-
9899
- if assertion_category == "COMPARE_SET":
9900
- inside = True if assertion_method == "in_set" else False
9901
-
9902
- results_tbl = ColValsCompareSet(
9903
- data_tbl=data_tbl_step,
9904
- column=column,
9905
- values=value,
9906
- threshold=threshold,
9907
- inside=inside,
9908
- allowed_types=compatible_dtypes,
9909
- tbl_type=tbl_type,
9910
- ).get_test_results()
9911
-
9912
- if assertion_category == "COMPARE_REGEX":
9913
- results_tbl = ColValsRegex(
9914
- data_tbl=data_tbl_step,
9915
- column=column,
9916
- pattern=value,
9917
- na_pass=na_pass,
9918
- threshold=threshold,
9919
- allowed_types=compatible_dtypes,
9920
- tbl_type=tbl_type,
9921
- ).get_test_results()
9922
-
9923
- if assertion_category == "COMPARE_EXPR":
9924
- results_tbl = ColValsExpr(
9925
- data_tbl=data_tbl_step,
9926
- expr=value,
9927
- threshold=threshold,
9928
- tbl_type=tbl_type,
9929
- ).get_test_results()
9930
-
9931
- if assertion_category == "ROWS_DISTINCT":
9932
- results_tbl = RowsDistinct(
9933
- data_tbl=data_tbl_step,
9934
- columns_subset=column,
9935
- threshold=threshold,
9936
- tbl_type=tbl_type,
9937
- ).get_test_results()
9938
-
9939
- if assertion_category == "ROWS_COMPLETE":
9940
- results_tbl = RowsComplete(
9941
- data_tbl=data_tbl_step,
9942
- columns_subset=column,
9943
- threshold=threshold,
9944
- tbl_type=tbl_type,
9945
- ).get_test_results()
9946
-
9947
- if assertion_category == "COL_EXISTS_HAS_TYPE":
9948
- result_bool = ColExistsHasType(
9949
- data_tbl=data_tbl_step,
9950
- column=column,
9951
- threshold=threshold,
9952
- assertion_method="exists",
9953
- tbl_type=tbl_type,
9954
- ).get_test_results()
9955
-
9956
- validation.all_passed = result_bool
9957
- validation.n = 1
9958
- validation.n_passed = result_bool
9959
- validation.n_failed = 1 - result_bool
9960
-
9961
- results_tbl = None
9962
-
9963
- if assertion_category == "COL_SCHEMA_MATCH":
9964
- result_bool = ColSchemaMatch(
9965
- data_tbl=data_tbl_step,
9966
- schema=value["schema"],
9967
- complete=value["complete"],
9968
- in_order=value["in_order"],
9969
- case_sensitive_colnames=value["case_sensitive_colnames"],
9970
- case_sensitive_dtypes=value["case_sensitive_dtypes"],
9971
- full_match_dtypes=value["full_match_dtypes"],
9972
- threshold=threshold,
9973
- ).get_test_results()
9974
-
9975
- schema_validation_info = _get_schema_validation_info(
9976
- data_tbl=data_tbl,
9977
- schema=value["schema"],
9978
- passed=result_bool,
9979
- complete=value["complete"],
9980
- in_order=value["in_order"],
9981
- case_sensitive_colnames=value["case_sensitive_colnames"],
9982
- case_sensitive_dtypes=value["case_sensitive_dtypes"],
9983
- full_match_dtypes=value["full_match_dtypes"],
9984
- )
9918
+ # Apply error handling only to data quality validations, not programming error validations
9919
+ if assertion_type != "specially":
9920
+ try:
9921
+ # validations requiring `_column_test_prep()`
9922
+ if assertion_type in [
9923
+ "col_vals_gt",
9924
+ "col_vals_lt",
9925
+ "col_vals_eq",
9926
+ "col_vals_ne",
9927
+ "col_vals_ge",
9928
+ "col_vals_le",
9929
+ "col_vals_null",
9930
+ "col_vals_not_null",
9931
+ "col_vals_between",
9932
+ "col_vals_outside",
9933
+ "col_vals_in_set",
9934
+ "col_vals_not_in_set",
9935
+ "col_vals_regex",
9936
+ ]:
9937
+ # Process table for column validation
9938
+ tbl = _column_test_prep(
9939
+ df=data_tbl_step, column=column, allowed_types=compatible_dtypes
9940
+ )
9985
9941
 
9986
- # Add the schema validation info to the validation object
9987
- validation.val_info = schema_validation_info
9988
-
9989
- validation.all_passed = result_bool
9990
- validation.n = 1
9991
- validation.n_passed = int(result_bool)
9992
- validation.n_failed = 1 - result_bool
9993
-
9994
- results_tbl = None
9995
-
9996
- if assertion_category == "ROW_COUNT_MATCH":
9997
- result_bool = RowCountMatch(
9998
- data_tbl=data_tbl_step,
9999
- count=value["count"],
10000
- inverse=value["inverse"],
10001
- threshold=threshold,
10002
- abs_tol_bounds=value["abs_tol_bounds"],
10003
- tbl_type=tbl_type,
10004
- ).get_test_results()
10005
-
10006
- validation.all_passed = result_bool
10007
- validation.n = 1
10008
- validation.n_passed = int(result_bool)
10009
- validation.n_failed = 1 - result_bool
10010
-
10011
- results_tbl = None
10012
-
10013
- if assertion_category == "COL_COUNT_MATCH":
10014
- result_bool = ColCountMatch(
10015
- data_tbl=data_tbl_step,
10016
- count=value["count"],
10017
- inverse=value["inverse"],
10018
- threshold=threshold,
10019
- tbl_type=tbl_type,
10020
- ).get_test_results()
10021
-
10022
- validation.all_passed = result_bool
10023
- validation.n = 1
10024
- validation.n_passed = int(result_bool)
10025
- validation.n_failed = 1 - result_bool
10026
-
10027
- results_tbl = None
10028
-
10029
- if assertion_category == "CONJOINTLY":
10030
- results_tbl = ConjointlyValidation(
10031
- data_tbl=data_tbl_step,
10032
- expressions=value["expressions"],
10033
- threshold=threshold,
10034
- tbl_type=tbl_type,
10035
- ).get_test_results()
10036
-
10037
- if assertion_category == "SPECIALLY":
10038
- results_tbl_list = SpeciallyValidation(
10039
- data_tbl=data_tbl_step,
10040
- expression=value,
10041
- threshold=threshold,
10042
- tbl_type=tbl_type,
10043
- ).get_test_results()
10044
-
10045
- #
10046
- # The result from this could either be a table in the conventional form, or,
10047
- # a list of boolean values; handle both cases
10048
- #
10049
-
10050
- if isinstance(results_tbl_list, list):
10051
- # If the result is a list of boolean values, then we need to convert it to a
10052
- # set the validation results from the list
10053
- validation.all_passed = all(results_tbl_list)
10054
- validation.n = len(results_tbl_list)
10055
- validation.n_passed = results_tbl_list.count(True)
10056
- validation.n_failed = results_tbl_list.count(False)
10057
-
10058
- results_tbl = None
9942
+ if assertion_method == "gt":
9943
+ results_tbl = interrogate_gt(
9944
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9945
+ )
9946
+ elif assertion_method == "lt":
9947
+ results_tbl = interrogate_lt(
9948
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9949
+ )
9950
+ elif assertion_method == "eq":
9951
+ results_tbl = interrogate_eq(
9952
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9953
+ )
9954
+ elif assertion_method == "ne":
9955
+ results_tbl = interrogate_ne(
9956
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9957
+ )
9958
+ elif assertion_method == "ge":
9959
+ results_tbl = interrogate_ge(
9960
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9961
+ )
9962
+ elif assertion_method == "le":
9963
+ results_tbl = interrogate_le(
9964
+ tbl=tbl, column=column, compare=value, na_pass=na_pass
9965
+ )
9966
+ elif assertion_method == "null":
9967
+ results_tbl = interrogate_null(tbl=tbl, column=column)
9968
+ elif assertion_method == "not_null":
9969
+ results_tbl = interrogate_not_null(tbl=tbl, column=column)
9970
+
9971
+ elif assertion_type == "col_vals_between":
9972
+ results_tbl = interrogate_between(
9973
+ tbl=tbl,
9974
+ column=column,
9975
+ low=value[0],
9976
+ high=value[1],
9977
+ inclusive=inclusive,
9978
+ na_pass=na_pass,
9979
+ )
10059
9980
 
10060
- else:
10061
- # If the result is not a list, then we assume it's a table in the conventional
10062
- # form (where the column is `pb_is_good_` exists, with boolean values
10063
- results_tbl = results_tbl_list
9981
+ elif assertion_type == "col_vals_outside":
9982
+ results_tbl = interrogate_outside(
9983
+ tbl=tbl,
9984
+ column=column,
9985
+ low=value[0],
9986
+ high=value[1],
9987
+ inclusive=inclusive,
9988
+ na_pass=na_pass,
9989
+ )
9990
+
9991
+ elif assertion_type == "col_vals_in_set":
9992
+ results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value)
9993
+
9994
+ elif assertion_type == "col_vals_not_in_set":
9995
+ results_tbl = interrogate_notin(
9996
+ tbl=tbl, column=column, set_values=value
9997
+ )
9998
+
9999
+ elif assertion_type == "col_vals_regex":
10000
+ results_tbl = interrogate_regex(
10001
+ tbl=tbl, column=column, pattern=value, na_pass=na_pass
10002
+ )
10003
+
10004
+ elif assertion_type == "col_vals_expr":
10005
+ results_tbl = col_vals_expr(
10006
+ data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
10007
+ )
10008
+
10009
+ elif assertion_type == "rows_distinct":
10010
+ results_tbl = interrogate_rows_distinct(
10011
+ data_tbl=data_tbl_step, columns_subset=column
10012
+ )
10013
+
10014
+ elif assertion_type == "rows_complete":
10015
+ results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
10016
+
10017
+ elif assertion_type == "col_exists":
10018
+ result_bool = col_exists(
10019
+ data_tbl=data_tbl_step,
10020
+ column=column,
10021
+ )
10022
+
10023
+ validation.all_passed = result_bool
10024
+ validation.n = 1
10025
+ validation.n_passed = int(result_bool)
10026
+ validation.n_failed = 1 - int(result_bool)
10027
+
10028
+ results_tbl = None
10029
+
10030
+ elif assertion_type == "col_schema_match":
10031
+ result_bool = col_schema_match(
10032
+ data_tbl=data_tbl_step,
10033
+ schema=value["schema"],
10034
+ complete=value["complete"],
10035
+ in_order=value["in_order"],
10036
+ case_sensitive_colnames=value["case_sensitive_colnames"],
10037
+ case_sensitive_dtypes=value["case_sensitive_dtypes"],
10038
+ full_match_dtypes=value["full_match_dtypes"],
10039
+ threshold=threshold,
10040
+ )
10041
+
10042
+ schema_validation_info = _get_schema_validation_info(
10043
+ data_tbl=data_tbl,
10044
+ schema=value["schema"],
10045
+ passed=result_bool,
10046
+ complete=value["complete"],
10047
+ in_order=value["in_order"],
10048
+ case_sensitive_colnames=value["case_sensitive_colnames"],
10049
+ case_sensitive_dtypes=value["case_sensitive_dtypes"],
10050
+ full_match_dtypes=value["full_match_dtypes"],
10051
+ )
10052
+
10053
+ # Add the schema validation info to the validation object
10054
+ validation.val_info = schema_validation_info
10055
+
10056
+ validation.all_passed = result_bool
10057
+ validation.n = 1
10058
+ validation.n_passed = int(result_bool)
10059
+ validation.n_failed = 1 - result_bool
10060
+
10061
+ results_tbl = None
10062
+
10063
+ elif assertion_type == "row_count_match":
10064
+ result_bool = row_count_match(
10065
+ data_tbl=data_tbl_step,
10066
+ count=value["count"],
10067
+ inverse=value["inverse"],
10068
+ abs_tol_bounds=value["abs_tol_bounds"],
10069
+ )
10070
+
10071
+ validation.all_passed = result_bool
10072
+ validation.n = 1
10073
+ validation.n_passed = int(result_bool)
10074
+ validation.n_failed = 1 - result_bool
10075
+
10076
+ results_tbl = None
10077
+
10078
+ elif assertion_type == "col_count_match":
10079
+ result_bool = col_count_match(
10080
+ data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"]
10081
+ )
10082
+
10083
+ validation.all_passed = result_bool
10084
+ validation.n = 1
10085
+ validation.n_passed = int(result_bool)
10086
+ validation.n_failed = 1 - result_bool
10087
+
10088
+ results_tbl = None
10089
+
10090
+ elif assertion_type == "conjointly":
10091
+ results_tbl = conjointly_validation(
10092
+ data_tbl=data_tbl_step,
10093
+ expressions=value["expressions"],
10094
+ threshold=threshold,
10095
+ tbl_type=tbl_type,
10096
+ )
10097
+
10098
+ else:
10099
+ raise ValueError(f"Unknown assertion type: {assertion_type}")
10100
+
10101
+ except Exception as e:
10102
+ # Only catch specific data quality comparison errors, not programming errors
10103
+ error_msg = str(e).lower()
10104
+ is_comparison_error = (
10105
+ "boolean value of na is ambiguous" in error_msg
10106
+ or "cannot compare" in error_msg
10107
+ or (
10108
+ "type" in error_msg
10109
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
10110
+ )
10111
+ or ("dtype" in error_msg and "compare" in error_msg)
10112
+ )
10113
+
10114
+ if is_comparison_error:
10115
+ # If data quality comparison fails, mark the validation as having an eval_error
10116
+ validation.eval_error = True
10117
+ end_time = datetime.datetime.now(datetime.timezone.utc)
10118
+ validation.proc_duration_s = (end_time - start_time).total_seconds()
10119
+ validation.time_processed = end_time.isoformat(timespec="milliseconds")
10120
+ validation.active = False
10121
+ continue
10122
+ else:
10123
+ # For other errors (like missing columns), let them propagate
10124
+ raise
10125
+
10126
+ else:
10127
+ # For "specially" validations, let programming errors propagate as exceptions
10128
+ if assertion_type == "specially":
10129
+ results_tbl_list = SpeciallyValidation(
10130
+ data_tbl=data_tbl_step,
10131
+ expression=value,
10132
+ threshold=threshold,
10133
+ tbl_type=tbl_type,
10134
+ ).get_test_results()
10135
+
10136
+ #
10137
+ # The result from this could either be a table in the conventional form, or,
10138
+ # a list of boolean values; handle both cases
10139
+ #
10140
+
10141
+ if isinstance(results_tbl_list, list):
10142
+ # If the result is a list of boolean values, then we need to convert it to a
10143
+ # set the validation results from the list
10144
+ validation.all_passed = all(results_tbl_list)
10145
+ validation.n = len(results_tbl_list)
10146
+ validation.n_passed = results_tbl_list.count(True)
10147
+ validation.n_failed = results_tbl_list.count(False)
10148
+
10149
+ results_tbl = None
10150
+
10151
+ else:
10152
+ # If the result is not a list, then we assume it's a table in the conventional
10153
+ # form (where the column is `pb_is_good_` exists, with boolean values
10154
+ results_tbl = results_tbl_list
10064
10155
 
10065
10156
  # If the results table is not `None`, then we assume there is a table with a column
10066
10157
  # called `pb_is_good_` that contains boolean values; we can then use this table to
@@ -13670,6 +13761,48 @@ def _string_date_dttm_conversion(value: any) -> any:
13670
13761
  return value
13671
13762
 
13672
13763
 
13764
+ def _conditional_string_date_dttm_conversion(
13765
+ value: any, allow_regular_strings: bool = False
13766
+ ) -> any:
13767
+ """
13768
+ Conditionally convert a string to a date or datetime object if it is in the correct format. If
13769
+ `allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
13770
+ the value is not a string, it is returned as is.
13771
+
13772
+ Parameters
13773
+ ----------
13774
+ value
13775
+ The value to convert. It can be a string, date, or datetime object.
13776
+ allow_regular_strings
13777
+ If `True`, regular strings (non-date/datetime) are allowed to pass through unchanged. If
13778
+ `False`, behaves like `_string_date_dttm_conversion()` and raises `ValueError` for regular
13779
+ strings.
13780
+
13781
+ Returns
13782
+ -------
13783
+ any
13784
+ The converted date or datetime object, or the original value.
13785
+
13786
+ Raises
13787
+ ------
13788
+ ValueError
13789
+ If allow_regular_strings is False and the string cannot be converted to a date or datetime.
13790
+ """
13791
+
13792
+ if isinstance(value, str):
13793
+ if _is_string_date(value):
13794
+ value = _convert_string_to_date(value)
13795
+ elif _is_string_datetime(value):
13796
+ value = _convert_string_to_datetime(value)
13797
+ elif not allow_regular_strings:
13798
+ raise ValueError(
13799
+ "If `value=` is provided as a string it must be a date or datetime string."
13800
+ )
13801
+ # If allow_regular_strings is True, regular strings pass through unchanged
13802
+
13803
+ return value
13804
+
13805
+
13673
13806
  def _process_brief(
13674
13807
  brief: str | None,
13675
13808
  step: int,
@@ -14319,17 +14452,108 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
14319
14452
  column, segment = segments_expr
14320
14453
 
14321
14454
  if tbl_type in ["pandas", "polars", "pyspark"]:
14322
- # If the table is a Pandas, Polars, or PySpark DataFrame, transforming to a Narwhals table
14455
+ # If the table is a Pandas, Polars, or PySpark DataFrame, transform to a Narwhals table
14323
14456
  # and perform the filtering operation
14324
14457
 
14325
14458
  # Transform to Narwhals table if a DataFrame
14326
14459
  data_tbl_nw = nw.from_native(data_tbl)
14327
14460
 
14461
+ # Handle Polars expressions by attempting to extract literal values
14462
+ # This is a compatibility measure for cases where `pl.datetime()`, `pl.lit()`, etc.,
14463
+ # are accidentally used instead of native Python types
14464
+ if (
14465
+ hasattr(segment, "__class__")
14466
+ and "polars" in segment.__class__.__module__
14467
+ and segment.__class__.__name__ == "Expr"
14468
+ ):
14469
+ # This is a Polars expression so we should warn about this and suggest native types
14470
+ import warnings
14471
+ from datetime import date, datetime
14472
+
14473
+ warnings.warn(
14474
+ "Polars expressions in segments are deprecated. Please use native Python types instead. "
14475
+ "For example, use datetime.date(2016, 1, 4) instead of pl.datetime(2016, 1, 4).",
14476
+ DeprecationWarning,
14477
+ stacklevel=3,
14478
+ )
14479
+
14480
+ # Try to extract the literal value from various Polars expression patterns
14481
+ segment_str = str(segment)
14482
+ parsed_value = None
14483
+
14484
+ # Handle different Polars expression string formats
14485
+ # Format 1: Direct date strings like "2016-01-04"
14486
+ if len(segment_str) == 10 and segment_str.count("-") == 2:
14487
+ try:
14488
+ parsed_value = date.fromisoformat(segment_str)
14489
+ except ValueError:
14490
+ pass
14491
+
14492
+ # Format 2: Datetime strings with UTC timezone like
14493
+ # "2016-01-04 00:00:01 UTC.strict_cast(...)"
14494
+ elif " UTC" in segment_str:
14495
+ try:
14496
+ # Extract just the datetime part before "UTC"
14497
+ datetime_part = segment_str.split(" UTC")[0]
14498
+ if len(datetime_part) >= 10:
14499
+ parsed_dt = datetime.fromisoformat(datetime_part)
14500
+ # Convert midnight datetimes to dates for consistency
14501
+ if parsed_dt.time() == datetime.min.time():
14502
+ parsed_value = parsed_dt.date()
14503
+ else:
14504
+ parsed_value = parsed_dt
14505
+ except (ValueError, IndexError):
14506
+ pass
14507
+
14508
+ # Format 3: Bracketed expressions like ['2016-01-04']
14509
+ elif segment_str.startswith("[") and segment_str.endswith("]"):
14510
+ try:
14511
+ content = segment_str[2:-2] # Remove [' and ']
14512
+
14513
+ # Try parsing as date first
14514
+ if len(content) == 10 and content.count("-") == 2:
14515
+ try:
14516
+ parsed_value = date.fromisoformat(content)
14517
+ except ValueError:
14518
+ pass
14519
+
14520
+ # Try parsing as datetime
14521
+ if parsed_value is None:
14522
+ try:
14523
+ parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
14524
+ if parsed_dt.time() == datetime.min.time():
14525
+ parsed_value = parsed_dt.date()
14526
+ else:
14527
+ parsed_value = parsed_dt
14528
+ except ValueError:
14529
+ pass
14530
+
14531
+ except (ValueError, IndexError):
14532
+ pass
14533
+
14534
+ # Handle `pl.datetime()` expressions with .alias("datetime")
14535
+ elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
14536
+ try:
14537
+ datetime_part = segment_str.split('.alias("datetime")')[0]
14538
+ parsed_dt = datetime.fromisoformat(datetime_part)
14539
+
14540
+ if parsed_dt.time() == datetime.min.time():
14541
+ parsed_value = parsed_dt.date()
14542
+ else:
14543
+ parsed_value = parsed_dt
14544
+
14545
+ except (ValueError, AttributeError):
14546
+ pass
14547
+
14548
+ # If we successfully parsed a value, use it; otherwise leave segment as is
14549
+ if parsed_value is not None:
14550
+ segment = parsed_value
14551
+
14328
14552
  # Filter the data table based on the column name and segment
14329
14553
  if segment is None:
14330
14554
  data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_null())
14331
- # Check if the segment is a segment group
14332
14555
  elif isinstance(segment, list):
14556
+ # Check if the segment is a segment group
14333
14557
  data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_in(segment))
14334
14558
  else:
14335
14559
  data_tbl_nw = data_tbl_nw.filter(nw.col(column) == segment)
@@ -14341,12 +14565,13 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
14341
14565
  # If the table is an Ibis backend table, perform the filtering operation directly
14342
14566
 
14343
14567
  # Filter the data table based on the column name and segment
14568
+ # Use the new Ibis API methods to avoid deprecation warnings
14344
14569
  if segment is None:
14345
- data_tbl = data_tbl[data_tbl[column].isnull()]
14570
+ data_tbl = data_tbl.filter(data_tbl[column].isnull())
14346
14571
  elif isinstance(segment, list):
14347
- data_tbl = data_tbl[data_tbl[column].isin(segment)]
14572
+ data_tbl = data_tbl.filter(data_tbl[column].isin(segment))
14348
14573
  else:
14349
- data_tbl = data_tbl[data_tbl[column] == segment]
14574
+ data_tbl = data_tbl.filter(data_tbl[column] == segment)
14350
14575
 
14351
14576
  return data_tbl
14352
14577
 
@@ -15113,6 +15338,8 @@ def _step_report_row_based(
15113
15338
  text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
15114
15339
  elif assertion_type == "col_vals_not_null":
15115
15340
  text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
15341
+ elif assertion_type == "col_vals_expr":
15342
+ text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
15116
15343
  elif assertion_type == "rows_complete":
15117
15344
  if column is None:
15118
15345
  text = STEP_REPORT_TEXT["rows_complete_all"][lang]
@@ -15159,10 +15386,14 @@ def _step_report_row_based(
15159
15386
  title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
15160
15387
  assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
15161
15388
 
15162
- success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
15163
- n=n,
15164
- column_position=column_position,
15165
- )
15389
+ # Use success_statement_no_column for col_vals_expr since it doesn't target a specific column
15390
+ if assertion_type == "col_vals_expr":
15391
+ success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(n=n)
15392
+ else:
15393
+ success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
15394
+ n=n,
15395
+ column_position=column_position,
15396
+ )
15166
15397
  preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
15167
15398
 
15168
15399
  details = (
@@ -15242,10 +15473,16 @@ def _step_report_row_based(
15242
15473
  assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
15243
15474
  failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
15244
15475
 
15245
- failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
15246
- failure_rate=failure_rate_metrics,
15247
- column_position=column_position,
15248
- )
15476
+ # Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column
15477
+ if assertion_type == "col_vals_expr":
15478
+ failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format(
15479
+ failure_rate=failure_rate_metrics
15480
+ )
15481
+ else:
15482
+ failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
15483
+ failure_rate=failure_rate_metrics,
15484
+ column_position=column_position,
15485
+ )
15249
15486
 
15250
15487
  if limit < extract_length:
15251
15488
  extract_length_resolved = limit