PyPI - pointblank - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

pointblank 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

pointblank/__init__.py +0 -2
pointblank/_constants.py +2 -28
pointblank/_constants_translations.py +54 -0
pointblank/_interrogation.py +1483 -1735
pointblank/column.py +6 -2
pointblank/datascan.py +3 -2
pointblank/schema.py +155 -1
pointblank/validate.py +459 -222
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/METADATA +3 -2
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/RECORD +14 -15
pointblank/tf.py +0 -287
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/WHEEL +0 -0
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/entry_points.txt +0 -0
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/top_level.txt +0 -0

pointblank/validate.py CHANGED Viewed

@@ -31,7 +31,6 @@ from pointblank._constants import (
     CROSS_MARK_SPAN,
     IBIS_BACKENDS,
     LOG_LEVELS_MAP,
-    METHOD_CATEGORY_MAP,
     REPORTING_LANGUAGES,
     ROW_BASED_VALIDATION_TYPES,
     RTL_LANGUAGES,
@@ -46,25 +45,35 @@ from pointblank._constants_translations import (
     VALIDATION_REPORT_TEXT,
 )
 from pointblank._interrogation import (
-    ColCountMatch,
-    ColExistsHasType,
-    ColSchemaMatch,
-    ColValsCompareOne,
-    ColValsCompareSet,
-    ColValsCompareTwo,
-    ColValsExpr,
-    ColValsRegex,
-    ConjointlyValidation,
     NumberOfTestUnits,
-    RowCountMatch,
-    RowsComplete,
-    RowsDistinct,
     SpeciallyValidation,
+    col_count_match,
+    col_exists,
+    col_schema_match,
+    col_vals_expr,
+    conjointly_validation,
+    interrogate_between,
+    interrogate_eq,
+    interrogate_ge,
+    interrogate_gt,
+    interrogate_isin,
+    interrogate_le,
+    interrogate_lt,
+    interrogate_ne,
+    interrogate_not_null,
+    interrogate_notin,
+    interrogate_null,
+    interrogate_outside,
+    interrogate_regex,
+    interrogate_rows_distinct,
+    row_count_match,
+    rows_complete,
 )
 from pointblank._typing import SegmentSpec
 from pointblank._utils import (
     _check_any_df_lib,
     _check_invalid_fields,
+    _column_test_prep,
     _count_null_values_in_column,
     _count_true_values_in_column,
     _derive_bounds,
@@ -1584,13 +1593,22 @@ def _generate_display_table(
                     tail_data = pd.DataFrame(columns=head_data.columns)
-                data = pd.concat([head_data, tail_data])
+                # Suppress the FutureWarning about DataFrame concatenation with empty entries
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        category=FutureWarning,
+                        message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
+                    )
+                    data = pd.concat([head_data, tail_data])
                 row_number_list = list(range(1, n_head + 1)) + list(
                     range(n_rows - n_tail + 1, n_rows + 1)
                 )
-        # For PySpark, update schema after conversion to pandas
+        # For PySpark, update schema after conversion to Pandas
         if tbl_type == "pyspark":
             tbl_schema = Schema(tbl=data)
@@ -2398,10 +2416,31 @@ def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
     return [lhs_values, rhs_values]
+def _get_column_names_safe(data: Any) -> list[str]:
+    """
+    Safely get column names from a DataFrame, optimized for LazyFrames.
+    This function avoids the Narwhals PerformanceWarning for LazyFrames.
+    """
+    try:
+        import narwhals as nw
+        df_nw = nw.from_native(data)
+        # Use `collect_schema()` for LazyFrames to avoid performance warnings
+        if hasattr(df_nw, "collect_schema"):
+            return list(df_nw.collect_schema().keys())
+        else:
+            return list(df_nw.columns)
+    except Exception:
+        # Fallback to direct column access
+        return list(data.columns)
 def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
     if ibis_tbl:
         return data.columns if df_lib_name_gt == "polars" else list(data.columns)
-    return list(data.columns)
+    # Use the optimized helper function
+    return _get_column_names_safe(data)
 def _validate_columns_subset(
@@ -2590,7 +2629,11 @@ def get_column_count(data: FrameT | Any) -> int:
         import narwhals as nw
         df_nw = nw.from_native(data)
-        return len(df_nw.columns)
+        # Use `collect_schema()` for LazyFrames to avoid performance warnings
+        if hasattr(df_nw, "collect_schema"):
+            return len(df_nw.collect_schema())
+        else:
+            return len(df_nw.columns)
     except Exception:
         # Fallback for unsupported types
         if "pandas" in str(type(data)):
@@ -4702,7 +4745,8 @@ class Validate:
         _check_boolean_input(param=active, param_name="active")
         # If value is a string-based date or datetime, convert it to the appropriate type
-        value = _string_date_dttm_conversion(value=value)
+        # Allow regular strings to pass through for string comparisons
+        value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
         # Determine threshold to use (global or local) and normalize a local `thresholds=` value
         thresholds = (
@@ -4990,7 +5034,8 @@ class Validate:
         _check_boolean_input(param=active, param_name="active")
         # If value is a string-based date or datetime, convert it to the appropriate type
-        value = _string_date_dttm_conversion(value=value)
+        # Allow regular strings to pass through for string comparisons
+        value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True)
         # Determine threshold to use (global or local) and normalize a local `thresholds=` value
         thresholds = (
@@ -9738,8 +9783,8 @@ class Validate:
             threshold = validation.thresholds
             segment = validation.segments
+            # Get compatible data types for this assertion type
             assertion_method = ASSERTION_TYPE_METHOD_MAP[assertion_type]
-            assertion_category = METHOD_CATEGORY_MAP[assertion_method]
             compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, [])
             # Process the `brief` text for the validation step by including template variables to
@@ -9870,197 +9915,243 @@ class Validate:
             # Validation stage
             # ------------------------------------------------
-            if assertion_category == "COMPARE_ONE":
-                results_tbl = ColValsCompareOne(
-                    data_tbl=data_tbl_step,
-                    column=column,
-                    value=value,
-                    na_pass=na_pass,
-                    threshold=threshold,
-                    assertion_method=assertion_method,
-                    allowed_types=compatible_dtypes,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "COMPARE_TWO":
-                results_tbl = ColValsCompareTwo(
-                    data_tbl=data_tbl_step,
-                    column=column,
-                    value1=value[0],
-                    value2=value[1],
-                    inclusive=inclusive,
-                    na_pass=na_pass,
-                    threshold=threshold,
-                    assertion_method=assertion_method,
-                    allowed_types=compatible_dtypes,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "COMPARE_SET":
-                inside = True if assertion_method == "in_set" else False
-                results_tbl = ColValsCompareSet(
-                    data_tbl=data_tbl_step,
-                    column=column,
-                    values=value,
-                    threshold=threshold,
-                    inside=inside,
-                    allowed_types=compatible_dtypes,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "COMPARE_REGEX":
-                results_tbl = ColValsRegex(
-                    data_tbl=data_tbl_step,
-                    column=column,
-                    pattern=value,
-                    na_pass=na_pass,
-                    threshold=threshold,
-                    allowed_types=compatible_dtypes,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "COMPARE_EXPR":
-                results_tbl = ColValsExpr(
-                    data_tbl=data_tbl_step,
-                    expr=value,
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "ROWS_DISTINCT":
-                results_tbl = RowsDistinct(
-                    data_tbl=data_tbl_step,
-                    columns_subset=column,
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "ROWS_COMPLETE":
-                results_tbl = RowsComplete(
-                    data_tbl=data_tbl_step,
-                    columns_subset=column,
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "COL_EXISTS_HAS_TYPE":
-                result_bool = ColExistsHasType(
-                    data_tbl=data_tbl_step,
-                    column=column,
-                    threshold=threshold,
-                    assertion_method="exists",
-                    tbl_type=tbl_type,
-                ).get_test_results()
-                validation.all_passed = result_bool
-                validation.n = 1
-                validation.n_passed = result_bool
-                validation.n_failed = 1 - result_bool
-                results_tbl = None
-            if assertion_category == "COL_SCHEMA_MATCH":
-                result_bool = ColSchemaMatch(
-                    data_tbl=data_tbl_step,
-                    schema=value["schema"],
-                    complete=value["complete"],
-                    in_order=value["in_order"],
-                    case_sensitive_colnames=value["case_sensitive_colnames"],
-                    case_sensitive_dtypes=value["case_sensitive_dtypes"],
-                    full_match_dtypes=value["full_match_dtypes"],
-                    threshold=threshold,
-                ).get_test_results()
-                schema_validation_info = _get_schema_validation_info(
-                    data_tbl=data_tbl,
-                    schema=value["schema"],
-                    passed=result_bool,
-                    complete=value["complete"],
-                    in_order=value["in_order"],
-                    case_sensitive_colnames=value["case_sensitive_colnames"],
-                    case_sensitive_dtypes=value["case_sensitive_dtypes"],
-                    full_match_dtypes=value["full_match_dtypes"],
-                )
+            # Apply error handling only to data quality validations, not programming error validations
+            if assertion_type != "specially":
+                try:
+                    # validations requiring `_column_test_prep()`
+                    if assertion_type in [
+                        "col_vals_gt",
+                        "col_vals_lt",
+                        "col_vals_eq",
+                        "col_vals_ne",
+                        "col_vals_ge",
+                        "col_vals_le",
+                        "col_vals_null",
+                        "col_vals_not_null",
+                        "col_vals_between",
+                        "col_vals_outside",
+                        "col_vals_in_set",
+                        "col_vals_not_in_set",
+                        "col_vals_regex",
+                    ]:
+                        # Process table for column validation
+                        tbl = _column_test_prep(
+                            df=data_tbl_step, column=column, allowed_types=compatible_dtypes
+                        )
-                # Add the schema validation info to the validation object
-                validation.val_info = schema_validation_info
-                validation.all_passed = result_bool
-                validation.n = 1
-                validation.n_passed = int(result_bool)
-                validation.n_failed = 1 - result_bool
-                results_tbl = None
-            if assertion_category == "ROW_COUNT_MATCH":
-                result_bool = RowCountMatch(
-                    data_tbl=data_tbl_step,
-                    count=value["count"],
-                    inverse=value["inverse"],
-                    threshold=threshold,
-                    abs_tol_bounds=value["abs_tol_bounds"],
-                    tbl_type=tbl_type,
-                ).get_test_results()
-                validation.all_passed = result_bool
-                validation.n = 1
-                validation.n_passed = int(result_bool)
-                validation.n_failed = 1 - result_bool
-                results_tbl = None
-            if assertion_category == "COL_COUNT_MATCH":
-                result_bool = ColCountMatch(
-                    data_tbl=data_tbl_step,
-                    count=value["count"],
-                    inverse=value["inverse"],
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-                validation.all_passed = result_bool
-                validation.n = 1
-                validation.n_passed = int(result_bool)
-                validation.n_failed = 1 - result_bool
-                results_tbl = None
-            if assertion_category == "CONJOINTLY":
-                results_tbl = ConjointlyValidation(
-                    data_tbl=data_tbl_step,
-                    expressions=value["expressions"],
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-            if assertion_category == "SPECIALLY":
-                results_tbl_list = SpeciallyValidation(
-                    data_tbl=data_tbl_step,
-                    expression=value,
-                    threshold=threshold,
-                    tbl_type=tbl_type,
-                ).get_test_results()
-                #
-                # The result from this could either be a table in the conventional form, or,
-                # a list of boolean values; handle both cases
-                #
-                if isinstance(results_tbl_list, list):
-                    # If the result is a list of boolean values, then we need to convert it to a
-                    # set the validation results from the list
-                    validation.all_passed = all(results_tbl_list)
-                    validation.n = len(results_tbl_list)
-                    validation.n_passed = results_tbl_list.count(True)
-                    validation.n_failed = results_tbl_list.count(False)
-                    results_tbl = None
+                        if assertion_method == "gt":
+                            results_tbl = interrogate_gt(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "lt":
+                            results_tbl = interrogate_lt(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "eq":
+                            results_tbl = interrogate_eq(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "ne":
+                            results_tbl = interrogate_ne(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "ge":
+                            results_tbl = interrogate_ge(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "le":
+                            results_tbl = interrogate_le(
+                                tbl=tbl, column=column, compare=value, na_pass=na_pass
+                            )
+                        elif assertion_method == "null":
+                            results_tbl = interrogate_null(tbl=tbl, column=column)
+                        elif assertion_method == "not_null":
+                            results_tbl = interrogate_not_null(tbl=tbl, column=column)
+                        elif assertion_type == "col_vals_between":
+                            results_tbl = interrogate_between(
+                                tbl=tbl,
+                                column=column,
+                                low=value[0],
+                                high=value[1],
+                                inclusive=inclusive,
+                                na_pass=na_pass,
+                            )
-                else:
-                    # If the result is not a list, then we assume it's a table in the conventional
-                    # form (where the column is `pb_is_good_` exists, with boolean values
-                    results_tbl = results_tbl_list
+                        elif assertion_type == "col_vals_outside":
+                            results_tbl = interrogate_outside(
+                                tbl=tbl,
+                                column=column,
+                                low=value[0],
+                                high=value[1],
+                                inclusive=inclusive,
+                                na_pass=na_pass,
+                            )
+                        elif assertion_type == "col_vals_in_set":
+                            results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value)
+                        elif assertion_type == "col_vals_not_in_set":
+                            results_tbl = interrogate_notin(
+                                tbl=tbl, column=column, set_values=value
+                            )
+                        elif assertion_type == "col_vals_regex":
+                            results_tbl = interrogate_regex(
+                                tbl=tbl, column=column, pattern=value, na_pass=na_pass
+                            )
+                    elif assertion_type == "col_vals_expr":
+                        results_tbl = col_vals_expr(
+                            data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type
+                        )
+                    elif assertion_type == "rows_distinct":
+                        results_tbl = interrogate_rows_distinct(
+                            data_tbl=data_tbl_step, columns_subset=column
+                        )
+                    elif assertion_type == "rows_complete":
+                        results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column)
+                    elif assertion_type == "col_exists":
+                        result_bool = col_exists(
+                            data_tbl=data_tbl_step,
+                            column=column,
+                        )
+                        validation.all_passed = result_bool
+                        validation.n = 1
+                        validation.n_passed = int(result_bool)
+                        validation.n_failed = 1 - int(result_bool)
+                        results_tbl = None
+                    elif assertion_type == "col_schema_match":
+                        result_bool = col_schema_match(
+                            data_tbl=data_tbl_step,
+                            schema=value["schema"],
+                            complete=value["complete"],
+                            in_order=value["in_order"],
+                            case_sensitive_colnames=value["case_sensitive_colnames"],
+                            case_sensitive_dtypes=value["case_sensitive_dtypes"],
+                            full_match_dtypes=value["full_match_dtypes"],
+                            threshold=threshold,
+                        )
+                        schema_validation_info = _get_schema_validation_info(
+                            data_tbl=data_tbl,
+                            schema=value["schema"],
+                            passed=result_bool,
+                            complete=value["complete"],
+                            in_order=value["in_order"],
+                            case_sensitive_colnames=value["case_sensitive_colnames"],
+                            case_sensitive_dtypes=value["case_sensitive_dtypes"],
+                            full_match_dtypes=value["full_match_dtypes"],
+                        )
+                        # Add the schema validation info to the validation object
+                        validation.val_info = schema_validation_info
+                        validation.all_passed = result_bool
+                        validation.n = 1
+                        validation.n_passed = int(result_bool)
+                        validation.n_failed = 1 - result_bool
+                        results_tbl = None
+                    elif assertion_type == "row_count_match":
+                        result_bool = row_count_match(
+                            data_tbl=data_tbl_step,
+                            count=value["count"],
+                            inverse=value["inverse"],
+                            abs_tol_bounds=value["abs_tol_bounds"],
+                        )
+                        validation.all_passed = result_bool
+                        validation.n = 1
+                        validation.n_passed = int(result_bool)
+                        validation.n_failed = 1 - result_bool
+                        results_tbl = None
+                    elif assertion_type == "col_count_match":
+                        result_bool = col_count_match(
+                            data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"]
+                        )
+                        validation.all_passed = result_bool
+                        validation.n = 1
+                        validation.n_passed = int(result_bool)
+                        validation.n_failed = 1 - result_bool
+                        results_tbl = None
+                    elif assertion_type == "conjointly":
+                        results_tbl = conjointly_validation(
+                            data_tbl=data_tbl_step,
+                            expressions=value["expressions"],
+                            threshold=threshold,
+                            tbl_type=tbl_type,
+                        )
+                    else:
+                        raise ValueError(f"Unknown assertion type: {assertion_type}")
+                except Exception as e:
+                    # Only catch specific data quality comparison errors, not programming errors
+                    error_msg = str(e).lower()
+                    is_comparison_error = (
+                        "boolean value of na is ambiguous" in error_msg
+                        or "cannot compare" in error_msg
+                        or (
+                            "type" in error_msg
+                            and ("mismatch" in error_msg or "incompatible" in error_msg)
+                        )
+                        or ("dtype" in error_msg and "compare" in error_msg)
+                    )
+                    if is_comparison_error:
+                        # If data quality comparison fails, mark the validation as having an eval_error
+                        validation.eval_error = True
+                        end_time = datetime.datetime.now(datetime.timezone.utc)
+                        validation.proc_duration_s = (end_time - start_time).total_seconds()
+                        validation.time_processed = end_time.isoformat(timespec="milliseconds")
+                        validation.active = False
+                        continue
+                    else:
+                        # For other errors (like missing columns), let them propagate
+                        raise
+            else:
+                # For "specially" validations, let programming errors propagate as exceptions
+                if assertion_type == "specially":
+                    results_tbl_list = SpeciallyValidation(
+                        data_tbl=data_tbl_step,
+                        expression=value,
+                        threshold=threshold,
+                        tbl_type=tbl_type,
+                    ).get_test_results()
+                    #
+                    # The result from this could either be a table in the conventional form, or,
+                    # a list of boolean values; handle both cases
+                    #
+                    if isinstance(results_tbl_list, list):
+                        # If the result is a list of boolean values, then we need to convert it to a
+                        # set the validation results from the list
+                        validation.all_passed = all(results_tbl_list)
+                        validation.n = len(results_tbl_list)
+                        validation.n_passed = results_tbl_list.count(True)
+                        validation.n_failed = results_tbl_list.count(False)
+                        results_tbl = None
+                    else:
+                        # If the result is not a list, then we assume it's a table in the conventional
+                        # form (where the column is `pb_is_good_` exists, with boolean values
+                        results_tbl = results_tbl_list
             # If the results table is not `None`, then we assume there is a table with a column
             # called `pb_is_good_` that contains boolean values; we can then use this table to
@@ -13670,6 +13761,48 @@ def _string_date_dttm_conversion(value: any) -> any:
     return value
+def _conditional_string_date_dttm_conversion(
+    value: any, allow_regular_strings: bool = False
+) -> any:
+    """
+    Conditionally convert a string to a date or datetime object if it is in the correct format. If
+    `allow_regular_strings=` is `True`, regular strings are allowed to pass through unchanged. If
+    the value is not a string, it is returned as is.
+    Parameters
+    ----------
+    value
+        The value to convert. It can be a string, date, or datetime object.
+    allow_regular_strings
+        If `True`, regular strings (non-date/datetime) are allowed to pass through unchanged. If
+        `False`, behaves like `_string_date_dttm_conversion()` and raises `ValueError` for regular
+        strings.
+    Returns
+    -------
+    any
+        The converted date or datetime object, or the original value.
+    Raises
+    ------
+    ValueError
+        If allow_regular_strings is False and the string cannot be converted to a date or datetime.
+    """
+    if isinstance(value, str):
+        if _is_string_date(value):
+            value = _convert_string_to_date(value)
+        elif _is_string_datetime(value):
+            value = _convert_string_to_datetime(value)
+        elif not allow_regular_strings:
+            raise ValueError(
+                "If `value=` is provided as a string it must be a date or datetime string."
+            )
+        # If allow_regular_strings is True, regular strings pass through unchanged
+    return value
 def _process_brief(
     brief: str | None,
     step: int,
@@ -14319,17 +14452,108 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
     column, segment = segments_expr
     if tbl_type in ["pandas", "polars", "pyspark"]:
-        # If the table is a Pandas, Polars, or PySpark DataFrame, transforming to a Narwhals table
+        # If the table is a Pandas, Polars, or PySpark DataFrame, transform to a Narwhals table
         # and perform the filtering operation
         # Transform to Narwhals table if a DataFrame
         data_tbl_nw = nw.from_native(data_tbl)
+        # Handle Polars expressions by attempting to extract literal values
+        # This is a compatibility measure for cases where `pl.datetime()`, `pl.lit()`, etc.,
+        # are accidentally used instead of native Python types
+        if (
+            hasattr(segment, "__class__")
+            and "polars" in segment.__class__.__module__
+            and segment.__class__.__name__ == "Expr"
+        ):
+            # This is a Polars expression so we should warn about this and suggest native types
+            import warnings
+            from datetime import date, datetime
+            warnings.warn(
+                "Polars expressions in segments are deprecated. Please use native Python types instead. "
+                "For example, use datetime.date(2016, 1, 4) instead of pl.datetime(2016, 1, 4).",
+                DeprecationWarning,
+                stacklevel=3,
+            )
+            # Try to extract the literal value from various Polars expression patterns
+            segment_str = str(segment)
+            parsed_value = None
+            # Handle different Polars expression string formats
+            # Format 1: Direct date strings like "2016-01-04"
+            if len(segment_str) == 10 and segment_str.count("-") == 2:
+                try:
+                    parsed_value = date.fromisoformat(segment_str)
+                except ValueError:
+                    pass
+            # Format 2: Datetime strings with UTC timezone like
+            # "2016-01-04 00:00:01 UTC.strict_cast(...)"
+            elif " UTC" in segment_str:
+                try:
+                    # Extract just the datetime part before "UTC"
+                    datetime_part = segment_str.split(" UTC")[0]
+                    if len(datetime_part) >= 10:
+                        parsed_dt = datetime.fromisoformat(datetime_part)
+                        # Convert midnight datetimes to dates for consistency
+                        if parsed_dt.time() == datetime.min.time():
+                            parsed_value = parsed_dt.date()
+                        else:
+                            parsed_value = parsed_dt
+                except (ValueError, IndexError):
+                    pass
+            # Format 3: Bracketed expressions like ['2016-01-04']
+            elif segment_str.startswith("[") and segment_str.endswith("]"):
+                try:
+                    content = segment_str[2:-2]  # Remove [' and ']
+                    # Try parsing as date first
+                    if len(content) == 10 and content.count("-") == 2:
+                        try:
+                            parsed_value = date.fromisoformat(content)
+                        except ValueError:
+                            pass
+                    # Try parsing as datetime
+                    if parsed_value is None:
+                        try:
+                            parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
+                            if parsed_dt.time() == datetime.min.time():
+                                parsed_value = parsed_dt.date()
+                            else:
+                                parsed_value = parsed_dt
+                        except ValueError:
+                            pass
+                except (ValueError, IndexError):
+                    pass
+            # Handle `pl.datetime()` expressions with .alias("datetime")
+            elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
+                try:
+                    datetime_part = segment_str.split('.alias("datetime")')[0]
+                    parsed_dt = datetime.fromisoformat(datetime_part)
+                    if parsed_dt.time() == datetime.min.time():
+                        parsed_value = parsed_dt.date()
+                    else:
+                        parsed_value = parsed_dt
+                except (ValueError, AttributeError):
+                    pass
+            # If we successfully parsed a value, use it; otherwise leave segment as is
+            if parsed_value is not None:
+                segment = parsed_value
         # Filter the data table based on the column name and segment
         if segment is None:
             data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_null())
-        # Check if the segment is a segment group
         elif isinstance(segment, list):
+            # Check if the segment is a segment group
             data_tbl_nw = data_tbl_nw.filter(nw.col(column).is_in(segment))
         else:
             data_tbl_nw = data_tbl_nw.filter(nw.col(column) == segment)
@@ -14341,12 +14565,13 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
         # If the table is an Ibis backend table, perform the filtering operation directly
         # Filter the data table based on the column name and segment
+        # Use the new Ibis API methods to avoid deprecation warnings
         if segment is None:
-            data_tbl = data_tbl[data_tbl[column].isnull()]
+            data_tbl = data_tbl.filter(data_tbl[column].isnull())
         elif isinstance(segment, list):
-            data_tbl = data_tbl[data_tbl[column].isin(segment)]
+            data_tbl = data_tbl.filter(data_tbl[column].isin(segment))
         else:
-            data_tbl = data_tbl[data_tbl[column] == segment]
+            data_tbl = data_tbl.filter(data_tbl[column] == segment)
     return data_tbl
@@ -15113,6 +15338,8 @@ def _step_report_row_based(
         text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
     elif assertion_type == "col_vals_not_null":
         text = STEP_REPORT_TEXT["column_is_not_null"][lang].format(column=column)
+    elif assertion_type == "col_vals_expr":
+        text = STEP_REPORT_TEXT["column_expr"][lang].format(values=values)
     elif assertion_type == "rows_complete":
         if column is None:
             text = STEP_REPORT_TEXT["rows_complete_all"][lang]
@@ -15159,10 +15386,14 @@ def _step_report_row_based(
         title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
         assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
-        success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
-            n=n,
-            column_position=column_position,
-        )
+        # Use success_statement_no_column for col_vals_expr since it doesn't target a specific column
+        if assertion_type == "col_vals_expr":
+            success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(n=n)
+        else:
+            success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
+                n=n,
+                column_position=column_position,
+            )
         preview_stmt = STEP_REPORT_TEXT["preview_statement"][lang]
         details = (
@@ -15242,10 +15473,16 @@ def _step_report_row_based(
         assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
         failure_rate_metrics = f"<strong>{n_failed}</strong> / <strong>{n}</strong>"
-        failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
-            failure_rate=failure_rate_metrics,
-            column_position=column_position,
-        )
+        # Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column
+        if assertion_type == "col_vals_expr":
+            failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format(
+                failure_rate=failure_rate_metrics
+            )
+        else:
+            failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format(
+                failure_rate=failure_rate_metrics,
+                column_position=column_position,
+            )
         if limit < extract_length:
             extract_length_resolved = limit

pointblank 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

pointblank 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl