PyPI - pointblank - Versions diffs - 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl - Mend

pointblank 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

pointblank/_constants_translations.py +54 -0
pointblank/_interrogation.py +16 -1
pointblank/_utils.py +40 -0
pointblank/validate.py +385 -159
pointblank/yaml.py +154 -44
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/METADATA +2 -2
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/RECORD +11 -12
pointblank/_constants_docs.py +0 -40
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/WHEEL +0 -0
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/entry_points.txt +0 -0
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.13.2.dist-info → pointblank-0.13.4.dist-info}/top_level.txt +0 -0

pointblank/validate.py CHANGED Viewed

@@ -10,6 +10,7 @@ import re
 import tempfile
 import threading
 from dataclasses import dataclass
+from enum import Enum
 from importlib.metadata import version
 from typing import TYPE_CHECKING, Any, Callable, Literal
 from zipfile import ZipFile
@@ -74,6 +75,7 @@ from pointblank._utils import (
     _check_any_df_lib,
     _check_invalid_fields,
     _column_test_prep,
+    _copy_dataframe,
     _count_null_values_in_column,
     _count_true_values_in_column,
     _derive_bounds,
@@ -2006,9 +2008,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
                         # Apply the appropriate conversion method
                         if use_polars_conversion:
-                            null_sum_converted = null_sum.to_polars()
+                            null_sum_converted = null_sum.to_polars()  # pragma: no cover
                         else:
-                            null_sum_converted = null_sum.to_pandas()
+                            null_sum_converted = null_sum.to_pandas()  # pragma: no cover
                         missing_prop = (null_sum_converted / sector_size) * 100
                         col_missing_props.append(missing_prop)
@@ -2025,9 +2027,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
                     # Apply the appropriate conversion method
                     if use_polars_conversion:
-                        null_sum_converted = null_sum.to_polars()
+                        null_sum_converted = null_sum.to_polars()  # pragma: no cover
                     else:
-                        null_sum_converted = null_sum.to_pandas()
+                        null_sum_converted = null_sum.to_pandas()  # pragma: no cover
                     missing_prop = (null_sum_converted / sector_size) * 100
                     col_missing_props.append(missing_prop)
@@ -2040,9 +2042,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
         # Use the helper function based on the DataFrame library
         if df_lib_name_gt == "polars":
-            missing_vals = _calculate_missing_proportions(use_polars_conversion=True)
+            missing_vals = _calculate_missing_proportions(
+                use_polars_conversion=True
+            )  # pragma: no cover
         else:
-            missing_vals = _calculate_missing_proportions(use_polars_conversion=False)
+            missing_vals = _calculate_missing_proportions(
+                use_polars_conversion=False
+            )  # pragma: no cover
         # Pivot the `missing_vals` dictionary to create a table with the missing value proportions
         missing_vals = {
@@ -2055,9 +2061,13 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
         # Get a dictionary of counts of missing values in each column
         if df_lib_name_gt == "polars":
-            missing_val_counts = {col: data[col].isnull().sum().to_polars() for col in data.columns}
+            missing_val_counts = {
+                col: data[col].isnull().sum().to_polars() for col in data.columns
+            }  # pragma: no cover
         else:
-            missing_val_counts = {col: data[col].isnull().sum().to_pandas() for col in data.columns}
+            missing_val_counts = {
+                col: data[col].isnull().sum().to_pandas() for col in data.columns
+            }  # pragma: no cover
     if pl_pb_tbl:
         # Get the column names from the table
@@ -2429,10 +2439,10 @@ def _get_column_names_safe(data: Any) -> list[str]:
         if hasattr(df_nw, "collect_schema"):
             return list(df_nw.collect_schema().keys())
         else:
-            return list(df_nw.columns)
-    except Exception:
+            return list(df_nw.columns)  # pragma: no cover
+    except Exception:  # pragma: no cover
         # Fallback to direct column access
-        return list(data.columns)
+        return list(data.columns)  # pragma: no cover
 def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
@@ -2633,7 +2643,7 @@ def get_column_count(data: FrameT | Any) -> int:
         if hasattr(df_nw, "collect_schema"):
             return len(df_nw.collect_schema())
         else:
-            return len(df_nw.columns)
+            return len(df_nw.columns)  # pragma: no cover
     except Exception:
         # Fallback for unsupported types
         if "pandas" in str(type(data)):
@@ -2642,6 +2652,48 @@ def get_column_count(data: FrameT | Any) -> int:
             raise ValueError("The input table type supplied in `data=` is not supported.")
+def _extract_enum_values(set_values: Any) -> list[Any]:
+    """
+    Extract values from Enum classes or collections containing Enum instances.
+    This helper function handles:
+    1. Enum classes: extracts all enum values
+    2. Collections containing Enum instances: extracts their values
+    3. Regular collections: returns as-is
+    Parameters
+    ----------
+    set_values
+        The input collection that may contain Enum class or Enum instances.
+    Returns
+    -------
+    list[Any]
+        A list of extracted values
+    """
+    from collections.abc import Collection
+    # Check if set_values is an Enum class (not an instance)
+    if inspect.isclass(set_values) and issubclass(set_values, Enum):
+        # Extract all values from the Enum class
+        return [enum_member.value for enum_member in set_values]
+    # Check if set_values is a collection
+    if isinstance(set_values, Collection) and not isinstance(set_values, (str, bytes)):
+        extracted_values = []
+        for item in set_values:
+            if isinstance(item, Enum):
+                # If item is an Enum instance, extract its value
+                extracted_values.append(item.value)
+            else:
+                # If item is not an Enum instance, keep as-is
+                extracted_values.append(item)
+        return extracted_values
+    # If set_values is neither an Enum class nor a collection, return as list
+    return [set_values]
 def get_row_count(data: FrameT | Any) -> int:
     """
     Get the number of rows in a table.
@@ -2806,11 +2858,11 @@ def get_row_count(data: FrameT | Any) -> int:
         # Try different ways to get row count
         if hasattr(df_nw, "shape"):
             return df_nw.shape[0]
-        elif hasattr(df_nw, "height"):
+        elif hasattr(df_nw, "height"):  # pragma: no cover
             return df_nw.height  # pragma: no cover
         else:  # pragma: no cover
             raise ValueError("Unable to determine row count from Narwhals DataFrame")
-    except Exception:
+    except Exception:  # pragma: no cover
         # Fallback for types that don't work with Narwhals
         if "pandas" in str(type(data)):  # pragma: no cover
             return data.shape[0]
@@ -6324,7 +6376,10 @@ class Validate:
             multiple columns are supplied or resolved, there will be a separate validation step
             generated for each column.
         set
-            A list of values to compare against.
+            A collection of values to compare against. Can be a list of values, a Python Enum class,
+            or a collection containing Enum instances. When an Enum class is provided, all enum
+            values will be used. When a collection contains Enum instances, their values will be
+            extracted automatically.
         pre
             An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
@@ -6501,12 +6556,69 @@ class Validate:
         The validation table reports two failing test units. The specific failing cases are for the
         column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`.
+        **Using Python Enums**
+        The `col_vals_in_set()` method also supports Python Enum classes and instances, which can
+        make validations more readable and maintainable:
+        ```{python}
+        from enum import Enum
+        class Color(Enum):
+            RED = "red"
+            GREEN = "green"
+            BLUE = "blue"
+        # Create a table with color data
+        tbl_colors = pl.DataFrame({
+            "product": ["shirt", "pants", "hat", "shoes"],
+            "color": ["red", "blue", "green", "yellow"]
+        })
+        # Validate using an Enum class (all enum values are allowed)
+        validation = (
+            pb.Validate(data=tbl_colors)
+            .col_vals_in_set(columns="color", set=Color)
+            .interrogate()
+        )
+        validation
+        ```
+        This validation will fail for the `"yellow"` value since it's not in the `Color` enum.
+        You can also use specific Enum instances or mix them with regular values:
+        ```{python}
+        # Validate using specific Enum instances
+        validation = (
+            pb.Validate(data=tbl_colors)
+            .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE])
+            .interrogate()
+        )
+        # Mix Enum instances with regular values
+        validation = (
+            pb.Validate(data=tbl_colors)
+            .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"])
+            .interrogate()
+        )
+        validation
+        ```
+        In this case, the `"green"` value will cause a failing test unit since it's not part of the
+        specified set.
         """
         assertion_type = _get_fn_name()
         _check_column(column=columns)
+        # Extract values from Enum classes or Enum instances if present
+        set = _extract_enum_values(set)
         for val in set:
             if val is None:
                 continue
@@ -6557,7 +6669,7 @@ class Validate:
     def col_vals_not_in_set(
         self,
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
-        set: list[float | int],
+        set: Collection[Any],
         pre: Callable | None = None,
         segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -6581,7 +6693,10 @@ class Validate:
             multiple columns are supplied or resolved, there will be a separate validation step
             generated for each column.
         set
-            A list of values to compare against.
+            A collection of values to compare against. Can be a list of values, a Python Enum class,
+            or a collection containing Enum instances. When an Enum class is provided, all enum
+            values will be used. When a collection contains Enum instances, their values will be
+            extracted automatically.
         pre
             An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
@@ -6759,11 +6874,45 @@ class Validate:
         The validation table reports two failing test units. The specific failing cases are for the
         column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`.
+        **Using Python Enums**
+        Like `col_vals_in_set()`, this method also supports Python Enum classes and instances:
+        ```{python}
+        from enum import Enum
+        class InvalidStatus(Enum):
+            DELETED = "deleted"
+            ARCHIVED = "archived"
+        # Create a table with status data
+        status_table = pl.DataFrame({
+            "product": ["widget", "gadget", "tool", "device"],
+            "status": ["active", "pending", "deleted", "active"]
+        })
+        # Validate that no values are in the invalid status set
+        validation = (
+            pb.Validate(data=status_table)
+            .col_vals_not_in_set(columns="status", set=InvalidStatus)
+            .interrogate()
+        )
+        validation
+        ```
+        This `"deleted"` value in the `status` column will fail since it matches one of the invalid
+        statuses in the `InvalidStatus` enum.
         """
         assertion_type = _get_fn_name()
         _check_column(column=columns)
+        # Extract values from Enum classes or Enum instances if present
+        set = _extract_enum_values(set)
         _check_set_types(set=set)
         _check_pre(pre=pre)
         # TODO: add check for segments
@@ -7297,6 +7446,7 @@ class Validate:
         columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
         pattern: str,
         na_pass: bool = False,
+        inverse: bool = False,
         pre: Callable | None = None,
         segments: SegmentSpec | None = None,
         thresholds: int | float | bool | tuple | dict | Thresholds = None,
@@ -7324,6 +7474,9 @@ class Validate:
         na_pass
             Should any encountered None, NA, or Null values be considered as passing test units? By
             default, this is `False`. Set to `True` to pass test units with missing values.
+        inverse
+            Should the validation step be inverted? If `True`, then the expectation is that column
+            values should *not* match the specified `pattern=` regex.
         pre
             An optional preprocessing function or lambda to apply to the data table during
             interrogation. This function should take a table as input and return a modified table.
@@ -7510,6 +7663,7 @@ class Validate:
         # _check_segments(segments=segments)
         _check_thresholds(thresholds=thresholds)
         _check_boolean_input(param=na_pass, param_name="na_pass")
+        _check_boolean_input(param=inverse, param_name="inverse")
         _check_boolean_input(param=active, param_name="active")
         # Determine threshold to use (global or local) and normalize a local `thresholds=` value
@@ -7529,12 +7683,15 @@ class Validate:
         # Determine brief to use (global or local) and transform any shorthands of `brief=`
         brief = self.brief if brief is None else _transform_auto_brief(brief=brief)
+        # Package up the `pattern=` and boolean params into a dictionary for later interrogation
+        values = {"pattern": pattern, "inverse": inverse}
         # Iterate over the columns and create a validation step for each
         for column in columns:
             val_info = _ValidationInfo(
                 assertion_type=assertion_type,
                 column=column,
-                values=pattern,
+                values=values,
                 na_pass=na_pass,
                 pre=pre,
                 segments=segments,
@@ -8401,8 +8558,8 @@ class Validate:
             self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds)
         )
-        if columns_subset is not None and isinstance(columns_subset, str):
-            columns_subset = [columns_subset]
+        if columns_subset is not None and isinstance(columns_subset, str):  # pragma: no cover
+            columns_subset = [columns_subset]  # pragma: no cover
         # TODO: incorporate Column object
@@ -9830,8 +9987,9 @@ class Validate:
                 validation.active = False
                 continue
-            # Make a copy of the table for this step
-            data_tbl_step = data_tbl
+            # Make a deep copy of the table for this step to ensure proper isolation
+            # This prevents modifications from one validation step affecting others
+            data_tbl_step = _copy_dataframe(data_tbl)
             # ------------------------------------------------
             # Preprocessing stage
@@ -9998,7 +10156,7 @@ class Validate:
                         elif assertion_type == "col_vals_regex":
                             results_tbl = interrogate_regex(
-                                tbl=tbl, column=column, pattern=value, na_pass=na_pass
+                                tbl=tbl, column=column, values=value, na_pass=na_pass
                             )
                     elif assertion_type == "col_vals_expr":
@@ -10096,7 +10254,9 @@ class Validate:
                         )
                     else:
-                        raise ValueError(f"Unknown assertion type: {assertion_type}")
+                        raise ValueError(
+                            f"Unknown assertion type: {assertion_type}"
+                        )  # pragma: no cover
                 except Exception as e:
                     # Only catch specific data quality comparison errors, not programming errors
@@ -10111,14 +10271,18 @@ class Validate:
                         or ("dtype" in error_msg and "compare" in error_msg)
                     )
-                    if is_comparison_error:
+                    if is_comparison_error:  # pragma: no cover
                         # If data quality comparison fails, mark the validation as having an eval_error
-                        validation.eval_error = True
-                        end_time = datetime.datetime.now(datetime.timezone.utc)
-                        validation.proc_duration_s = (end_time - start_time).total_seconds()
-                        validation.time_processed = end_time.isoformat(timespec="milliseconds")
-                        validation.active = False
-                        continue
+                        validation.eval_error = True  # pragma: no cover
+                        end_time = datetime.datetime.now(datetime.timezone.utc)  # pragma: no cover
+                        validation.proc_duration_s = (
+                            end_time - start_time
+                        ).total_seconds()  # pragma: no cover
+                        validation.time_processed = end_time.isoformat(
+                            timespec="milliseconds"
+                        )  # pragma: no cover
+                        validation.active = False  # pragma: no cover
+                        continue  # pragma: no cover
                     else:
                         # For other errors (like missing columns), let them propagate
                         raise
@@ -10363,32 +10527,46 @@ class Validate:
                     except AttributeError:
                         # For LazyFrames without sample method, collect first then sample
                         validation_extract_native = validation_extract_nw.collect().to_native()
-                        if hasattr(validation_extract_native, "sample"):
+                        if hasattr(validation_extract_native, "sample"):  # pragma: no cover
                             # PySpark DataFrame has sample method
-                            validation_extract_native = validation_extract_native.sample(
-                                fraction=min(1.0, sample_n / validation_extract_native.count())
-                            ).limit(sample_n)
-                            validation_extract_nw = nw.from_native(validation_extract_native)
+                            validation_extract_native = (
+                                validation_extract_native.sample(  # pragma: no cover
+                                    fraction=min(
+                                        1.0, sample_n / validation_extract_native.count()
+                                    )  # pragma: no cover
+                                ).limit(sample_n)
+                            )  # pragma: no cover
+                            validation_extract_nw = nw.from_native(
+                                validation_extract_native
+                            )  # pragma: no cover
                         else:
                             # Fallback: just take first n rows after collecting
-                            validation_extract_nw = validation_extract_nw.collect().head(sample_n)
+                            validation_extract_nw = validation_extract_nw.collect().head(
+                                sample_n
+                            )  # pragma: no cover
                 elif sample_frac is not None:
                     try:
                         validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac)
-                    except AttributeError:
+                    except AttributeError:  # pragma: no cover
                         # For LazyFrames without sample method, collect first then sample
-                        validation_extract_native = validation_extract_nw.collect().to_native()
-                        if hasattr(validation_extract_native, "sample"):
+                        validation_extract_native = (
+                            validation_extract_nw.collect().to_native()
+                        )  # pragma: no cover
+                        if hasattr(validation_extract_native, "sample"):  # pragma: no cover
                             # PySpark DataFrame has sample method
                             validation_extract_native = validation_extract_native.sample(
                                 fraction=sample_frac
-                            )
-                            validation_extract_nw = nw.from_native(validation_extract_native)
+                            )  # pragma: no cover
+                            validation_extract_nw = nw.from_native(
+                                validation_extract_native
+                            )  # pragma: no cover
                         else:
                             # Fallback: use fraction to calculate head size
-                            collected = validation_extract_nw.collect()
-                            sample_size = max(1, int(len(collected) * sample_frac))
-                            validation_extract_nw = collected.head(sample_size)
+                            collected = validation_extract_nw.collect()  # pragma: no cover
+                            sample_size = max(
+                                1, int(len(collected) * sample_frac)
+                            )  # pragma: no cover
+                            validation_extract_nw = collected.head(sample_size)  # pragma: no cover
                 # Ensure a limit is set on the number of rows to extract
                 try:
@@ -10398,9 +10576,9 @@ class Validate:
                     # For LazyFrames, collect to get length (or use a reasonable default)
                     try:
                         extract_length = len(validation_extract_nw.collect())
-                    except Exception:
+                    except Exception:  # pragma: no cover
                         # If collection fails, apply limit anyway as a safety measure
-                        extract_length = extract_limit + 1  # Force limiting
+                        extract_length = extract_limit + 1  # pragma: no cover
                 if extract_length > extract_limit:
                     validation_extract_nw = validation_extract_nw.head(extract_limit)
@@ -12065,10 +12243,12 @@ class Validate:
         try:
             # Try without order_by first (for DataFrames)
             data_nw = data_nw.with_row_index(name=index_name)
-        except TypeError:
+        except TypeError:  # pragma: no cover
             # LazyFrames require order_by parameter - use first column for ordering
-            first_col = data_nw.columns[0]
-            data_nw = data_nw.with_row_index(name=index_name, order_by=first_col)
+            first_col = data_nw.columns[0]  # pragma: no cover
+            data_nw = data_nw.with_row_index(
+                name=index_name, order_by=first_col
+            )  # pragma: no cover
         # Get all validation step result tables and join together the `pb_is_good_` columns
         # ensuring that the columns are named uniquely (e.g., `pb_is_good_1`, `pb_is_good_2`, ...)
@@ -12080,10 +12260,12 @@ class Validate:
             try:
                 # Try without order_by first (for DataFrames)
                 results_tbl = results_tbl.with_row_index(name=index_name)
-            except TypeError:
+            except TypeError:  # pragma: no cover
                 # LazyFrames require order_by parameter - use first column for ordering
-                first_col = results_tbl.columns[0]
-                results_tbl = results_tbl.with_row_index(name=index_name, order_by=first_col)
+                first_col = results_tbl.columns[0]  # pragma: no cover
+                results_tbl = results_tbl.with_row_index(
+                    name=index_name, order_by=first_col
+                )  # pragma: no cover
             # Add numerical suffix to the `pb_is_good_` column to make it unique
             results_tbl = results_tbl.select([index_name, "pb_is_good_"]).rename(
@@ -12215,15 +12397,15 @@ class Validate:
         # If the table is a Polars one, determine if it's a LazyFrame
         if tbl_info == "polars":
             if _is_lazy_frame(self.data):
-                tbl_info = "polars-lazy"
+                tbl_info = "polars-lazy"  # pragma: no cover
         # Determine if the input table is a Narwhals DF
         if _is_narwhals_table(self.data):
             # Determine if the Narwhals table is a LazyFrame
-            if _is_lazy_frame(self.data):
-                tbl_info = "narwhals-lazy"
+            if _is_lazy_frame(self.data):  # pragma: no cover
+                tbl_info = "narwhals-lazy"  # pragma: no cover
             else:
-                tbl_info = "narwhals"
+                tbl_info = "narwhals"  # pragma: no cover
         # Get the thresholds object
         thresholds = self.thresholds
@@ -12388,7 +12570,7 @@ class Validate:
             if lang in RTL_LANGUAGES:
                 gt_tbl = gt_tbl.tab_style(
                     style=style.css("direction: rtl;"), locations=loc.source_notes()
-                )
+                )  # pragma: no cover
             if incl_header:
                 gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle))
@@ -12537,6 +12719,11 @@ class Validate:
             elif assertion_type[i] in ["specially"]:
                 values_upd.append("EXPR")
+            elif assertion_type[i] in ["col_vals_regex"]:
+                pattern = value["pattern"]
+                values_upd.append(str(pattern))
             # If the assertion type is not recognized, add the value as a string
             else:
                 values_upd.append(str(value))
@@ -12705,9 +12892,11 @@ class Validate:
             # Get the number of rows in the extract (safe for LazyFrames)
             try:
                 n_rows = len(extract_nw)
-            except TypeError:
+            except TypeError:  # pragma: no cover
                 # For LazyFrames, collect() first to get length
-                n_rows = len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
+                n_rows = (
+                    len(extract_nw.collect()) if hasattr(extract_nw, "collect") else 0
+                )  # pragma: no cover
             # If the number of rows is zero, then produce an em dash then go to the next iteration
             if n_rows == 0:
@@ -12715,7 +12904,7 @@ class Validate:
                 continue
             # Write the CSV text (ensure LazyFrames are collected first)
-            if hasattr(extract_nw, "collect"):
+            if hasattr(extract_nw, "collect"):  # pragma: no cover
                 extract_nw = extract_nw.collect()
             csv_text = extract_nw.write_csv()
@@ -13217,7 +13406,7 @@ class Validate:
             elif isinstance(column, list):
                 column_position = [list(self.data.columns).index(col) + 1 for col in column]
             else:
-                column_position = None
+                column_position = None  # pragma: no cover
         else:
             column_position = None
@@ -13309,7 +13498,7 @@ class Validate:
                 )
         else:
-            step_report = None
+            step_report = None  # pragma: no cover
         return step_report
@@ -13797,7 +13986,7 @@ def _conditional_string_date_dttm_conversion(
         elif not allow_regular_strings:
             raise ValueError(
                 "If `value=` is provided as a string it must be a date or datetime string."
-            )
+            )  # pragma: no cover
         # If allow_regular_strings is True, regular strings pass through unchanged
     return value
@@ -13851,12 +14040,33 @@ def _process_brief(
     if segment is not None:
         # The segment is always a tuple of the form ("{column}", "{value}")
+        # Handle both regular lists and Segment objects (from seg_group())
+        segment_column = segment[0]
+        segment_value = segment[1]
+        # If segment_value is a Segment object (from seg_group()), format it appropriately
+        if isinstance(segment_value, Segment):
+            # For Segment objects, format the segments as a readable string
+            segments = segment_value.segments
+            if len(segments) == 1:
+                # Single segment: join the values with commas
+                segment_value_str = ", ".join(str(v) for v in segments[0])
+            else:
+                # Multiple segments: join each segment with commas, separate segments with " | "
+                segment_value_str = " | ".join([", ".join(str(v) for v in seg) for seg in segments])
+        else:
+            # For regular lists or other types, convert to string
+            if isinstance(segment_value, list):
+                segment_value_str = ", ".join(str(v) for v in segment_value)
+            else:
+                segment_value_str = str(segment_value)
-        segment_fmt = f"{segment[0]} / {segment[1]}"
+        segment_fmt = f"{segment_column} / {segment_value_str}"
         brief = brief.replace("{segment}", segment_fmt)
-        brief = brief.replace("{segment_column}", segment[0])
-        brief = brief.replace("{segment_value}", segment[1])
+        brief = brief.replace("{segment_column}", segment_column)
+        brief = brief.replace("{segment_value}", segment_value_str)
     return brief
@@ -13890,7 +14100,7 @@ def _process_action_str(
     if col is not None:
         # If a list of columns is provided, then join the columns into a comma-separated string
         if isinstance(col, list):
-            col = ", ".join(col)
+            col = ", ".join(col)  # pragma: no cover
         action_str = action_str.replace("{col}", col)
         action_str = action_str.replace("{column}", col)
@@ -14163,15 +14373,30 @@ def _create_text_null(
 def _create_text_regex(
-    lang: str, column: str | None, pattern: str, for_failure: bool = False
+    lang: str, column: str | None, pattern: str | dict, for_failure: bool = False
 ) -> str:
     type_ = _expect_failure_type(for_failure=for_failure)
     column_text = _prep_column_text(column=column)
-    return EXPECT_FAIL_TEXT[f"regex_{type_}_text"][lang].format(
+    # Handle case where pattern is a dictionary containing `pattern` and `inverse`
+    if isinstance(pattern, dict):
+        pattern_str = pattern["pattern"]
+        inverse = pattern.get("inverse", False)
+    else:
+        # For backward compatibility, assume it's just the pattern string
+        pattern_str = pattern
+        inverse = False
+    # Use inverse-specific translations if inverse=True
+    if inverse:
+        text_key = f"regex_inverse_{type_}_text"
+    else:
+        text_key = f"regex_{type_}_text"
+    return EXPECT_FAIL_TEXT[text_key][lang].format(
         column_text=column_text,
-        values_text=pattern,
+        values_text=pattern_str,
     )
@@ -14287,7 +14512,7 @@ def _prep_values_text(
     length_values = len(values)
     if length_values == 0:
-        return ""
+        return ""  # pragma: no cover
     if length_values > limit:
         num_omitted = length_values - limit
@@ -14296,7 +14521,7 @@ def _prep_values_text(
         formatted_values = []
         for value in values[:limit]:
             if isinstance(value, (datetime.datetime, datetime.date)):
-                formatted_values.append(f"`{value.isoformat()}`")
+                formatted_values.append(f"`{value.isoformat()}`")  # pragma: no cover
             else:
                 formatted_values.append(f"`{value}`")
@@ -14486,8 +14711,8 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
             if len(segment_str) == 10 and segment_str.count("-") == 2:
                 try:
                     parsed_value = date.fromisoformat(segment_str)
-                except ValueError:
-                    pass
+                except ValueError:  # pragma: no cover
+                    pass  # pragma: no cover
             # Format 2: Datetime strings with UTC timezone like
             # "2016-01-04 00:00:01 UTC.strict_cast(...)"
@@ -14499,27 +14724,28 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
                         parsed_dt = datetime.fromisoformat(datetime_part)
                         # Convert midnight datetimes to dates for consistency
                         if parsed_dt.time() == datetime.min.time():
-                            parsed_value = parsed_dt.date()
+                            parsed_value = parsed_dt.date()  # pragma: no cover
                         else:
                             parsed_value = parsed_dt
-                except (ValueError, IndexError):
-                    pass
+                except (ValueError, IndexError):  # pragma: no cover
+                    pass  # pragma: no cover
             # Format 3: Bracketed expressions like ['2016-01-04']
             elif segment_str.startswith("[") and segment_str.endswith("]"):
-                try:
-                    content = segment_str[2:-2]  # Remove [' and ']
+                try:  # pragma: no cover
+                    # Remove [' and ']
+                    content = segment_str[2:-2]  # pragma: no cover
                     # Try parsing as date first
-                    if len(content) == 10 and content.count("-") == 2:
-                        try:
-                            parsed_value = date.fromisoformat(content)
-                        except ValueError:
-                            pass
+                    if len(content) == 10 and content.count("-") == 2:  # pragma: no cover
+                        try:  # pragma: no cover
+                            parsed_value = date.fromisoformat(content)  # pragma: no cover
+                        except ValueError:  # pragma: no cover
+                            pass  # pragma: no cover
                     # Try parsing as datetime
-                    if parsed_value is None:
-                        try:
+                    if parsed_value is None:  # pragma: no cover
+                        try:  # pragma: no cover
                             parsed_dt = datetime.fromisoformat(content.replace(" UTC", ""))
                             if parsed_dt.time() == datetime.min.time():
                                 parsed_value = parsed_dt.date()
@@ -14528,8 +14754,8 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
                         except ValueError:
                             pass
-                except (ValueError, IndexError):
-                    pass
+                except (ValueError, IndexError):  # pragma: no cover
+                    pass  # pragma: no cover
             # Handle `pl.datetime()` expressions with .alias("datetime")
             elif "datetime" in segment_str and '.alias("datetime")' in segment_str:
@@ -14540,10 +14766,10 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
                     if parsed_dt.time() == datetime.min.time():
                         parsed_value = parsed_dt.date()
                     else:
-                        parsed_value = parsed_dt
+                        parsed_value = parsed_dt  # pragma: no cover
-                except (ValueError, AttributeError):
-                    pass
+                except (ValueError, AttributeError):  # pragma: no cover
+                    pass  # pragma: no cover
             # If we successfully parsed a value, use it; otherwise leave segment as is
             if parsed_value is not None:
@@ -14567,9 +14793,9 @@ def _apply_segments(data_tbl: any, segments_expr: tuple[str, Any]) -> any:
         # Filter the data table based on the column name and segment
         # Use the new Ibis API methods to avoid deprecation warnings
         if segment is None:
-            data_tbl = data_tbl.filter(data_tbl[column].isnull())
+            data_tbl = data_tbl.filter(data_tbl[column].isnull())  # pragma: no cover
         elif isinstance(segment, list):
-            data_tbl = data_tbl.filter(data_tbl[column].isin(segment))
+            data_tbl = data_tbl.filter(data_tbl[column].isin(segment))  # pragma: no cover
         else:
             data_tbl = data_tbl.filter(data_tbl[column] == segment)
@@ -14690,7 +14916,7 @@ def _get_title_text(
             "</span>"
             f'<span style="float: right;">{title}</span>'
             "</div>"
-        )
+        )  # pragma: no cover
     return html_str
@@ -14768,24 +14994,6 @@ def _transform_eval(
     return symbol_list
-def _format_numbers_with_gt(
-    values: list[int], n_sigfig: int = 3, compact: bool = True, locale: str = "en"
-) -> list[str]:
-    """Format numbers using Great Tables GT object to avoid pandas dependency."""
-    import polars as pl
-    # Create a single-column DataFrame with all values
-    df = pl.DataFrame({"values": values})
-    # Create GT object and format the column
-    gt_obj = GT(df).fmt_number(columns="values", n_sigfig=n_sigfig, compact=compact, locale=locale)
-    # Extract the formatted values using _get_column_of_values
-    formatted_values = _get_column_of_values(gt_obj, column_name="values", context="html")
-    return formatted_values
 def _format_single_number_with_gt(
     value: int, n_sigfig: int = 3, compact: bool = True, locale: str = "en", df_lib=None
 ) -> str:
@@ -14796,12 +15004,14 @@ def _format_single_number_with_gt(
             import polars as pl
             df_lib = pl
-        elif _is_lib_present("pandas"):
-            import pandas as pd
+        elif _is_lib_present("pandas"):  # pragma: no cover
+            import pandas as pd  # pragma: no cover
-            df_lib = pd
-        else:
-            raise ImportError("Neither Polars nor Pandas is available for formatting")
+            df_lib = pd  # pragma: no cover
+        else:  # pragma: no cover
+            raise ImportError(
+                "Neither Polars nor Pandas is available for formatting"
+            )  # pragma: no cover
     # Create a single-row, single-column DataFrame using the specified library
     df = df_lib.DataFrame({"value": [value]})
@@ -14867,12 +15077,14 @@ def _format_single_float_with_gt(
             import polars as pl
             df_lib = pl
-        elif _is_lib_present("pandas"):
-            import pandas as pd
+        elif _is_lib_present("pandas"):  # pragma: no cover
+            import pandas as pd  # pragma: no cover
-            df_lib = pd
-        else:
-            raise ImportError("Neither Polars nor Pandas is available for formatting")
+            df_lib = pd  # pragma: no cover
+        else:  # pragma: no cover
+            raise ImportError(
+                "Neither Polars nor Pandas is available for formatting"
+            )  # pragma: no cover
     # Create a single-row, single-column DataFrame using the specified library
     df = df_lib.DataFrame({"value": [value]})
@@ -14904,7 +15116,7 @@ def _transform_passed_failed(
             return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib)
         else:
             # Fallback to the original behavior
-            return vals.fmt_number(value, decimals=2, locale=locale)[0]
+            return vals.fmt_number(value, decimals=2, locale=locale)[0]  # pragma: no cover
     passed_failed = [
         (
@@ -15044,7 +15256,7 @@ def _get_callable_source(fn: Callable) -> str:
             return pre_arg
         except (OSError, TypeError):  # pragma: no cover
             return fn.__name__
-    return fn
+    return fn  # pragma: no cover
 def _extract_pre_argument(source: str) -> str:
@@ -15128,12 +15340,14 @@ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None)
             import polars as pl
             df_lib = pl
-        elif _is_lib_present("pandas"):
-            import pandas as pd
+        elif _is_lib_present("pandas"):  # pragma: no cover
+            import pandas as pd  # pragma: no cover
-            df_lib = pd
-        else:
-            raise ImportError("Neither Polars nor Pandas is available for formatting")
+            df_lib = pd  # pragma: no cover
+        else:  # pragma: no cover
+            raise ImportError(
+                "Neither Polars nor Pandas is available for formatting"
+            )  # pragma: no cover
     # Create a single-row, single-column DataFrame using the specified library
     df = df_lib.DataFrame({"value": [value]})
@@ -15161,12 +15375,14 @@ def _format_single_float_with_gt_custom(
             import polars as pl
             df_lib = pl
-        elif _is_lib_present("pandas"):
-            import pandas as pd
+        elif _is_lib_present("pandas"):  # pragma: no cover
+            import pandas as pd  # pragma: no cover
-            df_lib = pd
-        else:
-            raise ImportError("Neither Polars nor Pandas is available for formatting")
+            df_lib = pd  # pragma: no cover
+        else:  # pragma: no cover
+            raise ImportError(
+                "Neither Polars nor Pandas is available for formatting"
+            )  # pragma: no cover
     # Create a single-row, single-column DataFrame using the specified library
     df = df_lib.DataFrame({"value": [value]})
@@ -15201,7 +15417,7 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) ->
             # Fallback to the original behavior
             return fmt_number(
                 value, decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale
-            )[0]
+            )[0]  # pragma: no cover
     def _format_integer_safe(value: int) -> str:
         if df_lib is not None and value is not None:
@@ -15333,7 +15549,8 @@ def _step_report_row_based(
         elements = ", ".join(values)
         text = f"{column} &NotElement; {{{elements}}}"
     elif assertion_type == "col_vals_regex":
-        text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=values)
+        pattern = values["pattern"]
+        text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern)
     elif assertion_type == "col_vals_null":
         text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column)
     elif assertion_type == "col_vals_not_null":
@@ -15386,9 +15603,12 @@ def _step_report_row_based(
         title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN
         assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang]
-        # Use success_statement_no_column for col_vals_expr since it doesn't target a specific column
+        # Use 'success_statement_no_column' for col_vals_expr() since it doesn't target
+        # a specific column
         if assertion_type == "col_vals_expr":
-            success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(n=n)
+            success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format(
+                n=n
+            )  # pragma: no cover
         else:
             success_stmt = STEP_REPORT_TEXT["success_statement"][lang].format(
                 n=n,
@@ -16101,14 +16321,14 @@ def _step_report_schema_any_order(
         if exp_columns_dict[column_name_exp_i]["colname_matched"]:
             col_exp_correct.append(CHECK_MARK_SPAN)
         else:
-            col_exp_correct.append(CROSS_MARK_SPAN)
+            col_exp_correct.append(CROSS_MARK_SPAN)  # pragma: no cover
         #
         # `dtype_exp` values
         #
         if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
-            dtype_exp.append("")
+            dtype_exp.append("")  # pragma: no cover
         elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
             dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
@@ -16143,9 +16363,9 @@ def _step_report_schema_any_order(
         #
         if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
-            dtype_exp_correct.append("&mdash;")
+            dtype_exp_correct.append("&mdash;")  # pragma: no cover
         elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
-            dtype_exp_correct.append("")
+            dtype_exp_correct.append("")  # pragma: no cover
         elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
             dtype_exp_correct.append(CHECK_MARK_SPAN)
         else:
@@ -16191,13 +16411,17 @@ def _step_report_schema_any_order(
             #
             if not exp_columns_dict[column_name_exp_i]["dtype_present"]:
-                dtype_exp.append("")
+                dtype_exp.append("")  # pragma: no cover
             elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1:
-                dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]
+                dtype = exp_columns_dict[column_name_exp_i]["dtype_input"]  # pragma: no cover
-                if exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None:
-                    pos = exp_columns_dict[column_name_exp_i]["dtype_matched_pos"]
+                if (
+                    exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None
+                ):  # pragma: no cover
+                    pos = exp_columns_dict[column_name_exp_i][
+                        "dtype_matched_pos"
+                    ]  # pragma: no cover
                     # Combine the dtypes together with pipes but underline the matched dtype in
                     # green with an HTML span tag and style attribute
@@ -16209,13 +16433,13 @@ def _step_report_schema_any_order(
                             else dtype[i]
                         )
                         for i in range(len(dtype))
-                    ]
-                    dtype = " | ".join(dtype)
-                    dtype_exp.append(dtype)
+                    ]  # pragma: no cover
+                    dtype = " | ".join(dtype)  # pragma: no cover
+                    dtype_exp.append(dtype)  # pragma: no cover
                 else:
-                    dtype = " | ".join(dtype)
-                    dtype_exp.append(dtype)
+                    dtype = " | ".join(dtype)  # pragma: no cover
+                    dtype_exp.append(dtype)  # pragma: no cover
             else:
                 dtype = exp_columns_dict[column_name_exp_i]["dtype_input"][0]
@@ -16227,12 +16451,12 @@ def _step_report_schema_any_order(
             if not exp_columns_dict[column_name_exp_i]["colname_matched"]:
                 dtype_exp_correct.append("&mdash;")
-            elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:
-                dtype_exp_correct.append("")
-            elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:
-                dtype_exp_correct.append(CHECK_MARK_SPAN)
-            else:
-                dtype_exp_correct.append(CROSS_MARK_SPAN)
+            elif not exp_columns_dict[column_name_exp_i]["dtype_present"]:  # pragma: no cover
+                dtype_exp_correct.append("")  # pragma: no cover
+            elif exp_columns_dict[column_name_exp_i]["dtype_matched"]:  # pragma: no cover
+                dtype_exp_correct.append(CHECK_MARK_SPAN)  # pragma: no cover
+            else:  # pragma: no cover
+                dtype_exp_correct.append(CROSS_MARK_SPAN)  # pragma: no cover
         if len(columns_found) > 0:
             # Get the last index of the columns found
@@ -16248,7 +16472,9 @@ def _step_report_schema_any_order(
             ]
         else:
-            index_exp = [str(i) for i in range(1, len(colnames_exp_unmatched) + 1)]
+            index_exp = [
+                str(i) for i in range(1, len(colnames_exp_unmatched) + 1)
+            ]  # pragma: no cover
         schema_exp_unmatched = pl.DataFrame(
             {

pointblank 0.13.2__py3-none-any.whl → 0.13.4__py3-none-any.whl

pointblank 0.13.2py3-none-any.whl → 0.13.4py3-none-any.whl