PyPI - pointblank - Versions diffs - 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl - Mend

pointblank 0.11.6py3-none-any.whl → 0.12.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

pointblank/__init__.py +2 -0
pointblank/_constants.py +0 -1
pointblank/_interrogation.py +244 -606
pointblank/_utils.py +65 -3
pointblank/assistant.py +9 -0
pointblank/cli.py +39 -24
pointblank/data/api-docs.txt +658 -29
pointblank/schema.py +17 -0
pointblank/segments.py +163 -0
pointblank/validate.py +344 -92
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/METADATA +59 -6
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/RECORD +16 -15
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/WHEEL +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/entry_points.txt +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/licenses/LICENSE +0 -0
{pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/top_level.txt +0 -0

pointblank/_interrogation.py CHANGED Viewed

@@ -15,7 +15,7 @@ from pointblank._utils import (
     _convert_to_narwhals,
     _get_tbl_type,
 )
-from pointblank.column import Column, ColumnLiteral
+from pointblank.column import Column
 from pointblank.schema import Schema
 from pointblank.thresholds import _threshold_check
@@ -23,6 +23,74 @@ if TYPE_CHECKING:
     from pointblank._typing import AbsoluteTolBounds
+def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
+    """
+    Safely modify datetime comparison values for LazyFrame compatibility.
+    This function handles the case where we can't directly slice LazyFrames
+    to get column dtypes for datetime conversion.
+    """
+    try:
+        # First try to get column dtype from schema for LazyFrames
+        column_dtype = None
+        if hasattr(data_frame, "collect_schema"):
+            schema = data_frame.collect_schema()
+            column_dtype = schema.get(column)
+        elif hasattr(data_frame, "schema"):
+            schema = data_frame.schema
+            column_dtype = schema.get(column)
+        # If we got a dtype from schema, use it
+        if column_dtype is not None:
+            # Create a mock column object for _modify_datetime_compare_val
+            class MockColumn:
+                def __init__(self, dtype):
+                    self.dtype = dtype
+            mock_column = MockColumn(column_dtype)
+            return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
+        # Fallback: try collecting a small sample if possible
+        try:
+            sample = data_frame.head(1).collect()
+            if hasattr(sample, "dtypes") and column in sample.columns:
+                # For pandas-like dtypes
+                column_dtype = sample.dtypes[column] if hasattr(sample, "dtypes") else None
+                if column_dtype:
+                    class MockColumn:
+                        def __init__(self, dtype):
+                            self.dtype = dtype
+                    mock_column = MockColumn(column_dtype)
+                    return _modify_datetime_compare_val(
+                        tgt_column=mock_column, compare_val=compare_val
+                    )
+        except Exception:
+            pass
+        # Final fallback: try direct access (for eager DataFrames)
+        try:
+            if hasattr(data_frame, "dtypes") and column in data_frame.columns:
+                column_dtype = data_frame.dtypes[column]
+                class MockColumn:
+                    def __init__(self, dtype):
+                        self.dtype = dtype
+                mock_column = MockColumn(column_dtype)
+                return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
+        except Exception:
+            pass
+    except Exception:
+        pass
+    # If all else fails, return the original compare_val
+    return compare_val
 @dataclass
 class Interrogator:
     """
@@ -89,56 +157,25 @@ class Interrogator:
     na_pass: bool = False
     tbl_type: str = "local"
-    def gt(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, ColumnLiteral):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] > self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-            else:
-                #
-                # Ibis column-to-literal comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] > ibis.literal(self.compare),
-                )
+    def __post_init__(self):
+        """
+        Post-initialization to process Ibis tables through Narwhals.
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
+        This converts Ibis tables to Narwhals-wrapped tables to unify
+        the processing pathway and reduce code branching.
+        """
+        # Import the processing function
+        from pointblank._utils import _process_ibis_through_narwhals
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
+        # Process Ibis tables through Narwhals
+        self.x, self.tbl_type = _process_ibis_through_narwhals(self.x, self.tbl_type)
-        # Local backends (Narwhals) ---------------------------------
+    def gt(self) -> FrameT | Any:
+        # All backends now use Narwhals (including former Ibis tables) ---------
         compare_expr = _get_compare_expr_nw(compare=self.compare)
-        compare_expr = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=compare_expr
-        )
+        compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
         return (
             self.x.with_columns(
@@ -165,55 +202,11 @@ class Interrogator:
         )
     def lt(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, Column):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] < self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-            else:
-                #
-                # Ibis column-to-literal comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] < ibis.literal(self.compare),
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         compare_expr = _get_compare_expr_nw(compare=self.compare)
-        compare_expr = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=compare_expr
-        )
+        compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
         return (
             self.x.with_columns(
@@ -240,49 +233,7 @@ class Interrogator:
         )
     def eq(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, Column):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] == self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-            else:
-                #
-                # Ibis column-to-literal comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] == ibis.literal(self.compare),
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         if isinstance(self.compare, Column):
             compare_expr = _get_compare_expr_nw(compare=self.compare)
@@ -329,9 +280,7 @@ class Interrogator:
         else:
             compare_expr = _get_compare_expr_nw(compare=self.compare)
-            compare_expr = _modify_datetime_compare_val(
-                tgt_column=self.x[self.column], compare_val=compare_expr
-            )
+            compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
             tbl = self.x.with_columns(
                 pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
@@ -359,47 +308,7 @@ class Interrogator:
             return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
     def ne(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, Column):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] != self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-            #
-            # Ibis column-to-literal comparison
-            #
-            tbl = self.x.mutate(
-                pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                pb_is_good_2=ibis.ifelse(
-                    self.x[self.column].notnull(),
-                    self.x[self.column] != ibis.literal(self.compare),
-                    ibis.literal(False),
-                ),
-            )
-            return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                "pb_is_good_1", "pb_is_good_2"
-            )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         # Determine if the reference and comparison columns have any null values
         ref_col_has_null_vals = _column_has_null_values(table=self.x, column=self.column)
@@ -421,9 +330,7 @@ class Interrogator:
                 ).to_native()
             else:
-                compare_expr = _modify_datetime_compare_val(
-                    tgt_column=self.x[self.column], compare_val=self.compare
-                )
+                compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
                 return self.x.with_columns(
                     pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
@@ -469,6 +376,12 @@ class Interrogator:
                         tbl = tbl.with_columns(
                             pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
                         )
+                else:
+                    # General case (non-Polars): handle na_pass=True properly
+                    if self.na_pass:
+                        tbl = tbl.with_columns(
+                            pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
+                        )
                 return (
                     tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
@@ -500,6 +413,12 @@ class Interrogator:
                         tbl = tbl.with_columns(
                             pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
                         )
+                else:
+                    # General case (non-Polars): handle na_pass=True properly
+                    if self.na_pass:
+                        tbl = tbl.with_columns(
+                            pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
+                        )
                 return (
                     tbl.with_columns(pb_is_good_=nw.col("pb_is_good_1"))
@@ -532,6 +451,16 @@ class Interrogator:
                                 .otherwise(False)
                             )
                         )
+                else:
+                    # General case (non-Polars): handle na_pass=True properly
+                    if self.na_pass:
+                        tbl = tbl.with_columns(
+                            pb_is_good_3=(
+                                nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
+                                .then(True)
+                                .otherwise(nw.col("pb_is_good_3"))
+                            )
+                        )
                 return (
                     tbl.with_columns(pb_is_good_=nw.col("pb_is_good_3"))
@@ -544,9 +473,7 @@ class Interrogator:
             if ref_col_has_null_vals:
                 # Create individual cases for Pandas and Polars
-                compare_expr = _modify_datetime_compare_val(
-                    tgt_column=self.x[self.column], compare_val=self.compare
-                )
+                compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
                 if is_pandas_dataframe(self.x.to_native()):
                     tbl = self.x.with_columns(
@@ -584,54 +511,31 @@ class Interrogator:
                     return tbl
-    def ge(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, Column):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] >= self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
+                else:
+                    # Generic case for other DataFrame types (PySpark, etc.)
+                    # Use similar logic to Polars but handle potential differences
+                    tbl = self.x.with_columns(
+                        pb_is_good_1=nw.col(self.column).is_null(),  # val is Null in Column
+                        pb_is_good_2=nw.lit(self.na_pass),  # Pass if any Null in val or compare
+                    )
-            #
-            # Ibis column-to-literal comparison
-            #
-            tbl = self.x.mutate(
-                pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                pb_is_good_2=self.x[self.column] >= ibis.literal(self.compare),
-            )
+                    tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
-            tbl = tbl.mutate(
-                pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-            )
+                    tbl = tbl.with_columns(
+                        pb_is_good_=(
+                            (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
+                            | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
+                        )
+                    )
-            return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                "pb_is_good_1", "pb_is_good_2"
-            )
+                    return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
-        # Local backends (Narwhals) ---------------------------------
+    def ge(self) -> FrameT | Any:
+        # All backends now use Narwhals (including former Ibis tables) ---------
         compare_expr = _get_compare_expr_nw(compare=self.compare)
-        compare_expr = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=compare_expr
-        )
+        compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
         tbl = (
             self.x.with_columns(
@@ -658,53 +562,11 @@ class Interrogator:
         return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
     def le(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.compare, Column):
-                #
-                # Ibis column-to-column comparison
-                #
-                tbl = self.x.mutate(
-                    pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
-                    & ibis.literal(self.na_pass),
-                    pb_is_good_2=self.x[self.column] <= self.x[self.compare.name],
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                    "pb_is_good_1", "pb_is_good_2"
-                )
-            #
-            # Ibis column-to-literal comparison
-            #
-            tbl = self.x.mutate(
-                pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                pb_is_good_2=self.x[self.column] <= ibis.literal(self.compare),
-            )
-            tbl = tbl.mutate(
-                pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-            )
-            return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                "pb_is_good_1", "pb_is_good_2"
-            )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         compare_expr = _get_compare_expr_nw(compare=self.compare)
-        compare_expr = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=compare_expr
-        )
+        compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
         return (
             self.x.with_columns(
@@ -731,113 +593,13 @@ class Interrogator:
         )
     def between(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.low, Column) or isinstance(self.high, Column):
-                #
-                # Ibis column-to-column/column or column-to-column/literal comparison
-                #
-                if isinstance(self.low, Column):
-                    low_val = self.x[self.low.name]
-                else:
-                    low_val = ibis.literal(self.low)
-                if isinstance(self.high, Column):
-                    high_val = self.x[self.high.name]
-                else:
-                    high_val = ibis.literal(self.high)
-                if isinstance(self.low, Column) and isinstance(self.high, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(
-                            self.x[self.column].isnull()
-                            | self.x[self.low.name].isnull()
-                            | self.x[self.high.name].isnull()
-                        )
-                        & ibis.literal(self.na_pass)
-                    )
-                elif isinstance(self.low, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(self.x[self.column].isnull() | self.x[self.low.name].isnull())
-                        & ibis.literal(self.na_pass)
-                    )
-                elif isinstance(self.high, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(
-                            self.x[self.column].isnull() | self.x[self.high.name].isnull()
-                        )
-                        & ibis.literal(self.na_pass)
-                    )
-                if self.inclusive[0]:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] >= low_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] > low_val)
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                if self.inclusive[1]:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] <= high_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] < high_val)
-                tbl = tbl.mutate(
-                    pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
-                )
-                return tbl.mutate(
-                    pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 & tbl.pb_is_good_3)
-                ).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
-            else:
-                #
-                # Ibis column-to-literal/literal comparison
-                #
-                low_val = ibis.literal(self.low)
-                high_val = ibis.literal(self.high)
-                tbl = self.x.mutate(
-                    pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass)
-                )
-                if self.inclusive[0]:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] >= low_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] > low_val)
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-                )
-                if self.inclusive[1]:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] <= high_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] < high_val)
-                tbl = tbl.mutate(
-                    pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
-                )
-                return tbl.mutate(
-                    pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 & tbl.pb_is_good_3)
-                ).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         low_val = _get_compare_expr_nw(compare=self.low)
         high_val = _get_compare_expr_nw(compare=self.high)
-        low_val = _modify_datetime_compare_val(tgt_column=self.x[self.column], compare_val=low_val)
-        high_val = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=high_val
-        )
+        low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
+        high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
         tbl = self.x.with_columns(
             pb_is_good_1=nw.col(self.column).is_null(),  # val is Null in Column
@@ -900,136 +662,16 @@ class Interrogator:
         return tbl
     def outside(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            if isinstance(self.low, Column) or isinstance(self.high, Column):
-                #
-                # Ibis column-to-column/column or column-to-column/literal comparison
-                #
-                if isinstance(self.low, Column):
-                    low_val = self.x[self.low.name]
-                else:
-                    low_val = ibis.literal(self.low)
-                if isinstance(self.high, Column):
-                    high_val = self.x[self.high.name]
-                else:
-                    high_val = ibis.literal(self.high)
-                if isinstance(self.low, Column) and isinstance(self.high, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(
-                            self.x[self.column].isnull()
-                            | self.x[self.low.name].isnull()
-                            | self.x[self.high.name].isnull()
-                        )
-                        & ibis.literal(self.na_pass)
-                    )
+        # All backends now use Narwhals (including former Ibis tables) ---------
-                elif isinstance(self.low, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(self.x[self.column].isnull() | self.x[self.low.name].isnull())
-                        & ibis.literal(self.na_pass)
-                    )
-                elif isinstance(self.high, Column):
-                    tbl = self.x.mutate(
-                        pb_is_good_1=(
-                            self.x[self.column].isnull() | self.x[self.high.name].isnull()
-                        )
-                        & ibis.literal(self.na_pass)
-                    )
-                if self.inclusive[0]:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] < low_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_2=tbl[self.column] <= low_val)
-                if self.inclusive[1]:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] > high_val)
-                else:
-                    tbl = tbl.mutate(pb_is_good_3=tbl[self.column] >= high_val)
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(
-                        tbl.pb_is_good_3.isnull(),
-                        False,
-                        tbl.pb_is_good_2,
-                    )
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_3=ibis.ifelse(
-                        tbl.pb_is_good_2.isnull(),
-                        False,
-                        tbl.pb_is_good_3,
-                    )
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_2=ibis.ifelse(
-                        tbl.pb_is_good_2.isnull(),
-                        False,
-                        tbl.pb_is_good_2,
-                    )
-                )
-                tbl = tbl.mutate(
-                    pb_is_good_3=ibis.ifelse(
-                        tbl.pb_is_good_3.isnull(),
-                        False,
-                        tbl.pb_is_good_3,
-                    )
-                )
-                return tbl.mutate(
-                    pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 | tbl.pb_is_good_3)
-                ).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
-            #
-            # Ibis column-to-literal/literal comparison
-            #
-            low_val = ibis.literal(self.low)
-            high_val = ibis.literal(self.high)
-            tbl = self.x.mutate(
-                pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass)
-            )
-            if self.inclusive[0]:
-                tbl = tbl.mutate(pb_is_good_2=tbl[self.column] < low_val)
-            else:
-                tbl = tbl.mutate(pb_is_good_2=tbl[self.column] <= low_val)
-            tbl = tbl.mutate(
-                pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
-            )
-            if self.inclusive[1]:
-                tbl = tbl.mutate(pb_is_good_3=tbl[self.column] > high_val)
-            else:
-                tbl = tbl.mutate(pb_is_good_3=tbl[self.column] >= high_val)
-            tbl = tbl.mutate(
-                pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
-            )
-            return tbl.mutate(
-                pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2 | tbl.pb_is_good_3
-            ).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
-        # Local backends (Narwhals) ---------------------------------
+        low_val = _get_compare_expr_nw(compare=self.low)
+        high_val = _get_compare_expr_nw(compare=self.high)
         low_val = _get_compare_expr_nw(compare=self.low)
         high_val = _get_compare_expr_nw(compare=self.high)
-        low_val = _modify_datetime_compare_val(tgt_column=self.x[self.column], compare_val=low_val)
-        high_val = _modify_datetime_compare_val(
-            tgt_column=self.x[self.column], compare_val=high_val
-        )
+        low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
+        high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
         tbl = self.x.with_columns(
             pb_is_good_1=nw.col(self.column).is_null(),  # val is Null in Column
@@ -1088,17 +730,10 @@ class Interrogator:
         return tbl
     def isin(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         can_be_null: bool = None in self.set
-        if self.tbl_type in IBIS_BACKENDS:
-            base_expr = self.x[self.column].isin(self.set)
-            if can_be_null:
-                base_expr = base_expr | self.x[self.column].isnull()
-            return self.x.mutate(pb_is_good_=base_expr)
-        # Local backends (Narwhals) ---------------------------------
         base_expr: nw.Expr = nw.col(self.column).is_in(self.set)
         if can_be_null:
             base_expr = base_expr | nw.col(self.column).is_null()
@@ -1106,12 +741,7 @@ class Interrogator:
         return self.x.with_columns(pb_is_good_=base_expr).to_native()
     def notin(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            return self.x.mutate(pb_is_good_=self.x[self.column].notin(self.set))
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         return (
             self.x.with_columns(
@@ -1122,21 +752,7 @@ class Interrogator:
         )
     def regex(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            tbl = self.x.mutate(
-                pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
-                pb_is_good_2=self.x[self.column].re_search(self.pattern),
-            )
-            return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
-                "pb_is_good_1", "pb_is_good_2"
-            )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         return (
             self.x.with_columns(
@@ -1151,55 +767,21 @@ class Interrogator:
         )
     def null(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            return self.x.mutate(
-                pb_is_good_=self.x[self.column].isnull(),
-            )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         return self.x.with_columns(
             pb_is_good_=nw.col(self.column).is_null(),
         ).to_native()
     def not_null(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            return self.x.mutate(
-                pb_is_good_=~self.x[self.column].isnull(),
-            )
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         return self.x.with_columns(
             pb_is_good_=~nw.col(self.column).is_null(),
         ).to_native()
     def rows_distinct(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            import ibis
-            tbl = self.x
-            # Get the column subset to use for the test
-            if self.columns_subset is None:
-                columns_subset = tbl.columns
-            else:
-                columns_subset = self.columns_subset
-            # Create a subset of the table with only the columns of interest and count the
-            # number of times each unique row (or portion thereof) appears
-            tbl = tbl.group_by(columns_subset).mutate(pb_count_=ibis._.count())
-            # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
-            return tbl.mutate(pb_is_good_=tbl["pb_count_"] == 1).drop("pb_count_")
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         tbl = self.x
@@ -1209,32 +791,20 @@ class Interrogator:
         else:
             columns_subset = self.columns_subset
-        # Create a subset of the table with only the columns of interest
-        subset_tbl = tbl.select(columns_subset)
+        # Create a count of duplicates using group_by approach like Ibis backend
+        # Group by the columns of interest and count occurrences
+        count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
-        # Check for duplicates in the subset table, creating a series of booleans
-        pb_is_good_series = subset_tbl.is_duplicated()
+        # Join back to original table to get count for each row
+        tbl = tbl.join(count_tbl, on=columns_subset, how="left")
-        # Add the series to the input table
-        tbl = tbl.with_columns(pb_is_good_=~pb_is_good_series)
+        # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
+        tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
         return tbl.to_native()
     def rows_complete(self) -> FrameT | Any:
-        # Ibis backends ---------------------------------------------
-        if self.tbl_type in IBIS_BACKENDS:
-            tbl = self.x
-            # Determine the number of null values in each row (column subsets are handled in
-            # the `_check_nulls_across_columns_ibis()` function)
-            tbl = _check_nulls_across_columns_ibis(table=tbl, columns_subset=self.columns_subset)
-            # Failing rows will have the value `True` in the generated column, so we need to negate
-            # the result to get the passing rows
-            return tbl.mutate(pb_is_good_=~tbl["_any_is_null_"]).drop("_any_is_null_")
-        # Local backends (Narwhals) ---------------------------------
+        # All backends now use Narwhals (including former Ibis tables) ---------
         tbl = self.x
@@ -1299,10 +869,8 @@ class ColValsCompareOne:
             tbl = _column_test_prep(
                 df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
             )
-        # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
-        #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -1457,7 +1025,8 @@ class ColValsCompareTwo:
         # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
         #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -1550,10 +1119,8 @@ class ColValsCompareSet:
             tbl = _column_test_prep(
                 df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
             )
-        # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
-        #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -1627,10 +1194,8 @@ class ColValsRegex:
             tbl = _column_test_prep(
                 df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
             )
-        # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
-        #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -1758,11 +1323,9 @@ class ColExistsHasType:
             #  - check if the `column=` exists
             #  - check if the `column=` type is compatible with the test
             tbl = _convert_to_narwhals(df=self.data_tbl)
-        # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
-        #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
-            tbl = self.data_tbl
+        else:
+            # For remote backends (Ibis), pass the table as is since Narwhals can handle it
+            tbl = _convert_to_narwhals(df=self.data_tbl)
         if self.assertion_method == "exists":
             res = int(self.column in tbl.columns)
@@ -1810,7 +1373,8 @@ class RowsDistinct:
         # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
         #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -1862,7 +1426,8 @@ class RowsComplete:
         # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
         #       for now, just pass the table as is
-        if self.tbl_type in IBIS_BACKENDS:
+        else:
+            # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
             tbl = self.data_tbl
         # Collect results for the test units; the results are a list of booleans where
@@ -2088,6 +1653,8 @@ class ConjointlyValidation:
             return self._get_pandas_results()
         elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
             return self._get_ibis_results()
+        elif "pyspark" in self.tbl_type:
+            return self._get_pyspark_results()
         else:  # pragma: no cover
             raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
@@ -2247,6 +1814,53 @@ class ConjointlyValidation:
         results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
         return results_tbl
+    def _get_pyspark_results(self):
+        """Process expressions for PySpark DataFrames."""
+        from pyspark.sql import functions as F
+        pyspark_columns = []
+        for expr_fn in self.expressions:
+            try:
+                # First try direct evaluation with PySpark DataFrame
+                expr_result = expr_fn(self.data_tbl)
+                # Check if it's a PySpark Column
+                if hasattr(expr_result, "_jc"):  # PySpark Column has _jc attribute
+                    pyspark_columns.append(expr_result)
+                else:
+                    raise TypeError(
+                        f"Expression returned {type(expr_result)}, expected PySpark Column"
+                    )
+            except Exception as e:
+                try:
+                    # Try as a ColumnExpression (for pb.expr_col style)
+                    col_expr = expr_fn(None)
+                    if hasattr(col_expr, "to_pyspark_expr"):
+                        # Convert to PySpark expression
+                        pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
+                        pyspark_columns.append(pyspark_expr)
+                    else:
+                        raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
+                except Exception as nested_e:
+                    print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
+        # Combine results with AND logic
+        if pyspark_columns:
+            final_result = pyspark_columns[0]
+            for col in pyspark_columns[1:]:
+                final_result = final_result & col
+            # Create results table with boolean column
+            results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
+            return results_tbl
+        # Default case
+        results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
+        return results_tbl
 class SpeciallyValidation:
     def __init__(self, data_tbl, expression, threshold, tbl_type):
@@ -2359,13 +1973,22 @@ class NumberOfTestUnits:
     column: str
     def get_test_units(self, tbl_type: str) -> int:
-        if tbl_type == "pandas" or tbl_type == "polars":
+        if (
+            tbl_type == "pandas"
+            or tbl_type == "polars"
+            or tbl_type == "pyspark"
+            or tbl_type == "local"
+        ):
             # Convert the DataFrame to a format that narwhals can work with and:
             #  - check if the column exists
             dfn = _column_test_prep(
                 df=self.df, column=self.column, allowed_types=None, check_exists=False
             )
+            # Handle LazyFrames which don't have len()
+            if hasattr(dfn, "collect"):
+                dfn = dfn.collect()
             return len(dfn)
         if tbl_type in IBIS_BACKENDS:
@@ -2383,7 +2006,22 @@ def _get_compare_expr_nw(compare: Any) -> Any:
 def _column_has_null_values(table: FrameT, column: str) -> bool:
-    null_count = (table.select(column).null_count())[column][0]
+    try:
+        # Try the standard null_count() method
+        null_count = (table.select(column).null_count())[column][0]
+    except AttributeError:
+        # For LazyFrames, collect first then get null count
+        try:
+            collected = table.select(column).collect()
+            null_count = (collected.null_count())[column][0]
+        except Exception:
+            # Fallback: check if any values are null
+            try:
+                result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
+                null_count = result["null_count"][0]
+            except Exception:
+                # Last resort: return False (assume no nulls)
+                return False
     if null_count is None or null_count == 0:
         return False
@@ -2414,7 +2052,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
     # Build the expression by combining each column's `is_null()` with OR operations
     null_expr = functools.reduce(
-        lambda acc, col: acc | table[col].is_null() if acc is not None else table[col].is_null(),
+        lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
         column_names,
         None,
     )

pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl

pointblank 0.11.6py3-none-any.whl → 0.12.1py3-none-any.whl