pointblank 0.13.1__py3-none-any.whl → 0.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import functools
4
4
  from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import Any
6
6
 
7
7
  import narwhals as nw
8
8
  from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
@@ -10,17 +10,11 @@ from narwhals.typing import FrameT
10
10
 
11
11
  from pointblank._constants import IBIS_BACKENDS
12
12
  from pointblank._utils import (
13
- _column_subset_test_prep,
14
13
  _column_test_prep,
15
14
  _convert_to_narwhals,
16
15
  _get_tbl_type,
17
16
  )
18
17
  from pointblank.column import Column
19
- from pointblank.schema import Schema
20
- from pointblank.thresholds import _threshold_check
21
-
22
- if TYPE_CHECKING:
23
- from pointblank._typing import AbsoluteTolBounds
24
18
 
25
19
 
26
20
  def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
@@ -91,885 +85,597 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
91
85
  return compare_val
92
86
 
93
87
 
94
- @dataclass
95
- class Interrogator:
88
+ def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: str = None) -> Any:
96
89
  """
97
- Compare values against a single value, a set of values, or a range of values.
90
+ Create an expression that safely checks for both Null and NaN values.
91
+
92
+ This function handles the case where `is_nan()` is not supported for certain data types (like
93
+ strings) or backends (like `SQLite` via Ibis) by checking the backend type and column type
94
+ first.
98
95
 
99
96
  Parameters
100
97
  ----------
101
- x
102
- The values to compare.
103
- column
104
- The column to check.
105
- columns_subset
106
- The subset of columns to use for the check.
107
- compare
108
- The value to compare against. Used in the following interrogations:
109
- - 'gt' for greater than
110
- - 'lt' for less than
111
- - 'eq' for equal to
112
- - 'ne' for not equal to
113
- - 'ge' for greater than or equal to
114
- - 'le' for less than or equal to
115
- set
116
- The set of values to compare against. Used in the following interrogations:
117
- - 'isin' for values in the set
118
- - 'notin' for values not in the set
119
- pattern
120
- The regular expression pattern to compare against. Used in the following:
121
- - 'regex' for values that match the pattern
122
- low
123
- The lower bound of the range of values to compare against. Used in the following:
124
- - 'between' for values between the range
125
- - 'outside' for values outside the range
126
- high
127
- The upper bound of the range of values to compare against. Used in the following:
128
- - 'between' for values between the range
129
- - 'outside' for values outside the range
130
- inclusive
131
- A tuple of booleans that state which bounds are inclusive. The position of the boolean
132
- corresponds to the value in the following order: (low, high). Used in the following:
133
- - 'between' for values between the range
134
- - 'outside' for values outside the range
135
- na_pass
136
- `True` to pass test units with missing values, `False` otherwise.
137
- tbl_type
138
- The type of table to use for the assertion. This is used to determine the backend for the
139
- assertion. The default is 'local' but it can also be any of the table types in the
140
- `IBIS_BACKENDS` constant.
98
+ data_frame
99
+ The data frame to get schema information from.
100
+ column_expr
101
+ The narwhals column expression to check.
102
+ column_name
103
+ The name of the column.
141
104
 
142
105
  Returns
143
106
  -------
144
- list[bool]
145
- A list of booleans where `True` indicates a passing test unit.
107
+ Any
108
+ A narwhals expression that returns `True` for Null or NaN values.
146
109
  """
110
+ # Always check for null values
111
+ null_check = column_expr.is_null()
147
112
 
148
- x: nw.DataFrame | Any
149
- column: str = None
150
- columns_subset: list[str] = None
151
- compare: float | int | list[float | int] = None
152
- set: list[float | int] = None
153
- pattern: str = None
154
- low: float | int | list[float | int] = None
155
- high: float | int | list[float | int] = None
156
- inclusive: tuple[bool, bool] = None
157
- na_pass: bool = False
158
- tbl_type: str = "local"
159
-
160
- def __post_init__(self):
161
- """
162
- Post-initialization to process Ibis tables through Narwhals.
163
-
164
- This converts Ibis tables to Narwhals-wrapped tables to unify
165
- the processing pathway and reduce code branching.
166
- """
167
- # Import the processing function
168
- from pointblank._utils import _process_ibis_through_narwhals
169
-
170
- # Process Ibis tables through Narwhals
171
- self.x, self.tbl_type = _process_ibis_through_narwhals(self.x, self.tbl_type)
172
-
173
- def gt(self) -> FrameT | Any:
174
- # All backends now use Narwhals (including former Ibis tables) ---------
175
-
176
- compare_expr = _get_compare_expr_nw(compare=self.compare)
177
-
178
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
179
-
180
- return (
181
- self.x.with_columns(
182
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
183
- pb_is_good_2=(
184
- nw.col(self.compare.name).is_null() & self.na_pass
185
- if isinstance(self.compare, Column)
186
- else nw.lit(False)
187
- ),
188
- pb_is_good_3=nw.col(self.column) > compare_expr,
189
- )
190
- .with_columns(
191
- pb_is_good_3=(
192
- nw.when(nw.col("pb_is_good_3").is_null())
193
- .then(nw.lit(False))
194
- .otherwise(nw.col("pb_is_good_3"))
195
- )
196
- )
197
- .with_columns(
198
- pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
199
- )
200
- .drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
201
- .to_native()
202
- )
203
-
204
- def lt(self) -> FrameT | Any:
205
- # All backends now use Narwhals (including former Ibis tables) ---------
206
-
207
- compare_expr = _get_compare_expr_nw(compare=self.compare)
208
-
209
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
210
-
211
- return (
212
- self.x.with_columns(
213
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
214
- pb_is_good_2=(
215
- nw.col(self.compare.name).is_null() & self.na_pass
216
- if isinstance(self.compare, Column)
217
- else nw.lit(False)
218
- ),
219
- pb_is_good_3=nw.col(self.column) < compare_expr,
220
- )
221
- .with_columns(
222
- pb_is_good_3=(
223
- nw.when(nw.col("pb_is_good_3").is_null())
224
- .then(nw.lit(False))
225
- .otherwise(nw.col("pb_is_good_3"))
226
- )
227
- )
228
- .with_columns(
229
- pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
230
- )
231
- .drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
232
- .to_native()
233
- )
234
-
235
- def eq(self) -> FrameT | Any:
236
- # All backends now use Narwhals (including former Ibis tables) ---------
237
-
238
- if isinstance(self.compare, Column):
239
- compare_expr = _get_compare_expr_nw(compare=self.compare)
240
-
241
- tbl = self.x.with_columns(
242
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
243
- pb_is_good_2=(
244
- nw.col(self.compare.name).is_null() & self.na_pass
245
- if isinstance(self.compare, Column)
246
- else nw.lit(False)
247
- ),
248
- )
249
-
250
- tbl = tbl.with_columns(
251
- pb_is_good_3=(~nw.col(self.compare.name).is_null() & ~nw.col(self.column).is_null())
252
- )
253
-
254
- if is_pandas_dataframe(tbl.to_native()):
255
- tbl = tbl.with_columns(
256
- pb_is_good_4=nw.col(self.column) - compare_expr,
257
- )
258
-
259
- tbl = tbl.with_columns(
260
- pb_is_good_=nw.col("pb_is_good_1")
261
- | nw.col("pb_is_good_2")
262
- | (nw.col("pb_is_good_4") == 0 & ~nw.col("pb_is_good_3").is_null())
263
- )
264
-
265
- else:
266
- tbl = tbl.with_columns(
267
- pb_is_good_4=nw.col(self.column) == compare_expr,
268
- )
269
-
270
- tbl = tbl.with_columns(
271
- pb_is_good_=nw.col("pb_is_good_1")
272
- | nw.col("pb_is_good_2")
273
- | (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
274
- )
113
+ # For Ibis backends, many don't support `is_nan()` so we stick to Null checks only;
114
+ # use `narwhals.get_native_namespace()` for reliable backend detection
115
+ try:
116
+ native_namespace = nw.get_native_namespace(data_frame)
275
117
 
276
- return tbl.drop(
277
- "pb_is_good_1", "pb_is_good_2", "pb_is_good_3", "pb_is_good_4"
278
- ).to_native()
118
+ # If it's an Ibis backend, only check for null values
119
+ # The namespace is the actual module, so we check its name
120
+ if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
121
+ return null_check
122
+ except Exception:
123
+ pass
279
124
 
125
+ # For non-Ibis backends, try to use `is_nan()` if the column type supports it
126
+ try:
127
+ if hasattr(data_frame, "collect_schema"):
128
+ schema = data_frame.collect_schema()
129
+ elif hasattr(data_frame, "schema"):
130
+ schema = data_frame.schema
280
131
  else:
281
- compare_expr = _get_compare_expr_nw(compare=self.compare)
282
-
283
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
284
-
285
- tbl = self.x.with_columns(
286
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
287
- pb_is_good_2=(
288
- nw.col(self.compare.name).is_null() & self.na_pass
289
- if isinstance(self.compare, Column)
290
- else nw.lit(False)
291
- ),
292
- )
132
+ schema = None
293
133
 
294
- tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) == compare_expr)
134
+ if schema and column_name:
135
+ column_dtype = schema.get(column_name)
136
+ if column_dtype:
137
+ dtype_str = str(column_dtype).lower()
295
138
 
296
- tbl = tbl.with_columns(
297
- pb_is_good_3=(
298
- nw.when(nw.col("pb_is_good_3").is_null())
299
- .then(nw.lit(False))
300
- .otherwise(nw.col("pb_is_good_3"))
139
+ # Check if it's a numeric type that supports NaN
140
+ is_numeric = any(
141
+ num_type in dtype_str for num_type in ["float", "double", "f32", "f64"]
301
142
  )
302
- )
303
143
 
304
- tbl = tbl.with_columns(
305
- pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
306
- )
144
+ if is_numeric:
145
+ try:
146
+ # For numeric types, try to check both Null and NaN
147
+ return null_check | column_expr.is_nan()
148
+ except Exception:
149
+ # If `is_nan()` fails for any reason, fall back to Null only
150
+ pass
151
+ except Exception:
152
+ pass
307
153
 
308
- return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
154
+ # Fallback: just check Null values
155
+ return null_check
309
156
 
310
- def ne(self) -> FrameT | Any:
311
- # All backends now use Narwhals (including former Ibis tables) ---------
312
157
 
313
- # Determine if the reference and comparison columns have any null values
314
- ref_col_has_null_vals = _column_has_null_values(table=self.x, column=self.column)
158
+ class ConjointlyValidation:
159
+ def __init__(self, data_tbl, expressions, threshold, tbl_type):
160
+ self.data_tbl = data_tbl
161
+ self.expressions = expressions
162
+ self.threshold = threshold
315
163
 
316
- if isinstance(self.compare, Column):
317
- compare_name = self.compare.name if isinstance(self.compare, Column) else self.compare
318
- cmp_col_has_null_vals = _column_has_null_values(table=self.x, column=compare_name)
164
+ # Detect the table type
165
+ if tbl_type in (None, "local"):
166
+ # Detect the table type using _get_tbl_type()
167
+ self.tbl_type = _get_tbl_type(data=data_tbl)
319
168
  else:
320
- cmp_col_has_null_vals = False
169
+ self.tbl_type = tbl_type
321
170
 
322
- # If neither column has null values, we can proceed with the comparison
323
- # without too many complications
324
- if not ref_col_has_null_vals and not cmp_col_has_null_vals:
325
- if isinstance(self.compare, Column):
326
- compare_expr = _get_compare_expr_nw(compare=self.compare)
171
+ def get_test_results(self):
172
+ """Evaluate all expressions and combine them conjointly."""
327
173
 
328
- return self.x.with_columns(
329
- pb_is_good_=nw.col(self.column) != compare_expr,
330
- ).to_native()
174
+ if "polars" in self.tbl_type:
175
+ return self._get_polars_results()
176
+ elif "pandas" in self.tbl_type:
177
+ return self._get_pandas_results()
178
+ elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
179
+ return self._get_ibis_results()
180
+ elif "pyspark" in self.tbl_type:
181
+ return self._get_pyspark_results()
182
+ else: # pragma: no cover
183
+ raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
331
184
 
332
- else:
333
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
185
+ def _get_polars_results(self):
186
+ """Process expressions for Polars DataFrames."""
187
+ import polars as pl
334
188
 
335
- return self.x.with_columns(
336
- pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
337
- ).to_native()
189
+ polars_results = [] # Changed from polars_expressions to polars_results
338
190
 
339
- # If either column has null values, we need to handle the comparison
340
- # much more carefully since we can't inadverdently compare null values
341
- # to non-null values
191
+ for expr_fn in self.expressions:
192
+ try:
193
+ # First try direct evaluation with native expressions
194
+ expr_result = expr_fn(self.data_tbl)
195
+ if isinstance(expr_result, pl.Expr):
196
+ # This is a Polars expression, we'll evaluate it later
197
+ polars_results.append(("expr", expr_result))
198
+ elif isinstance(expr_result, pl.Series):
199
+ # This is a boolean Series from lambda function
200
+ polars_results.append(("series", expr_result))
201
+ else:
202
+ raise TypeError("Not a valid Polars expression or series")
203
+ except Exception as e:
204
+ try:
205
+ # Try to get a ColumnExpression
206
+ col_expr = expr_fn(None)
207
+ if hasattr(col_expr, "to_polars_expr"):
208
+ polars_expr = col_expr.to_polars_expr()
209
+ polars_results.append(("expr", polars_expr))
210
+ else: # pragma: no cover
211
+ raise TypeError(f"Cannot convert {type(col_expr)} to Polars expression")
212
+ except Exception as e: # pragma: no cover
213
+ print(f"Error evaluating expression: {e}")
342
214
 
343
- if isinstance(self.compare, Column):
344
- compare_expr = _get_compare_expr_nw(compare=self.compare)
215
+ # Combine results with AND logic
216
+ if polars_results:
217
+ # Convert everything to Series for consistent handling
218
+ series_results = []
219
+ for result_type, result_value in polars_results:
220
+ if result_type == "series":
221
+ series_results.append(result_value)
222
+ elif result_type == "expr":
223
+ # Evaluate the expression on the DataFrame to get a Series
224
+ evaluated_series = self.data_tbl.select(result_value).to_series()
225
+ series_results.append(evaluated_series)
226
+
227
+ # Combine all boolean Series with AND logic
228
+ final_result = series_results[0]
229
+ for series in series_results[1:]:
230
+ final_result = final_result & series
345
231
 
346
- # CASE 1: the reference column has null values but the comparison column does not
347
- if ref_col_has_null_vals and not cmp_col_has_null_vals:
348
- if is_pandas_dataframe(self.x.to_native()):
349
- tbl = self.x.with_columns(
350
- pb_is_good_1=nw.col(self.column).is_null(),
351
- pb_is_good_2=nw.lit(self.column) != nw.col(self.compare.name),
352
- )
232
+ # Create results table with boolean column
233
+ results_tbl = self.data_tbl.with_columns(pb_is_good_=final_result)
234
+ return results_tbl
353
235
 
354
- else:
355
- tbl = self.x.with_columns(
356
- pb_is_good_1=nw.col(self.column).is_null(),
357
- pb_is_good_2=nw.col(self.column) != nw.col(self.compare.name),
358
- )
236
+ # Default case
237
+ results_tbl = self.data_tbl.with_columns(pb_is_good_=pl.lit(True)) # pragma: no cover
238
+ return results_tbl # pragma: no cover
359
239
 
360
- if not self.na_pass:
361
- tbl = tbl.with_columns(
362
- pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
363
- )
240
+ def _get_pandas_results(self):
241
+ """Process expressions for pandas DataFrames."""
242
+ import pandas as pd
364
243
 
365
- if is_polars_dataframe(self.x.to_native()):
366
- # There may be Null values in the pb_is_good_2 column, change those to
367
- # True if na_pass is True, False otherwise
244
+ pandas_series = []
368
245
 
369
- tbl = tbl.with_columns(
370
- pb_is_good_2=nw.when(nw.col("pb_is_good_2").is_null())
371
- .then(False)
372
- .otherwise(nw.col("pb_is_good_2")),
373
- )
246
+ for expr_fn in self.expressions:
247
+ try:
248
+ # First try direct evaluation with pandas DataFrame
249
+ expr_result = expr_fn(self.data_tbl)
374
250
 
375
- if self.na_pass:
376
- tbl = tbl.with_columns(
377
- pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
378
- )
379
- else:
380
- # General case (non-Polars): handle na_pass=True properly
381
- if self.na_pass:
382
- tbl = tbl.with_columns(
383
- pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
251
+ # Check that it's a pandas Series with bool dtype
252
+ if isinstance(expr_result, pd.Series):
253
+ if expr_result.dtype == bool or pd.api.types.is_bool_dtype(expr_result):
254
+ pandas_series.append(expr_result)
255
+ else: # pragma: no cover
256
+ raise TypeError(
257
+ f"Expression returned Series of type {expr_result.dtype}, expected bool"
384
258
  )
259
+ else: # pragma: no cover
260
+ raise TypeError(f"Expression returned {type(expr_result)}, expected pd.Series")
385
261
 
386
- return (
387
- tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
388
- .drop("pb_is_good_1", "pb_is_good_2")
389
- .to_native()
390
- )
391
-
392
- # CASE 2: the comparison column has null values but the reference column does not
393
- elif not ref_col_has_null_vals and cmp_col_has_null_vals:
394
- if is_pandas_dataframe(self.x.to_native()):
395
- tbl = self.x.with_columns(
396
- pb_is_good_1=nw.col(self.column) != nw.lit(self.compare.name),
397
- pb_is_good_2=nw.col(self.compare.name).is_null(),
398
- )
262
+ except Exception as e:
263
+ try:
264
+ # Try as a ColumnExpression (for pb.expr_col style)
265
+ col_expr = expr_fn(None)
399
266
 
400
- else:
401
- tbl = self.x.with_columns(
402
- pb_is_good_1=nw.col(self.column) != nw.col(self.compare.name),
403
- pb_is_good_2=nw.col(self.compare.name).is_null(),
404
- )
267
+ if hasattr(col_expr, "to_pandas_expr"):
268
+ # Watch for NotImplementedError here and re-raise it
269
+ try:
270
+ pandas_expr = col_expr.to_pandas_expr(self.data_tbl)
271
+ pandas_series.append(pandas_expr)
272
+ except NotImplementedError as nie: # pragma: no cover
273
+ # Re-raise NotImplementedError with the original message
274
+ raise NotImplementedError(str(nie))
275
+ else: # pragma: no cover
276
+ raise TypeError(f"Cannot convert {type(col_expr)} to pandas Series")
277
+ except NotImplementedError as nie: # pragma: no cover
278
+ # Re-raise NotImplementedError
279
+ raise NotImplementedError(str(nie))
280
+ except Exception as nested_e: # pragma: no cover
281
+ print(f"Error evaluating pandas expression: {e} -> {nested_e}")
405
282
 
406
- if not self.na_pass:
407
- tbl = tbl.with_columns(
408
- pb_is_good_1=nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2")
409
- )
283
+ # Combine results with AND logic
284
+ if pandas_series:
285
+ final_result = pandas_series[0]
286
+ for series in pandas_series[1:]:
287
+ final_result = final_result & series
410
288
 
411
- if is_polars_dataframe(self.x.to_native()):
412
- if self.na_pass:
413
- tbl = tbl.with_columns(
414
- pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
415
- )
416
- else:
417
- # General case (non-Polars): handle na_pass=True properly
418
- if self.na_pass:
419
- tbl = tbl.with_columns(
420
- pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
421
- )
289
+ # Create results table with boolean column
290
+ results_tbl = self.data_tbl.copy()
291
+ results_tbl["pb_is_good_"] = final_result
292
+ return results_tbl
422
293
 
423
- return (
424
- tbl.with_columns(pb_is_good_=nw.col("pb_is_good_1"))
425
- .drop("pb_is_good_1", "pb_is_good_2")
426
- .to_native()
427
- )
294
+ # Default case
295
+ results_tbl = self.data_tbl.copy() # pragma: no cover
296
+ results_tbl["pb_is_good_"] = pd.Series( # pragma: no cover
297
+ [True] * len(self.data_tbl), index=self.data_tbl.index
298
+ )
299
+ return results_tbl # pragma: no cover
428
300
 
429
- # CASE 3: both columns have null values and there may potentially be cases where
430
- # there could even be null/null comparisons
431
- elif ref_col_has_null_vals and cmp_col_has_null_vals:
432
- tbl = self.x.with_columns(
433
- pb_is_good_1=nw.col(self.column).is_null(),
434
- pb_is_good_2=nw.col(self.compare.name).is_null(),
435
- pb_is_good_3=nw.col(self.column) != nw.col(self.compare.name),
436
- )
301
+ def _get_ibis_results(self):
302
+ """Process expressions for Ibis tables (including DuckDB)."""
303
+ import ibis
437
304
 
438
- if not self.na_pass:
439
- tbl = tbl.with_columns(
440
- pb_is_good_3=nw.col("pb_is_good_3")
441
- & ~nw.col("pb_is_good_1")
442
- & ~nw.col("pb_is_good_2")
443
- )
305
+ ibis_expressions = []
444
306
 
445
- if is_polars_dataframe(self.x.to_native()):
446
- if self.na_pass:
447
- tbl = tbl.with_columns(
448
- pb_is_good_3=(
449
- nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
450
- .then(True)
451
- .otherwise(False)
452
- )
453
- )
454
- else:
455
- # General case (non-Polars): handle na_pass=True properly
456
- if self.na_pass:
457
- tbl = tbl.with_columns(
458
- pb_is_good_3=(
459
- nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
460
- .then(True)
461
- .otherwise(nw.col("pb_is_good_3"))
462
- )
463
- )
307
+ for expr_fn in self.expressions:
308
+ # Strategy 1: Try direct evaluation with native Ibis expressions
309
+ try:
310
+ expr_result = expr_fn(self.data_tbl)
464
311
 
465
- return (
466
- tbl.with_columns(pb_is_good_=nw.col("pb_is_good_3"))
467
- .drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
468
- .to_native()
469
- )
312
+ # Check if it's a valid Ibis expression
313
+ if hasattr(expr_result, "_ibis_expr"): # pragma: no cover
314
+ ibis_expressions.append(expr_result)
315
+ continue # Skip to next expression if this worked
316
+ except Exception: # pragma: no cover
317
+ pass # Silently continue to Strategy 2
470
318
 
471
- else:
472
- # Case where the reference column contains null values
473
- if ref_col_has_null_vals:
474
- # Create individual cases for Pandas and Polars
319
+ # Strategy 2: Try with ColumnExpression
320
+ try: # pragma: no cover
321
+ # Skip this strategy if we don't have an expr_col implementation
322
+ if not hasattr(self, "to_ibis_expr"):
323
+ continue
475
324
 
476
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
325
+ col_expr = expr_fn(None)
477
326
 
478
- if is_pandas_dataframe(self.x.to_native()):
479
- tbl = self.x.with_columns(
480
- pb_is_good_1=nw.col(self.column).is_null(),
481
- pb_is_good_2=nw.lit(self.column) != nw.lit(compare_expr),
482
- )
327
+ # Skip if we got None
328
+ if col_expr is None:
329
+ continue
483
330
 
484
- if not self.na_pass:
485
- tbl = tbl.with_columns(
486
- pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
487
- )
331
+ # Convert ColumnExpression to Ibis expression
332
+ if hasattr(col_expr, "to_ibis_expr"):
333
+ ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
334
+ ibis_expressions.append(ibis_expr)
335
+ except Exception: # pragma: no cover
336
+ # Silent failure - we already tried both strategies
337
+ pass
488
338
 
489
- return (
490
- tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
491
- .drop("pb_is_good_1", "pb_is_good_2")
492
- .to_native()
493
- )
339
+ # Combine expressions
340
+ if ibis_expressions: # pragma: no cover
341
+ try:
342
+ final_result = ibis_expressions[0]
343
+ for expr in ibis_expressions[1:]:
344
+ final_result = final_result & expr
494
345
 
495
- elif is_polars_dataframe(self.x.to_native()):
496
- tbl = self.x.with_columns(
497
- pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
498
- pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
499
- )
346
+ # Create results table with boolean column
347
+ results_tbl = self.data_tbl.mutate(pb_is_good_=final_result)
348
+ return results_tbl
349
+ except Exception as e:
350
+ print(f"Error combining Ibis expressions: {e}")
500
351
 
501
- tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
352
+ # Default case
353
+ results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
354
+ return results_tbl
502
355
 
503
- tbl = tbl.with_columns(
504
- pb_is_good_=(
505
- (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
506
- | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
507
- )
508
- )
356
+ def _get_pyspark_results(self):
357
+ """Process expressions for PySpark DataFrames."""
358
+ from pyspark.sql import functions as F
509
359
 
510
- tbl = tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
360
+ pyspark_columns = []
511
361
 
512
- return tbl
362
+ for expr_fn in self.expressions:
363
+ try:
364
+ # First try direct evaluation with PySpark DataFrame
365
+ expr_result = expr_fn(self.data_tbl)
513
366
 
367
+ # Check if it's a PySpark Column
368
+ if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
369
+ pyspark_columns.append(expr_result)
514
370
  else:
515
- # Generic case for other DataFrame types (PySpark, etc.)
516
- # Use similar logic to Polars but handle potential differences
517
- tbl = self.x.with_columns(
518
- pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
519
- pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
371
+ raise TypeError(
372
+ f"Expression returned {type(expr_result)}, expected PySpark Column"
520
373
  )
521
374
 
522
- tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
523
-
524
- tbl = tbl.with_columns(
525
- pb_is_good_=(
526
- (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
527
- | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
528
- )
529
- )
375
+ except Exception as e:
376
+ try:
377
+ # Try as a ColumnExpression (for pb.expr_col style)
378
+ col_expr = expr_fn(None)
379
+
380
+ if hasattr(col_expr, "to_pyspark_expr"):
381
+ # Convert to PySpark expression
382
+ pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
383
+ pyspark_columns.append(pyspark_expr)
384
+ else:
385
+ raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
386
+ except Exception as nested_e:
387
+ print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
530
388
 
531
- return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
389
+ # Combine results with AND logic
390
+ if pyspark_columns:
391
+ final_result = pyspark_columns[0]
392
+ for col in pyspark_columns[1:]:
393
+ final_result = final_result & col
532
394
 
533
- def ge(self) -> FrameT | Any:
534
- # All backends now use Narwhals (including former Ibis tables) ---------
395
+ # Create results table with boolean column
396
+ results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
397
+ return results_tbl
535
398
 
536
- compare_expr = _get_compare_expr_nw(compare=self.compare)
399
+ # Default case
400
+ results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
401
+ return results_tbl
537
402
 
538
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
539
403
 
540
- tbl = (
541
- self.x.with_columns(
542
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
543
- pb_is_good_2=(
544
- nw.col(self.compare.name).is_null() & self.na_pass
545
- if isinstance(self.compare, Column)
546
- else nw.lit(False)
547
- ),
548
- pb_is_good_3=nw.col(self.column) >= compare_expr,
549
- )
550
- .with_columns(
551
- pb_is_good_3=(
552
- nw.when(nw.col("pb_is_good_3").is_null())
553
- .then(nw.lit(False))
554
- .otherwise(nw.col("pb_is_good_3"))
555
- )
556
- )
557
- .with_columns(
558
- pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
559
- )
560
- )
404
+ class SpeciallyValidation:
405
+ def __init__(self, data_tbl, expression, threshold, tbl_type):
406
+ self.data_tbl = data_tbl
407
+ self.expression = expression
408
+ self.threshold = threshold
561
409
 
562
- return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
410
+ # Detect the table type
411
+ if tbl_type in (None, "local"):
412
+ # Detect the table type using _get_tbl_type()
413
+ self.tbl_type = _get_tbl_type(data=data_tbl)
414
+ else:
415
+ self.tbl_type = tbl_type
563
416
 
564
- def le(self) -> FrameT | Any:
565
- # All backends now use Narwhals (including former Ibis tables) ---------
417
+ def get_test_results(self) -> any | list[bool]:
418
+ """Evaluate the expression get either a list of booleans or a results table."""
566
419
 
567
- compare_expr = _get_compare_expr_nw(compare=self.compare)
420
+ # Get the expression and inspect whether there is a `data` argument
421
+ expression = self.expression
568
422
 
569
- compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
423
+ import inspect
570
424
 
571
- return (
572
- self.x.with_columns(
573
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
574
- pb_is_good_2=(
575
- nw.col(self.compare.name).is_null() & self.na_pass
576
- if isinstance(self.compare, Column)
577
- else nw.lit(False)
578
- ),
579
- pb_is_good_3=nw.col(self.column) <= compare_expr,
580
- )
581
- .with_columns(
582
- pb_is_good_3=(
583
- nw.when(nw.col("pb_is_good_3").is_null())
584
- .then(nw.lit(False))
585
- .otherwise(nw.col("pb_is_good_3"))
586
- )
587
- )
588
- .with_columns(
589
- pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
425
+ # During execution of `specially` validation
426
+ sig = inspect.signature(expression)
427
+ params = list(sig.parameters.keys())
428
+
429
+ # Execute the function based on its signature
430
+ if len(params) == 0:
431
+ # No parameters: call without arguments
432
+ result = expression()
433
+ elif len(params) == 1:
434
+ # One parameter: pass the data table
435
+ data_tbl = self.data_tbl
436
+ result = expression(data_tbl)
437
+ else:
438
+ # More than one parameter - this doesn't match either allowed signature
439
+ raise ValueError(
440
+ f"The function provided to 'specially()' should have either no parameters or a "
441
+ f"single 'data' parameter, but it has {len(params)} parameters: {params}"
590
442
  )
591
- .drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
592
- .to_native()
593
- )
594
443
 
595
- def between(self) -> FrameT | Any:
596
- # All backends now use Narwhals (including former Ibis tables) ---------
444
+ # Determine if the object is a DataFrame by inspecting the string version of its type
445
+ if (
446
+ "pandas" in str(type(result))
447
+ or "polars" in str(type(result))
448
+ or "ibis" in str(type(result))
449
+ ):
450
+ # Get the type of the table
451
+ tbl_type = _get_tbl_type(data=result)
597
452
 
598
- low_val = _get_compare_expr_nw(compare=self.low)
599
- high_val = _get_compare_expr_nw(compare=self.high)
453
+ if "pandas" in tbl_type:
454
+ # If it's a Pandas DataFrame, check if the last column is a boolean column
455
+ last_col = result.iloc[:, -1]
600
456
 
601
- low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
602
- high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
457
+ import pandas as pd
603
458
 
604
- tbl = self.x.with_columns(
605
- pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
606
- pb_is_good_2=( # lb is Null in Column
607
- nw.col(self.low.name).is_null() if isinstance(self.low, Column) else nw.lit(False)
608
- ),
609
- pb_is_good_3=( # ub is Null in Column
610
- nw.col(self.high.name).is_null() if isinstance(self.high, Column) else nw.lit(False)
611
- ),
612
- pb_is_good_4=nw.lit(self.na_pass), # Pass if any Null in lb, val, or ub
613
- )
459
+ if last_col.dtype == bool or pd.api.types.is_bool_dtype(last_col):
460
+ # If the last column is a boolean column, rename it as `pb_is_good_`
461
+ result.rename(columns={result.columns[-1]: "pb_is_good_"}, inplace=True)
462
+ elif "polars" in tbl_type:
463
+ # If it's a Polars DataFrame, check if the last column is a boolean column
464
+ last_col_name = result.columns[-1]
465
+ last_col_dtype = result.schema[last_col_name]
614
466
 
615
- if self.inclusive[0]:
616
- tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) >= low_val)
617
- else:
618
- tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) > low_val)
467
+ import polars as pl
619
468
 
620
- if self.inclusive[1]:
621
- tbl = tbl.with_columns(pb_is_good_6=nw.col(self.column) <= high_val)
622
- else:
623
- tbl = tbl.with_columns(pb_is_good_6=nw.col(self.column) < high_val)
469
+ if last_col_dtype == pl.Boolean:
470
+ # If the last column is a boolean column, rename it as `pb_is_good_`
471
+ result = result.rename({last_col_name: "pb_is_good_"})
472
+ elif tbl_type in IBIS_BACKENDS:
473
+ # If it's an Ibis table, check if the last column is a boolean column
474
+ last_col_name = result.columns[-1]
475
+ result_schema = result.schema()
476
+ is_last_col_bool = str(result_schema[last_col_name]) == "boolean"
624
477
 
625
- tbl = tbl.with_columns(
626
- pb_is_good_5=(
627
- nw.when(nw.col("pb_is_good_5").is_null())
628
- .then(nw.lit(False))
629
- .otherwise(nw.col("pb_is_good_5"))
630
- )
631
- )
478
+ if is_last_col_bool:
479
+ # If the last column is a boolean column, rename it as `pb_is_good_`
480
+ result = result.rename(pb_is_good_=last_col_name)
632
481
 
633
- tbl = tbl.with_columns(
634
- pb_is_good_6=(
635
- nw.when(nw.col("pb_is_good_6").is_null())
636
- .then(nw.lit(False))
637
- .otherwise(nw.col("pb_is_good_6"))
638
- )
639
- )
482
+ else: # pragma: no cover
483
+ raise NotImplementedError(f"Support for {tbl_type} is not yet implemented")
640
484
 
641
- tbl = (
642
- tbl.with_columns(
643
- pb_is_good_=(
644
- (
645
- (nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
646
- & nw.col("pb_is_good_4")
647
- )
648
- | (nw.col("pb_is_good_5") & nw.col("pb_is_good_6"))
649
- )
650
- )
651
- .drop(
652
- "pb_is_good_1",
653
- "pb_is_good_2",
654
- "pb_is_good_3",
655
- "pb_is_good_4",
656
- "pb_is_good_5",
657
- "pb_is_good_6",
658
- )
659
- .to_native()
660
- )
485
+ elif isinstance(result, bool):
486
+ # If it's a single boolean, return that as a list
487
+ return [result]
488
+
489
+ elif isinstance(result, list):
490
+ # If it's a list, check that it is a boolean list
491
+ if all(isinstance(x, bool) for x in result):
492
+ # If it's a list of booleans, return it as is
493
+ return result
494
+ else:
495
+ # If it's not a list of booleans, raise an error
496
+ raise TypeError("The result is not a list of booleans.")
497
+ else: # pragma: no cover
498
+ # If it's not a DataFrame or a list, raise an error
499
+ raise TypeError("The result is not a DataFrame or a list of booleans.")
661
500
 
662
- return tbl
501
+ # Return the results table or list of booleans
502
+ return result
663
503
 
664
- def outside(self) -> FrameT | Any:
665
- # All backends now use Narwhals (including former Ibis tables) ---------
666
504
 
667
- low_val = _get_compare_expr_nw(compare=self.low)
668
- high_val = _get_compare_expr_nw(compare=self.high)
505
+ @dataclass
506
+ class NumberOfTestUnits:
507
+ """
508
+ Count the number of test units in a column.
509
+ """
669
510
 
670
- low_val = _get_compare_expr_nw(compare=self.low)
671
- high_val = _get_compare_expr_nw(compare=self.high)
511
+ df: FrameT
512
+ column: str
672
513
 
673
- low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
674
- high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
514
+ def get_test_units(self, tbl_type: str) -> int:
515
+ if (
516
+ tbl_type == "pandas"
517
+ or tbl_type == "polars"
518
+ or tbl_type == "pyspark"
519
+ or tbl_type == "local"
520
+ ):
521
+ # Convert the DataFrame to a format that narwhals can work with and:
522
+ # - check if the column exists
523
+ dfn = _column_test_prep(
524
+ df=self.df, column=self.column, allowed_types=None, check_exists=False
525
+ )
675
526
 
676
- tbl = self.x.with_columns(
677
- pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
678
- pb_is_good_2=( # lb is Null in Column
679
- nw.col(self.low.name).is_null() if isinstance(self.low, Column) else nw.lit(False)
680
- ),
681
- pb_is_good_3=( # ub is Null in Column
682
- nw.col(self.high.name).is_null() if isinstance(self.high, Column) else nw.lit(False)
683
- ),
684
- pb_is_good_4=nw.lit(self.na_pass), # Pass if any Null in lb, val, or ub
685
- )
527
+ # Handle LazyFrames which don't have len()
528
+ if hasattr(dfn, "collect"):
529
+ dfn = dfn.collect()
686
530
 
687
- if self.inclusive[0]:
688
- tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) < low_val)
689
- else:
690
- tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) <= low_val)
531
+ return len(dfn)
691
532
 
692
- if self.inclusive[1]:
693
- tbl = tbl.with_columns(pb_is_good_6=nw.col(self.column) > high_val)
694
- else:
695
- tbl = tbl.with_columns(pb_is_good_6=nw.col(self.column) >= high_val)
696
-
697
- tbl = tbl.with_columns(
698
- pb_is_good_5=nw.when(nw.col("pb_is_good_5").is_null())
699
- .then(False)
700
- .otherwise(nw.col("pb_is_good_5")),
701
- pb_is_good_6=nw.when(nw.col("pb_is_good_6").is_null())
702
- .then(False)
703
- .otherwise(nw.col("pb_is_good_6")),
704
- )
533
+ if tbl_type in IBIS_BACKENDS:
534
+ # Get the count of test units and convert to a native format
535
+ # TODO: check whether pandas or polars is available
536
+ return self.df.count().to_polars()
705
537
 
706
- tbl = (
707
- tbl.with_columns(
708
- pb_is_good_=(
709
- (
710
- (nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
711
- & nw.col("pb_is_good_4")
712
- )
713
- | (
714
- (nw.col("pb_is_good_5") & ~nw.col("pb_is_good_3"))
715
- | (nw.col("pb_is_good_6")) & ~nw.col("pb_is_good_2")
716
- )
717
- )
718
- )
719
- .drop(
720
- "pb_is_good_1",
721
- "pb_is_good_2",
722
- "pb_is_good_3",
723
- "pb_is_good_4",
724
- "pb_is_good_5",
725
- "pb_is_good_6",
726
- )
727
- .to_native()
728
- )
729
538
 
730
- return tbl
539
+ def _get_compare_expr_nw(compare: Any) -> Any:
540
+ if isinstance(compare, Column):
541
+ if not isinstance(compare.exprs, str):
542
+ raise ValueError("The column expression must be a string.") # pragma: no cover
543
+ return nw.col(compare.exprs)
544
+ return compare
545
+
546
+
547
+ def _column_has_null_values(table: FrameT, column: str) -> bool:
548
+ try:
549
+ # Try the standard null_count() method
550
+ null_count = (table.select(column).null_count())[column][0]
551
+ except AttributeError:
552
+ # For LazyFrames, collect first then get null count
553
+ try:
554
+ collected = table.select(column).collect()
555
+ null_count = (collected.null_count())[column][0]
556
+ except Exception:
557
+ # Fallback: check if any values are null
558
+ try:
559
+ result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
560
+ null_count = result["null_count"][0]
561
+ except Exception:
562
+ # Last resort: return False (assume no nulls)
563
+ return False
564
+
565
+ if null_count is None or null_count == 0:
566
+ return False
731
567
 
732
- def isin(self) -> FrameT | Any:
733
- # All backends now use Narwhals (including former Ibis tables) ---------
568
+ return True
734
569
 
735
- can_be_null: bool = None in self.set
736
570
 
737
- base_expr: nw.Expr = nw.col(self.column).is_in(self.set)
738
- if can_be_null:
739
- base_expr = base_expr | nw.col(self.column).is_null()
571
+ def _check_nulls_across_columns_nw(table, columns_subset):
572
+ # Get all column names from the table
573
+ column_names = columns_subset if columns_subset else table.columns
740
574
 
741
- return self.x.with_columns(pb_is_good_=base_expr).to_native()
575
+ # Build the expression by combining each column's `is_null()` with OR operations
576
+ null_expr = functools.reduce(
577
+ lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
578
+ column_names,
579
+ None,
580
+ )
742
581
 
743
- def notin(self) -> FrameT | Any:
744
- # All backends now use Narwhals (including former Ibis tables) ---------
582
+ # Add the expression as a new column to the table
583
+ result = table.with_columns(_any_is_null_=null_expr)
745
584
 
746
- return (
747
- self.x.with_columns(
748
- pb_is_good_=nw.col(self.column).is_in(self.set),
749
- )
750
- .with_columns(pb_is_good_=~nw.col("pb_is_good_"))
751
- .to_native()
752
- )
585
+ return result
753
586
 
754
- def regex(self) -> FrameT | Any:
755
- # All backends now use Narwhals (including former Ibis tables) ---------
756
587
 
757
- return (
758
- self.x.with_columns(
759
- pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
760
- pb_is_good_2=nw.when(~nw.col(self.column).is_null())
761
- .then(nw.col(self.column).str.contains(pattern=self.pattern))
762
- .otherwise(False),
763
- )
764
- .with_columns(pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
765
- .drop("pb_is_good_1", "pb_is_good_2")
766
- .to_native()
767
- )
588
+ def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
589
+ tgt_col_dtype_str = str(tgt_column.dtype).lower()
768
590
 
769
- def null(self) -> FrameT | Any:
770
- # All backends now use Narwhals (including former Ibis tables) ---------
591
+ if compare_val is isinstance(compare_val, Column): # pragma: no cover
592
+ return compare_val
771
593
 
772
- return self.x.with_columns(
773
- pb_is_good_=nw.col(self.column).is_null(),
774
- ).to_native()
594
+ # Get the type of `compare_expr` and convert, if necessary, to the type of the column
595
+ compare_type_str = str(type(compare_val)).lower()
775
596
 
776
- def not_null(self) -> FrameT | Any:
777
- # All backends now use Narwhals (including former Ibis tables) ---------
597
+ if "datetime.datetime" in compare_type_str:
598
+ compare_type = "datetime"
599
+ elif "datetime.date" in compare_type_str:
600
+ compare_type = "date"
601
+ else:
602
+ compare_type = "other"
778
603
 
779
- return self.x.with_columns(
780
- pb_is_good_=~nw.col(self.column).is_null(),
781
- ).to_native()
604
+ if "datetime" in tgt_col_dtype_str:
605
+ tgt_col_dtype = "datetime"
606
+ elif "date" in tgt_col_dtype_str or "object" in tgt_col_dtype_str:
607
+ # Object type is used for date columns in Pandas
608
+ tgt_col_dtype = "date"
609
+ else:
610
+ tgt_col_dtype = "other"
782
611
 
783
- def rows_distinct(self) -> FrameT | Any:
784
- # All backends now use Narwhals (including former Ibis tables) ---------
612
+ # Handle each combination of `compare_type` and `tgt_col_dtype`, coercing only the
613
+ # `compare_expr` to the type of the column
614
+ if compare_type == "datetime" and tgt_col_dtype == "date":
615
+ # Assume that `compare_expr` is a datetime.datetime object and strip the time part
616
+ # to get a date object
617
+ compare_expr = compare_val.date()
785
618
 
786
- tbl = self.x
619
+ elif compare_type == "date" and tgt_col_dtype == "datetime":
620
+ import datetime
787
621
 
788
- # Get the column subset to use for the test
789
- if self.columns_subset is None:
790
- columns_subset = tbl.columns
791
- else:
792
- columns_subset = self.columns_subset
622
+ # Assume that `compare_expr` is a `datetime.date` object so add in the time part
623
+ # to get a `datetime.datetime` object
624
+ compare_expr = datetime.datetime.combine(compare_val, datetime.datetime.min.time())
793
625
 
794
- # Create a count of duplicates using group_by approach like Ibis backend
795
- # Group by the columns of interest and count occurrences
796
- count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
626
+ else:
627
+ return compare_val
797
628
 
798
- # Join back to original table to get count for each row
799
- tbl = tbl.join(count_tbl, on=columns_subset, how="left")
629
+ return compare_expr
800
630
 
801
- # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
802
- tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
803
631
 
804
- return tbl.to_native()
632
+ def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
633
+ """Check if values in a column evaluate to True for a given predicate expression."""
634
+ if tbl_type == "local":
635
+ # Check the type of expression provided
636
+ if "narwhals" in str(type(expr)) and "expr" in str(type(expr)):
637
+ expression_type = "narwhals"
638
+ elif "polars" in str(type(expr)) and "expr" in str(type(expr)):
639
+ expression_type = "polars"
640
+ else:
641
+ expression_type = "pandas"
805
642
 
806
- def rows_complete(self) -> FrameT | Any:
807
- # All backends now use Narwhals (including former Ibis tables) ---------
643
+ # Determine whether this is a Pandas or Polars table
644
+ tbl_type_detected = _get_tbl_type(data=data_tbl)
645
+ df_lib_name = "polars" if "polars" in tbl_type_detected else "pandas"
808
646
 
809
- tbl = self.x
647
+ if expression_type == "narwhals":
648
+ tbl_nw = _convert_to_narwhals(df=data_tbl)
649
+ tbl_nw = tbl_nw.with_columns(pb_is_good_=expr)
650
+ return tbl_nw.to_native()
810
651
 
811
- # Determine the number of null values in each row (column subsets are handled in
812
- # the `_check_nulls_across_columns_nw()` function)
813
- tbl = _check_nulls_across_columns_nw(table=tbl, columns_subset=self.columns_subset)
652
+ if df_lib_name == "polars" and expression_type == "polars":
653
+ return data_tbl.with_columns(pb_is_good_=expr)
814
654
 
815
- # Failing rows will have the value `True` in the generated column, so we need to negate
816
- # the result to get the passing rows
817
- tbl = tbl.with_columns(pb_is_good_=~nw.col("_any_is_null_"))
818
- tbl = tbl.drop("_any_is_null_")
655
+ if df_lib_name == "pandas" and expression_type == "pandas":
656
+ return data_tbl.assign(pb_is_good_=expr)
819
657
 
820
- # Convert the table to a native format
821
- return tbl.to_native()
658
+ # For remote backends, return original table (placeholder)
659
+ return data_tbl
822
660
 
823
661
 
824
- @dataclass
825
- class ColValsCompareOne:
662
+ def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
826
663
  """
827
- Compare values in a table column against a single value.
664
+ Check if rows in a DataFrame are complete (no null values).
828
665
 
829
- Parameters
830
- ----------
831
- data_tbl
832
- A data table.
833
- column
834
- The column to check.
835
- value
836
- A value to check against.
837
- na_pass
838
- `True` to pass test units with missing values, `False` otherwise.
839
- threshold
840
- The maximum number of failing test units to allow.
841
- assertion_method
842
- The type of assertion ('gt' for greater than, 'lt' for less than).
843
- allowed_types
844
- The allowed data types for the column.
845
- tbl_type
846
- The type of table to use for the assertion.
847
-
848
- Returns
849
- -------
850
- bool
851
- `True` when test units pass below the threshold level for failing test units, `False`
852
- otherwise.
666
+ This function replaces the RowsComplete dataclass for direct usage.
853
667
  """
668
+ tbl = _convert_to_narwhals(df=data_tbl)
854
669
 
855
- data_tbl: FrameT
856
- column: str
857
- value: float | int
858
- na_pass: bool
859
- threshold: int
860
- assertion_method: str
861
- allowed_types: list[str]
862
- tbl_type: str = "local"
863
-
864
- def __post_init__(self):
865
- if self.tbl_type == "local":
866
- # Convert the DataFrame to a format that narwhals can work with, and:
867
- # - check if the `column=` exists
868
- # - check if the `column=` type is compatible with the test
869
- tbl = _column_test_prep(
870
- df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
871
- )
872
- else:
873
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
874
- tbl = self.data_tbl
875
-
876
- # Collect results for the test units; the results are a list of booleans where
877
- # `True` indicates a passing test unit
878
- if self.assertion_method == "gt":
879
- self.test_unit_res = Interrogator(
880
- x=tbl,
881
- column=self.column,
882
- compare=self.value,
883
- na_pass=self.na_pass,
884
- tbl_type=self.tbl_type,
885
- ).gt()
886
- elif self.assertion_method == "lt":
887
- self.test_unit_res = Interrogator(
888
- x=tbl,
889
- column=self.column,
890
- compare=self.value,
891
- na_pass=self.na_pass,
892
- tbl_type=self.tbl_type,
893
- ).lt()
894
- elif self.assertion_method == "eq":
895
- self.test_unit_res = Interrogator(
896
- x=tbl,
897
- column=self.column,
898
- compare=self.value,
899
- na_pass=self.na_pass,
900
- tbl_type=self.tbl_type,
901
- ).eq()
902
- elif self.assertion_method == "ne":
903
- self.test_unit_res = Interrogator(
904
- x=tbl,
905
- column=self.column,
906
- compare=self.value,
907
- na_pass=self.na_pass,
908
- tbl_type=self.tbl_type,
909
- ).ne()
910
- elif self.assertion_method == "ge":
911
- self.test_unit_res = Interrogator(
912
- x=tbl,
913
- column=self.column,
914
- compare=self.value,
915
- na_pass=self.na_pass,
916
- tbl_type=self.tbl_type,
917
- ).ge()
918
- elif self.assertion_method == "le":
919
- self.test_unit_res = Interrogator(
920
- x=tbl,
921
- column=self.column,
922
- compare=self.value,
923
- na_pass=self.na_pass,
924
- tbl_type=self.tbl_type,
925
- ).le()
926
- elif self.assertion_method == "null":
927
- self.test_unit_res = Interrogator(
928
- x=tbl,
929
- column=self.column,
930
- compare=self.value,
931
- tbl_type=self.tbl_type,
932
- ).null()
933
- elif self.assertion_method == "not_null":
934
- self.test_unit_res = Interrogator(
935
- x=tbl,
936
- column=self.column,
937
- compare=self.value,
938
- tbl_type=self.tbl_type,
939
- ).not_null()
940
- else:
941
- raise ValueError(
942
- """Invalid comparison type. Use:
943
- - `gt` for greater than,
944
- - `lt` for less than,
945
- - `eq` for equal to,
946
- - `ne` for not equal to,
947
- - `ge` for greater than or equal to,
948
- - `le` for less than or equal to,
949
- - `null` for null values, or
950
- - `not_null` for not null values.
951
- """
952
- )
953
-
954
- def get_test_results(self):
955
- return self.test_unit_res
956
-
957
- def test(self):
958
- # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
959
- # column and then determine if the test passes overall by comparing the number of failing
960
- # test units to the threshold for failing test units
961
-
962
- results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
963
-
964
- return _threshold_check(
965
- failing_test_units=results_list.count(False), threshold=self.threshold
966
- )
670
+ return interrogate_rows_complete(
671
+ tbl=tbl,
672
+ columns_subset=columns_subset,
673
+ )
967
674
 
968
675
 
969
- @dataclass
970
- class ColValsCompareTwo:
676
+ def col_exists(data_tbl: FrameT, column: str) -> bool:
971
677
  """
972
- General routine to compare values in a column against two values.
678
+ Check if a column exists in a DataFrame.
973
679
 
974
680
  Parameters
975
681
  ----------
@@ -977,1125 +683,1167 @@ class ColValsCompareTwo:
977
683
  A data table.
978
684
  column
979
685
  The column to check.
980
- value1
981
- A value to check against.
982
- value2
983
- A value to check against.
984
- inclusive
985
- A tuple of booleans that state which bounds are inclusive. The position of the boolean
986
- corresponds to the value in the following order: (value1, value2).
987
- na_pass
988
- `True` to pass test units with missing values, `False` otherwise.
989
- threshold
990
- The maximum number of failing test units to allow.
991
- assertion_method
992
- The type of assertion ('between' for between two values and 'outside' for outside two
993
- values).
994
- allowed_types
995
- The allowed data types for the column.
996
- tbl_type
997
- The type of table to use for the assertion.
998
686
 
999
687
  Returns
1000
688
  -------
1001
689
  bool
1002
- `True` when test units pass below the threshold level for failing test units, `False`
1003
- otherwise.
690
+ `True` if the column exists, `False` otherwise.
1004
691
  """
692
+ tbl = _convert_to_narwhals(df=data_tbl)
693
+ return column in tbl.columns
694
+
695
+
696
+ def col_schema_match(
697
+ data_tbl: FrameT,
698
+ schema,
699
+ complete: bool,
700
+ in_order: bool,
701
+ case_sensitive_colnames: bool,
702
+ case_sensitive_dtypes: bool,
703
+ full_match_dtypes: bool,
704
+ threshold: int,
705
+ ) -> bool:
706
+ """
707
+ Check if DataFrame schema matches expected schema.
708
+ """
709
+ from pointblank.schema import _check_schema_match
710
+
711
+ return _check_schema_match(
712
+ data_tbl=data_tbl,
713
+ schema=schema,
714
+ complete=complete,
715
+ in_order=in_order,
716
+ case_sensitive_colnames=case_sensitive_colnames,
717
+ case_sensitive_dtypes=case_sensitive_dtypes,
718
+ full_match_dtypes=full_match_dtypes,
719
+ )
1005
720
 
1006
- data_tbl: FrameT
1007
- column: str
1008
- value1: float | int
1009
- value2: float | int
1010
- inclusive: tuple[bool, bool]
1011
- na_pass: bool
1012
- threshold: int
1013
- assertion_method: str
1014
- allowed_types: list[str]
1015
- tbl_type: str = "local"
1016
-
1017
- def __post_init__(self):
1018
- if self.tbl_type == "local":
1019
- # Convert the DataFrame to a format that narwhals can work with, and:
1020
- # - check if the `column=` exists
1021
- # - check if the `column=` type is compatible with the test
1022
- tbl = _column_test_prep(
1023
- df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
1024
- )
1025
-
1026
- # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
1027
- # for now, just pass the table as is
1028
- else:
1029
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
1030
- tbl = self.data_tbl
1031
-
1032
- # Collect results for the test units; the results are a list of booleans where
1033
- # `True` indicates a passing test unit
1034
- if self.assertion_method == "between":
1035
- self.test_unit_res = Interrogator(
1036
- x=tbl,
1037
- column=self.column,
1038
- low=self.value1,
1039
- high=self.value2,
1040
- inclusive=self.inclusive,
1041
- na_pass=self.na_pass,
1042
- tbl_type=self.tbl_type,
1043
- ).between()
1044
- elif self.assertion_method == "outside":
1045
- self.test_unit_res = Interrogator(
1046
- x=tbl,
1047
- column=self.column,
1048
- low=self.value1,
1049
- high=self.value2,
1050
- inclusive=self.inclusive,
1051
- na_pass=self.na_pass,
1052
- tbl_type=self.tbl_type,
1053
- ).outside()
1054
- else:
1055
- raise ValueError(
1056
- """Invalid assertion type. Use:
1057
- - `between` for values between two values, or
1058
- - `outside` for values outside two values."""
1059
- )
1060
-
1061
- def get_test_results(self):
1062
- return self.test_unit_res
1063
721
 
1064
- def test(self):
1065
- # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
1066
- # column and then determine if the test passes overall by comparing the number of failing
1067
- # test units to the threshold for failing test units
722
+ def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> bool:
723
+ """
724
+ Check if DataFrame row count matches expected count.
725
+ """
726
+ from pointblank.validate import get_row_count
1068
727
 
1069
- results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
728
+ row_count: int = get_row_count(data=data_tbl)
729
+ lower_abs_limit, upper_abs_limit = abs_tol_bounds
730
+ min_val: int = count - lower_abs_limit
731
+ max_val: int = count + upper_abs_limit
1070
732
 
1071
- return _threshold_check(
1072
- failing_test_units=results_list.count(False), threshold=self.threshold
1073
- )
733
+ if inverse:
734
+ return not (row_count >= min_val and row_count <= max_val)
735
+ else:
736
+ return row_count >= min_val and row_count <= max_val
1074
737
 
1075
738
 
1076
- @dataclass
1077
- class ColValsCompareSet:
739
+ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
740
+ """
741
+ Check if DataFrame column count matches expected count.
1078
742
  """
1079
- General routine to compare values in a column against a set of values.
743
+ from pointblank.validate import get_column_count
1080
744
 
1081
- Parameters
1082
- ----------
1083
- data_tbl
1084
- A data table.
1085
- column
1086
- The column to check.
1087
- values
1088
- A set of values to check against.
1089
- threshold
1090
- The maximum number of failing test units to allow.
1091
- inside
1092
- `True` to check if the values are inside the set, `False` to check if the values are
1093
- outside the set.
1094
- allowed_types
1095
- The allowed data types for the column.
1096
- tbl_type
1097
- The type of table to use for the assertion.
745
+ if not inverse:
746
+ return get_column_count(data=data_tbl) == count
747
+ else:
748
+ return get_column_count(data=data_tbl) != count
1098
749
 
1099
- Returns
1100
- -------
1101
- bool
1102
- `True` when test units pass below the threshold level for failing test units, `False`
1103
- otherwise.
750
+
751
+ def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
752
+ """
753
+ Perform conjoint validation using multiple expressions.
1104
754
  """
755
+ # Create a ConjointlyValidation instance and get the results
756
+ conjointly_instance = ConjointlyValidation(
757
+ data_tbl=data_tbl,
758
+ expressions=expressions,
759
+ threshold=threshold,
760
+ tbl_type=tbl_type,
761
+ )
1105
762
 
1106
- data_tbl: FrameT
1107
- column: str
1108
- values: list[float | int]
1109
- threshold: int
1110
- inside: bool
1111
- allowed_types: list[str]
1112
- tbl_type: str = "local"
1113
-
1114
- def __post_init__(self):
1115
- if self.tbl_type == "local":
1116
- # Convert the DataFrame to a format that narwhals can work with, and:
1117
- # - check if the `column=` exists
1118
- # - check if the `column=` type is compatible with the test
1119
- tbl = _column_test_prep(
1120
- df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
1121
- )
1122
- else:
1123
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
1124
- tbl = self.data_tbl
1125
-
1126
- # Collect results for the test units; the results are a list of booleans where
1127
- # `True` indicates a passing test unit
1128
- if self.inside:
1129
- self.test_unit_res = Interrogator(
1130
- x=tbl, column=self.column, set=self.values, tbl_type=self.tbl_type
1131
- ).isin()
1132
- else:
1133
- self.test_unit_res = Interrogator(
1134
- x=tbl, column=self.column, set=self.values, tbl_type=self.tbl_type
1135
- ).notin()
763
+ return conjointly_instance.get_test_results()
1136
764
 
1137
- def get_test_results(self):
1138
- return self.test_unit_res
1139
765
 
1140
- def test(self):
1141
- # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
1142
- # column and then determine if the test passes overall by comparing the number of failing
1143
- # test units to the threshold for failing test units
766
+ def interrogate_gt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
767
+ """Greater than interrogation."""
768
+ return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
1144
769
 
1145
- results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
1146
770
 
1147
- return _threshold_check(
1148
- failing_test_units=results_list.count(False), threshold=self.threshold
1149
- )
771
+ def interrogate_lt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
772
+ """Less than interrogation."""
773
+ return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
1150
774
 
1151
775
 
1152
- @dataclass
1153
- class ColValsRegex:
1154
- """
1155
- Check if values in a column match a regular expression pattern.
776
+ def interrogate_ge(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
777
+ """Greater than or equal interrogation."""
778
+ return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
1156
779
 
1157
- Parameters
1158
- ----------
1159
- data_tbl
1160
- A data table.
1161
- column
1162
- The column to check.
1163
- pattern
1164
- The regular expression pattern to check against.
1165
- na_pass
1166
- `True` to pass test units with missing values, `False` otherwise.
1167
- threshold
1168
- The maximum number of failing test units to allow.
1169
- allowed_types
1170
- The allowed data types for the column.
1171
- tbl_type
1172
- The type of table to use for the assertion.
1173
780
 
1174
- Returns
1175
- -------
1176
- bool
1177
- `True` when test units pass below the threshold level for failing test units, `False`
1178
- otherwise.
1179
- """
781
+ def interrogate_le(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
782
+ """Less than or equal interrogation."""
783
+ return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
1180
784
 
1181
- data_tbl: FrameT
1182
- column: str
1183
- pattern: str
1184
- na_pass: bool
1185
- threshold: int
1186
- allowed_types: list[str]
1187
- tbl_type: str = "local"
1188
-
1189
- def __post_init__(self):
1190
- if self.tbl_type == "local":
1191
- # Convert the DataFrame to a format that narwhals can work with, and:
1192
- # - check if the `column=` exists
1193
- # - check if the `column=` type is compatible with the test
1194
- tbl = _column_test_prep(
1195
- df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
1196
- )
1197
- else:
1198
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
1199
- tbl = self.data_tbl
1200
-
1201
- # Collect results for the test units; the results are a list of booleans where
1202
- # `True` indicates a passing test unit
1203
- self.test_unit_res = Interrogator(
1204
- x=tbl,
1205
- column=self.column,
1206
- pattern=self.pattern,
1207
- na_pass=self.na_pass,
1208
- tbl_type=self.tbl_type,
1209
- ).regex()
1210
785
 
1211
- def get_test_results(self):
1212
- return self.test_unit_res
786
+ def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
787
+ """Equal interrogation."""
1213
788
 
1214
- def test(self):
1215
- # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
1216
- # column and then determine if the test passes overall by comparing the number of failing
1217
- # test units to the threshold for failing test units
789
+ nw_tbl = nw.from_native(tbl)
1218
790
 
1219
- results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
791
+ if isinstance(compare, Column):
792
+ compare_expr = _get_compare_expr_nw(compare=compare)
793
+
794
+ result_tbl = nw_tbl.with_columns(
795
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
796
+ pb_is_good_2=(
797
+ nw.col(compare.name).is_null() & na_pass
798
+ if isinstance(compare, Column)
799
+ else nw.lit(False)
800
+ ),
801
+ )
1220
802
 
1221
- return _threshold_check(
1222
- failing_test_units=results_list.count(False), threshold=self.threshold
803
+ result_tbl = result_tbl.with_columns(
804
+ pb_is_good_3=(~nw.col(compare.name).is_null() & ~nw.col(column).is_null())
1223
805
  )
1224
806
 
807
+ if is_pandas_dataframe(result_tbl.to_native()):
808
+ # For Pandas, handle potential NA comparison issues
809
+ try:
810
+ result_tbl = result_tbl.with_columns(
811
+ pb_is_good_4=nw.col(column) == compare_expr,
812
+ )
813
+ except (TypeError, ValueError) as e:
814
+ # Handle Pandas NA comparison issues
815
+ if "boolean value of NA is ambiguous" in str(e):
816
+ # Work around Pandas NA comparison issue by using Null checks first
817
+ result_tbl = result_tbl.with_columns(
818
+ pb_is_good_4_tmp=(
819
+ # Both Null: True (they're equal)
820
+ (nw.col(column).is_null() & nw.col(compare.name).is_null())
821
+ |
822
+ # Both not Null and values are equal: use string conversion
823
+ # as a fallback
824
+ (
825
+ (~nw.col(column).is_null() & ~nw.col(compare.name).is_null())
826
+ & (
827
+ nw.col(column).cast(nw.String)
828
+ == nw.col(compare.name).cast(nw.String)
829
+ )
830
+ )
831
+ )
832
+ )
833
+ result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
834
+ elif "cannot compare" in str(e).lower():
835
+ # Handle genuine type incompatibility
836
+ native_df = result_tbl.to_native()
837
+ col_dtype = str(native_df[column].dtype)
838
+ compare_dtype = str(native_df[compare.name].dtype)
1225
839
 
1226
- @dataclass
1227
- class ColValsExpr:
1228
- """
1229
- Check if values in a column evaluate to True for a given predicate expression.
840
+ raise TypeError(
841
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
842
+ f"'{compare.name}' (dtype: {compare_dtype}). "
843
+ f"Column types are incompatible for equality comparison. "
844
+ f"Ensure both columns have compatible data types (both numeric, "
845
+ f"both string, or both datetime) before comparing."
846
+ ) from e
847
+ else:
848
+ raise # Re-raise unexpected errors
1230
849
 
1231
- Parameters
1232
- ----------
1233
- data_tbl
1234
- A data table.
1235
- expr
1236
- The expression to check against.
1237
- threshold
1238
- The maximum number of failing test units to allow.
1239
- tbl_type
1240
- The type of table to use for the assertion.
850
+ result_tbl = result_tbl.with_columns(
851
+ pb_is_good_=nw.col("pb_is_good_1")
852
+ | nw.col("pb_is_good_2")
853
+ | (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
854
+ )
1241
855
 
1242
- Returns
1243
- -------
1244
- bool
1245
- `True` when test units pass below the threshold level for failing test units, `False`
1246
- otherwise.
1247
- """
856
+ else:
857
+ # For non-Pandas backends (Polars, Ibis, etc.), handle type incompatibility
858
+ try:
859
+ result_tbl = result_tbl.with_columns(
860
+ pb_is_good_4=nw.col(column) == compare_expr,
861
+ )
862
+ except (TypeError, ValueError, Exception) as e:
863
+ # Handle type compatibility issues for all backends
864
+ error_msg = str(e).lower()
865
+ if (
866
+ "cannot compare" in error_msg
867
+ or "type" in error_msg
868
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
869
+ or "dtype" in error_msg
870
+ or "conversion" in error_msg
871
+ and "failed" in error_msg
872
+ ):
873
+ # Get column types for a descriptive error message
874
+ try:
875
+ native_df = result_tbl.to_native()
876
+ if hasattr(native_df, "dtypes"):
877
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
878
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
879
+ elif hasattr(native_df, "schema"):
880
+ col_dtype = str(native_df.schema.get(column, "unknown"))
881
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
882
+ else:
883
+ col_dtype = "unknown"
884
+ compare_dtype = "unknown"
885
+ except Exception:
886
+ col_dtype = "unknown"
887
+ compare_dtype = "unknown"
1248
888
 
1249
- data_tbl: FrameT
1250
- expr: str
1251
- threshold: int
1252
- tbl_type: str = "local"
1253
-
1254
- def __post_init__(self):
1255
- if self.tbl_type == "local":
1256
- # Check the type of expression provided
1257
- if "narwhals" in str(type(self.expr)) and "expr" in str(type(self.expr)):
1258
- expression_type = "narwhals"
1259
- elif "polars" in str(type(self.expr)) and "expr" in str(type(self.expr)):
1260
- expression_type = "polars"
1261
- else:
1262
- expression_type = "pandas"
889
+ raise TypeError(
890
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
891
+ f"'{compare.name}' (dtype: {compare_dtype}). "
892
+ f"Column types are incompatible for equality comparison. "
893
+ f"Ensure both columns have compatible data types (both numeric, "
894
+ f"both string, or both datetime) before comparing."
895
+ ) from e
896
+ else:
897
+ raise # Re-raise unexpected errors
898
+
899
+ result_tbl = result_tbl.with_columns(
900
+ pb_is_good_=nw.col("pb_is_good_1")
901
+ | nw.col("pb_is_good_2")
902
+ | (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
903
+ )
904
+
905
+ return result_tbl.drop(
906
+ "pb_is_good_1", "pb_is_good_2", "pb_is_good_3", "pb_is_good_4"
907
+ ).to_native()
1263
908
 
1264
- # Determine whether this is a Pandas or Polars table
1265
- tbl_type = _get_tbl_type(data=self.data_tbl)
909
+ else:
910
+ compare_expr = _get_compare_expr_nw(compare=compare)
911
+ compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
912
+
913
+ result_tbl = nw_tbl.with_columns(
914
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
915
+ pb_is_good_2=(
916
+ nw.col(compare.name).is_null() & na_pass
917
+ if isinstance(compare, Column)
918
+ else nw.lit(False)
919
+ ),
920
+ )
1266
921
 
1267
- df_lib_name = "polars" if "polars" in tbl_type else "pandas"
922
+ # Handle type incompatibility for literal value comparisons
923
+ try:
924
+ result_tbl = result_tbl.with_columns(pb_is_good_3=nw.col(column) == compare_expr)
925
+ except (TypeError, ValueError, Exception) as e:
926
+ # Handle type compatibility issues for column vs literal comparisons
927
+ error_msg = str(e).lower()
928
+ if (
929
+ "cannot compare" in error_msg
930
+ or "type" in error_msg
931
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
932
+ or "dtype" in error_msg
933
+ or "conversion" in error_msg
934
+ and "failed" in error_msg
935
+ ):
936
+ # Get column type for a descriptive error message
937
+ try:
938
+ native_df = result_tbl.to_native()
939
+ if hasattr(native_df, "dtypes"):
940
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
941
+ elif hasattr(native_df, "schema"):
942
+ col_dtype = str(native_df.schema.get(column, "unknown"))
943
+ else:
944
+ col_dtype = "unknown"
945
+ except Exception:
946
+ col_dtype = "unknown"
947
+
948
+ compare_type = type(compare).__name__
949
+ compare_value = str(compare)
950
+
951
+ raise TypeError(
952
+ f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
953
+ f"literal value '{compare_value}' (type: {compare_type}). "
954
+ f"Column type and literal value type are incompatible for equality comparison. "
955
+ f"Ensure the column data type is compatible with the comparison value "
956
+ f"(e.g., numeric column with numeric value, string column with string value)."
957
+ ) from e
958
+ else:
959
+ raise # Re-raise unexpected errors
1268
960
 
1269
- if expression_type == "narwhals":
1270
- tbl_nw = _convert_to_narwhals(df=self.data_tbl)
1271
- tbl_nw = tbl_nw.with_columns(pb_is_good_=self.expr)
1272
- tbl = tbl_nw.to_native()
1273
- self.test_unit_res = tbl
961
+ result_tbl = result_tbl.with_columns(
962
+ pb_is_good_3=(
963
+ nw.when(nw.col("pb_is_good_3").is_null())
964
+ .then(nw.lit(False))
965
+ .otherwise(nw.col("pb_is_good_3"))
966
+ )
967
+ )
1274
968
 
1275
- return self
969
+ result_tbl = result_tbl.with_columns(
970
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
971
+ )
1276
972
 
1277
- if df_lib_name == "polars" and expression_type == "polars":
1278
- self.test_unit_res = self.data_tbl.with_columns(pb_is_good_=self.expr)
973
+ return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
1279
974
 
1280
- if df_lib_name == "pandas" and expression_type == "pandas":
1281
- self.test_unit_res = self.data_tbl.assign(pb_is_good_=self.expr)
1282
975
 
1283
- return self
976
+ def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
977
+ """Not equal interrogation."""
1284
978
 
1285
- def get_test_results(self):
1286
- return self.test_unit_res
979
+ nw_tbl = nw.from_native(tbl)
1287
980
 
981
+ # Determine if the reference and comparison columns have any null values
982
+ ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
1288
983
 
1289
- @dataclass
1290
- class ColExistsHasType:
1291
- """
1292
- Check if a column exists in a DataFrame or has a certain data type.
984
+ if isinstance(compare, Column):
985
+ compare_name = compare.name if isinstance(compare, Column) else compare
986
+ cmp_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=compare_name)
987
+ else:
988
+ cmp_col_has_null_vals = False
1293
989
 
1294
- Parameters
1295
- ----------
1296
- data_tbl
1297
- A data table.
1298
- column
1299
- The column to check.
1300
- threshold
1301
- The maximum number of failing test units to allow.
1302
- assertion_method
1303
- The type of assertion ('exists' for column existence).
1304
- tbl_type
1305
- The type of table to use for the assertion.
990
+ # If neither column has null values, we can proceed with the comparison
991
+ # without too many complications
992
+ if not ref_col_has_null_vals and not cmp_col_has_null_vals:
993
+ if isinstance(compare, Column):
994
+ compare_expr = _get_compare_expr_nw(compare=compare)
1306
995
 
1307
- Returns
1308
- -------
1309
- bool
1310
- `True` when test units pass below the threshold level for failing test units, `False`
1311
- otherwise.
1312
- """
996
+ # Handle type incompatibility for column comparisons
997
+ try:
998
+ return nw_tbl.with_columns(
999
+ pb_is_good_=nw.col(column) != compare_expr,
1000
+ ).to_native()
1001
+ except (TypeError, ValueError, Exception) as e:
1002
+ # Handle type compatibility issues for column vs column comparisons
1003
+ error_msg = str(e).lower()
1004
+ if (
1005
+ "cannot compare" in error_msg
1006
+ or "type" in error_msg
1007
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1008
+ or "dtype" in error_msg
1009
+ or "conversion" in error_msg
1010
+ and "failed" in error_msg
1011
+ or "boolean value of na is ambiguous" in error_msg
1012
+ ):
1013
+ # Get column types for a descriptive error message
1014
+ try:
1015
+ native_df = nw_tbl.to_native()
1016
+ if hasattr(native_df, "dtypes"):
1017
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1018
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
1019
+ elif hasattr(native_df, "schema"):
1020
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1021
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
1022
+ else:
1023
+ col_dtype = "unknown"
1024
+ compare_dtype = "unknown"
1025
+ except Exception:
1026
+ col_dtype = "unknown"
1027
+ compare_dtype = "unknown"
1028
+
1029
+ raise TypeError(
1030
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1031
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1032
+ f"Column types are incompatible for inequality comparison. "
1033
+ f"Ensure both columns have compatible data types (both numeric, "
1034
+ f"both string, or both datetime) before comparing."
1035
+ ) from e
1036
+ else:
1037
+ raise # Re-raise unexpected errors
1313
1038
 
1314
- data_tbl: FrameT
1315
- column: str
1316
- threshold: int
1317
- assertion_method: str
1318
- tbl_type: str = "local"
1319
-
1320
- def __post_init__(self):
1321
- if self.tbl_type == "local":
1322
- # Convert the DataFrame to a format that narwhals can work with, and:
1323
- # - check if the `column=` exists
1324
- # - check if the `column=` type is compatible with the test
1325
- tbl = _convert_to_narwhals(df=self.data_tbl)
1326
1039
  else:
1327
- # For remote backends (Ibis), pass the table as is since Narwhals can handle it
1328
- tbl = _convert_to_narwhals(df=self.data_tbl)
1040
+ compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare)
1329
1041
 
1330
- if self.assertion_method == "exists":
1331
- res = int(self.column in tbl.columns)
1042
+ # Handle type incompatibility for literal comparisons
1043
+ try:
1044
+ return nw_tbl.with_columns(
1045
+ pb_is_good_=nw.col(column) != nw.lit(compare_expr),
1046
+ ).to_native()
1047
+ except (TypeError, ValueError, Exception) as e:
1048
+ # Handle type compatibility issues for column vs literal comparisons
1049
+ error_msg = str(e).lower()
1050
+ if (
1051
+ "cannot compare" in error_msg
1052
+ or "type" in error_msg
1053
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1054
+ or "dtype" in error_msg
1055
+ or "conversion" in error_msg
1056
+ and "failed" in error_msg
1057
+ ):
1058
+ # Get column type for a descriptive error message
1059
+ try:
1060
+ native_df = nw_tbl.to_native()
1061
+ if hasattr(native_df, "dtypes"):
1062
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1063
+ elif hasattr(native_df, "schema"):
1064
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1065
+ else:
1066
+ col_dtype = "unknown"
1067
+ except Exception:
1068
+ col_dtype = "unknown"
1069
+
1070
+ compare_type = type(compare).__name__
1071
+ compare_value = str(compare)
1332
1072
 
1333
- self.test_unit_res = res
1073
+ raise TypeError(
1074
+ f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
1075
+ f"literal value '{compare_value}' (type: {compare_type}). "
1076
+ f"Column type and literal value type are incompatible for inequality comparison. "
1077
+ f"Ensure the column data type is compatible with the comparison value "
1078
+ f"(e.g., numeric column with numeric value, string column with string value)."
1079
+ ) from e
1080
+ else:
1081
+ raise # Re-raise unexpected errors
1334
1082
 
1335
- def get_test_results(self):
1336
- return self.test_unit_res
1083
+ # If either column has Null values, we need to handle the comparison
1084
+ # much more carefully since we can't inadvertently compare Null values
1085
+ # to non-Null values
1337
1086
 
1087
+ if isinstance(compare, Column):
1088
+ compare_expr = _get_compare_expr_nw(compare=compare)
1338
1089
 
1339
- @dataclass
1340
- class RowsDistinct:
1341
- """
1342
- Check if rows in a DataFrame are distinct.
1090
+ # CASE 1: the reference column has Null values but the comparison column does not
1091
+ if ref_col_has_null_vals and not cmp_col_has_null_vals:
1092
+ if is_pandas_dataframe(nw_tbl.to_native()):
1093
+ try:
1094
+ result_tbl = nw_tbl.with_columns(
1095
+ pb_is_good_1=nw.col(column).is_null(),
1096
+ pb_is_good_2=nw.col(column) != nw.col(compare.name),
1097
+ )
1098
+ except (TypeError, ValueError) as e:
1099
+ # Handle Pandas type compatibility issues
1100
+ if (
1101
+ "boolean value of NA is ambiguous" in str(e)
1102
+ or "cannot compare" in str(e).lower()
1103
+ ):
1104
+ # Get column types for a descriptive error message
1105
+ native_df = nw_tbl.to_native()
1106
+ col_dtype = str(native_df[column].dtype)
1107
+ compare_dtype = str(native_df[compare.name].dtype)
1343
1108
 
1344
- Parameters
1345
- ----------
1346
- data_tbl
1347
- A data table.
1348
- columns_subset
1349
- A list of columns to check for distinctness.
1350
- threshold
1351
- The maximum number of failing test units to allow.
1352
- tbl_type
1353
- The type of table to use for the assertion.
1109
+ raise TypeError(
1110
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1111
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1112
+ f"Column types are incompatible for inequality comparison. "
1113
+ f"Ensure both columns have compatible data types (both numeric, "
1114
+ f"both string, or both datetime) before comparing."
1115
+ ) from e
1116
+ else:
1117
+ raise # Re-raise unexpected errors
1354
1118
 
1355
- Returns
1356
- -------
1357
- bool
1358
- `True` when test units pass below the threshold level for failing test units, `False`
1359
- otherwise.
1360
- """
1119
+ else:
1120
+ try:
1121
+ result_tbl = nw_tbl.with_columns(
1122
+ pb_is_good_1=nw.col(column).is_null(),
1123
+ pb_is_good_2=nw.col(column) != nw.col(compare.name),
1124
+ )
1125
+ except (TypeError, ValueError, Exception) as e:
1126
+ # Handle type compatibility issues for non-Pandas backends
1127
+ error_msg = str(e).lower()
1128
+ if (
1129
+ "cannot compare" in error_msg
1130
+ or "type" in error_msg
1131
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1132
+ or "dtype" in error_msg
1133
+ or "conversion" in error_msg
1134
+ and "failed" in error_msg
1135
+ ):
1136
+ # Get column types for a descriptive error message
1137
+ try:
1138
+ native_df = nw_tbl.to_native()
1139
+ if hasattr(native_df, "dtypes"):
1140
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1141
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
1142
+ elif hasattr(native_df, "schema"):
1143
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1144
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
1145
+ else:
1146
+ col_dtype = "unknown"
1147
+ compare_dtype = "unknown"
1148
+ except Exception:
1149
+ col_dtype = "unknown"
1150
+ compare_dtype = "unknown"
1361
1151
 
1362
- data_tbl: FrameT
1363
- columns_subset: list[str] | None
1364
- threshold: int
1365
- tbl_type: str = "local"
1152
+ raise TypeError(
1153
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1154
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1155
+ f"Column types are incompatible for inequality comparison. "
1156
+ f"Ensure both columns have compatible data types (both numeric, "
1157
+ f"both string, or both datetime) before comparing."
1158
+ ) from e
1159
+ else:
1160
+ raise # Re-raise unexpected errors
1366
1161
 
1367
- def __post_init__(self):
1368
- if self.tbl_type == "local":
1369
- # Convert the DataFrame to a format that narwhals can work with, and:
1370
- # - check if the `column=` exists
1371
- # - check if the `column=` type is compatible with the test
1372
- tbl = _column_subset_test_prep(df=self.data_tbl, columns_subset=self.columns_subset)
1162
+ if not na_pass:
1163
+ result_tbl = result_tbl.with_columns(
1164
+ pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
1165
+ )
1373
1166
 
1374
- # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
1375
- # for now, just pass the table as is
1376
- else:
1377
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
1378
- tbl = self.data_tbl
1167
+ if is_polars_dataframe(nw_tbl.to_native()):
1168
+ # There may be Null values in the `pb_is_good_2` column, change those to
1169
+ # True if `na_pass=` is True, False otherwise
1379
1170
 
1380
- # Collect results for the test units; the results are a list of booleans where
1381
- # `True` indicates a passing test unit
1382
- self.test_unit_res = Interrogator(
1383
- x=tbl,
1384
- columns_subset=self.columns_subset,
1385
- tbl_type=self.tbl_type,
1386
- ).rows_distinct()
1171
+ result_tbl = result_tbl.with_columns(
1172
+ pb_is_good_2=nw.when(nw.col("pb_is_good_2").is_null())
1173
+ .then(False)
1174
+ .otherwise(nw.col("pb_is_good_2")),
1175
+ )
1387
1176
 
1388
- def get_test_results(self):
1389
- return self.test_unit_res
1177
+ if na_pass:
1178
+ result_tbl = result_tbl.with_columns(
1179
+ pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1180
+ )
1181
+ else:
1182
+ # General case (non-Polars): handle na_pass=True properly
1183
+ if na_pass:
1184
+ result_tbl = result_tbl.with_columns(
1185
+ pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1186
+ )
1390
1187
 
1391
- def test(self):
1392
- # Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
1393
- # column and then determine if the test passes overall by comparing the number of failing
1394
- # test units to the threshold for failing test units
1188
+ return (
1189
+ result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
1190
+ .drop("pb_is_good_1", "pb_is_good_2")
1191
+ .to_native()
1192
+ )
1395
1193
 
1396
- results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
1194
+ # CASE 2: the comparison column has Null values but the reference column does not
1195
+ elif not ref_col_has_null_vals and cmp_col_has_null_vals:
1196
+ if is_pandas_dataframe(nw_tbl.to_native()):
1197
+ try:
1198
+ result_tbl = nw_tbl.with_columns(
1199
+ pb_is_good_1=nw.col(column) != nw.lit(compare.name),
1200
+ pb_is_good_2=nw.col(compare.name).is_null(),
1201
+ )
1202
+ except (TypeError, ValueError) as e:
1203
+ # Handle Pandas type compatibility issues
1204
+ if (
1205
+ "boolean value of NA is ambiguous" in str(e)
1206
+ or "cannot compare" in str(e).lower()
1207
+ ):
1208
+ # Get column types for a descriptive error message
1209
+ native_df = nw_tbl.to_native()
1210
+ col_dtype = str(native_df[column].dtype)
1211
+ compare_dtype = str(native_df[compare.name].dtype)
1397
1212
 
1398
- return _threshold_check(
1399
- failing_test_units=results_list.count(False), threshold=self.threshold
1400
- )
1213
+ raise TypeError(
1214
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1215
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1216
+ f"Column types are incompatible for inequality comparison. "
1217
+ f"Ensure both columns have compatible data types (both numeric, "
1218
+ f"both string, or both datetime) before comparing."
1219
+ ) from e
1220
+ else:
1221
+ raise # Re-raise unexpected errors
1401
1222
 
1223
+ else:
1224
+ try:
1225
+ result_tbl = nw_tbl.with_columns(
1226
+ pb_is_good_1=nw.col(column) != nw.col(compare.name),
1227
+ pb_is_good_2=nw.col(compare.name).is_null(),
1228
+ )
1229
+ except (TypeError, ValueError, Exception) as e:
1230
+ # Handle type compatibility issues for non-Pandas backends
1231
+ error_msg = str(e).lower()
1232
+ if (
1233
+ "cannot compare" in error_msg
1234
+ or "type" in error_msg
1235
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1236
+ or "dtype" in error_msg
1237
+ or "conversion" in error_msg
1238
+ and "failed" in error_msg
1239
+ ):
1240
+ # Get column types for a descriptive error message
1241
+ try:
1242
+ native_df = nw_tbl.to_native()
1243
+ if hasattr(native_df, "dtypes"):
1244
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1245
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
1246
+ elif hasattr(native_df, "schema"):
1247
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1248
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
1249
+ else:
1250
+ col_dtype = "unknown"
1251
+ compare_dtype = "unknown"
1252
+ except Exception:
1253
+ col_dtype = "unknown"
1254
+ compare_dtype = "unknown"
1402
1255
 
1403
- @dataclass
1404
- class RowsComplete:
1405
- """
1406
- Check if rows in a DataFrame are complete.
1256
+ raise TypeError(
1257
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1258
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1259
+ f"Column types are incompatible for inequality comparison. "
1260
+ f"Ensure both columns have compatible data types (both numeric, "
1261
+ f"both string, or both datetime) before comparing."
1262
+ ) from e
1263
+ else:
1264
+ raise # Re-raise unexpected errors
1407
1265
 
1408
- Parameters
1409
- ----------
1410
- data_tbl
1411
- A data table.
1412
- columns_subset
1413
- A list of columns to check for completeness.
1414
- threshold
1415
- The maximum number of failing test units to allow.
1416
- tbl_type
1417
- The type of table to use for the assertion.
1266
+ if not na_pass:
1267
+ result_tbl = result_tbl.with_columns(
1268
+ pb_is_good_1=nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2")
1269
+ )
1418
1270
 
1419
- Returns
1420
- -------
1421
- bool
1422
- `True` when test units pass below the threshold level for failing test units, `False`
1423
- otherwise.
1424
- """
1271
+ if is_polars_dataframe(nw_tbl.to_native()):
1272
+ if na_pass:
1273
+ result_tbl = result_tbl.with_columns(
1274
+ pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1275
+ )
1276
+ else:
1277
+ # General case (non-Polars): handle `na_pass=True` properly
1278
+ if na_pass:
1279
+ result_tbl = result_tbl.with_columns(
1280
+ pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1281
+ )
1425
1282
 
1426
- data_tbl: FrameT
1427
- columns_subset: list[str] | None
1428
- threshold: int
1429
- tbl_type: str = "local"
1283
+ return (
1284
+ result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_1"))
1285
+ .drop("pb_is_good_1", "pb_is_good_2")
1286
+ .to_native()
1287
+ )
1430
1288
 
1431
- def __post_init__(self):
1432
- if self.tbl_type == "local":
1433
- # Convert the DataFrame to a format that narwhals can work with, and:
1434
- # - check if the `column=` exists
1435
- # - check if the `column=` type is compatible with the test
1436
- tbl = _column_subset_test_prep(df=self.data_tbl, columns_subset=self.columns_subset)
1289
+ # CASE 3: both columns have Null values and there may potentially be cases where
1290
+ # there could even be Null/Null comparisons
1291
+ elif ref_col_has_null_vals and cmp_col_has_null_vals:
1292
+ try:
1293
+ result_tbl = nw_tbl.with_columns(
1294
+ pb_is_good_1=nw.col(column).is_null(),
1295
+ pb_is_good_2=nw.col(compare.name).is_null(),
1296
+ pb_is_good_3=nw.col(column) != nw.col(compare.name),
1297
+ )
1298
+ except (TypeError, ValueError, Exception) as e:
1299
+ # Handle type compatibility issues for column vs column comparisons
1300
+ error_msg = str(e).lower()
1301
+ if (
1302
+ "cannot compare" in error_msg
1303
+ or "type" in error_msg
1304
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1305
+ or "dtype" in error_msg
1306
+ or "conversion" in error_msg
1307
+ and "failed" in error_msg
1308
+ or "boolean value of na is ambiguous" in error_msg
1309
+ ):
1310
+ # Get column types for a descriptive error message
1311
+ try:
1312
+ native_df = nw_tbl.to_native()
1313
+ if hasattr(native_df, "dtypes"):
1314
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1315
+ compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
1316
+ elif hasattr(native_df, "schema"):
1317
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1318
+ compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
1319
+ else:
1320
+ col_dtype = "unknown"
1321
+ compare_dtype = "unknown"
1322
+ except Exception:
1323
+ col_dtype = "unknown"
1324
+ compare_dtype = "unknown"
1437
1325
 
1438
- # TODO: For Ibis backends, check if the column exists and if the column type is compatible;
1439
- # for now, just pass the table as is
1440
- else:
1441
- # For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
1442
- tbl = self.data_tbl
1326
+ raise TypeError(
1327
+ f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
1328
+ f"'{compare.name}' (dtype: {compare_dtype}). "
1329
+ f"Column types are incompatible for inequality comparison. "
1330
+ f"Ensure both columns have compatible data types (both numeric, "
1331
+ f"both string, or both datetime) before comparing."
1332
+ ) from e
1333
+ else:
1334
+ raise # Re-raise unexpected errors
1443
1335
 
1444
- # Collect results for the test units; the results are a list of booleans where
1445
- # `True` indicates a passing test unit
1446
- self.test_unit_res = Interrogator(
1447
- x=tbl,
1448
- columns_subset=self.columns_subset,
1449
- tbl_type=self.tbl_type,
1450
- ).rows_complete()
1336
+ if not na_pass:
1337
+ result_tbl = result_tbl.with_columns(
1338
+ pb_is_good_3=nw.col("pb_is_good_3")
1339
+ & ~nw.col("pb_is_good_1")
1340
+ & ~nw.col("pb_is_good_2")
1341
+ )
1451
1342
 
1452
- def get_test_results(self):
1453
- return self.test_unit_res
1343
+ if is_polars_dataframe(nw_tbl.to_native()):
1344
+ if na_pass:
1345
+ result_tbl = result_tbl.with_columns(
1346
+ pb_is_good_3=(
1347
+ nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1348
+ .then(True)
1349
+ .otherwise(False)
1350
+ )
1351
+ )
1352
+ else:
1353
+ # General case (non-Polars): handle na_pass=True properly
1354
+ if na_pass:
1355
+ result_tbl = result_tbl.with_columns(
1356
+ pb_is_good_3=(
1357
+ nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
1358
+ .then(True)
1359
+ .otherwise(nw.col("pb_is_good_3"))
1360
+ )
1361
+ )
1454
1362
 
1363
+ return (
1364
+ result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_3"))
1365
+ .drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
1366
+ .to_native()
1367
+ )
1455
1368
 
1456
- @dataclass
1457
- class ColSchemaMatch:
1458
- """
1459
- Check if a column exists in a DataFrame or has a certain data type.
1369
+ else:
1370
+ # Case where the reference column contains null values
1371
+ if ref_col_has_null_vals:
1372
+ # Create individual cases for Pandas and Polars
1373
+ compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare)
1460
1374
 
1461
- Parameters
1462
- ----------
1463
- data_tbl
1464
- A data table.
1465
- schema
1466
- A schema to check against.
1467
- complete
1468
- `True` to check if the schema is complete, `False` otherwise.
1469
- in_order
1470
- `True` to check if the schema is in order, `False` otherwise.
1471
- case_sensitive_colnames
1472
- `True` to perform column-name matching in a case-sensitive manner, `False` otherwise.
1473
- case_sensitive_dtypes
1474
- `True` to perform data-type matching in a case-sensitive manner, `False` otherwise.
1475
- full_match_dtypes
1476
- `True` to perform a full match of data types, `False` otherwise.
1477
- threshold
1478
- The maximum number of failing test units to allow.
1479
- tbl_type
1480
- The type of table to use for the assertion.
1481
-
1482
- Returns
1483
- -------
1484
- bool
1485
- `True` when test units pass below the threshold level for failing test units, `False`
1486
- otherwise.
1487
- """
1375
+ if is_pandas_dataframe(nw_tbl.to_native()):
1376
+ try:
1377
+ result_tbl = nw_tbl.with_columns(
1378
+ pb_is_good_1=nw.col(column).is_null(),
1379
+ pb_is_good_2=nw.col(column) != nw.lit(compare_expr),
1380
+ )
1381
+ except (TypeError, ValueError) as e:
1382
+ # Handle Pandas type compatibility issues for literal comparisons
1383
+ if (
1384
+ "boolean value of NA is ambiguous" in str(e)
1385
+ or "cannot compare" in str(e).lower()
1386
+ ):
1387
+ # Get column type for a descriptive error message
1388
+ native_df = nw_tbl.to_native()
1389
+ col_dtype = str(native_df[column].dtype)
1390
+ compare_type = type(compare).__name__
1391
+ compare_value = str(compare)
1488
1392
 
1489
- data_tbl: FrameT | Any
1490
- schema: any
1491
- complete: bool
1492
- in_order: bool
1493
- case_sensitive_colnames: bool
1494
- case_sensitive_dtypes: bool
1495
- full_match_dtypes: bool
1496
- threshold: int
1497
-
1498
- def __post_init__(self):
1499
- schema_expect = self.schema
1500
- schema_actual = Schema(tbl=self.data_tbl)
1501
-
1502
- if self.complete and self.in_order:
1503
- # Check if the schema is complete and in order (most restrictive check)
1504
- # complete: True, in_order: True
1505
- res = schema_expect._compare_schema_columns_complete_in_order(
1506
- other=schema_actual,
1507
- case_sensitive_colnames=self.case_sensitive_colnames,
1508
- case_sensitive_dtypes=self.case_sensitive_dtypes,
1509
- full_match_dtypes=self.full_match_dtypes,
1510
- )
1393
+ raise TypeError(
1394
+ f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
1395
+ f"literal value '{compare_value}' (type: {compare_type}). "
1396
+ f"Column type and literal value type are incompatible for inequality comparison. "
1397
+ f"Ensure the column data type is compatible with the comparison value "
1398
+ f"(e.g., numeric column with numeric value, string column with string value)."
1399
+ ) from e
1400
+ else:
1401
+ raise # Re-raise unexpected errors
1511
1402
 
1512
- elif not self.complete and not self.in_order:
1513
- # Check if the schema is at least a subset, and, order of columns does not matter
1514
- # complete: False, in_order: False
1515
- res = schema_expect._compare_schema_columns_subset_any_order(
1516
- other=schema_actual,
1517
- case_sensitive_colnames=self.case_sensitive_colnames,
1518
- case_sensitive_dtypes=self.case_sensitive_dtypes,
1519
- full_match_dtypes=self.full_match_dtypes,
1520
- )
1403
+ if not na_pass:
1404
+ result_tbl = result_tbl.with_columns(
1405
+ pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
1406
+ )
1521
1407
 
1522
- elif self.complete:
1523
- # Check if the schema is complete, but the order of columns does not matter
1524
- # complete: True, in_order: False
1525
- res = schema_expect._compare_schema_columns_complete_any_order(
1526
- other=schema_actual,
1527
- case_sensitive_colnames=self.case_sensitive_colnames,
1528
- case_sensitive_dtypes=self.case_sensitive_dtypes,
1529
- full_match_dtypes=self.full_match_dtypes,
1530
- )
1408
+ return (
1409
+ result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
1410
+ .drop("pb_is_good_1", "pb_is_good_2")
1411
+ .to_native()
1412
+ )
1531
1413
 
1532
- else:
1533
- # Check if the schema is a subset (doesn't need to be complete) and in order
1534
- # complete: False, in_order: True
1535
- res = schema_expect._compare_schema_columns_subset_in_order(
1536
- other=schema_actual,
1537
- case_sensitive_colnames=self.case_sensitive_colnames,
1538
- case_sensitive_dtypes=self.case_sensitive_dtypes,
1539
- full_match_dtypes=self.full_match_dtypes,
1540
- )
1414
+ elif is_polars_dataframe(nw_tbl.to_native()):
1415
+ result_tbl = nw_tbl.with_columns(
1416
+ pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
1417
+ pb_is_good_2=nw.lit(na_pass), # Pass if any Null in val or compare
1418
+ )
1541
1419
 
1542
- self.test_unit_res = res
1420
+ try:
1421
+ result_tbl = result_tbl.with_columns(
1422
+ pb_is_good_3=nw.col(column) != nw.lit(compare_expr)
1423
+ )
1424
+ except (TypeError, ValueError, Exception) as e:
1425
+ # Handle type compatibility issues for literal comparisons
1426
+ error_msg = str(e).lower()
1427
+ if (
1428
+ "cannot compare" in error_msg
1429
+ or "type" in error_msg
1430
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1431
+ or "dtype" in error_msg
1432
+ or "conversion" in error_msg
1433
+ and "failed" in error_msg
1434
+ ):
1435
+ # Get column type for a descriptive error message
1436
+ try:
1437
+ native_df = nw_tbl.to_native()
1438
+ if hasattr(native_df, "dtypes"):
1439
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1440
+ elif hasattr(native_df, "schema"):
1441
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1442
+ else:
1443
+ col_dtype = "unknown"
1444
+ except Exception:
1445
+ col_dtype = "unknown"
1446
+
1447
+ compare_type = type(compare).__name__
1448
+ compare_value = str(compare)
1543
1449
 
1544
- def get_test_results(self):
1545
- return self.test_unit_res
1450
+ raise TypeError(
1451
+ f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
1452
+ f"literal value '{compare_value}' (type: {compare_type}). "
1453
+ f"Column type and literal value type are incompatible for inequality comparison. "
1454
+ f"Ensure the column data type is compatible with the comparison value "
1455
+ f"(e.g., numeric column with numeric value, string column with string value)."
1456
+ ) from e
1457
+ else:
1458
+ raise # Re-raise unexpected errors
1546
1459
 
1460
+ result_tbl = result_tbl.with_columns(
1461
+ pb_is_good_=(
1462
+ (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
1463
+ | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
1464
+ )
1465
+ )
1547
1466
 
1548
- @dataclass
1549
- class RowCountMatch:
1550
- """
1551
- Check if rows in a DataFrame either match or don't match a fixed value.
1467
+ result_tbl = result_tbl.drop(
1468
+ "pb_is_good_1", "pb_is_good_2", "pb_is_good_3"
1469
+ ).to_native()
1552
1470
 
1553
- Parameters
1554
- ----------
1555
- data_tbl
1556
- A data table.
1557
- count
1558
- The fixed row count to check against.
1559
- inverse
1560
- `True` to check if the row count does not match the fixed value, `False` otherwise.
1561
- threshold
1562
- The maximum number of failing test units to allow.
1563
- tbl_type
1564
- The type of table to use for the assertion.
1471
+ return result_tbl
1565
1472
 
1566
- Returns
1567
- -------
1568
- bool
1569
- `True` when test units pass below the threshold level for failing test units, `False`
1570
- otherwise.
1571
- """
1473
+ else:
1474
+ # Generic case for other DataFrame types (PySpark, etc.)
1475
+ # Use similar logic to Polars but handle potential differences
1476
+ result_tbl = nw_tbl.with_columns(
1477
+ pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
1478
+ pb_is_good_2=nw.lit(na_pass), # Pass if any Null in val or compare
1479
+ )
1572
1480
 
1573
- data_tbl: FrameT
1574
- count: int
1575
- inverse: bool
1576
- threshold: int
1577
- abs_tol_bounds: AbsoluteTolBounds
1578
- tbl_type: str = "local"
1481
+ try:
1482
+ result_tbl = result_tbl.with_columns(
1483
+ pb_is_good_3=nw.col(column) != nw.lit(compare_expr)
1484
+ )
1485
+ except (TypeError, ValueError, Exception) as e:
1486
+ # Handle type compatibility issues for literal comparisons
1487
+ error_msg = str(e).lower()
1488
+ if (
1489
+ "cannot compare" in error_msg
1490
+ or "type" in error_msg
1491
+ and ("mismatch" in error_msg or "incompatible" in error_msg)
1492
+ or "dtype" in error_msg
1493
+ or "conversion" in error_msg
1494
+ and "failed" in error_msg
1495
+ ):
1496
+ # Get column type for a descriptive error message
1497
+ try:
1498
+ native_df = nw_tbl.to_native()
1499
+ if hasattr(native_df, "dtypes"):
1500
+ col_dtype = str(native_df.dtypes.get(column, "unknown"))
1501
+ elif hasattr(native_df, "schema"):
1502
+ col_dtype = str(native_df.schema.get(column, "unknown"))
1503
+ else:
1504
+ col_dtype = "unknown"
1505
+ except Exception:
1506
+ col_dtype = "unknown"
1507
+
1508
+ compare_type = type(compare).__name__
1509
+ compare_value = str(compare)
1579
1510
 
1580
- def __post_init__(self):
1581
- from pointblank.validate import get_row_count
1511
+ raise TypeError(
1512
+ f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
1513
+ f"literal value '{compare_value}' (type: {compare_type}). "
1514
+ f"Column type and literal value type are incompatible for inequality comparison. "
1515
+ f"Ensure the column data type is compatible with the comparison value "
1516
+ f"(e.g., numeric column with numeric value, string column with string value)."
1517
+ ) from e
1518
+ else:
1519
+ raise # Re-raise unexpected errors
1582
1520
 
1583
- row_count: int = get_row_count(data=self.data_tbl)
1521
+ result_tbl = result_tbl.with_columns(
1522
+ pb_is_good_=(
1523
+ (nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
1524
+ | (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
1525
+ )
1526
+ )
1584
1527
 
1585
- lower_abs_limit, upper_abs_limit = self.abs_tol_bounds
1586
- min_val: int = self.count - lower_abs_limit
1587
- max_val: int = self.count + upper_abs_limit
1528
+ return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
1588
1529
 
1589
- if self.inverse:
1590
- res: bool = not (row_count >= min_val and row_count <= max_val)
1591
- else:
1592
- res: bool = row_count >= min_val and row_count <= max_val
1593
1530
 
1594
- self.test_unit_res = res
1531
+ def interrogate_between(
1532
+ tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
1533
+ ) -> FrameT:
1534
+ """Between interrogation."""
1595
1535
 
1596
- def get_test_results(self):
1597
- return self.test_unit_res
1536
+ low_val = _get_compare_expr_nw(compare=low)
1537
+ high_val = _get_compare_expr_nw(compare=high)
1598
1538
 
1539
+ nw_tbl = nw.from_native(tbl)
1540
+ low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
1541
+ high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
1599
1542
 
1600
- @dataclass
1601
- class ColCountMatch:
1602
- """
1603
- Check if columns in a DataFrame either match or don't match a fixed value.
1543
+ result_tbl = nw_tbl.with_columns(
1544
+ pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
1545
+ pb_is_good_2=( # lb is Null in Column
1546
+ nw.col(low.name).is_null() if isinstance(low, Column) else nw.lit(False)
1547
+ ),
1548
+ pb_is_good_3=( # ub is Null in Column
1549
+ nw.col(high.name).is_null() if isinstance(high, Column) else nw.lit(False)
1550
+ ),
1551
+ pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
1552
+ )
1604
1553
 
1605
- Parameters
1606
- ----------
1607
- data_tbl
1608
- A data table.
1609
- count
1610
- The fixed column count to check against.
1611
- inverse
1612
- `True` to check if the column count does not match the fixed value, `False` otherwise.
1613
- threshold
1614
- The maximum number of failing test units to allow.
1615
- tbl_type
1616
- The type of table to use for the assertion.
1554
+ if inclusive[0]:
1555
+ result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) >= low_val)
1556
+ else:
1557
+ result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) > low_val)
1617
1558
 
1618
- Returns
1619
- -------
1620
- bool
1621
- `True` when test units pass below the threshold level for failing test units, `False`
1622
- otherwise.
1623
- """
1559
+ if inclusive[1]:
1560
+ result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) <= high_val)
1561
+ else:
1562
+ result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) < high_val)
1624
1563
 
1625
- data_tbl: FrameT
1626
- count: int
1627
- inverse: bool
1628
- threshold: int
1629
- tbl_type: str = "local"
1564
+ result_tbl = result_tbl.with_columns(
1565
+ pb_is_good_5=(
1566
+ nw.when(nw.col("pb_is_good_5").is_null())
1567
+ .then(nw.lit(False))
1568
+ .otherwise(nw.col("pb_is_good_5"))
1569
+ )
1570
+ )
1630
1571
 
1631
- def __post_init__(self):
1632
- from pointblank.validate import get_column_count
1572
+ result_tbl = result_tbl.with_columns(
1573
+ pb_is_good_6=(
1574
+ nw.when(nw.col("pb_is_good_6").is_null())
1575
+ .then(nw.lit(False))
1576
+ .otherwise(nw.col("pb_is_good_6"))
1577
+ )
1578
+ )
1633
1579
 
1634
- if not self.inverse:
1635
- res = get_column_count(data=self.data_tbl) == self.count
1636
- else:
1637
- res = get_column_count(data=self.data_tbl) != self.count
1580
+ result_tbl = result_tbl.with_columns(
1581
+ pb_is_good_=(
1582
+ (
1583
+ (nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
1584
+ & nw.col("pb_is_good_4")
1585
+ )
1586
+ | (nw.col("pb_is_good_5") & nw.col("pb_is_good_6"))
1587
+ )
1588
+ ).drop(
1589
+ "pb_is_good_1",
1590
+ "pb_is_good_2",
1591
+ "pb_is_good_3",
1592
+ "pb_is_good_4",
1593
+ "pb_is_good_5",
1594
+ "pb_is_good_6",
1595
+ )
1638
1596
 
1639
- self.test_unit_res = res
1597
+ return result_tbl.to_native()
1640
1598
 
1641
- def get_test_results(self):
1642
- return self.test_unit_res
1643
1599
 
1600
+ def interrogate_outside(
1601
+ tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
1602
+ ) -> FrameT:
1603
+ """Outside range interrogation."""
1644
1604
 
1645
- class ConjointlyValidation:
1646
- def __init__(self, data_tbl, expressions, threshold, tbl_type):
1647
- self.data_tbl = data_tbl
1648
- self.expressions = expressions
1649
- self.threshold = threshold
1605
+ low_val = _get_compare_expr_nw(compare=low)
1606
+ high_val = _get_compare_expr_nw(compare=high)
1650
1607
 
1651
- # Detect the table type
1652
- if tbl_type in (None, "local"):
1653
- # Detect the table type using _get_tbl_type()
1654
- self.tbl_type = _get_tbl_type(data=data_tbl)
1655
- else:
1656
- self.tbl_type = tbl_type
1608
+ nw_tbl = nw.from_native(tbl)
1609
+ low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
1610
+ high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
1657
1611
 
1658
- def get_test_results(self):
1659
- """Evaluate all expressions and combine them conjointly."""
1612
+ result_tbl = nw_tbl.with_columns(
1613
+ pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
1614
+ pb_is_good_2=( # lb is Null in Column
1615
+ nw.col(low.name).is_null() if isinstance(low, Column) else nw.lit(False)
1616
+ ),
1617
+ pb_is_good_3=( # ub is Null in Column
1618
+ nw.col(high.name).is_null() if isinstance(high, Column) else nw.lit(False)
1619
+ ),
1620
+ pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
1621
+ )
1660
1622
 
1661
- if "polars" in self.tbl_type:
1662
- return self._get_polars_results()
1663
- elif "pandas" in self.tbl_type:
1664
- return self._get_pandas_results()
1665
- elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
1666
- return self._get_ibis_results()
1667
- elif "pyspark" in self.tbl_type:
1668
- return self._get_pyspark_results()
1669
- else: # pragma: no cover
1670
- raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
1623
+ # Note: Logic is inverted for "outside" - when inclusive[0] is True,
1624
+ # we want values < low_val (not <= low_val) to be "outside"
1625
+ if inclusive[0]:
1626
+ result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
1627
+ else:
1628
+ result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) <= low_val)
1671
1629
 
1672
- def _get_polars_results(self):
1673
- """Process expressions for Polars DataFrames."""
1674
- import polars as pl
1630
+ if inclusive[1]:
1631
+ result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) > high_val)
1632
+ else:
1633
+ result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) >= high_val)
1634
+
1635
+ result_tbl = result_tbl.with_columns(
1636
+ pb_is_good_5=nw.when(nw.col("pb_is_good_5").is_null())
1637
+ .then(False)
1638
+ .otherwise(nw.col("pb_is_good_5")),
1639
+ pb_is_good_6=nw.when(nw.col("pb_is_good_6").is_null())
1640
+ .then(False)
1641
+ .otherwise(nw.col("pb_is_good_6")),
1642
+ )
1675
1643
 
1676
- polars_expressions = []
1644
+ result_tbl = result_tbl.with_columns(
1645
+ pb_is_good_=(
1646
+ (
1647
+ (nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
1648
+ & nw.col("pb_is_good_4")
1649
+ )
1650
+ | (
1651
+ (nw.col("pb_is_good_5") & ~nw.col("pb_is_good_3"))
1652
+ | (nw.col("pb_is_good_6")) & ~nw.col("pb_is_good_2")
1653
+ )
1654
+ )
1655
+ ).drop(
1656
+ "pb_is_good_1",
1657
+ "pb_is_good_2",
1658
+ "pb_is_good_3",
1659
+ "pb_is_good_4",
1660
+ "pb_is_good_5",
1661
+ "pb_is_good_6",
1662
+ )
1677
1663
 
1678
- for expr_fn in self.expressions:
1679
- try:
1680
- # First try direct evaluation with native Polars expressions
1681
- expr_result = expr_fn(self.data_tbl)
1682
- if isinstance(expr_result, pl.Expr):
1683
- polars_expressions.append(expr_result)
1684
- else:
1685
- raise TypeError("Not a valid Polars expression")
1686
- except Exception as e:
1687
- try:
1688
- # Try to get a ColumnExpression
1689
- col_expr = expr_fn(None)
1690
- if hasattr(col_expr, "to_polars_expr"):
1691
- polars_expr = col_expr.to_polars_expr()
1692
- polars_expressions.append(polars_expr)
1693
- else: # pragma: no cover
1694
- raise TypeError(f"Cannot convert {type(col_expr)} to Polars expression")
1695
- except Exception as e: # pragma: no cover
1696
- print(f"Error evaluating expression: {e}")
1664
+ return result_tbl.to_native()
1697
1665
 
1698
- # Combine results with AND logic
1699
- if polars_expressions:
1700
- final_result = polars_expressions[0]
1701
- for expr in polars_expressions[1:]:
1702
- final_result = final_result & expr
1703
1666
 
1704
- # Create results table with boolean column
1705
- results_tbl = self.data_tbl.with_columns(pb_is_good_=final_result)
1706
- return results_tbl
1667
+ def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
1668
+ """In set interrogation."""
1707
1669
 
1708
- # Default case
1709
- results_tbl = self.data_tbl.with_columns(pb_is_good_=pl.lit(True)) # pragma: no cover
1710
- return results_tbl # pragma: no cover
1670
+ nw_tbl = nw.from_native(tbl)
1711
1671
 
1712
- def _get_pandas_results(self):
1713
- """Process expressions for pandas DataFrames."""
1714
- import pandas as pd
1672
+ can_be_null: bool = None in set_values
1673
+ base_expr: nw.Expr = nw.col(column).is_in(set_values)
1674
+ if can_be_null:
1675
+ base_expr = base_expr | nw.col(column).is_null()
1715
1676
 
1716
- pandas_series = []
1677
+ result_tbl = nw_tbl.with_columns(pb_is_good_=base_expr)
1678
+ return result_tbl.to_native()
1717
1679
 
1718
- for expr_fn in self.expressions:
1719
- try:
1720
- # First try direct evaluation with pandas DataFrame
1721
- expr_result = expr_fn(self.data_tbl)
1722
1680
 
1723
- # Check that it's a pandas Series with bool dtype
1724
- if isinstance(expr_result, pd.Series):
1725
- if expr_result.dtype == bool or pd.api.types.is_bool_dtype(expr_result):
1726
- pandas_series.append(expr_result)
1727
- else: # pragma: no cover
1728
- raise TypeError(
1729
- f"Expression returned Series of type {expr_result.dtype}, expected bool"
1730
- )
1731
- else: # pragma: no cover
1732
- raise TypeError(f"Expression returned {type(expr_result)}, expected pd.Series")
1681
+ def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
1682
+ """Not in set interrogation."""
1733
1683
 
1734
- except Exception as e:
1735
- try:
1736
- # Try as a ColumnExpression (for pb.expr_col style)
1737
- col_expr = expr_fn(None)
1684
+ nw_tbl = nw.from_native(tbl)
1685
+ result_tbl = nw_tbl.with_columns(
1686
+ pb_is_good_=nw.col(column).is_in(set_values),
1687
+ ).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
1688
+ return result_tbl.to_native()
1738
1689
 
1739
- if hasattr(col_expr, "to_pandas_expr"):
1740
- # Watch for NotImplementedError here and re-raise it
1741
- try:
1742
- pandas_expr = col_expr.to_pandas_expr(self.data_tbl)
1743
- pandas_series.append(pandas_expr)
1744
- except NotImplementedError as nie: # pragma: no cover
1745
- # Re-raise NotImplementedError with the original message
1746
- raise NotImplementedError(str(nie))
1747
- else: # pragma: no cover
1748
- raise TypeError(f"Cannot convert {type(col_expr)} to pandas Series")
1749
- except NotImplementedError as nie: # pragma: no cover
1750
- # Re-raise NotImplementedError
1751
- raise NotImplementedError(str(nie))
1752
- except Exception as nested_e: # pragma: no cover
1753
- print(f"Error evaluating pandas expression: {e} -> {nested_e}")
1754
1690
 
1755
- # Combine results with AND logic
1756
- if pandas_series:
1757
- final_result = pandas_series[0]
1758
- for series in pandas_series[1:]:
1759
- final_result = final_result & series
1691
+ def interrogate_regex(tbl: FrameT, column: str, pattern: str, na_pass: bool) -> FrameT:
1692
+ """Regex interrogation."""
1760
1693
 
1761
- # Create results table with boolean column
1762
- results_tbl = self.data_tbl.copy()
1763
- results_tbl["pb_is_good_"] = final_result
1764
- return results_tbl
1694
+ nw_tbl = nw.from_native(tbl)
1695
+ result_tbl = nw_tbl.with_columns(
1696
+ pb_is_good_1=nw.col(column).is_null() & na_pass,
1697
+ pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
1698
+ )
1765
1699
 
1766
- # Default case
1767
- results_tbl = self.data_tbl.copy() # pragma: no cover
1768
- results_tbl["pb_is_good_"] = pd.Series( # pragma: no cover
1769
- [True] * len(self.data_tbl), index=self.data_tbl.index
1770
- )
1771
- return results_tbl # pragma: no cover
1700
+ result_tbl = result_tbl.with_columns(
1701
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
1702
+ ).drop("pb_is_good_1", "pb_is_good_2")
1772
1703
 
1773
- def _get_ibis_results(self):
1774
- """Process expressions for Ibis tables (including DuckDB)."""
1775
- import ibis
1704
+ return result_tbl.to_native()
1776
1705
 
1777
- ibis_expressions = []
1778
1706
 
1779
- for expr_fn in self.expressions:
1780
- # Strategy 1: Try direct evaluation with native Ibis expressions
1781
- try:
1782
- expr_result = expr_fn(self.data_tbl)
1707
+ def interrogate_null(tbl: FrameT, column: str) -> FrameT:
1708
+ """Null interrogation."""
1783
1709
 
1784
- # Check if it's a valid Ibis expression
1785
- if hasattr(expr_result, "_ibis_expr"): # pragma: no cover
1786
- ibis_expressions.append(expr_result)
1787
- continue # Skip to next expression if this worked
1788
- except Exception: # pragma: no cover
1789
- pass # Silently continue to Strategy 2
1710
+ nw_tbl = nw.from_native(tbl)
1711
+ result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
1712
+ return result_tbl.to_native()
1790
1713
 
1791
- # Strategy 2: Try with ColumnExpression
1792
- try: # pragma: no cover
1793
- # Skip this strategy if we don't have an expr_col implementation
1794
- if not hasattr(self, "to_ibis_expr"):
1795
- continue
1796
1714
 
1797
- col_expr = expr_fn(None)
1715
+ def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
1716
+ """Not null interrogation."""
1798
1717
 
1799
- # Skip if we got None
1800
- if col_expr is None:
1801
- continue
1718
+ nw_tbl = nw.from_native(tbl)
1719
+ result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
1720
+ return result_tbl.to_native()
1802
1721
 
1803
- # Convert ColumnExpression to Ibis expression
1804
- if hasattr(col_expr, "to_ibis_expr"):
1805
- ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
1806
- ibis_expressions.append(ibis_expr)
1807
- except Exception: # pragma: no cover
1808
- # Silent failure - we already tried both strategies
1809
- pass
1810
1722
 
1811
- # Combine expressions
1812
- if ibis_expressions: # pragma: no cover
1813
- try:
1814
- final_result = ibis_expressions[0]
1815
- for expr in ibis_expressions[1:]:
1816
- final_result = final_result & expr
1723
+ def _interrogate_comparison_base(
1724
+ tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
1725
+ ) -> FrameT:
1726
+ """
1727
+ Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
1817
1728
 
1818
- # Create results table with boolean column
1819
- results_tbl = self.data_tbl.mutate(pb_is_good_=final_result)
1820
- return results_tbl
1821
- except Exception as e:
1822
- print(f"Error combining Ibis expressions: {e}")
1729
+ Parameters
1730
+ ----------
1731
+ tbl
1732
+ The table to interrogate.
1733
+ column
1734
+ The column to check.
1735
+ compare
1736
+ The value to compare against.
1737
+ na_pass
1738
+ Whether to pass null values.
1739
+ operator
1740
+ The comparison operator: 'gt', 'ge', 'lt', 'le', 'eq', 'ne'.
1823
1741
 
1824
- # Default case
1825
- results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
1826
- return results_tbl
1742
+ Returns
1743
+ -------
1744
+ FrameT
1745
+ The result table with `pb_is_good_` column indicating the passing test units.
1746
+ """
1827
1747
 
1828
- def _get_pyspark_results(self):
1829
- """Process expressions for PySpark DataFrames."""
1830
- from pyspark.sql import functions as F
1748
+ compare_expr = _get_compare_expr_nw(compare=compare)
1749
+
1750
+ nw_tbl = nw.from_native(tbl)
1751
+ compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
1752
+
1753
+ # Create the comparison expression based on the operator
1754
+ column_expr = nw.col(column)
1755
+ if operator == "gt":
1756
+ comparison = column_expr > compare_expr
1757
+ elif operator == "ge":
1758
+ comparison = column_expr >= compare_expr
1759
+ elif operator == "lt":
1760
+ comparison = column_expr < compare_expr
1761
+ elif operator == "le":
1762
+ comparison = column_expr <= compare_expr
1763
+ elif operator == "eq":
1764
+ comparison = column_expr == compare_expr
1765
+ elif operator == "ne":
1766
+ comparison = column_expr != compare_expr
1767
+ else:
1768
+ raise ValueError( # pragma: no cover
1769
+ f"Invalid operator: {operator}. Must be one of: 'gt', 'ge', 'lt', 'le', 'eq', 'ne'"
1770
+ )
1831
1771
 
1832
- pyspark_columns = []
1772
+ result_tbl = nw_tbl.with_columns(
1773
+ pb_is_good_1=_safe_is_nan_or_null_expr(nw_tbl, nw.col(column), column) & na_pass,
1774
+ pb_is_good_2=(
1775
+ _safe_is_nan_or_null_expr(nw_tbl, nw.col(compare.name), compare.name) & na_pass
1776
+ if isinstance(compare, Column)
1777
+ else nw.lit(False)
1778
+ ),
1779
+ pb_is_good_3=comparison & ~_safe_is_nan_or_null_expr(nw_tbl, nw.col(column), column),
1780
+ )
1833
1781
 
1834
- for expr_fn in self.expressions:
1835
- try:
1836
- # First try direct evaluation with PySpark DataFrame
1837
- expr_result = expr_fn(self.data_tbl)
1838
-
1839
- # Check if it's a PySpark Column
1840
- if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
1841
- pyspark_columns.append(expr_result)
1842
- else:
1843
- raise TypeError(
1844
- f"Expression returned {type(expr_result)}, expected PySpark Column"
1845
- )
1846
-
1847
- except Exception as e:
1848
- try:
1849
- # Try as a ColumnExpression (for pb.expr_col style)
1850
- col_expr = expr_fn(None)
1851
-
1852
- if hasattr(col_expr, "to_pyspark_expr"):
1853
- # Convert to PySpark expression
1854
- pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
1855
- pyspark_columns.append(pyspark_expr)
1856
- else:
1857
- raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
1858
- except Exception as nested_e:
1859
- print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
1860
-
1861
- # Combine results with AND logic
1862
- if pyspark_columns:
1863
- final_result = pyspark_columns[0]
1864
- for col in pyspark_columns[1:]:
1865
- final_result = final_result & col
1866
-
1867
- # Create results table with boolean column
1868
- results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
1869
- return results_tbl
1870
-
1871
- # Default case
1872
- results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
1873
- return results_tbl
1874
-
1875
-
1876
- class SpeciallyValidation:
1877
- def __init__(self, data_tbl, expression, threshold, tbl_type):
1878
- self.data_tbl = data_tbl
1879
- self.expression = expression
1880
- self.threshold = threshold
1881
-
1882
- # Detect the table type
1883
- if tbl_type in (None, "local"):
1884
- # Detect the table type using _get_tbl_type()
1885
- self.tbl_type = _get_tbl_type(data=data_tbl)
1886
- else:
1887
- self.tbl_type = tbl_type
1888
-
1889
- def get_test_results(self) -> any | list[bool]:
1890
- """Evaluate the expression get either a list of booleans or a results table."""
1891
-
1892
- # Get the expression and inspect whether there is a `data` argument
1893
- expression = self.expression
1894
-
1895
- import inspect
1896
-
1897
- # During execution of `specially` validation
1898
- sig = inspect.signature(expression)
1899
- params = list(sig.parameters.keys())
1900
-
1901
- # Execute the function based on its signature
1902
- if len(params) == 0:
1903
- # No parameters: call without arguments
1904
- result = expression()
1905
- elif len(params) == 1:
1906
- # One parameter: pass the data table
1907
- data_tbl = self.data_tbl
1908
- result = expression(data_tbl)
1909
- else:
1910
- # More than one parameter - this doesn't match either allowed signature
1911
- raise ValueError(
1912
- f"The function provided to 'specially()' should have either no parameters or a "
1913
- f"single 'data' parameter, but it has {len(params)} parameters: {params}"
1914
- )
1915
-
1916
- # Determine if the object is a DataFrame by inspecting the string version of its type
1917
- if (
1918
- "pandas" in str(type(result))
1919
- or "polars" in str(type(result))
1920
- or "ibis" in str(type(result))
1921
- ):
1922
- # Get the type of the table
1923
- tbl_type = _get_tbl_type(data=result)
1924
-
1925
- if "pandas" in tbl_type:
1926
- # If it's a Pandas DataFrame, check if the last column is a boolean column
1927
- last_col = result.iloc[:, -1]
1928
-
1929
- import pandas as pd
1930
-
1931
- if last_col.dtype == bool or pd.api.types.is_bool_dtype(last_col):
1932
- # If the last column is a boolean column, rename it as `pb_is_good_`
1933
- result.rename(columns={result.columns[-1]: "pb_is_good_"}, inplace=True)
1934
- elif "polars" in tbl_type:
1935
- # If it's a Polars DataFrame, check if the last column is a boolean column
1936
- last_col_name = result.columns[-1]
1937
- last_col_dtype = result.schema[last_col_name]
1782
+ result_tbl = result_tbl.with_columns(
1783
+ pb_is_good_3=(
1784
+ nw.when(nw.col("pb_is_good_3").is_null())
1785
+ .then(nw.lit(False))
1786
+ .otherwise(nw.col("pb_is_good_3"))
1787
+ )
1788
+ )
1938
1789
 
1939
- import polars as pl
1790
+ result_tbl = result_tbl.with_columns(
1791
+ pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
1792
+ ).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
1940
1793
 
1941
- if last_col_dtype == pl.Boolean:
1942
- # If the last column is a boolean column, rename it as `pb_is_good_`
1943
- result = result.rename({last_col_name: "pb_is_good_"})
1944
- elif tbl_type in IBIS_BACKENDS:
1945
- # If it's an Ibis table, check if the last column is a boolean column
1946
- last_col_name = result.columns[-1]
1947
- result_schema = result.schema()
1948
- is_last_col_bool = str(result_schema[last_col_name]) == "boolean"
1794
+ return result_tbl.to_native()
1949
1795
 
1950
- if is_last_col_bool:
1951
- # If the last column is a boolean column, rename it as `pb_is_good_`
1952
- result = result.rename(pb_is_good_=last_col_name)
1953
1796
 
1954
- else: # pragma: no cover
1955
- raise NotImplementedError(f"Support for {tbl_type} is not yet implemented")
1956
-
1957
- elif isinstance(result, bool):
1958
- # If it's a single boolean, return that as a list
1959
- return [result]
1960
-
1961
- elif isinstance(result, list):
1962
- # If it's a list, check that it is a boolean list
1963
- if all(isinstance(x, bool) for x in result):
1964
- # If it's a list of booleans, return it as is
1965
- return result
1966
- else:
1967
- # If it's not a list of booleans, raise an error
1968
- raise TypeError("The result is not a list of booleans.")
1969
- else: # pragma: no cover
1970
- # If it's not a DataFrame or a list, raise an error
1971
- raise TypeError("The result is not a DataFrame or a list of booleans.")
1972
-
1973
- # Return the results table or list of booleans
1974
- return result
1975
-
1976
-
1977
- @dataclass
1978
- class NumberOfTestUnits:
1797
+ def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
1979
1798
  """
1980
- Count the number of test units in a column.
1981
- """
1982
-
1983
- df: FrameT
1984
- column: str
1985
-
1986
- def get_test_units(self, tbl_type: str) -> int:
1987
- if (
1988
- tbl_type == "pandas"
1989
- or tbl_type == "polars"
1990
- or tbl_type == "pyspark"
1991
- or tbl_type == "local"
1992
- ):
1993
- # Convert the DataFrame to a format that narwhals can work with and:
1994
- # - check if the column exists
1995
- dfn = _column_test_prep(
1996
- df=self.df, column=self.column, allowed_types=None, check_exists=False
1997
- )
1998
-
1999
- # Handle LazyFrames which don't have len()
2000
- if hasattr(dfn, "collect"):
2001
- dfn = dfn.collect()
2002
-
2003
- return len(dfn)
2004
-
2005
- if tbl_type in IBIS_BACKENDS:
2006
- # Get the count of test units and convert to a native format
2007
- # TODO: check whether pandas or polars is available
2008
- return self.df.count().to_polars()
2009
-
2010
-
2011
- def _get_compare_expr_nw(compare: Any) -> Any:
2012
- if isinstance(compare, Column):
2013
- if not isinstance(compare.exprs, str):
2014
- raise ValueError("The column expression must be a string.") # pragma: no cover
2015
- return nw.col(compare.exprs)
2016
- return compare
2017
-
2018
-
2019
- def _column_has_null_values(table: FrameT, column: str) -> bool:
2020
- try:
2021
- # Try the standard null_count() method
2022
- null_count = (table.select(column).null_count())[column][0]
2023
- except AttributeError:
2024
- # For LazyFrames, collect first then get null count
2025
- try:
2026
- collected = table.select(column).collect()
2027
- null_count = (collected.null_count())[column][0]
2028
- except Exception:
2029
- # Fallback: check if any values are null
2030
- try:
2031
- result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
2032
- null_count = result["null_count"][0]
2033
- except Exception:
2034
- # Last resort: return False (assume no nulls)
2035
- return False
2036
-
2037
- if null_count is None or null_count == 0:
2038
- return False
2039
-
2040
- return True
2041
-
2042
-
2043
- def _check_nulls_across_columns_nw(table, columns_subset):
2044
- # Get all column names from the table
2045
- column_names = columns_subset if columns_subset else table.columns
2046
-
2047
- # Build the expression by combining each column's `is_null()` with OR operations
2048
- null_expr = functools.reduce(
2049
- lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
2050
- column_names,
2051
- None,
2052
- )
2053
-
2054
- # Add the expression as a new column to the table
2055
- result = table.with_columns(_any_is_null_=null_expr)
1799
+ Check if rows in a DataFrame are distinct.
2056
1800
 
2057
- return result
1801
+ Parameters
1802
+ ----------
1803
+ data_tbl
1804
+ A data table.
1805
+ columns_subset
1806
+ A list of columns to check for distinctness.
1807
+ threshold
1808
+ The maximum number of failing test units to allow.
1809
+ tbl_type
1810
+ The type of table to use for the assertion.
2058
1811
 
1812
+ Returns
1813
+ -------
1814
+ FrameT
1815
+ A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
1816
+ """
1817
+ tbl = nw.from_native(data_tbl)
2059
1818
 
2060
- def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
2061
- tgt_col_dtype_str = str(tgt_column.dtype).lower()
1819
+ # Get the column subset to use for the test
1820
+ if columns_subset is None:
1821
+ columns_subset = tbl.columns
2062
1822
 
2063
- if compare_val is isinstance(compare_val, Column): # pragma: no cover
2064
- return compare_val
1823
+ # Create a count of duplicates using group_by approach
1824
+ # Group by the columns of interest and count occurrences
1825
+ count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
2065
1826
 
2066
- # Get the type of `compare_expr` and convert, if necessary, to the type of the column
2067
- compare_type_str = str(type(compare_val)).lower()
1827
+ # Join back to original table to get count for each row
1828
+ tbl = tbl.join(count_tbl, on=columns_subset, how="left")
2068
1829
 
2069
- if "datetime.datetime" in compare_type_str:
2070
- compare_type = "datetime"
2071
- elif "datetime.date" in compare_type_str:
2072
- compare_type = "date"
2073
- else:
2074
- compare_type = "other"
1830
+ # Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
1831
+ tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
2075
1832
 
2076
- if "datetime" in tgt_col_dtype_str:
2077
- tgt_col_dtype = "datetime"
2078
- elif "date" in tgt_col_dtype_str or "object" in tgt_col_dtype_str:
2079
- # Object type is used for date columns in Pandas
2080
- tgt_col_dtype = "date"
2081
- else:
2082
- tgt_col_dtype = "other"
1833
+ return tbl.to_native()
2083
1834
 
2084
- # Handle each combination of `compare_type` and `tgt_col_dtype`, coercing only the
2085
- # `compare_expr` to the type of the column
2086
- if compare_type == "datetime" and tgt_col_dtype == "date":
2087
- # Assume that `compare_expr` is a datetime.datetime object and strip the time part
2088
- # to get a date object
2089
- compare_expr = compare_val.date()
2090
1835
 
2091
- elif compare_type == "date" and tgt_col_dtype == "datetime":
2092
- import datetime
1836
+ def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
1837
+ """Rows complete interrogation."""
1838
+ nw_tbl = nw.from_native(tbl)
2093
1839
 
2094
- # Assume that `compare_expr` is a `datetime.date` object so add in the time part
2095
- # to get a `datetime.datetime` object
2096
- compare_expr = datetime.datetime.combine(compare_val, datetime.datetime.min.time())
1840
+ # Determine the number of null values in each row (column subsets are handled in
1841
+ # the `_check_nulls_across_columns_nw()` function)
1842
+ result_tbl = _check_nulls_across_columns_nw(table=nw_tbl, columns_subset=columns_subset)
2097
1843
 
2098
- else:
2099
- return compare_val
1844
+ # Failing rows will have the value `True` in the generated column, so we need to negate
1845
+ # the result to get the passing rows
1846
+ result_tbl = result_tbl.with_columns(pb_is_good_=~nw.col("_any_is_null_"))
1847
+ result_tbl = result_tbl.drop("_any_is_null_")
2100
1848
 
2101
- return compare_expr
1849
+ return result_tbl.to_native()