pointblank 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +0 -2
- pointblank/_constants.py +2 -28
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +1483 -1735
- pointblank/column.py +6 -2
- pointblank/datascan.py +3 -2
- pointblank/schema.py +155 -1
- pointblank/validate.py +459 -222
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/METADATA +3 -2
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/RECORD +14 -15
- pointblank/tf.py +0 -287
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/WHEEL +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import functools
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Any
|
|
6
6
|
|
|
7
7
|
import narwhals as nw
|
|
8
8
|
from narwhals.dependencies import is_pandas_dataframe, is_polars_dataframe
|
|
@@ -10,17 +10,11 @@ from narwhals.typing import FrameT
|
|
|
10
10
|
|
|
11
11
|
from pointblank._constants import IBIS_BACKENDS
|
|
12
12
|
from pointblank._utils import (
|
|
13
|
-
_column_subset_test_prep,
|
|
14
13
|
_column_test_prep,
|
|
15
14
|
_convert_to_narwhals,
|
|
16
15
|
_get_tbl_type,
|
|
17
16
|
)
|
|
18
17
|
from pointblank.column import Column
|
|
19
|
-
from pointblank.schema import Schema
|
|
20
|
-
from pointblank.thresholds import _threshold_check
|
|
21
|
-
|
|
22
|
-
if TYPE_CHECKING:
|
|
23
|
-
from pointblank._typing import AbsoluteTolBounds
|
|
24
18
|
|
|
25
19
|
|
|
26
20
|
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
@@ -91,885 +85,597 @@ def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val:
|
|
|
91
85
|
return compare_val
|
|
92
86
|
|
|
93
87
|
|
|
94
|
-
|
|
95
|
-
class Interrogator:
|
|
88
|
+
def _safe_is_nan_or_null_expr(data_frame: Any, column_expr: Any, column_name: str = None) -> Any:
|
|
96
89
|
"""
|
|
97
|
-
|
|
90
|
+
Create an expression that safely checks for both Null and NaN values.
|
|
91
|
+
|
|
92
|
+
This function handles the case where `is_nan()` is not supported for certain data types (like
|
|
93
|
+
strings) or backends (like `SQLite` via Ibis) by checking the backend type and column type
|
|
94
|
+
first.
|
|
98
95
|
|
|
99
96
|
Parameters
|
|
100
97
|
----------
|
|
101
|
-
|
|
102
|
-
The
|
|
103
|
-
|
|
104
|
-
The column to check.
|
|
105
|
-
|
|
106
|
-
The
|
|
107
|
-
compare
|
|
108
|
-
The value to compare against. Used in the following interrogations:
|
|
109
|
-
- 'gt' for greater than
|
|
110
|
-
- 'lt' for less than
|
|
111
|
-
- 'eq' for equal to
|
|
112
|
-
- 'ne' for not equal to
|
|
113
|
-
- 'ge' for greater than or equal to
|
|
114
|
-
- 'le' for less than or equal to
|
|
115
|
-
set
|
|
116
|
-
The set of values to compare against. Used in the following interrogations:
|
|
117
|
-
- 'isin' for values in the set
|
|
118
|
-
- 'notin' for values not in the set
|
|
119
|
-
pattern
|
|
120
|
-
The regular expression pattern to compare against. Used in the following:
|
|
121
|
-
- 'regex' for values that match the pattern
|
|
122
|
-
low
|
|
123
|
-
The lower bound of the range of values to compare against. Used in the following:
|
|
124
|
-
- 'between' for values between the range
|
|
125
|
-
- 'outside' for values outside the range
|
|
126
|
-
high
|
|
127
|
-
The upper bound of the range of values to compare against. Used in the following:
|
|
128
|
-
- 'between' for values between the range
|
|
129
|
-
- 'outside' for values outside the range
|
|
130
|
-
inclusive
|
|
131
|
-
A tuple of booleans that state which bounds are inclusive. The position of the boolean
|
|
132
|
-
corresponds to the value in the following order: (low, high). Used in the following:
|
|
133
|
-
- 'between' for values between the range
|
|
134
|
-
- 'outside' for values outside the range
|
|
135
|
-
na_pass
|
|
136
|
-
`True` to pass test units with missing values, `False` otherwise.
|
|
137
|
-
tbl_type
|
|
138
|
-
The type of table to use for the assertion. This is used to determine the backend for the
|
|
139
|
-
assertion. The default is 'local' but it can also be any of the table types in the
|
|
140
|
-
`IBIS_BACKENDS` constant.
|
|
98
|
+
data_frame
|
|
99
|
+
The data frame to get schema information from.
|
|
100
|
+
column_expr
|
|
101
|
+
The narwhals column expression to check.
|
|
102
|
+
column_name
|
|
103
|
+
The name of the column.
|
|
141
104
|
|
|
142
105
|
Returns
|
|
143
106
|
-------
|
|
144
|
-
|
|
145
|
-
A
|
|
107
|
+
Any
|
|
108
|
+
A narwhals expression that returns `True` for Null or NaN values.
|
|
146
109
|
"""
|
|
110
|
+
# Always check for null values
|
|
111
|
+
null_check = column_expr.is_null()
|
|
147
112
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
set: list[float | int] = None
|
|
153
|
-
pattern: str = None
|
|
154
|
-
low: float | int | list[float | int] = None
|
|
155
|
-
high: float | int | list[float | int] = None
|
|
156
|
-
inclusive: tuple[bool, bool] = None
|
|
157
|
-
na_pass: bool = False
|
|
158
|
-
tbl_type: str = "local"
|
|
159
|
-
|
|
160
|
-
def __post_init__(self):
|
|
161
|
-
"""
|
|
162
|
-
Post-initialization to process Ibis tables through Narwhals.
|
|
163
|
-
|
|
164
|
-
This converts Ibis tables to Narwhals-wrapped tables to unify
|
|
165
|
-
the processing pathway and reduce code branching.
|
|
166
|
-
"""
|
|
167
|
-
# Import the processing function
|
|
168
|
-
from pointblank._utils import _process_ibis_through_narwhals
|
|
169
|
-
|
|
170
|
-
# Process Ibis tables through Narwhals
|
|
171
|
-
self.x, self.tbl_type = _process_ibis_through_narwhals(self.x, self.tbl_type)
|
|
172
|
-
|
|
173
|
-
def gt(self) -> FrameT | Any:
|
|
174
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
175
|
-
|
|
176
|
-
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
177
|
-
|
|
178
|
-
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
179
|
-
|
|
180
|
-
return (
|
|
181
|
-
self.x.with_columns(
|
|
182
|
-
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
183
|
-
pb_is_good_2=(
|
|
184
|
-
nw.col(self.compare.name).is_null() & self.na_pass
|
|
185
|
-
if isinstance(self.compare, Column)
|
|
186
|
-
else nw.lit(False)
|
|
187
|
-
),
|
|
188
|
-
pb_is_good_3=nw.col(self.column) > compare_expr,
|
|
189
|
-
)
|
|
190
|
-
.with_columns(
|
|
191
|
-
pb_is_good_3=(
|
|
192
|
-
nw.when(nw.col("pb_is_good_3").is_null())
|
|
193
|
-
.then(nw.lit(False))
|
|
194
|
-
.otherwise(nw.col("pb_is_good_3"))
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
.with_columns(
|
|
198
|
-
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
199
|
-
)
|
|
200
|
-
.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
201
|
-
.to_native()
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
def lt(self) -> FrameT | Any:
|
|
205
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
206
|
-
|
|
207
|
-
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
208
|
-
|
|
209
|
-
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
210
|
-
|
|
211
|
-
return (
|
|
212
|
-
self.x.with_columns(
|
|
213
|
-
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
214
|
-
pb_is_good_2=(
|
|
215
|
-
nw.col(self.compare.name).is_null() & self.na_pass
|
|
216
|
-
if isinstance(self.compare, Column)
|
|
217
|
-
else nw.lit(False)
|
|
218
|
-
),
|
|
219
|
-
pb_is_good_3=nw.col(self.column) < compare_expr,
|
|
220
|
-
)
|
|
221
|
-
.with_columns(
|
|
222
|
-
pb_is_good_3=(
|
|
223
|
-
nw.when(nw.col("pb_is_good_3").is_null())
|
|
224
|
-
.then(nw.lit(False))
|
|
225
|
-
.otherwise(nw.col("pb_is_good_3"))
|
|
226
|
-
)
|
|
227
|
-
)
|
|
228
|
-
.with_columns(
|
|
229
|
-
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
230
|
-
)
|
|
231
|
-
.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
232
|
-
.to_native()
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
def eq(self) -> FrameT | Any:
|
|
236
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
237
|
-
|
|
238
|
-
if isinstance(self.compare, Column):
|
|
239
|
-
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
240
|
-
|
|
241
|
-
tbl = self.x.with_columns(
|
|
242
|
-
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
243
|
-
pb_is_good_2=(
|
|
244
|
-
nw.col(self.compare.name).is_null() & self.na_pass
|
|
245
|
-
if isinstance(self.compare, Column)
|
|
246
|
-
else nw.lit(False)
|
|
247
|
-
),
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
tbl = tbl.with_columns(
|
|
251
|
-
pb_is_good_3=(~nw.col(self.compare.name).is_null() & ~nw.col(self.column).is_null())
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
if is_pandas_dataframe(tbl.to_native()):
|
|
255
|
-
tbl = tbl.with_columns(
|
|
256
|
-
pb_is_good_4=nw.col(self.column) - compare_expr,
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
tbl = tbl.with_columns(
|
|
260
|
-
pb_is_good_=nw.col("pb_is_good_1")
|
|
261
|
-
| nw.col("pb_is_good_2")
|
|
262
|
-
| (nw.col("pb_is_good_4") == 0 & ~nw.col("pb_is_good_3").is_null())
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
else:
|
|
266
|
-
tbl = tbl.with_columns(
|
|
267
|
-
pb_is_good_4=nw.col(self.column) == compare_expr,
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
tbl = tbl.with_columns(
|
|
271
|
-
pb_is_good_=nw.col("pb_is_good_1")
|
|
272
|
-
| nw.col("pb_is_good_2")
|
|
273
|
-
| (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
|
|
274
|
-
)
|
|
113
|
+
# For Ibis backends, many don't support `is_nan()` so we stick to Null checks only;
|
|
114
|
+
# use `narwhals.get_native_namespace()` for reliable backend detection
|
|
115
|
+
try:
|
|
116
|
+
native_namespace = nw.get_native_namespace(data_frame)
|
|
275
117
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
118
|
+
# If it's an Ibis backend, only check for null values
|
|
119
|
+
# The namespace is the actual module, so we check its name
|
|
120
|
+
if hasattr(native_namespace, "__name__") and "ibis" in native_namespace.__name__:
|
|
121
|
+
return null_check
|
|
122
|
+
except Exception:
|
|
123
|
+
pass
|
|
279
124
|
|
|
125
|
+
# For non-Ibis backends, try to use `is_nan()` if the column type supports it
|
|
126
|
+
try:
|
|
127
|
+
if hasattr(data_frame, "collect_schema"):
|
|
128
|
+
schema = data_frame.collect_schema()
|
|
129
|
+
elif hasattr(data_frame, "schema"):
|
|
130
|
+
schema = data_frame.schema
|
|
280
131
|
else:
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
284
|
-
|
|
285
|
-
tbl = self.x.with_columns(
|
|
286
|
-
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
287
|
-
pb_is_good_2=(
|
|
288
|
-
nw.col(self.compare.name).is_null() & self.na_pass
|
|
289
|
-
if isinstance(self.compare, Column)
|
|
290
|
-
else nw.lit(False)
|
|
291
|
-
),
|
|
292
|
-
)
|
|
132
|
+
schema = None
|
|
293
133
|
|
|
294
|
-
|
|
134
|
+
if schema and column_name:
|
|
135
|
+
column_dtype = schema.get(column_name)
|
|
136
|
+
if column_dtype:
|
|
137
|
+
dtype_str = str(column_dtype).lower()
|
|
295
138
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
.then(nw.lit(False))
|
|
300
|
-
.otherwise(nw.col("pb_is_good_3"))
|
|
139
|
+
# Check if it's a numeric type that supports NaN
|
|
140
|
+
is_numeric = any(
|
|
141
|
+
num_type in dtype_str for num_type in ["float", "double", "f32", "f64"]
|
|
301
142
|
)
|
|
302
|
-
)
|
|
303
143
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
144
|
+
if is_numeric:
|
|
145
|
+
try:
|
|
146
|
+
# For numeric types, try to check both Null and NaN
|
|
147
|
+
return null_check | column_expr.is_nan()
|
|
148
|
+
except Exception:
|
|
149
|
+
# If `is_nan()` fails for any reason, fall back to Null only
|
|
150
|
+
pass
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
307
153
|
|
|
308
|
-
|
|
154
|
+
# Fallback: just check Null values
|
|
155
|
+
return null_check
|
|
309
156
|
|
|
310
|
-
def ne(self) -> FrameT | Any:
|
|
311
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
312
157
|
|
|
313
|
-
|
|
314
|
-
|
|
158
|
+
class ConjointlyValidation:
|
|
159
|
+
def __init__(self, data_tbl, expressions, threshold, tbl_type):
|
|
160
|
+
self.data_tbl = data_tbl
|
|
161
|
+
self.expressions = expressions
|
|
162
|
+
self.threshold = threshold
|
|
315
163
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
164
|
+
# Detect the table type
|
|
165
|
+
if tbl_type in (None, "local"):
|
|
166
|
+
# Detect the table type using _get_tbl_type()
|
|
167
|
+
self.tbl_type = _get_tbl_type(data=data_tbl)
|
|
319
168
|
else:
|
|
320
|
-
|
|
169
|
+
self.tbl_type = tbl_type
|
|
321
170
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
if not ref_col_has_null_vals and not cmp_col_has_null_vals:
|
|
325
|
-
if isinstance(self.compare, Column):
|
|
326
|
-
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
171
|
+
def get_test_results(self):
|
|
172
|
+
"""Evaluate all expressions and combine them conjointly."""
|
|
327
173
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
174
|
+
if "polars" in self.tbl_type:
|
|
175
|
+
return self._get_polars_results()
|
|
176
|
+
elif "pandas" in self.tbl_type:
|
|
177
|
+
return self._get_pandas_results()
|
|
178
|
+
elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
|
|
179
|
+
return self._get_ibis_results()
|
|
180
|
+
elif "pyspark" in self.tbl_type:
|
|
181
|
+
return self._get_pyspark_results()
|
|
182
|
+
else: # pragma: no cover
|
|
183
|
+
raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
|
|
331
184
|
|
|
332
|
-
|
|
333
|
-
|
|
185
|
+
def _get_polars_results(self):
|
|
186
|
+
"""Process expressions for Polars DataFrames."""
|
|
187
|
+
import polars as pl
|
|
334
188
|
|
|
335
|
-
|
|
336
|
-
pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
|
|
337
|
-
).to_native()
|
|
189
|
+
polars_results = [] # Changed from polars_expressions to polars_results
|
|
338
190
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
191
|
+
for expr_fn in self.expressions:
|
|
192
|
+
try:
|
|
193
|
+
# First try direct evaluation with native expressions
|
|
194
|
+
expr_result = expr_fn(self.data_tbl)
|
|
195
|
+
if isinstance(expr_result, pl.Expr):
|
|
196
|
+
# This is a Polars expression, we'll evaluate it later
|
|
197
|
+
polars_results.append(("expr", expr_result))
|
|
198
|
+
elif isinstance(expr_result, pl.Series):
|
|
199
|
+
# This is a boolean Series from lambda function
|
|
200
|
+
polars_results.append(("series", expr_result))
|
|
201
|
+
else:
|
|
202
|
+
raise TypeError("Not a valid Polars expression or series")
|
|
203
|
+
except Exception as e:
|
|
204
|
+
try:
|
|
205
|
+
# Try to get a ColumnExpression
|
|
206
|
+
col_expr = expr_fn(None)
|
|
207
|
+
if hasattr(col_expr, "to_polars_expr"):
|
|
208
|
+
polars_expr = col_expr.to_polars_expr()
|
|
209
|
+
polars_results.append(("expr", polars_expr))
|
|
210
|
+
else: # pragma: no cover
|
|
211
|
+
raise TypeError(f"Cannot convert {type(col_expr)} to Polars expression")
|
|
212
|
+
except Exception as e: # pragma: no cover
|
|
213
|
+
print(f"Error evaluating expression: {e}")
|
|
342
214
|
|
|
343
|
-
|
|
344
|
-
|
|
215
|
+
# Combine results with AND logic
|
|
216
|
+
if polars_results:
|
|
217
|
+
# Convert everything to Series for consistent handling
|
|
218
|
+
series_results = []
|
|
219
|
+
for result_type, result_value in polars_results:
|
|
220
|
+
if result_type == "series":
|
|
221
|
+
series_results.append(result_value)
|
|
222
|
+
elif result_type == "expr":
|
|
223
|
+
# Evaluate the expression on the DataFrame to get a Series
|
|
224
|
+
evaluated_series = self.data_tbl.select(result_value).to_series()
|
|
225
|
+
series_results.append(evaluated_series)
|
|
226
|
+
|
|
227
|
+
# Combine all boolean Series with AND logic
|
|
228
|
+
final_result = series_results[0]
|
|
229
|
+
for series in series_results[1:]:
|
|
230
|
+
final_result = final_result & series
|
|
345
231
|
|
|
346
|
-
#
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
tbl = self.x.with_columns(
|
|
350
|
-
pb_is_good_1=nw.col(self.column).is_null(),
|
|
351
|
-
pb_is_good_2=nw.lit(self.column) != nw.col(self.compare.name),
|
|
352
|
-
)
|
|
232
|
+
# Create results table with boolean column
|
|
233
|
+
results_tbl = self.data_tbl.with_columns(pb_is_good_=final_result)
|
|
234
|
+
return results_tbl
|
|
353
235
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
pb_is_good_2=nw.col(self.column) != nw.col(self.compare.name),
|
|
358
|
-
)
|
|
236
|
+
# Default case
|
|
237
|
+
results_tbl = self.data_tbl.with_columns(pb_is_good_=pl.lit(True)) # pragma: no cover
|
|
238
|
+
return results_tbl # pragma: no cover
|
|
359
239
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
)
|
|
240
|
+
def _get_pandas_results(self):
|
|
241
|
+
"""Process expressions for pandas DataFrames."""
|
|
242
|
+
import pandas as pd
|
|
364
243
|
|
|
365
|
-
|
|
366
|
-
# There may be Null values in the pb_is_good_2 column, change those to
|
|
367
|
-
# True if na_pass is True, False otherwise
|
|
244
|
+
pandas_series = []
|
|
368
245
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
)
|
|
246
|
+
for expr_fn in self.expressions:
|
|
247
|
+
try:
|
|
248
|
+
# First try direct evaluation with pandas DataFrame
|
|
249
|
+
expr_result = expr_fn(self.data_tbl)
|
|
374
250
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
)
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
tbl = tbl.with_columns(
|
|
383
|
-
pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
251
|
+
# Check that it's a pandas Series with bool dtype
|
|
252
|
+
if isinstance(expr_result, pd.Series):
|
|
253
|
+
if expr_result.dtype == bool or pd.api.types.is_bool_dtype(expr_result):
|
|
254
|
+
pandas_series.append(expr_result)
|
|
255
|
+
else: # pragma: no cover
|
|
256
|
+
raise TypeError(
|
|
257
|
+
f"Expression returned Series of type {expr_result.dtype}, expected bool"
|
|
384
258
|
)
|
|
259
|
+
else: # pragma: no cover
|
|
260
|
+
raise TypeError(f"Expression returned {type(expr_result)}, expected pd.Series")
|
|
385
261
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
# CASE 2: the comparison column has null values but the reference column does not
|
|
393
|
-
elif not ref_col_has_null_vals and cmp_col_has_null_vals:
|
|
394
|
-
if is_pandas_dataframe(self.x.to_native()):
|
|
395
|
-
tbl = self.x.with_columns(
|
|
396
|
-
pb_is_good_1=nw.col(self.column) != nw.lit(self.compare.name),
|
|
397
|
-
pb_is_good_2=nw.col(self.compare.name).is_null(),
|
|
398
|
-
)
|
|
262
|
+
except Exception as e:
|
|
263
|
+
try:
|
|
264
|
+
# Try as a ColumnExpression (for pb.expr_col style)
|
|
265
|
+
col_expr = expr_fn(None)
|
|
399
266
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
267
|
+
if hasattr(col_expr, "to_pandas_expr"):
|
|
268
|
+
# Watch for NotImplementedError here and re-raise it
|
|
269
|
+
try:
|
|
270
|
+
pandas_expr = col_expr.to_pandas_expr(self.data_tbl)
|
|
271
|
+
pandas_series.append(pandas_expr)
|
|
272
|
+
except NotImplementedError as nie: # pragma: no cover
|
|
273
|
+
# Re-raise NotImplementedError with the original message
|
|
274
|
+
raise NotImplementedError(str(nie))
|
|
275
|
+
else: # pragma: no cover
|
|
276
|
+
raise TypeError(f"Cannot convert {type(col_expr)} to pandas Series")
|
|
277
|
+
except NotImplementedError as nie: # pragma: no cover
|
|
278
|
+
# Re-raise NotImplementedError
|
|
279
|
+
raise NotImplementedError(str(nie))
|
|
280
|
+
except Exception as nested_e: # pragma: no cover
|
|
281
|
+
print(f"Error evaluating pandas expression: {e} -> {nested_e}")
|
|
405
282
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
283
|
+
# Combine results with AND logic
|
|
284
|
+
if pandas_series:
|
|
285
|
+
final_result = pandas_series[0]
|
|
286
|
+
for series in pandas_series[1:]:
|
|
287
|
+
final_result = final_result & series
|
|
410
288
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
)
|
|
416
|
-
else:
|
|
417
|
-
# General case (non-Polars): handle na_pass=True properly
|
|
418
|
-
if self.na_pass:
|
|
419
|
-
tbl = tbl.with_columns(
|
|
420
|
-
pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
421
|
-
)
|
|
289
|
+
# Create results table with boolean column
|
|
290
|
+
results_tbl = self.data_tbl.copy()
|
|
291
|
+
results_tbl["pb_is_good_"] = final_result
|
|
292
|
+
return results_tbl
|
|
422
293
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
294
|
+
# Default case
|
|
295
|
+
results_tbl = self.data_tbl.copy() # pragma: no cover
|
|
296
|
+
results_tbl["pb_is_good_"] = pd.Series( # pragma: no cover
|
|
297
|
+
[True] * len(self.data_tbl), index=self.data_tbl.index
|
|
298
|
+
)
|
|
299
|
+
return results_tbl # pragma: no cover
|
|
428
300
|
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
tbl = self.x.with_columns(
|
|
433
|
-
pb_is_good_1=nw.col(self.column).is_null(),
|
|
434
|
-
pb_is_good_2=nw.col(self.compare.name).is_null(),
|
|
435
|
-
pb_is_good_3=nw.col(self.column) != nw.col(self.compare.name),
|
|
436
|
-
)
|
|
301
|
+
def _get_ibis_results(self):
|
|
302
|
+
"""Process expressions for Ibis tables (including DuckDB)."""
|
|
303
|
+
import ibis
|
|
437
304
|
|
|
438
|
-
|
|
439
|
-
tbl = tbl.with_columns(
|
|
440
|
-
pb_is_good_3=nw.col("pb_is_good_3")
|
|
441
|
-
& ~nw.col("pb_is_good_1")
|
|
442
|
-
& ~nw.col("pb_is_good_2")
|
|
443
|
-
)
|
|
305
|
+
ibis_expressions = []
|
|
444
306
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
450
|
-
.then(True)
|
|
451
|
-
.otherwise(False)
|
|
452
|
-
)
|
|
453
|
-
)
|
|
454
|
-
else:
|
|
455
|
-
# General case (non-Polars): handle na_pass=True properly
|
|
456
|
-
if self.na_pass:
|
|
457
|
-
tbl = tbl.with_columns(
|
|
458
|
-
pb_is_good_3=(
|
|
459
|
-
nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
460
|
-
.then(True)
|
|
461
|
-
.otherwise(nw.col("pb_is_good_3"))
|
|
462
|
-
)
|
|
463
|
-
)
|
|
307
|
+
for expr_fn in self.expressions:
|
|
308
|
+
# Strategy 1: Try direct evaluation with native Ibis expressions
|
|
309
|
+
try:
|
|
310
|
+
expr_result = expr_fn(self.data_tbl)
|
|
464
311
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
.
|
|
468
|
-
|
|
469
|
-
|
|
312
|
+
# Check if it's a valid Ibis expression
|
|
313
|
+
if hasattr(expr_result, "_ibis_expr"): # pragma: no cover
|
|
314
|
+
ibis_expressions.append(expr_result)
|
|
315
|
+
continue # Skip to next expression if this worked
|
|
316
|
+
except Exception: # pragma: no cover
|
|
317
|
+
pass # Silently continue to Strategy 2
|
|
470
318
|
|
|
471
|
-
|
|
472
|
-
#
|
|
473
|
-
|
|
474
|
-
|
|
319
|
+
# Strategy 2: Try with ColumnExpression
|
|
320
|
+
try: # pragma: no cover
|
|
321
|
+
# Skip this strategy if we don't have an expr_col implementation
|
|
322
|
+
if not hasattr(self, "to_ibis_expr"):
|
|
323
|
+
continue
|
|
475
324
|
|
|
476
|
-
|
|
325
|
+
col_expr = expr_fn(None)
|
|
477
326
|
|
|
478
|
-
if
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
pb_is_good_2=nw.lit(self.column) != nw.lit(compare_expr),
|
|
482
|
-
)
|
|
327
|
+
# Skip if we got None
|
|
328
|
+
if col_expr is None:
|
|
329
|
+
continue
|
|
483
330
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
331
|
+
# Convert ColumnExpression to Ibis expression
|
|
332
|
+
if hasattr(col_expr, "to_ibis_expr"):
|
|
333
|
+
ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
|
|
334
|
+
ibis_expressions.append(ibis_expr)
|
|
335
|
+
except Exception: # pragma: no cover
|
|
336
|
+
# Silent failure - we already tried both strategies
|
|
337
|
+
pass
|
|
488
338
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
339
|
+
# Combine expressions
|
|
340
|
+
if ibis_expressions: # pragma: no cover
|
|
341
|
+
try:
|
|
342
|
+
final_result = ibis_expressions[0]
|
|
343
|
+
for expr in ibis_expressions[1:]:
|
|
344
|
+
final_result = final_result & expr
|
|
494
345
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
346
|
+
# Create results table with boolean column
|
|
347
|
+
results_tbl = self.data_tbl.mutate(pb_is_good_=final_result)
|
|
348
|
+
return results_tbl
|
|
349
|
+
except Exception as e:
|
|
350
|
+
print(f"Error combining Ibis expressions: {e}")
|
|
500
351
|
|
|
501
|
-
|
|
352
|
+
# Default case
|
|
353
|
+
results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
|
|
354
|
+
return results_tbl
|
|
502
355
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
| (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
|
|
507
|
-
)
|
|
508
|
-
)
|
|
356
|
+
def _get_pyspark_results(self):
|
|
357
|
+
"""Process expressions for PySpark DataFrames."""
|
|
358
|
+
from pyspark.sql import functions as F
|
|
509
359
|
|
|
510
|
-
|
|
360
|
+
pyspark_columns = []
|
|
511
361
|
|
|
512
|
-
|
|
362
|
+
for expr_fn in self.expressions:
|
|
363
|
+
try:
|
|
364
|
+
# First try direct evaluation with PySpark DataFrame
|
|
365
|
+
expr_result = expr_fn(self.data_tbl)
|
|
513
366
|
|
|
367
|
+
# Check if it's a PySpark Column
|
|
368
|
+
if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
|
|
369
|
+
pyspark_columns.append(expr_result)
|
|
514
370
|
else:
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
tbl = self.x.with_columns(
|
|
518
|
-
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
519
|
-
pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
|
|
371
|
+
raise TypeError(
|
|
372
|
+
f"Expression returned {type(expr_result)}, expected PySpark Column"
|
|
520
373
|
)
|
|
521
374
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
375
|
+
except Exception as e:
|
|
376
|
+
try:
|
|
377
|
+
# Try as a ColumnExpression (for pb.expr_col style)
|
|
378
|
+
col_expr = expr_fn(None)
|
|
379
|
+
|
|
380
|
+
if hasattr(col_expr, "to_pyspark_expr"):
|
|
381
|
+
# Convert to PySpark expression
|
|
382
|
+
pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
|
|
383
|
+
pyspark_columns.append(pyspark_expr)
|
|
384
|
+
else:
|
|
385
|
+
raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
|
|
386
|
+
except Exception as nested_e:
|
|
387
|
+
print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
|
|
530
388
|
|
|
531
|
-
|
|
389
|
+
# Combine results with AND logic
|
|
390
|
+
if pyspark_columns:
|
|
391
|
+
final_result = pyspark_columns[0]
|
|
392
|
+
for col in pyspark_columns[1:]:
|
|
393
|
+
final_result = final_result & col
|
|
532
394
|
|
|
533
|
-
|
|
534
|
-
|
|
395
|
+
# Create results table with boolean column
|
|
396
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
|
|
397
|
+
return results_tbl
|
|
535
398
|
|
|
536
|
-
|
|
399
|
+
# Default case
|
|
400
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
|
|
401
|
+
return results_tbl
|
|
537
402
|
|
|
538
|
-
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
539
403
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
if isinstance(self.compare, Column)
|
|
546
|
-
else nw.lit(False)
|
|
547
|
-
),
|
|
548
|
-
pb_is_good_3=nw.col(self.column) >= compare_expr,
|
|
549
|
-
)
|
|
550
|
-
.with_columns(
|
|
551
|
-
pb_is_good_3=(
|
|
552
|
-
nw.when(nw.col("pb_is_good_3").is_null())
|
|
553
|
-
.then(nw.lit(False))
|
|
554
|
-
.otherwise(nw.col("pb_is_good_3"))
|
|
555
|
-
)
|
|
556
|
-
)
|
|
557
|
-
.with_columns(
|
|
558
|
-
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
559
|
-
)
|
|
560
|
-
)
|
|
404
|
+
class SpeciallyValidation:
|
|
405
|
+
def __init__(self, data_tbl, expression, threshold, tbl_type):
|
|
406
|
+
self.data_tbl = data_tbl
|
|
407
|
+
self.expression = expression
|
|
408
|
+
self.threshold = threshold
|
|
561
409
|
|
|
562
|
-
|
|
410
|
+
# Detect the table type
|
|
411
|
+
if tbl_type in (None, "local"):
|
|
412
|
+
# Detect the table type using _get_tbl_type()
|
|
413
|
+
self.tbl_type = _get_tbl_type(data=data_tbl)
|
|
414
|
+
else:
|
|
415
|
+
self.tbl_type = tbl_type
|
|
563
416
|
|
|
564
|
-
def
|
|
565
|
-
|
|
417
|
+
def get_test_results(self) -> any | list[bool]:
|
|
418
|
+
"""Evaluate the expression get either a list of booleans or a results table."""
|
|
566
419
|
|
|
567
|
-
|
|
420
|
+
# Get the expression and inspect whether there is a `data` argument
|
|
421
|
+
expression = self.expression
|
|
568
422
|
|
|
569
|
-
|
|
423
|
+
import inspect
|
|
570
424
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
.
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
)
|
|
587
|
-
|
|
588
|
-
.with_columns(
|
|
589
|
-
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
425
|
+
# During execution of `specially` validation
|
|
426
|
+
sig = inspect.signature(expression)
|
|
427
|
+
params = list(sig.parameters.keys())
|
|
428
|
+
|
|
429
|
+
# Execute the function based on its signature
|
|
430
|
+
if len(params) == 0:
|
|
431
|
+
# No parameters: call without arguments
|
|
432
|
+
result = expression()
|
|
433
|
+
elif len(params) == 1:
|
|
434
|
+
# One parameter: pass the data table
|
|
435
|
+
data_tbl = self.data_tbl
|
|
436
|
+
result = expression(data_tbl)
|
|
437
|
+
else:
|
|
438
|
+
# More than one parameter - this doesn't match either allowed signature
|
|
439
|
+
raise ValueError(
|
|
440
|
+
f"The function provided to 'specially()' should have either no parameters or a "
|
|
441
|
+
f"single 'data' parameter, but it has {len(params)} parameters: {params}"
|
|
590
442
|
)
|
|
591
|
-
.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
592
|
-
.to_native()
|
|
593
|
-
)
|
|
594
443
|
|
|
595
|
-
|
|
596
|
-
|
|
444
|
+
# Determine if the object is a DataFrame by inspecting the string version of its type
|
|
445
|
+
if (
|
|
446
|
+
"pandas" in str(type(result))
|
|
447
|
+
or "polars" in str(type(result))
|
|
448
|
+
or "ibis" in str(type(result))
|
|
449
|
+
):
|
|
450
|
+
# Get the type of the table
|
|
451
|
+
tbl_type = _get_tbl_type(data=result)
|
|
597
452
|
|
|
598
|
-
|
|
599
|
-
|
|
453
|
+
if "pandas" in tbl_type:
|
|
454
|
+
# If it's a Pandas DataFrame, check if the last column is a boolean column
|
|
455
|
+
last_col = result.iloc[:, -1]
|
|
600
456
|
|
|
601
|
-
|
|
602
|
-
high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
|
|
457
|
+
import pandas as pd
|
|
603
458
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
),
|
|
612
|
-
pb_is_good_4=nw.lit(self.na_pass), # Pass if any Null in lb, val, or ub
|
|
613
|
-
)
|
|
459
|
+
if last_col.dtype == bool or pd.api.types.is_bool_dtype(last_col):
|
|
460
|
+
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
461
|
+
result.rename(columns={result.columns[-1]: "pb_is_good_"}, inplace=True)
|
|
462
|
+
elif "polars" in tbl_type:
|
|
463
|
+
# If it's a Polars DataFrame, check if the last column is a boolean column
|
|
464
|
+
last_col_name = result.columns[-1]
|
|
465
|
+
last_col_dtype = result.schema[last_col_name]
|
|
614
466
|
|
|
615
|
-
|
|
616
|
-
tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) >= low_val)
|
|
617
|
-
else:
|
|
618
|
-
tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) > low_val)
|
|
467
|
+
import polars as pl
|
|
619
468
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
469
|
+
if last_col_dtype == pl.Boolean:
|
|
470
|
+
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
471
|
+
result = result.rename({last_col_name: "pb_is_good_"})
|
|
472
|
+
elif tbl_type in IBIS_BACKENDS:
|
|
473
|
+
# If it's an Ibis table, check if the last column is a boolean column
|
|
474
|
+
last_col_name = result.columns[-1]
|
|
475
|
+
result_schema = result.schema()
|
|
476
|
+
is_last_col_bool = str(result_schema[last_col_name]) == "boolean"
|
|
624
477
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
.then(nw.lit(False))
|
|
629
|
-
.otherwise(nw.col("pb_is_good_5"))
|
|
630
|
-
)
|
|
631
|
-
)
|
|
478
|
+
if is_last_col_bool:
|
|
479
|
+
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
480
|
+
result = result.rename(pb_is_good_=last_col_name)
|
|
632
481
|
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
nw.when(nw.col("pb_is_good_6").is_null())
|
|
636
|
-
.then(nw.lit(False))
|
|
637
|
-
.otherwise(nw.col("pb_is_good_6"))
|
|
638
|
-
)
|
|
639
|
-
)
|
|
482
|
+
else: # pragma: no cover
|
|
483
|
+
raise NotImplementedError(f"Support for {tbl_type} is not yet implemented")
|
|
640
484
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
"
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
"pb_is_good_5",
|
|
657
|
-
"pb_is_good_6",
|
|
658
|
-
)
|
|
659
|
-
.to_native()
|
|
660
|
-
)
|
|
485
|
+
elif isinstance(result, bool):
|
|
486
|
+
# If it's a single boolean, return that as a list
|
|
487
|
+
return [result]
|
|
488
|
+
|
|
489
|
+
elif isinstance(result, list):
|
|
490
|
+
# If it's a list, check that it is a boolean list
|
|
491
|
+
if all(isinstance(x, bool) for x in result):
|
|
492
|
+
# If it's a list of booleans, return it as is
|
|
493
|
+
return result
|
|
494
|
+
else:
|
|
495
|
+
# If it's not a list of booleans, raise an error
|
|
496
|
+
raise TypeError("The result is not a list of booleans.")
|
|
497
|
+
else: # pragma: no cover
|
|
498
|
+
# If it's not a DataFrame or a list, raise an error
|
|
499
|
+
raise TypeError("The result is not a DataFrame or a list of booleans.")
|
|
661
500
|
|
|
662
|
-
|
|
501
|
+
# Return the results table or list of booleans
|
|
502
|
+
return result
|
|
663
503
|
|
|
664
|
-
def outside(self) -> FrameT | Any:
|
|
665
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
666
504
|
|
|
667
|
-
|
|
668
|
-
|
|
505
|
+
@dataclass
|
|
506
|
+
class NumberOfTestUnits:
|
|
507
|
+
"""
|
|
508
|
+
Count the number of test units in a column.
|
|
509
|
+
"""
|
|
669
510
|
|
|
670
|
-
|
|
671
|
-
|
|
511
|
+
df: FrameT
|
|
512
|
+
column: str
|
|
672
513
|
|
|
673
|
-
|
|
674
|
-
|
|
514
|
+
def get_test_units(self, tbl_type: str) -> int:
|
|
515
|
+
if (
|
|
516
|
+
tbl_type == "pandas"
|
|
517
|
+
or tbl_type == "polars"
|
|
518
|
+
or tbl_type == "pyspark"
|
|
519
|
+
or tbl_type == "local"
|
|
520
|
+
):
|
|
521
|
+
# Convert the DataFrame to a format that narwhals can work with and:
|
|
522
|
+
# - check if the column exists
|
|
523
|
+
dfn = _column_test_prep(
|
|
524
|
+
df=self.df, column=self.column, allowed_types=None, check_exists=False
|
|
525
|
+
)
|
|
675
526
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
nw.col(self.low.name).is_null() if isinstance(self.low, Column) else nw.lit(False)
|
|
680
|
-
),
|
|
681
|
-
pb_is_good_3=( # ub is Null in Column
|
|
682
|
-
nw.col(self.high.name).is_null() if isinstance(self.high, Column) else nw.lit(False)
|
|
683
|
-
),
|
|
684
|
-
pb_is_good_4=nw.lit(self.na_pass), # Pass if any Null in lb, val, or ub
|
|
685
|
-
)
|
|
527
|
+
# Handle LazyFrames which don't have len()
|
|
528
|
+
if hasattr(dfn, "collect"):
|
|
529
|
+
dfn = dfn.collect()
|
|
686
530
|
|
|
687
|
-
|
|
688
|
-
tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) < low_val)
|
|
689
|
-
else:
|
|
690
|
-
tbl = tbl.with_columns(pb_is_good_5=nw.col(self.column) <= low_val)
|
|
531
|
+
return len(dfn)
|
|
691
532
|
|
|
692
|
-
if
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
tbl = tbl.with_columns(
|
|
698
|
-
pb_is_good_5=nw.when(nw.col("pb_is_good_5").is_null())
|
|
699
|
-
.then(False)
|
|
700
|
-
.otherwise(nw.col("pb_is_good_5")),
|
|
701
|
-
pb_is_good_6=nw.when(nw.col("pb_is_good_6").is_null())
|
|
702
|
-
.then(False)
|
|
703
|
-
.otherwise(nw.col("pb_is_good_6")),
|
|
704
|
-
)
|
|
533
|
+
if tbl_type in IBIS_BACKENDS:
|
|
534
|
+
# Get the count of test units and convert to a native format
|
|
535
|
+
# TODO: check whether pandas or polars is available
|
|
536
|
+
return self.df.count().to_polars()
|
|
705
537
|
|
|
706
|
-
tbl = (
|
|
707
|
-
tbl.with_columns(
|
|
708
|
-
pb_is_good_=(
|
|
709
|
-
(
|
|
710
|
-
(nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
|
|
711
|
-
& nw.col("pb_is_good_4")
|
|
712
|
-
)
|
|
713
|
-
| (
|
|
714
|
-
(nw.col("pb_is_good_5") & ~nw.col("pb_is_good_3"))
|
|
715
|
-
| (nw.col("pb_is_good_6")) & ~nw.col("pb_is_good_2")
|
|
716
|
-
)
|
|
717
|
-
)
|
|
718
|
-
)
|
|
719
|
-
.drop(
|
|
720
|
-
"pb_is_good_1",
|
|
721
|
-
"pb_is_good_2",
|
|
722
|
-
"pb_is_good_3",
|
|
723
|
-
"pb_is_good_4",
|
|
724
|
-
"pb_is_good_5",
|
|
725
|
-
"pb_is_good_6",
|
|
726
|
-
)
|
|
727
|
-
.to_native()
|
|
728
|
-
)
|
|
729
538
|
|
|
730
|
-
|
|
539
|
+
def _get_compare_expr_nw(compare: Any) -> Any:
|
|
540
|
+
if isinstance(compare, Column):
|
|
541
|
+
if not isinstance(compare.exprs, str):
|
|
542
|
+
raise ValueError("The column expression must be a string.") # pragma: no cover
|
|
543
|
+
return nw.col(compare.exprs)
|
|
544
|
+
return compare
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _column_has_null_values(table: FrameT, column: str) -> bool:
|
|
548
|
+
try:
|
|
549
|
+
# Try the standard null_count() method
|
|
550
|
+
null_count = (table.select(column).null_count())[column][0]
|
|
551
|
+
except AttributeError:
|
|
552
|
+
# For LazyFrames, collect first then get null count
|
|
553
|
+
try:
|
|
554
|
+
collected = table.select(column).collect()
|
|
555
|
+
null_count = (collected.null_count())[column][0]
|
|
556
|
+
except Exception:
|
|
557
|
+
# Fallback: check if any values are null
|
|
558
|
+
try:
|
|
559
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
560
|
+
null_count = result["null_count"][0]
|
|
561
|
+
except Exception:
|
|
562
|
+
# Last resort: return False (assume no nulls)
|
|
563
|
+
return False
|
|
564
|
+
|
|
565
|
+
if null_count is None or null_count == 0:
|
|
566
|
+
return False
|
|
731
567
|
|
|
732
|
-
|
|
733
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
568
|
+
return True
|
|
734
569
|
|
|
735
|
-
can_be_null: bool = None in self.set
|
|
736
570
|
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
571
|
+
def _check_nulls_across_columns_nw(table, columns_subset):
|
|
572
|
+
# Get all column names from the table
|
|
573
|
+
column_names = columns_subset if columns_subset else table.columns
|
|
740
574
|
|
|
741
|
-
|
|
575
|
+
# Build the expression by combining each column's `is_null()` with OR operations
|
|
576
|
+
null_expr = functools.reduce(
|
|
577
|
+
lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
|
|
578
|
+
column_names,
|
|
579
|
+
None,
|
|
580
|
+
)
|
|
742
581
|
|
|
743
|
-
|
|
744
|
-
|
|
582
|
+
# Add the expression as a new column to the table
|
|
583
|
+
result = table.with_columns(_any_is_null_=null_expr)
|
|
745
584
|
|
|
746
|
-
|
|
747
|
-
self.x.with_columns(
|
|
748
|
-
pb_is_good_=nw.col(self.column).is_in(self.set),
|
|
749
|
-
)
|
|
750
|
-
.with_columns(pb_is_good_=~nw.col("pb_is_good_"))
|
|
751
|
-
.to_native()
|
|
752
|
-
)
|
|
585
|
+
return result
|
|
753
586
|
|
|
754
|
-
def regex(self) -> FrameT | Any:
|
|
755
|
-
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
756
587
|
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
760
|
-
pb_is_good_2=nw.when(~nw.col(self.column).is_null())
|
|
761
|
-
.then(nw.col(self.column).str.contains(pattern=self.pattern))
|
|
762
|
-
.otherwise(False),
|
|
763
|
-
)
|
|
764
|
-
.with_columns(pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
765
|
-
.drop("pb_is_good_1", "pb_is_good_2")
|
|
766
|
-
.to_native()
|
|
767
|
-
)
|
|
588
|
+
def _modify_datetime_compare_val(tgt_column: any, compare_val: any) -> any:
|
|
589
|
+
tgt_col_dtype_str = str(tgt_column.dtype).lower()
|
|
768
590
|
|
|
769
|
-
|
|
770
|
-
|
|
591
|
+
if compare_val is isinstance(compare_val, Column): # pragma: no cover
|
|
592
|
+
return compare_val
|
|
771
593
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
).to_native()
|
|
594
|
+
# Get the type of `compare_expr` and convert, if necessary, to the type of the column
|
|
595
|
+
compare_type_str = str(type(compare_val)).lower()
|
|
775
596
|
|
|
776
|
-
|
|
777
|
-
|
|
597
|
+
if "datetime.datetime" in compare_type_str:
|
|
598
|
+
compare_type = "datetime"
|
|
599
|
+
elif "datetime.date" in compare_type_str:
|
|
600
|
+
compare_type = "date"
|
|
601
|
+
else:
|
|
602
|
+
compare_type = "other"
|
|
778
603
|
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
604
|
+
if "datetime" in tgt_col_dtype_str:
|
|
605
|
+
tgt_col_dtype = "datetime"
|
|
606
|
+
elif "date" in tgt_col_dtype_str or "object" in tgt_col_dtype_str:
|
|
607
|
+
# Object type is used for date columns in Pandas
|
|
608
|
+
tgt_col_dtype = "date"
|
|
609
|
+
else:
|
|
610
|
+
tgt_col_dtype = "other"
|
|
782
611
|
|
|
783
|
-
|
|
784
|
-
|
|
612
|
+
# Handle each combination of `compare_type` and `tgt_col_dtype`, coercing only the
|
|
613
|
+
# `compare_expr` to the type of the column
|
|
614
|
+
if compare_type == "datetime" and tgt_col_dtype == "date":
|
|
615
|
+
# Assume that `compare_expr` is a datetime.datetime object and strip the time part
|
|
616
|
+
# to get a date object
|
|
617
|
+
compare_expr = compare_val.date()
|
|
785
618
|
|
|
786
|
-
|
|
619
|
+
elif compare_type == "date" and tgt_col_dtype == "datetime":
|
|
620
|
+
import datetime
|
|
787
621
|
|
|
788
|
-
#
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
else:
|
|
792
|
-
columns_subset = self.columns_subset
|
|
622
|
+
# Assume that `compare_expr` is a `datetime.date` object so add in the time part
|
|
623
|
+
# to get a `datetime.datetime` object
|
|
624
|
+
compare_expr = datetime.datetime.combine(compare_val, datetime.datetime.min.time())
|
|
793
625
|
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
626
|
+
else:
|
|
627
|
+
return compare_val
|
|
797
628
|
|
|
798
|
-
|
|
799
|
-
tbl = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
629
|
+
return compare_expr
|
|
800
630
|
|
|
801
|
-
# Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
|
|
802
|
-
tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
803
631
|
|
|
804
|
-
|
|
632
|
+
def col_vals_expr(data_tbl: FrameT, expr, tbl_type: str = "local"):
|
|
633
|
+
"""Check if values in a column evaluate to True for a given predicate expression."""
|
|
634
|
+
if tbl_type == "local":
|
|
635
|
+
# Check the type of expression provided
|
|
636
|
+
if "narwhals" in str(type(expr)) and "expr" in str(type(expr)):
|
|
637
|
+
expression_type = "narwhals"
|
|
638
|
+
elif "polars" in str(type(expr)) and "expr" in str(type(expr)):
|
|
639
|
+
expression_type = "polars"
|
|
640
|
+
else:
|
|
641
|
+
expression_type = "pandas"
|
|
805
642
|
|
|
806
|
-
|
|
807
|
-
|
|
643
|
+
# Determine whether this is a Pandas or Polars table
|
|
644
|
+
tbl_type_detected = _get_tbl_type(data=data_tbl)
|
|
645
|
+
df_lib_name = "polars" if "polars" in tbl_type_detected else "pandas"
|
|
808
646
|
|
|
809
|
-
|
|
647
|
+
if expression_type == "narwhals":
|
|
648
|
+
tbl_nw = _convert_to_narwhals(df=data_tbl)
|
|
649
|
+
tbl_nw = tbl_nw.with_columns(pb_is_good_=expr)
|
|
650
|
+
return tbl_nw.to_native()
|
|
810
651
|
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
tbl = _check_nulls_across_columns_nw(table=tbl, columns_subset=self.columns_subset)
|
|
652
|
+
if df_lib_name == "polars" and expression_type == "polars":
|
|
653
|
+
return data_tbl.with_columns(pb_is_good_=expr)
|
|
814
654
|
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
tbl = tbl.with_columns(pb_is_good_=~nw.col("_any_is_null_"))
|
|
818
|
-
tbl = tbl.drop("_any_is_null_")
|
|
655
|
+
if df_lib_name == "pandas" and expression_type == "pandas":
|
|
656
|
+
return data_tbl.assign(pb_is_good_=expr)
|
|
819
657
|
|
|
820
|
-
|
|
821
|
-
|
|
658
|
+
# For remote backends, return original table (placeholder)
|
|
659
|
+
return data_tbl
|
|
822
660
|
|
|
823
661
|
|
|
824
|
-
|
|
825
|
-
class ColValsCompareOne:
|
|
662
|
+
def rows_complete(data_tbl: FrameT, columns_subset: list[str] | None):
|
|
826
663
|
"""
|
|
827
|
-
|
|
664
|
+
Check if rows in a DataFrame are complete (no null values).
|
|
828
665
|
|
|
829
|
-
|
|
830
|
-
----------
|
|
831
|
-
data_tbl
|
|
832
|
-
A data table.
|
|
833
|
-
column
|
|
834
|
-
The column to check.
|
|
835
|
-
value
|
|
836
|
-
A value to check against.
|
|
837
|
-
na_pass
|
|
838
|
-
`True` to pass test units with missing values, `False` otherwise.
|
|
839
|
-
threshold
|
|
840
|
-
The maximum number of failing test units to allow.
|
|
841
|
-
assertion_method
|
|
842
|
-
The type of assertion ('gt' for greater than, 'lt' for less than).
|
|
843
|
-
allowed_types
|
|
844
|
-
The allowed data types for the column.
|
|
845
|
-
tbl_type
|
|
846
|
-
The type of table to use for the assertion.
|
|
847
|
-
|
|
848
|
-
Returns
|
|
849
|
-
-------
|
|
850
|
-
bool
|
|
851
|
-
`True` when test units pass below the threshold level for failing test units, `False`
|
|
852
|
-
otherwise.
|
|
666
|
+
This function replaces the RowsComplete dataclass for direct usage.
|
|
853
667
|
"""
|
|
668
|
+
tbl = _convert_to_narwhals(df=data_tbl)
|
|
854
669
|
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
threshold: int
|
|
860
|
-
assertion_method: str
|
|
861
|
-
allowed_types: list[str]
|
|
862
|
-
tbl_type: str = "local"
|
|
863
|
-
|
|
864
|
-
def __post_init__(self):
|
|
865
|
-
if self.tbl_type == "local":
|
|
866
|
-
# Convert the DataFrame to a format that narwhals can work with, and:
|
|
867
|
-
# - check if the `column=` exists
|
|
868
|
-
# - check if the `column=` type is compatible with the test
|
|
869
|
-
tbl = _column_test_prep(
|
|
870
|
-
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
871
|
-
)
|
|
872
|
-
else:
|
|
873
|
-
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
874
|
-
tbl = self.data_tbl
|
|
875
|
-
|
|
876
|
-
# Collect results for the test units; the results are a list of booleans where
|
|
877
|
-
# `True` indicates a passing test unit
|
|
878
|
-
if self.assertion_method == "gt":
|
|
879
|
-
self.test_unit_res = Interrogator(
|
|
880
|
-
x=tbl,
|
|
881
|
-
column=self.column,
|
|
882
|
-
compare=self.value,
|
|
883
|
-
na_pass=self.na_pass,
|
|
884
|
-
tbl_type=self.tbl_type,
|
|
885
|
-
).gt()
|
|
886
|
-
elif self.assertion_method == "lt":
|
|
887
|
-
self.test_unit_res = Interrogator(
|
|
888
|
-
x=tbl,
|
|
889
|
-
column=self.column,
|
|
890
|
-
compare=self.value,
|
|
891
|
-
na_pass=self.na_pass,
|
|
892
|
-
tbl_type=self.tbl_type,
|
|
893
|
-
).lt()
|
|
894
|
-
elif self.assertion_method == "eq":
|
|
895
|
-
self.test_unit_res = Interrogator(
|
|
896
|
-
x=tbl,
|
|
897
|
-
column=self.column,
|
|
898
|
-
compare=self.value,
|
|
899
|
-
na_pass=self.na_pass,
|
|
900
|
-
tbl_type=self.tbl_type,
|
|
901
|
-
).eq()
|
|
902
|
-
elif self.assertion_method == "ne":
|
|
903
|
-
self.test_unit_res = Interrogator(
|
|
904
|
-
x=tbl,
|
|
905
|
-
column=self.column,
|
|
906
|
-
compare=self.value,
|
|
907
|
-
na_pass=self.na_pass,
|
|
908
|
-
tbl_type=self.tbl_type,
|
|
909
|
-
).ne()
|
|
910
|
-
elif self.assertion_method == "ge":
|
|
911
|
-
self.test_unit_res = Interrogator(
|
|
912
|
-
x=tbl,
|
|
913
|
-
column=self.column,
|
|
914
|
-
compare=self.value,
|
|
915
|
-
na_pass=self.na_pass,
|
|
916
|
-
tbl_type=self.tbl_type,
|
|
917
|
-
).ge()
|
|
918
|
-
elif self.assertion_method == "le":
|
|
919
|
-
self.test_unit_res = Interrogator(
|
|
920
|
-
x=tbl,
|
|
921
|
-
column=self.column,
|
|
922
|
-
compare=self.value,
|
|
923
|
-
na_pass=self.na_pass,
|
|
924
|
-
tbl_type=self.tbl_type,
|
|
925
|
-
).le()
|
|
926
|
-
elif self.assertion_method == "null":
|
|
927
|
-
self.test_unit_res = Interrogator(
|
|
928
|
-
x=tbl,
|
|
929
|
-
column=self.column,
|
|
930
|
-
compare=self.value,
|
|
931
|
-
tbl_type=self.tbl_type,
|
|
932
|
-
).null()
|
|
933
|
-
elif self.assertion_method == "not_null":
|
|
934
|
-
self.test_unit_res = Interrogator(
|
|
935
|
-
x=tbl,
|
|
936
|
-
column=self.column,
|
|
937
|
-
compare=self.value,
|
|
938
|
-
tbl_type=self.tbl_type,
|
|
939
|
-
).not_null()
|
|
940
|
-
else:
|
|
941
|
-
raise ValueError(
|
|
942
|
-
"""Invalid comparison type. Use:
|
|
943
|
-
- `gt` for greater than,
|
|
944
|
-
- `lt` for less than,
|
|
945
|
-
- `eq` for equal to,
|
|
946
|
-
- `ne` for not equal to,
|
|
947
|
-
- `ge` for greater than or equal to,
|
|
948
|
-
- `le` for less than or equal to,
|
|
949
|
-
- `null` for null values, or
|
|
950
|
-
- `not_null` for not null values.
|
|
951
|
-
"""
|
|
952
|
-
)
|
|
953
|
-
|
|
954
|
-
def get_test_results(self):
|
|
955
|
-
return self.test_unit_res
|
|
956
|
-
|
|
957
|
-
def test(self):
|
|
958
|
-
# Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
|
|
959
|
-
# column and then determine if the test passes overall by comparing the number of failing
|
|
960
|
-
# test units to the threshold for failing test units
|
|
961
|
-
|
|
962
|
-
results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
|
|
963
|
-
|
|
964
|
-
return _threshold_check(
|
|
965
|
-
failing_test_units=results_list.count(False), threshold=self.threshold
|
|
966
|
-
)
|
|
670
|
+
return interrogate_rows_complete(
|
|
671
|
+
tbl=tbl,
|
|
672
|
+
columns_subset=columns_subset,
|
|
673
|
+
)
|
|
967
674
|
|
|
968
675
|
|
|
969
|
-
|
|
970
|
-
class ColValsCompareTwo:
|
|
676
|
+
def col_exists(data_tbl: FrameT, column: str) -> bool:
|
|
971
677
|
"""
|
|
972
|
-
|
|
678
|
+
Check if a column exists in a DataFrame.
|
|
973
679
|
|
|
974
680
|
Parameters
|
|
975
681
|
----------
|
|
@@ -977,1125 +683,1167 @@ class ColValsCompareTwo:
|
|
|
977
683
|
A data table.
|
|
978
684
|
column
|
|
979
685
|
The column to check.
|
|
980
|
-
value1
|
|
981
|
-
A value to check against.
|
|
982
|
-
value2
|
|
983
|
-
A value to check against.
|
|
984
|
-
inclusive
|
|
985
|
-
A tuple of booleans that state which bounds are inclusive. The position of the boolean
|
|
986
|
-
corresponds to the value in the following order: (value1, value2).
|
|
987
|
-
na_pass
|
|
988
|
-
`True` to pass test units with missing values, `False` otherwise.
|
|
989
|
-
threshold
|
|
990
|
-
The maximum number of failing test units to allow.
|
|
991
|
-
assertion_method
|
|
992
|
-
The type of assertion ('between' for between two values and 'outside' for outside two
|
|
993
|
-
values).
|
|
994
|
-
allowed_types
|
|
995
|
-
The allowed data types for the column.
|
|
996
|
-
tbl_type
|
|
997
|
-
The type of table to use for the assertion.
|
|
998
686
|
|
|
999
687
|
Returns
|
|
1000
688
|
-------
|
|
1001
689
|
bool
|
|
1002
|
-
`True`
|
|
1003
|
-
otherwise.
|
|
690
|
+
`True` if the column exists, `False` otherwise.
|
|
1004
691
|
"""
|
|
692
|
+
tbl = _convert_to_narwhals(df=data_tbl)
|
|
693
|
+
return column in tbl.columns
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def col_schema_match(
|
|
697
|
+
data_tbl: FrameT,
|
|
698
|
+
schema,
|
|
699
|
+
complete: bool,
|
|
700
|
+
in_order: bool,
|
|
701
|
+
case_sensitive_colnames: bool,
|
|
702
|
+
case_sensitive_dtypes: bool,
|
|
703
|
+
full_match_dtypes: bool,
|
|
704
|
+
threshold: int,
|
|
705
|
+
) -> bool:
|
|
706
|
+
"""
|
|
707
|
+
Check if DataFrame schema matches expected schema.
|
|
708
|
+
"""
|
|
709
|
+
from pointblank.schema import _check_schema_match
|
|
710
|
+
|
|
711
|
+
return _check_schema_match(
|
|
712
|
+
data_tbl=data_tbl,
|
|
713
|
+
schema=schema,
|
|
714
|
+
complete=complete,
|
|
715
|
+
in_order=in_order,
|
|
716
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
717
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
718
|
+
full_match_dtypes=full_match_dtypes,
|
|
719
|
+
)
|
|
1005
720
|
|
|
1006
|
-
data_tbl: FrameT
|
|
1007
|
-
column: str
|
|
1008
|
-
value1: float | int
|
|
1009
|
-
value2: float | int
|
|
1010
|
-
inclusive: tuple[bool, bool]
|
|
1011
|
-
na_pass: bool
|
|
1012
|
-
threshold: int
|
|
1013
|
-
assertion_method: str
|
|
1014
|
-
allowed_types: list[str]
|
|
1015
|
-
tbl_type: str = "local"
|
|
1016
|
-
|
|
1017
|
-
def __post_init__(self):
|
|
1018
|
-
if self.tbl_type == "local":
|
|
1019
|
-
# Convert the DataFrame to a format that narwhals can work with, and:
|
|
1020
|
-
# - check if the `column=` exists
|
|
1021
|
-
# - check if the `column=` type is compatible with the test
|
|
1022
|
-
tbl = _column_test_prep(
|
|
1023
|
-
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1024
|
-
)
|
|
1025
|
-
|
|
1026
|
-
# TODO: For Ibis backends, check if the column exists and if the column type is compatible;
|
|
1027
|
-
# for now, just pass the table as is
|
|
1028
|
-
else:
|
|
1029
|
-
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1030
|
-
tbl = self.data_tbl
|
|
1031
|
-
|
|
1032
|
-
# Collect results for the test units; the results are a list of booleans where
|
|
1033
|
-
# `True` indicates a passing test unit
|
|
1034
|
-
if self.assertion_method == "between":
|
|
1035
|
-
self.test_unit_res = Interrogator(
|
|
1036
|
-
x=tbl,
|
|
1037
|
-
column=self.column,
|
|
1038
|
-
low=self.value1,
|
|
1039
|
-
high=self.value2,
|
|
1040
|
-
inclusive=self.inclusive,
|
|
1041
|
-
na_pass=self.na_pass,
|
|
1042
|
-
tbl_type=self.tbl_type,
|
|
1043
|
-
).between()
|
|
1044
|
-
elif self.assertion_method == "outside":
|
|
1045
|
-
self.test_unit_res = Interrogator(
|
|
1046
|
-
x=tbl,
|
|
1047
|
-
column=self.column,
|
|
1048
|
-
low=self.value1,
|
|
1049
|
-
high=self.value2,
|
|
1050
|
-
inclusive=self.inclusive,
|
|
1051
|
-
na_pass=self.na_pass,
|
|
1052
|
-
tbl_type=self.tbl_type,
|
|
1053
|
-
).outside()
|
|
1054
|
-
else:
|
|
1055
|
-
raise ValueError(
|
|
1056
|
-
"""Invalid assertion type. Use:
|
|
1057
|
-
- `between` for values between two values, or
|
|
1058
|
-
- `outside` for values outside two values."""
|
|
1059
|
-
)
|
|
1060
|
-
|
|
1061
|
-
def get_test_results(self):
|
|
1062
|
-
return self.test_unit_res
|
|
1063
721
|
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
722
|
+
def row_count_match(data_tbl: FrameT, count, inverse: bool, abs_tol_bounds) -> bool:
|
|
723
|
+
"""
|
|
724
|
+
Check if DataFrame row count matches expected count.
|
|
725
|
+
"""
|
|
726
|
+
from pointblank.validate import get_row_count
|
|
1068
727
|
|
|
1069
|
-
|
|
728
|
+
row_count: int = get_row_count(data=data_tbl)
|
|
729
|
+
lower_abs_limit, upper_abs_limit = abs_tol_bounds
|
|
730
|
+
min_val: int = count - lower_abs_limit
|
|
731
|
+
max_val: int = count + upper_abs_limit
|
|
1070
732
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
733
|
+
if inverse:
|
|
734
|
+
return not (row_count >= min_val and row_count <= max_val)
|
|
735
|
+
else:
|
|
736
|
+
return row_count >= min_val and row_count <= max_val
|
|
1074
737
|
|
|
1075
738
|
|
|
1076
|
-
|
|
1077
|
-
|
|
739
|
+
def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
|
|
740
|
+
"""
|
|
741
|
+
Check if DataFrame column count matches expected count.
|
|
1078
742
|
"""
|
|
1079
|
-
|
|
743
|
+
from pointblank.validate import get_column_count
|
|
1080
744
|
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
column
|
|
1086
|
-
The column to check.
|
|
1087
|
-
values
|
|
1088
|
-
A set of values to check against.
|
|
1089
|
-
threshold
|
|
1090
|
-
The maximum number of failing test units to allow.
|
|
1091
|
-
inside
|
|
1092
|
-
`True` to check if the values are inside the set, `False` to check if the values are
|
|
1093
|
-
outside the set.
|
|
1094
|
-
allowed_types
|
|
1095
|
-
The allowed data types for the column.
|
|
1096
|
-
tbl_type
|
|
1097
|
-
The type of table to use for the assertion.
|
|
745
|
+
if not inverse:
|
|
746
|
+
return get_column_count(data=data_tbl) == count
|
|
747
|
+
else:
|
|
748
|
+
return get_column_count(data=data_tbl) != count
|
|
1098
749
|
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
otherwise.
|
|
750
|
+
|
|
751
|
+
def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
|
|
752
|
+
"""
|
|
753
|
+
Perform conjoint validation using multiple expressions.
|
|
1104
754
|
"""
|
|
755
|
+
# Create a ConjointlyValidation instance and get the results
|
|
756
|
+
conjointly_instance = ConjointlyValidation(
|
|
757
|
+
data_tbl=data_tbl,
|
|
758
|
+
expressions=expressions,
|
|
759
|
+
threshold=threshold,
|
|
760
|
+
tbl_type=tbl_type,
|
|
761
|
+
)
|
|
1105
762
|
|
|
1106
|
-
|
|
1107
|
-
column: str
|
|
1108
|
-
values: list[float | int]
|
|
1109
|
-
threshold: int
|
|
1110
|
-
inside: bool
|
|
1111
|
-
allowed_types: list[str]
|
|
1112
|
-
tbl_type: str = "local"
|
|
1113
|
-
|
|
1114
|
-
def __post_init__(self):
|
|
1115
|
-
if self.tbl_type == "local":
|
|
1116
|
-
# Convert the DataFrame to a format that narwhals can work with, and:
|
|
1117
|
-
# - check if the `column=` exists
|
|
1118
|
-
# - check if the `column=` type is compatible with the test
|
|
1119
|
-
tbl = _column_test_prep(
|
|
1120
|
-
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1121
|
-
)
|
|
1122
|
-
else:
|
|
1123
|
-
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1124
|
-
tbl = self.data_tbl
|
|
1125
|
-
|
|
1126
|
-
# Collect results for the test units; the results are a list of booleans where
|
|
1127
|
-
# `True` indicates a passing test unit
|
|
1128
|
-
if self.inside:
|
|
1129
|
-
self.test_unit_res = Interrogator(
|
|
1130
|
-
x=tbl, column=self.column, set=self.values, tbl_type=self.tbl_type
|
|
1131
|
-
).isin()
|
|
1132
|
-
else:
|
|
1133
|
-
self.test_unit_res = Interrogator(
|
|
1134
|
-
x=tbl, column=self.column, set=self.values, tbl_type=self.tbl_type
|
|
1135
|
-
).notin()
|
|
763
|
+
return conjointly_instance.get_test_results()
|
|
1136
764
|
|
|
1137
|
-
def get_test_results(self):
|
|
1138
|
-
return self.test_unit_res
|
|
1139
765
|
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
# test units to the threshold for failing test units
|
|
766
|
+
def interrogate_gt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
767
|
+
"""Greater than interrogation."""
|
|
768
|
+
return _interrogate_comparison_base(tbl, column, compare, na_pass, "gt")
|
|
1144
769
|
|
|
1145
|
-
results_list = nw.from_native(self.test_unit_res)["pb_is_good_"].to_list()
|
|
1146
770
|
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
771
|
+
def interrogate_lt(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
772
|
+
"""Less than interrogation."""
|
|
773
|
+
return _interrogate_comparison_base(tbl, column, compare, na_pass, "lt")
|
|
1150
774
|
|
|
1151
775
|
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
""
|
|
1155
|
-
Check if values in a column match a regular expression pattern.
|
|
776
|
+
def interrogate_ge(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
777
|
+
"""Greater than or equal interrogation."""
|
|
778
|
+
return _interrogate_comparison_base(tbl, column, compare, na_pass, "ge")
|
|
1156
779
|
|
|
1157
|
-
Parameters
|
|
1158
|
-
----------
|
|
1159
|
-
data_tbl
|
|
1160
|
-
A data table.
|
|
1161
|
-
column
|
|
1162
|
-
The column to check.
|
|
1163
|
-
pattern
|
|
1164
|
-
The regular expression pattern to check against.
|
|
1165
|
-
na_pass
|
|
1166
|
-
`True` to pass test units with missing values, `False` otherwise.
|
|
1167
|
-
threshold
|
|
1168
|
-
The maximum number of failing test units to allow.
|
|
1169
|
-
allowed_types
|
|
1170
|
-
The allowed data types for the column.
|
|
1171
|
-
tbl_type
|
|
1172
|
-
The type of table to use for the assertion.
|
|
1173
780
|
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
`True` when test units pass below the threshold level for failing test units, `False`
|
|
1178
|
-
otherwise.
|
|
1179
|
-
"""
|
|
781
|
+
def interrogate_le(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
782
|
+
"""Less than or equal interrogation."""
|
|
783
|
+
return _interrogate_comparison_base(tbl, column, compare, na_pass, "le")
|
|
1180
784
|
|
|
1181
|
-
data_tbl: FrameT
|
|
1182
|
-
column: str
|
|
1183
|
-
pattern: str
|
|
1184
|
-
na_pass: bool
|
|
1185
|
-
threshold: int
|
|
1186
|
-
allowed_types: list[str]
|
|
1187
|
-
tbl_type: str = "local"
|
|
1188
|
-
|
|
1189
|
-
def __post_init__(self):
|
|
1190
|
-
if self.tbl_type == "local":
|
|
1191
|
-
# Convert the DataFrame to a format that narwhals can work with, and:
|
|
1192
|
-
# - check if the `column=` exists
|
|
1193
|
-
# - check if the `column=` type is compatible with the test
|
|
1194
|
-
tbl = _column_test_prep(
|
|
1195
|
-
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1196
|
-
)
|
|
1197
|
-
else:
|
|
1198
|
-
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1199
|
-
tbl = self.data_tbl
|
|
1200
|
-
|
|
1201
|
-
# Collect results for the test units; the results are a list of booleans where
|
|
1202
|
-
# `True` indicates a passing test unit
|
|
1203
|
-
self.test_unit_res = Interrogator(
|
|
1204
|
-
x=tbl,
|
|
1205
|
-
column=self.column,
|
|
1206
|
-
pattern=self.pattern,
|
|
1207
|
-
na_pass=self.na_pass,
|
|
1208
|
-
tbl_type=self.tbl_type,
|
|
1209
|
-
).regex()
|
|
1210
785
|
|
|
1211
|
-
|
|
1212
|
-
|
|
786
|
+
def interrogate_eq(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
787
|
+
"""Equal interrogation."""
|
|
1213
788
|
|
|
1214
|
-
|
|
1215
|
-
# Get the number of failing test units by counting instances of `False` in the `pb_is_good_`
|
|
1216
|
-
# column and then determine if the test passes overall by comparing the number of failing
|
|
1217
|
-
# test units to the threshold for failing test units
|
|
789
|
+
nw_tbl = nw.from_native(tbl)
|
|
1218
790
|
|
|
1219
|
-
|
|
791
|
+
if isinstance(compare, Column):
|
|
792
|
+
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
793
|
+
|
|
794
|
+
result_tbl = nw_tbl.with_columns(
|
|
795
|
+
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
796
|
+
pb_is_good_2=(
|
|
797
|
+
nw.col(compare.name).is_null() & na_pass
|
|
798
|
+
if isinstance(compare, Column)
|
|
799
|
+
else nw.lit(False)
|
|
800
|
+
),
|
|
801
|
+
)
|
|
1220
802
|
|
|
1221
|
-
|
|
1222
|
-
|
|
803
|
+
result_tbl = result_tbl.with_columns(
|
|
804
|
+
pb_is_good_3=(~nw.col(compare.name).is_null() & ~nw.col(column).is_null())
|
|
1223
805
|
)
|
|
1224
806
|
|
|
807
|
+
if is_pandas_dataframe(result_tbl.to_native()):
|
|
808
|
+
# For Pandas, handle potential NA comparison issues
|
|
809
|
+
try:
|
|
810
|
+
result_tbl = result_tbl.with_columns(
|
|
811
|
+
pb_is_good_4=nw.col(column) == compare_expr,
|
|
812
|
+
)
|
|
813
|
+
except (TypeError, ValueError) as e:
|
|
814
|
+
# Handle Pandas NA comparison issues
|
|
815
|
+
if "boolean value of NA is ambiguous" in str(e):
|
|
816
|
+
# Work around Pandas NA comparison issue by using Null checks first
|
|
817
|
+
result_tbl = result_tbl.with_columns(
|
|
818
|
+
pb_is_good_4_tmp=(
|
|
819
|
+
# Both Null: True (they're equal)
|
|
820
|
+
(nw.col(column).is_null() & nw.col(compare.name).is_null())
|
|
821
|
+
|
|
|
822
|
+
# Both not Null and values are equal: use string conversion
|
|
823
|
+
# as a fallback
|
|
824
|
+
(
|
|
825
|
+
(~nw.col(column).is_null() & ~nw.col(compare.name).is_null())
|
|
826
|
+
& (
|
|
827
|
+
nw.col(column).cast(nw.String)
|
|
828
|
+
== nw.col(compare.name).cast(nw.String)
|
|
829
|
+
)
|
|
830
|
+
)
|
|
831
|
+
)
|
|
832
|
+
)
|
|
833
|
+
result_tbl = result_tbl.rename({"pb_is_good_4_tmp": "pb_is_good_4"})
|
|
834
|
+
elif "cannot compare" in str(e).lower():
|
|
835
|
+
# Handle genuine type incompatibility
|
|
836
|
+
native_df = result_tbl.to_native()
|
|
837
|
+
col_dtype = str(native_df[column].dtype)
|
|
838
|
+
compare_dtype = str(native_df[compare.name].dtype)
|
|
1225
839
|
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
840
|
+
raise TypeError(
|
|
841
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
842
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
843
|
+
f"Column types are incompatible for equality comparison. "
|
|
844
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
845
|
+
f"both string, or both datetime) before comparing."
|
|
846
|
+
) from e
|
|
847
|
+
else:
|
|
848
|
+
raise # Re-raise unexpected errors
|
|
1230
849
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
The expression to check against.
|
|
1237
|
-
threshold
|
|
1238
|
-
The maximum number of failing test units to allow.
|
|
1239
|
-
tbl_type
|
|
1240
|
-
The type of table to use for the assertion.
|
|
850
|
+
result_tbl = result_tbl.with_columns(
|
|
851
|
+
pb_is_good_=nw.col("pb_is_good_1")
|
|
852
|
+
| nw.col("pb_is_good_2")
|
|
853
|
+
| (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
|
|
854
|
+
)
|
|
1241
855
|
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
856
|
+
else:
|
|
857
|
+
# For non-Pandas backends (Polars, Ibis, etc.), handle type incompatibility
|
|
858
|
+
try:
|
|
859
|
+
result_tbl = result_tbl.with_columns(
|
|
860
|
+
pb_is_good_4=nw.col(column) == compare_expr,
|
|
861
|
+
)
|
|
862
|
+
except (TypeError, ValueError, Exception) as e:
|
|
863
|
+
# Handle type compatibility issues for all backends
|
|
864
|
+
error_msg = str(e).lower()
|
|
865
|
+
if (
|
|
866
|
+
"cannot compare" in error_msg
|
|
867
|
+
or "type" in error_msg
|
|
868
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
869
|
+
or "dtype" in error_msg
|
|
870
|
+
or "conversion" in error_msg
|
|
871
|
+
and "failed" in error_msg
|
|
872
|
+
):
|
|
873
|
+
# Get column types for a descriptive error message
|
|
874
|
+
try:
|
|
875
|
+
native_df = result_tbl.to_native()
|
|
876
|
+
if hasattr(native_df, "dtypes"):
|
|
877
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
878
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
879
|
+
elif hasattr(native_df, "schema"):
|
|
880
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
881
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
882
|
+
else:
|
|
883
|
+
col_dtype = "unknown"
|
|
884
|
+
compare_dtype = "unknown"
|
|
885
|
+
except Exception:
|
|
886
|
+
col_dtype = "unknown"
|
|
887
|
+
compare_dtype = "unknown"
|
|
1248
888
|
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
889
|
+
raise TypeError(
|
|
890
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
891
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
892
|
+
f"Column types are incompatible for equality comparison. "
|
|
893
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
894
|
+
f"both string, or both datetime) before comparing."
|
|
895
|
+
) from e
|
|
896
|
+
else:
|
|
897
|
+
raise # Re-raise unexpected errors
|
|
898
|
+
|
|
899
|
+
result_tbl = result_tbl.with_columns(
|
|
900
|
+
pb_is_good_=nw.col("pb_is_good_1")
|
|
901
|
+
| nw.col("pb_is_good_2")
|
|
902
|
+
| (nw.col("pb_is_good_4") & ~nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2"))
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
return result_tbl.drop(
|
|
906
|
+
"pb_is_good_1", "pb_is_good_2", "pb_is_good_3", "pb_is_good_4"
|
|
907
|
+
).to_native()
|
|
1263
908
|
|
|
1264
|
-
|
|
1265
|
-
|
|
909
|
+
else:
|
|
910
|
+
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
911
|
+
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
|
|
912
|
+
|
|
913
|
+
result_tbl = nw_tbl.with_columns(
|
|
914
|
+
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
915
|
+
pb_is_good_2=(
|
|
916
|
+
nw.col(compare.name).is_null() & na_pass
|
|
917
|
+
if isinstance(compare, Column)
|
|
918
|
+
else nw.lit(False)
|
|
919
|
+
),
|
|
920
|
+
)
|
|
1266
921
|
|
|
1267
|
-
|
|
922
|
+
# Handle type incompatibility for literal value comparisons
|
|
923
|
+
try:
|
|
924
|
+
result_tbl = result_tbl.with_columns(pb_is_good_3=nw.col(column) == compare_expr)
|
|
925
|
+
except (TypeError, ValueError, Exception) as e:
|
|
926
|
+
# Handle type compatibility issues for column vs literal comparisons
|
|
927
|
+
error_msg = str(e).lower()
|
|
928
|
+
if (
|
|
929
|
+
"cannot compare" in error_msg
|
|
930
|
+
or "type" in error_msg
|
|
931
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
932
|
+
or "dtype" in error_msg
|
|
933
|
+
or "conversion" in error_msg
|
|
934
|
+
and "failed" in error_msg
|
|
935
|
+
):
|
|
936
|
+
# Get column type for a descriptive error message
|
|
937
|
+
try:
|
|
938
|
+
native_df = result_tbl.to_native()
|
|
939
|
+
if hasattr(native_df, "dtypes"):
|
|
940
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
941
|
+
elif hasattr(native_df, "schema"):
|
|
942
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
943
|
+
else:
|
|
944
|
+
col_dtype = "unknown"
|
|
945
|
+
except Exception:
|
|
946
|
+
col_dtype = "unknown"
|
|
947
|
+
|
|
948
|
+
compare_type = type(compare).__name__
|
|
949
|
+
compare_value = str(compare)
|
|
950
|
+
|
|
951
|
+
raise TypeError(
|
|
952
|
+
f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
|
|
953
|
+
f"literal value '{compare_value}' (type: {compare_type}). "
|
|
954
|
+
f"Column type and literal value type are incompatible for equality comparison. "
|
|
955
|
+
f"Ensure the column data type is compatible with the comparison value "
|
|
956
|
+
f"(e.g., numeric column with numeric value, string column with string value)."
|
|
957
|
+
) from e
|
|
958
|
+
else:
|
|
959
|
+
raise # Re-raise unexpected errors
|
|
1268
960
|
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
961
|
+
result_tbl = result_tbl.with_columns(
|
|
962
|
+
pb_is_good_3=(
|
|
963
|
+
nw.when(nw.col("pb_is_good_3").is_null())
|
|
964
|
+
.then(nw.lit(False))
|
|
965
|
+
.otherwise(nw.col("pb_is_good_3"))
|
|
966
|
+
)
|
|
967
|
+
)
|
|
1274
968
|
|
|
1275
|
-
|
|
969
|
+
result_tbl = result_tbl.with_columns(
|
|
970
|
+
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
971
|
+
)
|
|
1276
972
|
|
|
1277
|
-
|
|
1278
|
-
self.test_unit_res = self.data_tbl.with_columns(pb_is_good_=self.expr)
|
|
973
|
+
return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
1279
974
|
|
|
1280
|
-
if df_lib_name == "pandas" and expression_type == "pandas":
|
|
1281
|
-
self.test_unit_res = self.data_tbl.assign(pb_is_good_=self.expr)
|
|
1282
975
|
|
|
1283
|
-
|
|
976
|
+
def interrogate_ne(tbl: FrameT, column: str, compare: any, na_pass: bool) -> FrameT:
|
|
977
|
+
"""Not equal interrogation."""
|
|
1284
978
|
|
|
1285
|
-
|
|
1286
|
-
return self.test_unit_res
|
|
979
|
+
nw_tbl = nw.from_native(tbl)
|
|
1287
980
|
|
|
981
|
+
# Determine if the reference and comparison columns have any null values
|
|
982
|
+
ref_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=column)
|
|
1288
983
|
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
984
|
+
if isinstance(compare, Column):
|
|
985
|
+
compare_name = compare.name if isinstance(compare, Column) else compare
|
|
986
|
+
cmp_col_has_null_vals = _column_has_null_values(table=nw_tbl, column=compare_name)
|
|
987
|
+
else:
|
|
988
|
+
cmp_col_has_null_vals = False
|
|
1293
989
|
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
The column to check.
|
|
1300
|
-
threshold
|
|
1301
|
-
The maximum number of failing test units to allow.
|
|
1302
|
-
assertion_method
|
|
1303
|
-
The type of assertion ('exists' for column existence).
|
|
1304
|
-
tbl_type
|
|
1305
|
-
The type of table to use for the assertion.
|
|
990
|
+
# If neither column has null values, we can proceed with the comparison
|
|
991
|
+
# without too many complications
|
|
992
|
+
if not ref_col_has_null_vals and not cmp_col_has_null_vals:
|
|
993
|
+
if isinstance(compare, Column):
|
|
994
|
+
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
1306
995
|
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
996
|
+
# Handle type incompatibility for column comparisons
|
|
997
|
+
try:
|
|
998
|
+
return nw_tbl.with_columns(
|
|
999
|
+
pb_is_good_=nw.col(column) != compare_expr,
|
|
1000
|
+
).to_native()
|
|
1001
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1002
|
+
# Handle type compatibility issues for column vs column comparisons
|
|
1003
|
+
error_msg = str(e).lower()
|
|
1004
|
+
if (
|
|
1005
|
+
"cannot compare" in error_msg
|
|
1006
|
+
or "type" in error_msg
|
|
1007
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1008
|
+
or "dtype" in error_msg
|
|
1009
|
+
or "conversion" in error_msg
|
|
1010
|
+
and "failed" in error_msg
|
|
1011
|
+
or "boolean value of na is ambiguous" in error_msg
|
|
1012
|
+
):
|
|
1013
|
+
# Get column types for a descriptive error message
|
|
1014
|
+
try:
|
|
1015
|
+
native_df = nw_tbl.to_native()
|
|
1016
|
+
if hasattr(native_df, "dtypes"):
|
|
1017
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1018
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1019
|
+
elif hasattr(native_df, "schema"):
|
|
1020
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1021
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1022
|
+
else:
|
|
1023
|
+
col_dtype = "unknown"
|
|
1024
|
+
compare_dtype = "unknown"
|
|
1025
|
+
except Exception:
|
|
1026
|
+
col_dtype = "unknown"
|
|
1027
|
+
compare_dtype = "unknown"
|
|
1028
|
+
|
|
1029
|
+
raise TypeError(
|
|
1030
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1031
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1032
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1033
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1034
|
+
f"both string, or both datetime) before comparing."
|
|
1035
|
+
) from e
|
|
1036
|
+
else:
|
|
1037
|
+
raise # Re-raise unexpected errors
|
|
1313
1038
|
|
|
1314
|
-
data_tbl: FrameT
|
|
1315
|
-
column: str
|
|
1316
|
-
threshold: int
|
|
1317
|
-
assertion_method: str
|
|
1318
|
-
tbl_type: str = "local"
|
|
1319
|
-
|
|
1320
|
-
def __post_init__(self):
|
|
1321
|
-
if self.tbl_type == "local":
|
|
1322
|
-
# Convert the DataFrame to a format that narwhals can work with, and:
|
|
1323
|
-
# - check if the `column=` exists
|
|
1324
|
-
# - check if the `column=` type is compatible with the test
|
|
1325
|
-
tbl = _convert_to_narwhals(df=self.data_tbl)
|
|
1326
1039
|
else:
|
|
1327
|
-
|
|
1328
|
-
tbl = _convert_to_narwhals(df=self.data_tbl)
|
|
1040
|
+
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare)
|
|
1329
1041
|
|
|
1330
|
-
|
|
1331
|
-
|
|
1042
|
+
# Handle type incompatibility for literal comparisons
|
|
1043
|
+
try:
|
|
1044
|
+
return nw_tbl.with_columns(
|
|
1045
|
+
pb_is_good_=nw.col(column) != nw.lit(compare_expr),
|
|
1046
|
+
).to_native()
|
|
1047
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1048
|
+
# Handle type compatibility issues for column vs literal comparisons
|
|
1049
|
+
error_msg = str(e).lower()
|
|
1050
|
+
if (
|
|
1051
|
+
"cannot compare" in error_msg
|
|
1052
|
+
or "type" in error_msg
|
|
1053
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1054
|
+
or "dtype" in error_msg
|
|
1055
|
+
or "conversion" in error_msg
|
|
1056
|
+
and "failed" in error_msg
|
|
1057
|
+
):
|
|
1058
|
+
# Get column type for a descriptive error message
|
|
1059
|
+
try:
|
|
1060
|
+
native_df = nw_tbl.to_native()
|
|
1061
|
+
if hasattr(native_df, "dtypes"):
|
|
1062
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1063
|
+
elif hasattr(native_df, "schema"):
|
|
1064
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1065
|
+
else:
|
|
1066
|
+
col_dtype = "unknown"
|
|
1067
|
+
except Exception:
|
|
1068
|
+
col_dtype = "unknown"
|
|
1069
|
+
|
|
1070
|
+
compare_type = type(compare).__name__
|
|
1071
|
+
compare_value = str(compare)
|
|
1332
1072
|
|
|
1333
|
-
|
|
1073
|
+
raise TypeError(
|
|
1074
|
+
f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
|
|
1075
|
+
f"literal value '{compare_value}' (type: {compare_type}). "
|
|
1076
|
+
f"Column type and literal value type are incompatible for inequality comparison. "
|
|
1077
|
+
f"Ensure the column data type is compatible with the comparison value "
|
|
1078
|
+
f"(e.g., numeric column with numeric value, string column with string value)."
|
|
1079
|
+
) from e
|
|
1080
|
+
else:
|
|
1081
|
+
raise # Re-raise unexpected errors
|
|
1334
1082
|
|
|
1335
|
-
|
|
1336
|
-
|
|
1083
|
+
# If either column has Null values, we need to handle the comparison
|
|
1084
|
+
# much more carefully since we can't inadvertently compare Null values
|
|
1085
|
+
# to non-Null values
|
|
1337
1086
|
|
|
1087
|
+
if isinstance(compare, Column):
|
|
1088
|
+
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
1338
1089
|
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1090
|
+
# CASE 1: the reference column has Null values but the comparison column does not
|
|
1091
|
+
if ref_col_has_null_vals and not cmp_col_has_null_vals:
|
|
1092
|
+
if is_pandas_dataframe(nw_tbl.to_native()):
|
|
1093
|
+
try:
|
|
1094
|
+
result_tbl = nw_tbl.with_columns(
|
|
1095
|
+
pb_is_good_1=nw.col(column).is_null(),
|
|
1096
|
+
pb_is_good_2=nw.col(column) != nw.col(compare.name),
|
|
1097
|
+
)
|
|
1098
|
+
except (TypeError, ValueError) as e:
|
|
1099
|
+
# Handle Pandas type compatibility issues
|
|
1100
|
+
if (
|
|
1101
|
+
"boolean value of NA is ambiguous" in str(e)
|
|
1102
|
+
or "cannot compare" in str(e).lower()
|
|
1103
|
+
):
|
|
1104
|
+
# Get column types for a descriptive error message
|
|
1105
|
+
native_df = nw_tbl.to_native()
|
|
1106
|
+
col_dtype = str(native_df[column].dtype)
|
|
1107
|
+
compare_dtype = str(native_df[compare.name].dtype)
|
|
1343
1108
|
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
The type of table to use for the assertion.
|
|
1109
|
+
raise TypeError(
|
|
1110
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1111
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1112
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1113
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1114
|
+
f"both string, or both datetime) before comparing."
|
|
1115
|
+
) from e
|
|
1116
|
+
else:
|
|
1117
|
+
raise # Re-raise unexpected errors
|
|
1354
1118
|
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1119
|
+
else:
|
|
1120
|
+
try:
|
|
1121
|
+
result_tbl = nw_tbl.with_columns(
|
|
1122
|
+
pb_is_good_1=nw.col(column).is_null(),
|
|
1123
|
+
pb_is_good_2=nw.col(column) != nw.col(compare.name),
|
|
1124
|
+
)
|
|
1125
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1126
|
+
# Handle type compatibility issues for non-Pandas backends
|
|
1127
|
+
error_msg = str(e).lower()
|
|
1128
|
+
if (
|
|
1129
|
+
"cannot compare" in error_msg
|
|
1130
|
+
or "type" in error_msg
|
|
1131
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1132
|
+
or "dtype" in error_msg
|
|
1133
|
+
or "conversion" in error_msg
|
|
1134
|
+
and "failed" in error_msg
|
|
1135
|
+
):
|
|
1136
|
+
# Get column types for a descriptive error message
|
|
1137
|
+
try:
|
|
1138
|
+
native_df = nw_tbl.to_native()
|
|
1139
|
+
if hasattr(native_df, "dtypes"):
|
|
1140
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1141
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1142
|
+
elif hasattr(native_df, "schema"):
|
|
1143
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1144
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1145
|
+
else:
|
|
1146
|
+
col_dtype = "unknown"
|
|
1147
|
+
compare_dtype = "unknown"
|
|
1148
|
+
except Exception:
|
|
1149
|
+
col_dtype = "unknown"
|
|
1150
|
+
compare_dtype = "unknown"
|
|
1361
1151
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1152
|
+
raise TypeError(
|
|
1153
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1154
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1155
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1156
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1157
|
+
f"both string, or both datetime) before comparing."
|
|
1158
|
+
) from e
|
|
1159
|
+
else:
|
|
1160
|
+
raise # Re-raise unexpected errors
|
|
1366
1161
|
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
# - check if the `column=` type is compatible with the test
|
|
1372
|
-
tbl = _column_subset_test_prep(df=self.data_tbl, columns_subset=self.columns_subset)
|
|
1162
|
+
if not na_pass:
|
|
1163
|
+
result_tbl = result_tbl.with_columns(
|
|
1164
|
+
pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
|
|
1165
|
+
)
|
|
1373
1166
|
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1378
|
-
tbl = self.data_tbl
|
|
1167
|
+
if is_polars_dataframe(nw_tbl.to_native()):
|
|
1168
|
+
# There may be Null values in the `pb_is_good_2` column, change those to
|
|
1169
|
+
# True if `na_pass=` is True, False otherwise
|
|
1379
1170
|
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
tbl_type=self.tbl_type,
|
|
1386
|
-
).rows_distinct()
|
|
1171
|
+
result_tbl = result_tbl.with_columns(
|
|
1172
|
+
pb_is_good_2=nw.when(nw.col("pb_is_good_2").is_null())
|
|
1173
|
+
.then(False)
|
|
1174
|
+
.otherwise(nw.col("pb_is_good_2")),
|
|
1175
|
+
)
|
|
1387
1176
|
|
|
1388
|
-
|
|
1389
|
-
|
|
1177
|
+
if na_pass:
|
|
1178
|
+
result_tbl = result_tbl.with_columns(
|
|
1179
|
+
pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1180
|
+
)
|
|
1181
|
+
else:
|
|
1182
|
+
# General case (non-Polars): handle na_pass=True properly
|
|
1183
|
+
if na_pass:
|
|
1184
|
+
result_tbl = result_tbl.with_columns(
|
|
1185
|
+
pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1186
|
+
)
|
|
1390
1187
|
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1188
|
+
return (
|
|
1189
|
+
result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
|
|
1190
|
+
.drop("pb_is_good_1", "pb_is_good_2")
|
|
1191
|
+
.to_native()
|
|
1192
|
+
)
|
|
1395
1193
|
|
|
1396
|
-
|
|
1194
|
+
# CASE 2: the comparison column has Null values but the reference column does not
|
|
1195
|
+
elif not ref_col_has_null_vals and cmp_col_has_null_vals:
|
|
1196
|
+
if is_pandas_dataframe(nw_tbl.to_native()):
|
|
1197
|
+
try:
|
|
1198
|
+
result_tbl = nw_tbl.with_columns(
|
|
1199
|
+
pb_is_good_1=nw.col(column) != nw.lit(compare.name),
|
|
1200
|
+
pb_is_good_2=nw.col(compare.name).is_null(),
|
|
1201
|
+
)
|
|
1202
|
+
except (TypeError, ValueError) as e:
|
|
1203
|
+
# Handle Pandas type compatibility issues
|
|
1204
|
+
if (
|
|
1205
|
+
"boolean value of NA is ambiguous" in str(e)
|
|
1206
|
+
or "cannot compare" in str(e).lower()
|
|
1207
|
+
):
|
|
1208
|
+
# Get column types for a descriptive error message
|
|
1209
|
+
native_df = nw_tbl.to_native()
|
|
1210
|
+
col_dtype = str(native_df[column].dtype)
|
|
1211
|
+
compare_dtype = str(native_df[compare.name].dtype)
|
|
1397
1212
|
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1213
|
+
raise TypeError(
|
|
1214
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1215
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1216
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1217
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1218
|
+
f"both string, or both datetime) before comparing."
|
|
1219
|
+
) from e
|
|
1220
|
+
else:
|
|
1221
|
+
raise # Re-raise unexpected errors
|
|
1401
1222
|
|
|
1223
|
+
else:
|
|
1224
|
+
try:
|
|
1225
|
+
result_tbl = nw_tbl.with_columns(
|
|
1226
|
+
pb_is_good_1=nw.col(column) != nw.col(compare.name),
|
|
1227
|
+
pb_is_good_2=nw.col(compare.name).is_null(),
|
|
1228
|
+
)
|
|
1229
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1230
|
+
# Handle type compatibility issues for non-Pandas backends
|
|
1231
|
+
error_msg = str(e).lower()
|
|
1232
|
+
if (
|
|
1233
|
+
"cannot compare" in error_msg
|
|
1234
|
+
or "type" in error_msg
|
|
1235
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1236
|
+
or "dtype" in error_msg
|
|
1237
|
+
or "conversion" in error_msg
|
|
1238
|
+
and "failed" in error_msg
|
|
1239
|
+
):
|
|
1240
|
+
# Get column types for a descriptive error message
|
|
1241
|
+
try:
|
|
1242
|
+
native_df = nw_tbl.to_native()
|
|
1243
|
+
if hasattr(native_df, "dtypes"):
|
|
1244
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1245
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1246
|
+
elif hasattr(native_df, "schema"):
|
|
1247
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1248
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1249
|
+
else:
|
|
1250
|
+
col_dtype = "unknown"
|
|
1251
|
+
compare_dtype = "unknown"
|
|
1252
|
+
except Exception:
|
|
1253
|
+
col_dtype = "unknown"
|
|
1254
|
+
compare_dtype = "unknown"
|
|
1402
1255
|
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1256
|
+
raise TypeError(
|
|
1257
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1258
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1259
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1260
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1261
|
+
f"both string, or both datetime) before comparing."
|
|
1262
|
+
) from e
|
|
1263
|
+
else:
|
|
1264
|
+
raise # Re-raise unexpected errors
|
|
1407
1265
|
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
columns_subset
|
|
1413
|
-
A list of columns to check for completeness.
|
|
1414
|
-
threshold
|
|
1415
|
-
The maximum number of failing test units to allow.
|
|
1416
|
-
tbl_type
|
|
1417
|
-
The type of table to use for the assertion.
|
|
1266
|
+
if not na_pass:
|
|
1267
|
+
result_tbl = result_tbl.with_columns(
|
|
1268
|
+
pb_is_good_1=nw.col("pb_is_good_1") & ~nw.col("pb_is_good_2")
|
|
1269
|
+
)
|
|
1418
1270
|
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1271
|
+
if is_polars_dataframe(nw_tbl.to_native()):
|
|
1272
|
+
if na_pass:
|
|
1273
|
+
result_tbl = result_tbl.with_columns(
|
|
1274
|
+
pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1275
|
+
)
|
|
1276
|
+
else:
|
|
1277
|
+
# General case (non-Polars): handle `na_pass=True` properly
|
|
1278
|
+
if na_pass:
|
|
1279
|
+
result_tbl = result_tbl.with_columns(
|
|
1280
|
+
pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1281
|
+
)
|
|
1425
1282
|
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1283
|
+
return (
|
|
1284
|
+
result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_1"))
|
|
1285
|
+
.drop("pb_is_good_1", "pb_is_good_2")
|
|
1286
|
+
.to_native()
|
|
1287
|
+
)
|
|
1430
1288
|
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1289
|
+
# CASE 3: both columns have Null values and there may potentially be cases where
|
|
1290
|
+
# there could even be Null/Null comparisons
|
|
1291
|
+
elif ref_col_has_null_vals and cmp_col_has_null_vals:
|
|
1292
|
+
try:
|
|
1293
|
+
result_tbl = nw_tbl.with_columns(
|
|
1294
|
+
pb_is_good_1=nw.col(column).is_null(),
|
|
1295
|
+
pb_is_good_2=nw.col(compare.name).is_null(),
|
|
1296
|
+
pb_is_good_3=nw.col(column) != nw.col(compare.name),
|
|
1297
|
+
)
|
|
1298
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1299
|
+
# Handle type compatibility issues for column vs column comparisons
|
|
1300
|
+
error_msg = str(e).lower()
|
|
1301
|
+
if (
|
|
1302
|
+
"cannot compare" in error_msg
|
|
1303
|
+
or "type" in error_msg
|
|
1304
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1305
|
+
or "dtype" in error_msg
|
|
1306
|
+
or "conversion" in error_msg
|
|
1307
|
+
and "failed" in error_msg
|
|
1308
|
+
or "boolean value of na is ambiguous" in error_msg
|
|
1309
|
+
):
|
|
1310
|
+
# Get column types for a descriptive error message
|
|
1311
|
+
try:
|
|
1312
|
+
native_df = nw_tbl.to_native()
|
|
1313
|
+
if hasattr(native_df, "dtypes"):
|
|
1314
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1315
|
+
compare_dtype = str(native_df.dtypes.get(compare.name, "unknown"))
|
|
1316
|
+
elif hasattr(native_df, "schema"):
|
|
1317
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1318
|
+
compare_dtype = str(native_df.schema.get(compare.name, "unknown"))
|
|
1319
|
+
else:
|
|
1320
|
+
col_dtype = "unknown"
|
|
1321
|
+
compare_dtype = "unknown"
|
|
1322
|
+
except Exception:
|
|
1323
|
+
col_dtype = "unknown"
|
|
1324
|
+
compare_dtype = "unknown"
|
|
1437
1325
|
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1326
|
+
raise TypeError(
|
|
1327
|
+
f"Cannot compare columns '{column}' (dtype: {col_dtype}) and "
|
|
1328
|
+
f"'{compare.name}' (dtype: {compare_dtype}). "
|
|
1329
|
+
f"Column types are incompatible for inequality comparison. "
|
|
1330
|
+
f"Ensure both columns have compatible data types (both numeric, "
|
|
1331
|
+
f"both string, or both datetime) before comparing."
|
|
1332
|
+
) from e
|
|
1333
|
+
else:
|
|
1334
|
+
raise # Re-raise unexpected errors
|
|
1443
1335
|
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
).rows_complete()
|
|
1336
|
+
if not na_pass:
|
|
1337
|
+
result_tbl = result_tbl.with_columns(
|
|
1338
|
+
pb_is_good_3=nw.col("pb_is_good_3")
|
|
1339
|
+
& ~nw.col("pb_is_good_1")
|
|
1340
|
+
& ~nw.col("pb_is_good_2")
|
|
1341
|
+
)
|
|
1451
1342
|
|
|
1452
|
-
|
|
1453
|
-
|
|
1343
|
+
if is_polars_dataframe(nw_tbl.to_native()):
|
|
1344
|
+
if na_pass:
|
|
1345
|
+
result_tbl = result_tbl.with_columns(
|
|
1346
|
+
pb_is_good_3=(
|
|
1347
|
+
nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1348
|
+
.then(True)
|
|
1349
|
+
.otherwise(False)
|
|
1350
|
+
)
|
|
1351
|
+
)
|
|
1352
|
+
else:
|
|
1353
|
+
# General case (non-Polars): handle na_pass=True properly
|
|
1354
|
+
if na_pass:
|
|
1355
|
+
result_tbl = result_tbl.with_columns(
|
|
1356
|
+
pb_is_good_3=(
|
|
1357
|
+
nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
1358
|
+
.then(True)
|
|
1359
|
+
.otherwise(nw.col("pb_is_good_3"))
|
|
1360
|
+
)
|
|
1361
|
+
)
|
|
1454
1362
|
|
|
1363
|
+
return (
|
|
1364
|
+
result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_3"))
|
|
1365
|
+
.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
1366
|
+
.to_native()
|
|
1367
|
+
)
|
|
1455
1368
|
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1369
|
+
else:
|
|
1370
|
+
# Case where the reference column contains null values
|
|
1371
|
+
if ref_col_has_null_vals:
|
|
1372
|
+
# Create individual cases for Pandas and Polars
|
|
1373
|
+
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare)
|
|
1460
1374
|
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
The maximum number of failing test units to allow.
|
|
1479
|
-
tbl_type
|
|
1480
|
-
The type of table to use for the assertion.
|
|
1481
|
-
|
|
1482
|
-
Returns
|
|
1483
|
-
-------
|
|
1484
|
-
bool
|
|
1485
|
-
`True` when test units pass below the threshold level for failing test units, `False`
|
|
1486
|
-
otherwise.
|
|
1487
|
-
"""
|
|
1375
|
+
if is_pandas_dataframe(nw_tbl.to_native()):
|
|
1376
|
+
try:
|
|
1377
|
+
result_tbl = nw_tbl.with_columns(
|
|
1378
|
+
pb_is_good_1=nw.col(column).is_null(),
|
|
1379
|
+
pb_is_good_2=nw.col(column) != nw.lit(compare_expr),
|
|
1380
|
+
)
|
|
1381
|
+
except (TypeError, ValueError) as e:
|
|
1382
|
+
# Handle Pandas type compatibility issues for literal comparisons
|
|
1383
|
+
if (
|
|
1384
|
+
"boolean value of NA is ambiguous" in str(e)
|
|
1385
|
+
or "cannot compare" in str(e).lower()
|
|
1386
|
+
):
|
|
1387
|
+
# Get column type for a descriptive error message
|
|
1388
|
+
native_df = nw_tbl.to_native()
|
|
1389
|
+
col_dtype = str(native_df[column].dtype)
|
|
1390
|
+
compare_type = type(compare).__name__
|
|
1391
|
+
compare_value = str(compare)
|
|
1488
1392
|
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
def __post_init__(self):
|
|
1499
|
-
schema_expect = self.schema
|
|
1500
|
-
schema_actual = Schema(tbl=self.data_tbl)
|
|
1501
|
-
|
|
1502
|
-
if self.complete and self.in_order:
|
|
1503
|
-
# Check if the schema is complete and in order (most restrictive check)
|
|
1504
|
-
# complete: True, in_order: True
|
|
1505
|
-
res = schema_expect._compare_schema_columns_complete_in_order(
|
|
1506
|
-
other=schema_actual,
|
|
1507
|
-
case_sensitive_colnames=self.case_sensitive_colnames,
|
|
1508
|
-
case_sensitive_dtypes=self.case_sensitive_dtypes,
|
|
1509
|
-
full_match_dtypes=self.full_match_dtypes,
|
|
1510
|
-
)
|
|
1393
|
+
raise TypeError(
|
|
1394
|
+
f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
|
|
1395
|
+
f"literal value '{compare_value}' (type: {compare_type}). "
|
|
1396
|
+
f"Column type and literal value type are incompatible for inequality comparison. "
|
|
1397
|
+
f"Ensure the column data type is compatible with the comparison value "
|
|
1398
|
+
f"(e.g., numeric column with numeric value, string column with string value)."
|
|
1399
|
+
) from e
|
|
1400
|
+
else:
|
|
1401
|
+
raise # Re-raise unexpected errors
|
|
1511
1402
|
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
other=schema_actual,
|
|
1517
|
-
case_sensitive_colnames=self.case_sensitive_colnames,
|
|
1518
|
-
case_sensitive_dtypes=self.case_sensitive_dtypes,
|
|
1519
|
-
full_match_dtypes=self.full_match_dtypes,
|
|
1520
|
-
)
|
|
1403
|
+
if not na_pass:
|
|
1404
|
+
result_tbl = result_tbl.with_columns(
|
|
1405
|
+
pb_is_good_2=nw.col("pb_is_good_2") & ~nw.col("pb_is_good_1")
|
|
1406
|
+
)
|
|
1521
1407
|
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
case_sensitive_colnames=self.case_sensitive_colnames,
|
|
1528
|
-
case_sensitive_dtypes=self.case_sensitive_dtypes,
|
|
1529
|
-
full_match_dtypes=self.full_match_dtypes,
|
|
1530
|
-
)
|
|
1408
|
+
return (
|
|
1409
|
+
result_tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
|
|
1410
|
+
.drop("pb_is_good_1", "pb_is_good_2")
|
|
1411
|
+
.to_native()
|
|
1412
|
+
)
|
|
1531
1413
|
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
case_sensitive_colnames=self.case_sensitive_colnames,
|
|
1538
|
-
case_sensitive_dtypes=self.case_sensitive_dtypes,
|
|
1539
|
-
full_match_dtypes=self.full_match_dtypes,
|
|
1540
|
-
)
|
|
1414
|
+
elif is_polars_dataframe(nw_tbl.to_native()):
|
|
1415
|
+
result_tbl = nw_tbl.with_columns(
|
|
1416
|
+
pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
|
|
1417
|
+
pb_is_good_2=nw.lit(na_pass), # Pass if any Null in val or compare
|
|
1418
|
+
)
|
|
1541
1419
|
|
|
1542
|
-
|
|
1420
|
+
try:
|
|
1421
|
+
result_tbl = result_tbl.with_columns(
|
|
1422
|
+
pb_is_good_3=nw.col(column) != nw.lit(compare_expr)
|
|
1423
|
+
)
|
|
1424
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1425
|
+
# Handle type compatibility issues for literal comparisons
|
|
1426
|
+
error_msg = str(e).lower()
|
|
1427
|
+
if (
|
|
1428
|
+
"cannot compare" in error_msg
|
|
1429
|
+
or "type" in error_msg
|
|
1430
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1431
|
+
or "dtype" in error_msg
|
|
1432
|
+
or "conversion" in error_msg
|
|
1433
|
+
and "failed" in error_msg
|
|
1434
|
+
):
|
|
1435
|
+
# Get column type for a descriptive error message
|
|
1436
|
+
try:
|
|
1437
|
+
native_df = nw_tbl.to_native()
|
|
1438
|
+
if hasattr(native_df, "dtypes"):
|
|
1439
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1440
|
+
elif hasattr(native_df, "schema"):
|
|
1441
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1442
|
+
else:
|
|
1443
|
+
col_dtype = "unknown"
|
|
1444
|
+
except Exception:
|
|
1445
|
+
col_dtype = "unknown"
|
|
1446
|
+
|
|
1447
|
+
compare_type = type(compare).__name__
|
|
1448
|
+
compare_value = str(compare)
|
|
1543
1449
|
|
|
1544
|
-
|
|
1545
|
-
|
|
1450
|
+
raise TypeError(
|
|
1451
|
+
f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
|
|
1452
|
+
f"literal value '{compare_value}' (type: {compare_type}). "
|
|
1453
|
+
f"Column type and literal value type are incompatible for inequality comparison. "
|
|
1454
|
+
f"Ensure the column data type is compatible with the comparison value "
|
|
1455
|
+
f"(e.g., numeric column with numeric value, string column with string value)."
|
|
1456
|
+
) from e
|
|
1457
|
+
else:
|
|
1458
|
+
raise # Re-raise unexpected errors
|
|
1546
1459
|
|
|
1460
|
+
result_tbl = result_tbl.with_columns(
|
|
1461
|
+
pb_is_good_=(
|
|
1462
|
+
(nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
|
|
1463
|
+
| (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
|
|
1464
|
+
)
|
|
1465
|
+
)
|
|
1547
1466
|
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
Check if rows in a DataFrame either match or don't match a fixed value.
|
|
1467
|
+
result_tbl = result_tbl.drop(
|
|
1468
|
+
"pb_is_good_1", "pb_is_good_2", "pb_is_good_3"
|
|
1469
|
+
).to_native()
|
|
1552
1470
|
|
|
1553
|
-
|
|
1554
|
-
----------
|
|
1555
|
-
data_tbl
|
|
1556
|
-
A data table.
|
|
1557
|
-
count
|
|
1558
|
-
The fixed row count to check against.
|
|
1559
|
-
inverse
|
|
1560
|
-
`True` to check if the row count does not match the fixed value, `False` otherwise.
|
|
1561
|
-
threshold
|
|
1562
|
-
The maximum number of failing test units to allow.
|
|
1563
|
-
tbl_type
|
|
1564
|
-
The type of table to use for the assertion.
|
|
1471
|
+
return result_tbl
|
|
1565
1472
|
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1473
|
+
else:
|
|
1474
|
+
# Generic case for other DataFrame types (PySpark, etc.)
|
|
1475
|
+
# Use similar logic to Polars but handle potential differences
|
|
1476
|
+
result_tbl = nw_tbl.with_columns(
|
|
1477
|
+
pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
|
|
1478
|
+
pb_is_good_2=nw.lit(na_pass), # Pass if any Null in val or compare
|
|
1479
|
+
)
|
|
1572
1480
|
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1481
|
+
try:
|
|
1482
|
+
result_tbl = result_tbl.with_columns(
|
|
1483
|
+
pb_is_good_3=nw.col(column) != nw.lit(compare_expr)
|
|
1484
|
+
)
|
|
1485
|
+
except (TypeError, ValueError, Exception) as e:
|
|
1486
|
+
# Handle type compatibility issues for literal comparisons
|
|
1487
|
+
error_msg = str(e).lower()
|
|
1488
|
+
if (
|
|
1489
|
+
"cannot compare" in error_msg
|
|
1490
|
+
or "type" in error_msg
|
|
1491
|
+
and ("mismatch" in error_msg or "incompatible" in error_msg)
|
|
1492
|
+
or "dtype" in error_msg
|
|
1493
|
+
or "conversion" in error_msg
|
|
1494
|
+
and "failed" in error_msg
|
|
1495
|
+
):
|
|
1496
|
+
# Get column type for a descriptive error message
|
|
1497
|
+
try:
|
|
1498
|
+
native_df = nw_tbl.to_native()
|
|
1499
|
+
if hasattr(native_df, "dtypes"):
|
|
1500
|
+
col_dtype = str(native_df.dtypes.get(column, "unknown"))
|
|
1501
|
+
elif hasattr(native_df, "schema"):
|
|
1502
|
+
col_dtype = str(native_df.schema.get(column, "unknown"))
|
|
1503
|
+
else:
|
|
1504
|
+
col_dtype = "unknown"
|
|
1505
|
+
except Exception:
|
|
1506
|
+
col_dtype = "unknown"
|
|
1507
|
+
|
|
1508
|
+
compare_type = type(compare).__name__
|
|
1509
|
+
compare_value = str(compare)
|
|
1579
1510
|
|
|
1580
|
-
|
|
1581
|
-
|
|
1511
|
+
raise TypeError(
|
|
1512
|
+
f"Cannot compare column '{column}' (dtype: {col_dtype}) with "
|
|
1513
|
+
f"literal value '{compare_value}' (type: {compare_type}). "
|
|
1514
|
+
f"Column type and literal value type are incompatible for inequality comparison. "
|
|
1515
|
+
f"Ensure the column data type is compatible with the comparison value "
|
|
1516
|
+
f"(e.g., numeric column with numeric value, string column with string value)."
|
|
1517
|
+
) from e
|
|
1518
|
+
else:
|
|
1519
|
+
raise # Re-raise unexpected errors
|
|
1582
1520
|
|
|
1583
|
-
|
|
1521
|
+
result_tbl = result_tbl.with_columns(
|
|
1522
|
+
pb_is_good_=(
|
|
1523
|
+
(nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
|
|
1524
|
+
| (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
|
|
1525
|
+
)
|
|
1526
|
+
)
|
|
1584
1527
|
|
|
1585
|
-
|
|
1586
|
-
min_val: int = self.count - lower_abs_limit
|
|
1587
|
-
max_val: int = self.count + upper_abs_limit
|
|
1528
|
+
return result_tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
1588
1529
|
|
|
1589
|
-
if self.inverse:
|
|
1590
|
-
res: bool = not (row_count >= min_val and row_count <= max_val)
|
|
1591
|
-
else:
|
|
1592
|
-
res: bool = row_count >= min_val and row_count <= max_val
|
|
1593
1530
|
|
|
1594
|
-
|
|
1531
|
+
def interrogate_between(
|
|
1532
|
+
tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
|
|
1533
|
+
) -> FrameT:
|
|
1534
|
+
"""Between interrogation."""
|
|
1595
1535
|
|
|
1596
|
-
|
|
1597
|
-
|
|
1536
|
+
low_val = _get_compare_expr_nw(compare=low)
|
|
1537
|
+
high_val = _get_compare_expr_nw(compare=high)
|
|
1598
1538
|
|
|
1539
|
+
nw_tbl = nw.from_native(tbl)
|
|
1540
|
+
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1541
|
+
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1599
1542
|
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1543
|
+
result_tbl = nw_tbl.with_columns(
|
|
1544
|
+
pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
|
|
1545
|
+
pb_is_good_2=( # lb is Null in Column
|
|
1546
|
+
nw.col(low.name).is_null() if isinstance(low, Column) else nw.lit(False)
|
|
1547
|
+
),
|
|
1548
|
+
pb_is_good_3=( # ub is Null in Column
|
|
1549
|
+
nw.col(high.name).is_null() if isinstance(high, Column) else nw.lit(False)
|
|
1550
|
+
),
|
|
1551
|
+
pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
|
|
1552
|
+
)
|
|
1604
1553
|
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
count
|
|
1610
|
-
The fixed column count to check against.
|
|
1611
|
-
inverse
|
|
1612
|
-
`True` to check if the column count does not match the fixed value, `False` otherwise.
|
|
1613
|
-
threshold
|
|
1614
|
-
The maximum number of failing test units to allow.
|
|
1615
|
-
tbl_type
|
|
1616
|
-
The type of table to use for the assertion.
|
|
1554
|
+
if inclusive[0]:
|
|
1555
|
+
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) >= low_val)
|
|
1556
|
+
else:
|
|
1557
|
+
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) > low_val)
|
|
1617
1558
|
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1621
|
-
|
|
1622
|
-
otherwise.
|
|
1623
|
-
"""
|
|
1559
|
+
if inclusive[1]:
|
|
1560
|
+
result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) <= high_val)
|
|
1561
|
+
else:
|
|
1562
|
+
result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) < high_val)
|
|
1624
1563
|
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1564
|
+
result_tbl = result_tbl.with_columns(
|
|
1565
|
+
pb_is_good_5=(
|
|
1566
|
+
nw.when(nw.col("pb_is_good_5").is_null())
|
|
1567
|
+
.then(nw.lit(False))
|
|
1568
|
+
.otherwise(nw.col("pb_is_good_5"))
|
|
1569
|
+
)
|
|
1570
|
+
)
|
|
1630
1571
|
|
|
1631
|
-
|
|
1632
|
-
|
|
1572
|
+
result_tbl = result_tbl.with_columns(
|
|
1573
|
+
pb_is_good_6=(
|
|
1574
|
+
nw.when(nw.col("pb_is_good_6").is_null())
|
|
1575
|
+
.then(nw.lit(False))
|
|
1576
|
+
.otherwise(nw.col("pb_is_good_6"))
|
|
1577
|
+
)
|
|
1578
|
+
)
|
|
1633
1579
|
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1580
|
+
result_tbl = result_tbl.with_columns(
|
|
1581
|
+
pb_is_good_=(
|
|
1582
|
+
(
|
|
1583
|
+
(nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
|
|
1584
|
+
& nw.col("pb_is_good_4")
|
|
1585
|
+
)
|
|
1586
|
+
| (nw.col("pb_is_good_5") & nw.col("pb_is_good_6"))
|
|
1587
|
+
)
|
|
1588
|
+
).drop(
|
|
1589
|
+
"pb_is_good_1",
|
|
1590
|
+
"pb_is_good_2",
|
|
1591
|
+
"pb_is_good_3",
|
|
1592
|
+
"pb_is_good_4",
|
|
1593
|
+
"pb_is_good_5",
|
|
1594
|
+
"pb_is_good_6",
|
|
1595
|
+
)
|
|
1638
1596
|
|
|
1639
|
-
|
|
1597
|
+
return result_tbl.to_native()
|
|
1640
1598
|
|
|
1641
|
-
def get_test_results(self):
|
|
1642
|
-
return self.test_unit_res
|
|
1643
1599
|
|
|
1600
|
+
def interrogate_outside(
|
|
1601
|
+
tbl: FrameT, column: str, low: any, high: any, inclusive: tuple, na_pass: bool
|
|
1602
|
+
) -> FrameT:
|
|
1603
|
+
"""Outside range interrogation."""
|
|
1644
1604
|
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
self.data_tbl = data_tbl
|
|
1648
|
-
self.expressions = expressions
|
|
1649
|
-
self.threshold = threshold
|
|
1605
|
+
low_val = _get_compare_expr_nw(compare=low)
|
|
1606
|
+
high_val = _get_compare_expr_nw(compare=high)
|
|
1650
1607
|
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
self.tbl_type = _get_tbl_type(data=data_tbl)
|
|
1655
|
-
else:
|
|
1656
|
-
self.tbl_type = tbl_type
|
|
1608
|
+
nw_tbl = nw.from_native(tbl)
|
|
1609
|
+
low_val = _safe_modify_datetime_compare_val(nw_tbl, column, low_val)
|
|
1610
|
+
high_val = _safe_modify_datetime_compare_val(nw_tbl, column, high_val)
|
|
1657
1611
|
|
|
1658
|
-
|
|
1659
|
-
|
|
1612
|
+
result_tbl = nw_tbl.with_columns(
|
|
1613
|
+
pb_is_good_1=nw.col(column).is_null(), # val is Null in Column
|
|
1614
|
+
pb_is_good_2=( # lb is Null in Column
|
|
1615
|
+
nw.col(low.name).is_null() if isinstance(low, Column) else nw.lit(False)
|
|
1616
|
+
),
|
|
1617
|
+
pb_is_good_3=( # ub is Null in Column
|
|
1618
|
+
nw.col(high.name).is_null() if isinstance(high, Column) else nw.lit(False)
|
|
1619
|
+
),
|
|
1620
|
+
pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
|
|
1621
|
+
)
|
|
1660
1622
|
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
elif "pyspark" in self.tbl_type:
|
|
1668
|
-
return self._get_pyspark_results()
|
|
1669
|
-
else: # pragma: no cover
|
|
1670
|
-
raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
|
|
1623
|
+
# Note: Logic is inverted for "outside" - when inclusive[0] is True,
|
|
1624
|
+
# we want values < low_val (not <= low_val) to be "outside"
|
|
1625
|
+
if inclusive[0]:
|
|
1626
|
+
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
|
|
1627
|
+
else:
|
|
1628
|
+
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) <= low_val)
|
|
1671
1629
|
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1630
|
+
if inclusive[1]:
|
|
1631
|
+
result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) > high_val)
|
|
1632
|
+
else:
|
|
1633
|
+
result_tbl = result_tbl.with_columns(pb_is_good_6=nw.col(column) >= high_val)
|
|
1634
|
+
|
|
1635
|
+
result_tbl = result_tbl.with_columns(
|
|
1636
|
+
pb_is_good_5=nw.when(nw.col("pb_is_good_5").is_null())
|
|
1637
|
+
.then(False)
|
|
1638
|
+
.otherwise(nw.col("pb_is_good_5")),
|
|
1639
|
+
pb_is_good_6=nw.when(nw.col("pb_is_good_6").is_null())
|
|
1640
|
+
.then(False)
|
|
1641
|
+
.otherwise(nw.col("pb_is_good_6")),
|
|
1642
|
+
)
|
|
1675
1643
|
|
|
1676
|
-
|
|
1644
|
+
result_tbl = result_tbl.with_columns(
|
|
1645
|
+
pb_is_good_=(
|
|
1646
|
+
(
|
|
1647
|
+
(nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3"))
|
|
1648
|
+
& nw.col("pb_is_good_4")
|
|
1649
|
+
)
|
|
1650
|
+
| (
|
|
1651
|
+
(nw.col("pb_is_good_5") & ~nw.col("pb_is_good_3"))
|
|
1652
|
+
| (nw.col("pb_is_good_6")) & ~nw.col("pb_is_good_2")
|
|
1653
|
+
)
|
|
1654
|
+
)
|
|
1655
|
+
).drop(
|
|
1656
|
+
"pb_is_good_1",
|
|
1657
|
+
"pb_is_good_2",
|
|
1658
|
+
"pb_is_good_3",
|
|
1659
|
+
"pb_is_good_4",
|
|
1660
|
+
"pb_is_good_5",
|
|
1661
|
+
"pb_is_good_6",
|
|
1662
|
+
)
|
|
1677
1663
|
|
|
1678
|
-
|
|
1679
|
-
try:
|
|
1680
|
-
# First try direct evaluation with native Polars expressions
|
|
1681
|
-
expr_result = expr_fn(self.data_tbl)
|
|
1682
|
-
if isinstance(expr_result, pl.Expr):
|
|
1683
|
-
polars_expressions.append(expr_result)
|
|
1684
|
-
else:
|
|
1685
|
-
raise TypeError("Not a valid Polars expression")
|
|
1686
|
-
except Exception as e:
|
|
1687
|
-
try:
|
|
1688
|
-
# Try to get a ColumnExpression
|
|
1689
|
-
col_expr = expr_fn(None)
|
|
1690
|
-
if hasattr(col_expr, "to_polars_expr"):
|
|
1691
|
-
polars_expr = col_expr.to_polars_expr()
|
|
1692
|
-
polars_expressions.append(polars_expr)
|
|
1693
|
-
else: # pragma: no cover
|
|
1694
|
-
raise TypeError(f"Cannot convert {type(col_expr)} to Polars expression")
|
|
1695
|
-
except Exception as e: # pragma: no cover
|
|
1696
|
-
print(f"Error evaluating expression: {e}")
|
|
1664
|
+
return result_tbl.to_native()
|
|
1697
1665
|
|
|
1698
|
-
# Combine results with AND logic
|
|
1699
|
-
if polars_expressions:
|
|
1700
|
-
final_result = polars_expressions[0]
|
|
1701
|
-
for expr in polars_expressions[1:]:
|
|
1702
|
-
final_result = final_result & expr
|
|
1703
1666
|
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
return results_tbl
|
|
1667
|
+
def interrogate_isin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
1668
|
+
"""In set interrogation."""
|
|
1707
1669
|
|
|
1708
|
-
|
|
1709
|
-
results_tbl = self.data_tbl.with_columns(pb_is_good_=pl.lit(True)) # pragma: no cover
|
|
1710
|
-
return results_tbl # pragma: no cover
|
|
1670
|
+
nw_tbl = nw.from_native(tbl)
|
|
1711
1671
|
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1672
|
+
can_be_null: bool = None in set_values
|
|
1673
|
+
base_expr: nw.Expr = nw.col(column).is_in(set_values)
|
|
1674
|
+
if can_be_null:
|
|
1675
|
+
base_expr = base_expr | nw.col(column).is_null()
|
|
1715
1676
|
|
|
1716
|
-
|
|
1677
|
+
result_tbl = nw_tbl.with_columns(pb_is_good_=base_expr)
|
|
1678
|
+
return result_tbl.to_native()
|
|
1717
1679
|
|
|
1718
|
-
for expr_fn in self.expressions:
|
|
1719
|
-
try:
|
|
1720
|
-
# First try direct evaluation with pandas DataFrame
|
|
1721
|
-
expr_result = expr_fn(self.data_tbl)
|
|
1722
1680
|
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
if expr_result.dtype == bool or pd.api.types.is_bool_dtype(expr_result):
|
|
1726
|
-
pandas_series.append(expr_result)
|
|
1727
|
-
else: # pragma: no cover
|
|
1728
|
-
raise TypeError(
|
|
1729
|
-
f"Expression returned Series of type {expr_result.dtype}, expected bool"
|
|
1730
|
-
)
|
|
1731
|
-
else: # pragma: no cover
|
|
1732
|
-
raise TypeError(f"Expression returned {type(expr_result)}, expected pd.Series")
|
|
1681
|
+
def interrogate_notin(tbl: FrameT, column: str, set_values: any) -> FrameT:
|
|
1682
|
+
"""Not in set interrogation."""
|
|
1733
1683
|
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1684
|
+
nw_tbl = nw.from_native(tbl)
|
|
1685
|
+
result_tbl = nw_tbl.with_columns(
|
|
1686
|
+
pb_is_good_=nw.col(column).is_in(set_values),
|
|
1687
|
+
).with_columns(pb_is_good_=~nw.col("pb_is_good_"))
|
|
1688
|
+
return result_tbl.to_native()
|
|
1738
1689
|
|
|
1739
|
-
if hasattr(col_expr, "to_pandas_expr"):
|
|
1740
|
-
# Watch for NotImplementedError here and re-raise it
|
|
1741
|
-
try:
|
|
1742
|
-
pandas_expr = col_expr.to_pandas_expr(self.data_tbl)
|
|
1743
|
-
pandas_series.append(pandas_expr)
|
|
1744
|
-
except NotImplementedError as nie: # pragma: no cover
|
|
1745
|
-
# Re-raise NotImplementedError with the original message
|
|
1746
|
-
raise NotImplementedError(str(nie))
|
|
1747
|
-
else: # pragma: no cover
|
|
1748
|
-
raise TypeError(f"Cannot convert {type(col_expr)} to pandas Series")
|
|
1749
|
-
except NotImplementedError as nie: # pragma: no cover
|
|
1750
|
-
# Re-raise NotImplementedError
|
|
1751
|
-
raise NotImplementedError(str(nie))
|
|
1752
|
-
except Exception as nested_e: # pragma: no cover
|
|
1753
|
-
print(f"Error evaluating pandas expression: {e} -> {nested_e}")
|
|
1754
1690
|
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
final_result = pandas_series[0]
|
|
1758
|
-
for series in pandas_series[1:]:
|
|
1759
|
-
final_result = final_result & series
|
|
1691
|
+
def interrogate_regex(tbl: FrameT, column: str, pattern: str, na_pass: bool) -> FrameT:
|
|
1692
|
+
"""Regex interrogation."""
|
|
1760
1693
|
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1694
|
+
nw_tbl = nw.from_native(tbl)
|
|
1695
|
+
result_tbl = nw_tbl.with_columns(
|
|
1696
|
+
pb_is_good_1=nw.col(column).is_null() & na_pass,
|
|
1697
|
+
pb_is_good_2=nw.col(column).str.contains(pattern, literal=False).fill_null(False),
|
|
1698
|
+
)
|
|
1765
1699
|
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
[True] * len(self.data_tbl), index=self.data_tbl.index
|
|
1770
|
-
)
|
|
1771
|
-
return results_tbl # pragma: no cover
|
|
1700
|
+
result_tbl = result_tbl.with_columns(
|
|
1701
|
+
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2")
|
|
1702
|
+
).drop("pb_is_good_1", "pb_is_good_2")
|
|
1772
1703
|
|
|
1773
|
-
|
|
1774
|
-
"""Process expressions for Ibis tables (including DuckDB)."""
|
|
1775
|
-
import ibis
|
|
1704
|
+
return result_tbl.to_native()
|
|
1776
1705
|
|
|
1777
|
-
ibis_expressions = []
|
|
1778
1706
|
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
try:
|
|
1782
|
-
expr_result = expr_fn(self.data_tbl)
|
|
1707
|
+
def interrogate_null(tbl: FrameT, column: str) -> FrameT:
|
|
1708
|
+
"""Null interrogation."""
|
|
1783
1709
|
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
continue # Skip to next expression if this worked
|
|
1788
|
-
except Exception: # pragma: no cover
|
|
1789
|
-
pass # Silently continue to Strategy 2
|
|
1710
|
+
nw_tbl = nw.from_native(tbl)
|
|
1711
|
+
result_tbl = nw_tbl.with_columns(pb_is_good_=nw.col(column).is_null())
|
|
1712
|
+
return result_tbl.to_native()
|
|
1790
1713
|
|
|
1791
|
-
# Strategy 2: Try with ColumnExpression
|
|
1792
|
-
try: # pragma: no cover
|
|
1793
|
-
# Skip this strategy if we don't have an expr_col implementation
|
|
1794
|
-
if not hasattr(self, "to_ibis_expr"):
|
|
1795
|
-
continue
|
|
1796
1714
|
|
|
1797
|
-
|
|
1715
|
+
def interrogate_not_null(tbl: FrameT, column: str) -> FrameT:
|
|
1716
|
+
"""Not null interrogation."""
|
|
1798
1717
|
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1718
|
+
nw_tbl = nw.from_native(tbl)
|
|
1719
|
+
result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null())
|
|
1720
|
+
return result_tbl.to_native()
|
|
1802
1721
|
|
|
1803
|
-
# Convert ColumnExpression to Ibis expression
|
|
1804
|
-
if hasattr(col_expr, "to_ibis_expr"):
|
|
1805
|
-
ibis_expr = col_expr.to_ibis_expr(self.data_tbl)
|
|
1806
|
-
ibis_expressions.append(ibis_expr)
|
|
1807
|
-
except Exception: # pragma: no cover
|
|
1808
|
-
# Silent failure - we already tried both strategies
|
|
1809
|
-
pass
|
|
1810
1722
|
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
final_result = final_result & expr
|
|
1723
|
+
def _interrogate_comparison_base(
|
|
1724
|
+
tbl: FrameT, column: str, compare: any, na_pass: bool, operator: str
|
|
1725
|
+
) -> FrameT:
|
|
1726
|
+
"""
|
|
1727
|
+
Unified base function for comparison operations (gt, ge, lt, le, eq, ne).
|
|
1817
1728
|
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1729
|
+
Parameters
|
|
1730
|
+
----------
|
|
1731
|
+
tbl
|
|
1732
|
+
The table to interrogate.
|
|
1733
|
+
column
|
|
1734
|
+
The column to check.
|
|
1735
|
+
compare
|
|
1736
|
+
The value to compare against.
|
|
1737
|
+
na_pass
|
|
1738
|
+
Whether to pass null values.
|
|
1739
|
+
operator
|
|
1740
|
+
The comparison operator: 'gt', 'ge', 'lt', 'le', 'eq', 'ne'.
|
|
1823
1741
|
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1742
|
+
Returns
|
|
1743
|
+
-------
|
|
1744
|
+
FrameT
|
|
1745
|
+
The result table with `pb_is_good_` column indicating the passing test units.
|
|
1746
|
+
"""
|
|
1827
1747
|
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1748
|
+
compare_expr = _get_compare_expr_nw(compare=compare)
|
|
1749
|
+
|
|
1750
|
+
nw_tbl = nw.from_native(tbl)
|
|
1751
|
+
compare_expr = _safe_modify_datetime_compare_val(nw_tbl, column, compare_expr)
|
|
1752
|
+
|
|
1753
|
+
# Create the comparison expression based on the operator
|
|
1754
|
+
column_expr = nw.col(column)
|
|
1755
|
+
if operator == "gt":
|
|
1756
|
+
comparison = column_expr > compare_expr
|
|
1757
|
+
elif operator == "ge":
|
|
1758
|
+
comparison = column_expr >= compare_expr
|
|
1759
|
+
elif operator == "lt":
|
|
1760
|
+
comparison = column_expr < compare_expr
|
|
1761
|
+
elif operator == "le":
|
|
1762
|
+
comparison = column_expr <= compare_expr
|
|
1763
|
+
elif operator == "eq":
|
|
1764
|
+
comparison = column_expr == compare_expr
|
|
1765
|
+
elif operator == "ne":
|
|
1766
|
+
comparison = column_expr != compare_expr
|
|
1767
|
+
else:
|
|
1768
|
+
raise ValueError( # pragma: no cover
|
|
1769
|
+
f"Invalid operator: {operator}. Must be one of: 'gt', 'ge', 'lt', 'le', 'eq', 'ne'"
|
|
1770
|
+
)
|
|
1831
1771
|
|
|
1832
|
-
|
|
1772
|
+
result_tbl = nw_tbl.with_columns(
|
|
1773
|
+
pb_is_good_1=_safe_is_nan_or_null_expr(nw_tbl, nw.col(column), column) & na_pass,
|
|
1774
|
+
pb_is_good_2=(
|
|
1775
|
+
_safe_is_nan_or_null_expr(nw_tbl, nw.col(compare.name), compare.name) & na_pass
|
|
1776
|
+
if isinstance(compare, Column)
|
|
1777
|
+
else nw.lit(False)
|
|
1778
|
+
),
|
|
1779
|
+
pb_is_good_3=comparison & ~_safe_is_nan_or_null_expr(nw_tbl, nw.col(column), column),
|
|
1780
|
+
)
|
|
1833
1781
|
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
pyspark_columns.append(expr_result)
|
|
1842
|
-
else:
|
|
1843
|
-
raise TypeError(
|
|
1844
|
-
f"Expression returned {type(expr_result)}, expected PySpark Column"
|
|
1845
|
-
)
|
|
1846
|
-
|
|
1847
|
-
except Exception as e:
|
|
1848
|
-
try:
|
|
1849
|
-
# Try as a ColumnExpression (for pb.expr_col style)
|
|
1850
|
-
col_expr = expr_fn(None)
|
|
1851
|
-
|
|
1852
|
-
if hasattr(col_expr, "to_pyspark_expr"):
|
|
1853
|
-
# Convert to PySpark expression
|
|
1854
|
-
pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
|
|
1855
|
-
pyspark_columns.append(pyspark_expr)
|
|
1856
|
-
else:
|
|
1857
|
-
raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
|
|
1858
|
-
except Exception as nested_e:
|
|
1859
|
-
print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
|
|
1860
|
-
|
|
1861
|
-
# Combine results with AND logic
|
|
1862
|
-
if pyspark_columns:
|
|
1863
|
-
final_result = pyspark_columns[0]
|
|
1864
|
-
for col in pyspark_columns[1:]:
|
|
1865
|
-
final_result = final_result & col
|
|
1866
|
-
|
|
1867
|
-
# Create results table with boolean column
|
|
1868
|
-
results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
|
|
1869
|
-
return results_tbl
|
|
1870
|
-
|
|
1871
|
-
# Default case
|
|
1872
|
-
results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
|
|
1873
|
-
return results_tbl
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
class SpeciallyValidation:
|
|
1877
|
-
def __init__(self, data_tbl, expression, threshold, tbl_type):
|
|
1878
|
-
self.data_tbl = data_tbl
|
|
1879
|
-
self.expression = expression
|
|
1880
|
-
self.threshold = threshold
|
|
1881
|
-
|
|
1882
|
-
# Detect the table type
|
|
1883
|
-
if tbl_type in (None, "local"):
|
|
1884
|
-
# Detect the table type using _get_tbl_type()
|
|
1885
|
-
self.tbl_type = _get_tbl_type(data=data_tbl)
|
|
1886
|
-
else:
|
|
1887
|
-
self.tbl_type = tbl_type
|
|
1888
|
-
|
|
1889
|
-
def get_test_results(self) -> any | list[bool]:
|
|
1890
|
-
"""Evaluate the expression get either a list of booleans or a results table."""
|
|
1891
|
-
|
|
1892
|
-
# Get the expression and inspect whether there is a `data` argument
|
|
1893
|
-
expression = self.expression
|
|
1894
|
-
|
|
1895
|
-
import inspect
|
|
1896
|
-
|
|
1897
|
-
# During execution of `specially` validation
|
|
1898
|
-
sig = inspect.signature(expression)
|
|
1899
|
-
params = list(sig.parameters.keys())
|
|
1900
|
-
|
|
1901
|
-
# Execute the function based on its signature
|
|
1902
|
-
if len(params) == 0:
|
|
1903
|
-
# No parameters: call without arguments
|
|
1904
|
-
result = expression()
|
|
1905
|
-
elif len(params) == 1:
|
|
1906
|
-
# One parameter: pass the data table
|
|
1907
|
-
data_tbl = self.data_tbl
|
|
1908
|
-
result = expression(data_tbl)
|
|
1909
|
-
else:
|
|
1910
|
-
# More than one parameter - this doesn't match either allowed signature
|
|
1911
|
-
raise ValueError(
|
|
1912
|
-
f"The function provided to 'specially()' should have either no parameters or a "
|
|
1913
|
-
f"single 'data' parameter, but it has {len(params)} parameters: {params}"
|
|
1914
|
-
)
|
|
1915
|
-
|
|
1916
|
-
# Determine if the object is a DataFrame by inspecting the string version of its type
|
|
1917
|
-
if (
|
|
1918
|
-
"pandas" in str(type(result))
|
|
1919
|
-
or "polars" in str(type(result))
|
|
1920
|
-
or "ibis" in str(type(result))
|
|
1921
|
-
):
|
|
1922
|
-
# Get the type of the table
|
|
1923
|
-
tbl_type = _get_tbl_type(data=result)
|
|
1924
|
-
|
|
1925
|
-
if "pandas" in tbl_type:
|
|
1926
|
-
# If it's a Pandas DataFrame, check if the last column is a boolean column
|
|
1927
|
-
last_col = result.iloc[:, -1]
|
|
1928
|
-
|
|
1929
|
-
import pandas as pd
|
|
1930
|
-
|
|
1931
|
-
if last_col.dtype == bool or pd.api.types.is_bool_dtype(last_col):
|
|
1932
|
-
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
1933
|
-
result.rename(columns={result.columns[-1]: "pb_is_good_"}, inplace=True)
|
|
1934
|
-
elif "polars" in tbl_type:
|
|
1935
|
-
# If it's a Polars DataFrame, check if the last column is a boolean column
|
|
1936
|
-
last_col_name = result.columns[-1]
|
|
1937
|
-
last_col_dtype = result.schema[last_col_name]
|
|
1782
|
+
result_tbl = result_tbl.with_columns(
|
|
1783
|
+
pb_is_good_3=(
|
|
1784
|
+
nw.when(nw.col("pb_is_good_3").is_null())
|
|
1785
|
+
.then(nw.lit(False))
|
|
1786
|
+
.otherwise(nw.col("pb_is_good_3"))
|
|
1787
|
+
)
|
|
1788
|
+
)
|
|
1938
1789
|
|
|
1939
|
-
|
|
1790
|
+
result_tbl = result_tbl.with_columns(
|
|
1791
|
+
pb_is_good_=nw.col("pb_is_good_1") | nw.col("pb_is_good_2") | nw.col("pb_is_good_3")
|
|
1792
|
+
).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
1940
1793
|
|
|
1941
|
-
|
|
1942
|
-
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
1943
|
-
result = result.rename({last_col_name: "pb_is_good_"})
|
|
1944
|
-
elif tbl_type in IBIS_BACKENDS:
|
|
1945
|
-
# If it's an Ibis table, check if the last column is a boolean column
|
|
1946
|
-
last_col_name = result.columns[-1]
|
|
1947
|
-
result_schema = result.schema()
|
|
1948
|
-
is_last_col_bool = str(result_schema[last_col_name]) == "boolean"
|
|
1794
|
+
return result_tbl.to_native()
|
|
1949
1795
|
|
|
1950
|
-
if is_last_col_bool:
|
|
1951
|
-
# If the last column is a boolean column, rename it as `pb_is_good_`
|
|
1952
|
-
result = result.rename(pb_is_good_=last_col_name)
|
|
1953
1796
|
|
|
1954
|
-
|
|
1955
|
-
raise NotImplementedError(f"Support for {tbl_type} is not yet implemented")
|
|
1956
|
-
|
|
1957
|
-
elif isinstance(result, bool):
|
|
1958
|
-
# If it's a single boolean, return that as a list
|
|
1959
|
-
return [result]
|
|
1960
|
-
|
|
1961
|
-
elif isinstance(result, list):
|
|
1962
|
-
# If it's a list, check that it is a boolean list
|
|
1963
|
-
if all(isinstance(x, bool) for x in result):
|
|
1964
|
-
# If it's a list of booleans, return it as is
|
|
1965
|
-
return result
|
|
1966
|
-
else:
|
|
1967
|
-
# If it's not a list of booleans, raise an error
|
|
1968
|
-
raise TypeError("The result is not a list of booleans.")
|
|
1969
|
-
else: # pragma: no cover
|
|
1970
|
-
# If it's not a DataFrame or a list, raise an error
|
|
1971
|
-
raise TypeError("The result is not a DataFrame or a list of booleans.")
|
|
1972
|
-
|
|
1973
|
-
# Return the results table or list of booleans
|
|
1974
|
-
return result
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
@dataclass
|
|
1978
|
-
class NumberOfTestUnits:
|
|
1797
|
+
def interrogate_rows_distinct(data_tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
|
|
1979
1798
|
"""
|
|
1980
|
-
|
|
1981
|
-
"""
|
|
1982
|
-
|
|
1983
|
-
df: FrameT
|
|
1984
|
-
column: str
|
|
1985
|
-
|
|
1986
|
-
def get_test_units(self, tbl_type: str) -> int:
|
|
1987
|
-
if (
|
|
1988
|
-
tbl_type == "pandas"
|
|
1989
|
-
or tbl_type == "polars"
|
|
1990
|
-
or tbl_type == "pyspark"
|
|
1991
|
-
or tbl_type == "local"
|
|
1992
|
-
):
|
|
1993
|
-
# Convert the DataFrame to a format that narwhals can work with and:
|
|
1994
|
-
# - check if the column exists
|
|
1995
|
-
dfn = _column_test_prep(
|
|
1996
|
-
df=self.df, column=self.column, allowed_types=None, check_exists=False
|
|
1997
|
-
)
|
|
1998
|
-
|
|
1999
|
-
# Handle LazyFrames which don't have len()
|
|
2000
|
-
if hasattr(dfn, "collect"):
|
|
2001
|
-
dfn = dfn.collect()
|
|
2002
|
-
|
|
2003
|
-
return len(dfn)
|
|
2004
|
-
|
|
2005
|
-
if tbl_type in IBIS_BACKENDS:
|
|
2006
|
-
# Get the count of test units and convert to a native format
|
|
2007
|
-
# TODO: check whether pandas or polars is available
|
|
2008
|
-
return self.df.count().to_polars()
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
def _get_compare_expr_nw(compare: Any) -> Any:
|
|
2012
|
-
if isinstance(compare, Column):
|
|
2013
|
-
if not isinstance(compare.exprs, str):
|
|
2014
|
-
raise ValueError("The column expression must be a string.") # pragma: no cover
|
|
2015
|
-
return nw.col(compare.exprs)
|
|
2016
|
-
return compare
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
def _column_has_null_values(table: FrameT, column: str) -> bool:
|
|
2020
|
-
try:
|
|
2021
|
-
# Try the standard null_count() method
|
|
2022
|
-
null_count = (table.select(column).null_count())[column][0]
|
|
2023
|
-
except AttributeError:
|
|
2024
|
-
# For LazyFrames, collect first then get null count
|
|
2025
|
-
try:
|
|
2026
|
-
collected = table.select(column).collect()
|
|
2027
|
-
null_count = (collected.null_count())[column][0]
|
|
2028
|
-
except Exception:
|
|
2029
|
-
# Fallback: check if any values are null
|
|
2030
|
-
try:
|
|
2031
|
-
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
2032
|
-
null_count = result["null_count"][0]
|
|
2033
|
-
except Exception:
|
|
2034
|
-
# Last resort: return False (assume no nulls)
|
|
2035
|
-
return False
|
|
2036
|
-
|
|
2037
|
-
if null_count is None or null_count == 0:
|
|
2038
|
-
return False
|
|
2039
|
-
|
|
2040
|
-
return True
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
def _check_nulls_across_columns_nw(table, columns_subset):
|
|
2044
|
-
# Get all column names from the table
|
|
2045
|
-
column_names = columns_subset if columns_subset else table.columns
|
|
2046
|
-
|
|
2047
|
-
# Build the expression by combining each column's `is_null()` with OR operations
|
|
2048
|
-
null_expr = functools.reduce(
|
|
2049
|
-
lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
|
|
2050
|
-
column_names,
|
|
2051
|
-
None,
|
|
2052
|
-
)
|
|
2053
|
-
|
|
2054
|
-
# Add the expression as a new column to the table
|
|
2055
|
-
result = table.with_columns(_any_is_null_=null_expr)
|
|
1799
|
+
Check if rows in a DataFrame are distinct.
|
|
2056
1800
|
|
|
2057
|
-
|
|
1801
|
+
Parameters
|
|
1802
|
+
----------
|
|
1803
|
+
data_tbl
|
|
1804
|
+
A data table.
|
|
1805
|
+
columns_subset
|
|
1806
|
+
A list of columns to check for distinctness.
|
|
1807
|
+
threshold
|
|
1808
|
+
The maximum number of failing test units to allow.
|
|
1809
|
+
tbl_type
|
|
1810
|
+
The type of table to use for the assertion.
|
|
2058
1811
|
|
|
1812
|
+
Returns
|
|
1813
|
+
-------
|
|
1814
|
+
FrameT
|
|
1815
|
+
A DataFrame with a `pb_is_good_` column indicating which rows pass the test.
|
|
1816
|
+
"""
|
|
1817
|
+
tbl = nw.from_native(data_tbl)
|
|
2059
1818
|
|
|
2060
|
-
|
|
2061
|
-
|
|
1819
|
+
# Get the column subset to use for the test
|
|
1820
|
+
if columns_subset is None:
|
|
1821
|
+
columns_subset = tbl.columns
|
|
2062
1822
|
|
|
2063
|
-
|
|
2064
|
-
|
|
1823
|
+
# Create a count of duplicates using group_by approach
|
|
1824
|
+
# Group by the columns of interest and count occurrences
|
|
1825
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
2065
1826
|
|
|
2066
|
-
#
|
|
2067
|
-
|
|
1827
|
+
# Join back to original table to get count for each row
|
|
1828
|
+
tbl = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
2068
1829
|
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
elif "datetime.date" in compare_type_str:
|
|
2072
|
-
compare_type = "date"
|
|
2073
|
-
else:
|
|
2074
|
-
compare_type = "other"
|
|
1830
|
+
# Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
|
|
1831
|
+
tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
2075
1832
|
|
|
2076
|
-
|
|
2077
|
-
tgt_col_dtype = "datetime"
|
|
2078
|
-
elif "date" in tgt_col_dtype_str or "object" in tgt_col_dtype_str:
|
|
2079
|
-
# Object type is used for date columns in Pandas
|
|
2080
|
-
tgt_col_dtype = "date"
|
|
2081
|
-
else:
|
|
2082
|
-
tgt_col_dtype = "other"
|
|
1833
|
+
return tbl.to_native()
|
|
2083
1834
|
|
|
2084
|
-
# Handle each combination of `compare_type` and `tgt_col_dtype`, coercing only the
|
|
2085
|
-
# `compare_expr` to the type of the column
|
|
2086
|
-
if compare_type == "datetime" and tgt_col_dtype == "date":
|
|
2087
|
-
# Assume that `compare_expr` is a datetime.datetime object and strip the time part
|
|
2088
|
-
# to get a date object
|
|
2089
|
-
compare_expr = compare_val.date()
|
|
2090
1835
|
|
|
2091
|
-
|
|
2092
|
-
|
|
1836
|
+
def interrogate_rows_complete(tbl: FrameT, columns_subset: list[str] | None) -> FrameT:
|
|
1837
|
+
"""Rows complete interrogation."""
|
|
1838
|
+
nw_tbl = nw.from_native(tbl)
|
|
2093
1839
|
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
1840
|
+
# Determine the number of null values in each row (column subsets are handled in
|
|
1841
|
+
# the `_check_nulls_across_columns_nw()` function)
|
|
1842
|
+
result_tbl = _check_nulls_across_columns_nw(table=nw_tbl, columns_subset=columns_subset)
|
|
2097
1843
|
|
|
2098
|
-
|
|
2099
|
-
|
|
1844
|
+
# Failing rows will have the value `True` in the generated column, so we need to negate
|
|
1845
|
+
# the result to get the passing rows
|
|
1846
|
+
result_tbl = result_tbl.with_columns(pb_is_good_=~nw.col("_any_is_null_"))
|
|
1847
|
+
result_tbl = result_tbl.drop("_any_is_null_")
|
|
2100
1848
|
|
|
2101
|
-
return
|
|
1849
|
+
return result_tbl.to_native()
|