pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +0 -1
- pointblank/_interrogation.py +244 -606
- pointblank/_utils.py +65 -3
- pointblank/assistant.py +9 -0
- pointblank/cli.py +39 -24
- pointblank/data/api-docs.txt +658 -29
- pointblank/schema.py +17 -0
- pointblank/segments.py +163 -0
- pointblank/validate.py +344 -92
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/METADATA +59 -6
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/RECORD +16 -15
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/WHEEL +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/top_level.txt +0 -0
pointblank/_interrogation.py
CHANGED
|
@@ -15,7 +15,7 @@ from pointblank._utils import (
|
|
|
15
15
|
_convert_to_narwhals,
|
|
16
16
|
_get_tbl_type,
|
|
17
17
|
)
|
|
18
|
-
from pointblank.column import Column
|
|
18
|
+
from pointblank.column import Column
|
|
19
19
|
from pointblank.schema import Schema
|
|
20
20
|
from pointblank.thresholds import _threshold_check
|
|
21
21
|
|
|
@@ -23,6 +23,74 @@ if TYPE_CHECKING:
|
|
|
23
23
|
from pointblank._typing import AbsoluteTolBounds
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def _safe_modify_datetime_compare_val(data_frame: Any, column: str, compare_val: Any) -> Any:
|
|
27
|
+
"""
|
|
28
|
+
Safely modify datetime comparison values for LazyFrame compatibility.
|
|
29
|
+
|
|
30
|
+
This function handles the case where we can't directly slice LazyFrames
|
|
31
|
+
to get column dtypes for datetime conversion.
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
# First try to get column dtype from schema for LazyFrames
|
|
35
|
+
column_dtype = None
|
|
36
|
+
|
|
37
|
+
if hasattr(data_frame, "collect_schema"):
|
|
38
|
+
schema = data_frame.collect_schema()
|
|
39
|
+
column_dtype = schema.get(column)
|
|
40
|
+
elif hasattr(data_frame, "schema"):
|
|
41
|
+
schema = data_frame.schema
|
|
42
|
+
column_dtype = schema.get(column)
|
|
43
|
+
|
|
44
|
+
# If we got a dtype from schema, use it
|
|
45
|
+
if column_dtype is not None:
|
|
46
|
+
# Create a mock column object for _modify_datetime_compare_val
|
|
47
|
+
class MockColumn:
|
|
48
|
+
def __init__(self, dtype):
|
|
49
|
+
self.dtype = dtype
|
|
50
|
+
|
|
51
|
+
mock_column = MockColumn(column_dtype)
|
|
52
|
+
return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
|
|
53
|
+
|
|
54
|
+
# Fallback: try collecting a small sample if possible
|
|
55
|
+
try:
|
|
56
|
+
sample = data_frame.head(1).collect()
|
|
57
|
+
if hasattr(sample, "dtypes") and column in sample.columns:
|
|
58
|
+
# For pandas-like dtypes
|
|
59
|
+
column_dtype = sample.dtypes[column] if hasattr(sample, "dtypes") else None
|
|
60
|
+
if column_dtype:
|
|
61
|
+
|
|
62
|
+
class MockColumn:
|
|
63
|
+
def __init__(self, dtype):
|
|
64
|
+
self.dtype = dtype
|
|
65
|
+
|
|
66
|
+
mock_column = MockColumn(column_dtype)
|
|
67
|
+
return _modify_datetime_compare_val(
|
|
68
|
+
tgt_column=mock_column, compare_val=compare_val
|
|
69
|
+
)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
# Final fallback: try direct access (for eager DataFrames)
|
|
74
|
+
try:
|
|
75
|
+
if hasattr(data_frame, "dtypes") and column in data_frame.columns:
|
|
76
|
+
column_dtype = data_frame.dtypes[column]
|
|
77
|
+
|
|
78
|
+
class MockColumn:
|
|
79
|
+
def __init__(self, dtype):
|
|
80
|
+
self.dtype = dtype
|
|
81
|
+
|
|
82
|
+
mock_column = MockColumn(column_dtype)
|
|
83
|
+
return _modify_datetime_compare_val(tgt_column=mock_column, compare_val=compare_val)
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
# If all else fails, return the original compare_val
|
|
91
|
+
return compare_val
|
|
92
|
+
|
|
93
|
+
|
|
26
94
|
@dataclass
|
|
27
95
|
class Interrogator:
|
|
28
96
|
"""
|
|
@@ -89,56 +157,25 @@ class Interrogator:
|
|
|
89
157
|
na_pass: bool = False
|
|
90
158
|
tbl_type: str = "local"
|
|
91
159
|
|
|
92
|
-
def
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
96
|
-
import ibis
|
|
97
|
-
|
|
98
|
-
if isinstance(self.compare, ColumnLiteral):
|
|
99
|
-
#
|
|
100
|
-
# Ibis column-to-column comparison
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
tbl = self.x.mutate(
|
|
104
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
105
|
-
& ibis.literal(self.na_pass),
|
|
106
|
-
pb_is_good_2=self.x[self.column] > self.x[self.compare.name],
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
tbl = tbl.mutate(
|
|
110
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
114
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
else:
|
|
118
|
-
#
|
|
119
|
-
# Ibis column-to-literal comparison
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
tbl = self.x.mutate(
|
|
123
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
124
|
-
pb_is_good_2=self.x[self.column] > ibis.literal(self.compare),
|
|
125
|
-
)
|
|
160
|
+
def __post_init__(self):
|
|
161
|
+
"""
|
|
162
|
+
Post-initialization to process Ibis tables through Narwhals.
|
|
126
163
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
164
|
+
This converts Ibis tables to Narwhals-wrapped tables to unify
|
|
165
|
+
the processing pathway and reduce code branching.
|
|
166
|
+
"""
|
|
167
|
+
# Import the processing function
|
|
168
|
+
from pointblank._utils import _process_ibis_through_narwhals
|
|
130
169
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
)
|
|
170
|
+
# Process Ibis tables through Narwhals
|
|
171
|
+
self.x, self.tbl_type = _process_ibis_through_narwhals(self.x, self.tbl_type)
|
|
134
172
|
|
|
135
|
-
|
|
173
|
+
def gt(self) -> FrameT | Any:
|
|
174
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
136
175
|
|
|
137
176
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
138
177
|
|
|
139
|
-
compare_expr =
|
|
140
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
141
|
-
)
|
|
178
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
142
179
|
|
|
143
180
|
return (
|
|
144
181
|
self.x.with_columns(
|
|
@@ -165,55 +202,11 @@ class Interrogator:
|
|
|
165
202
|
)
|
|
166
203
|
|
|
167
204
|
def lt(self) -> FrameT | Any:
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
171
|
-
import ibis
|
|
172
|
-
|
|
173
|
-
if isinstance(self.compare, Column):
|
|
174
|
-
#
|
|
175
|
-
# Ibis column-to-column comparison
|
|
176
|
-
#
|
|
177
|
-
|
|
178
|
-
tbl = self.x.mutate(
|
|
179
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
180
|
-
& ibis.literal(self.na_pass),
|
|
181
|
-
pb_is_good_2=self.x[self.column] < self.x[self.compare.name],
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
tbl = tbl.mutate(
|
|
185
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
189
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
else:
|
|
193
|
-
#
|
|
194
|
-
# Ibis column-to-literal comparison
|
|
195
|
-
#
|
|
196
|
-
|
|
197
|
-
tbl = self.x.mutate(
|
|
198
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
199
|
-
pb_is_good_2=self.x[self.column] < ibis.literal(self.compare),
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
tbl = tbl.mutate(
|
|
203
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
207
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Local backends (Narwhals) ---------------------------------
|
|
205
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
211
206
|
|
|
212
207
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
213
208
|
|
|
214
|
-
compare_expr =
|
|
215
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
216
|
-
)
|
|
209
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
217
210
|
|
|
218
211
|
return (
|
|
219
212
|
self.x.with_columns(
|
|
@@ -240,49 +233,7 @@ class Interrogator:
|
|
|
240
233
|
)
|
|
241
234
|
|
|
242
235
|
def eq(self) -> FrameT | Any:
|
|
243
|
-
#
|
|
244
|
-
|
|
245
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
246
|
-
import ibis
|
|
247
|
-
|
|
248
|
-
if isinstance(self.compare, Column):
|
|
249
|
-
#
|
|
250
|
-
# Ibis column-to-column comparison
|
|
251
|
-
#
|
|
252
|
-
|
|
253
|
-
tbl = self.x.mutate(
|
|
254
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
255
|
-
& ibis.literal(self.na_pass),
|
|
256
|
-
pb_is_good_2=self.x[self.column] == self.x[self.compare.name],
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
tbl = tbl.mutate(
|
|
260
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
264
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
265
|
-
)
|
|
266
|
-
|
|
267
|
-
else:
|
|
268
|
-
#
|
|
269
|
-
# Ibis column-to-literal comparison
|
|
270
|
-
#
|
|
271
|
-
|
|
272
|
-
tbl = self.x.mutate(
|
|
273
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
274
|
-
pb_is_good_2=self.x[self.column] == ibis.literal(self.compare),
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
tbl = tbl.mutate(
|
|
278
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
282
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
# Local backends (Narwhals) ---------------------------------
|
|
236
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
286
237
|
|
|
287
238
|
if isinstance(self.compare, Column):
|
|
288
239
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
@@ -329,9 +280,7 @@ class Interrogator:
|
|
|
329
280
|
else:
|
|
330
281
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
331
282
|
|
|
332
|
-
compare_expr =
|
|
333
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
334
|
-
)
|
|
283
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
335
284
|
|
|
336
285
|
tbl = self.x.with_columns(
|
|
337
286
|
pb_is_good_1=nw.col(self.column).is_null() & self.na_pass,
|
|
@@ -359,47 +308,7 @@ class Interrogator:
|
|
|
359
308
|
return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
360
309
|
|
|
361
310
|
def ne(self) -> FrameT | Any:
|
|
362
|
-
#
|
|
363
|
-
|
|
364
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
365
|
-
import ibis
|
|
366
|
-
|
|
367
|
-
if isinstance(self.compare, Column):
|
|
368
|
-
#
|
|
369
|
-
# Ibis column-to-column comparison
|
|
370
|
-
#
|
|
371
|
-
|
|
372
|
-
tbl = self.x.mutate(
|
|
373
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
374
|
-
& ibis.literal(self.na_pass),
|
|
375
|
-
pb_is_good_2=self.x[self.column] != self.x[self.compare.name],
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
tbl = tbl.mutate(
|
|
379
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
383
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
#
|
|
387
|
-
# Ibis column-to-literal comparison
|
|
388
|
-
#
|
|
389
|
-
tbl = self.x.mutate(
|
|
390
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
391
|
-
pb_is_good_2=ibis.ifelse(
|
|
392
|
-
self.x[self.column].notnull(),
|
|
393
|
-
self.x[self.column] != ibis.literal(self.compare),
|
|
394
|
-
ibis.literal(False),
|
|
395
|
-
),
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
399
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
# Local backends (Narwhals) ---------------------------------
|
|
311
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
403
312
|
|
|
404
313
|
# Determine if the reference and comparison columns have any null values
|
|
405
314
|
ref_col_has_null_vals = _column_has_null_values(table=self.x, column=self.column)
|
|
@@ -421,9 +330,7 @@ class Interrogator:
|
|
|
421
330
|
).to_native()
|
|
422
331
|
|
|
423
332
|
else:
|
|
424
|
-
compare_expr =
|
|
425
|
-
tgt_column=self.x[self.column], compare_val=self.compare
|
|
426
|
-
)
|
|
333
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
|
|
427
334
|
|
|
428
335
|
return self.x.with_columns(
|
|
429
336
|
pb_is_good_=nw.col(self.column) != nw.lit(compare_expr),
|
|
@@ -469,6 +376,12 @@ class Interrogator:
|
|
|
469
376
|
tbl = tbl.with_columns(
|
|
470
377
|
pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
471
378
|
)
|
|
379
|
+
else:
|
|
380
|
+
# General case (non-Polars): handle na_pass=True properly
|
|
381
|
+
if self.na_pass:
|
|
382
|
+
tbl = tbl.with_columns(
|
|
383
|
+
pb_is_good_2=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
384
|
+
)
|
|
472
385
|
|
|
473
386
|
return (
|
|
474
387
|
tbl.with_columns(pb_is_good_=nw.col("pb_is_good_2"))
|
|
@@ -500,6 +413,12 @@ class Interrogator:
|
|
|
500
413
|
tbl = tbl.with_columns(
|
|
501
414
|
pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
502
415
|
)
|
|
416
|
+
else:
|
|
417
|
+
# General case (non-Polars): handle na_pass=True properly
|
|
418
|
+
if self.na_pass:
|
|
419
|
+
tbl = tbl.with_columns(
|
|
420
|
+
pb_is_good_1=(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
421
|
+
)
|
|
503
422
|
|
|
504
423
|
return (
|
|
505
424
|
tbl.with_columns(pb_is_good_=nw.col("pb_is_good_1"))
|
|
@@ -532,6 +451,16 @@ class Interrogator:
|
|
|
532
451
|
.otherwise(False)
|
|
533
452
|
)
|
|
534
453
|
)
|
|
454
|
+
else:
|
|
455
|
+
# General case (non-Polars): handle na_pass=True properly
|
|
456
|
+
if self.na_pass:
|
|
457
|
+
tbl = tbl.with_columns(
|
|
458
|
+
pb_is_good_3=(
|
|
459
|
+
nw.when(nw.col("pb_is_good_1") | nw.col("pb_is_good_2"))
|
|
460
|
+
.then(True)
|
|
461
|
+
.otherwise(nw.col("pb_is_good_3"))
|
|
462
|
+
)
|
|
463
|
+
)
|
|
535
464
|
|
|
536
465
|
return (
|
|
537
466
|
tbl.with_columns(pb_is_good_=nw.col("pb_is_good_3"))
|
|
@@ -544,9 +473,7 @@ class Interrogator:
|
|
|
544
473
|
if ref_col_has_null_vals:
|
|
545
474
|
# Create individual cases for Pandas and Polars
|
|
546
475
|
|
|
547
|
-
compare_expr =
|
|
548
|
-
tgt_column=self.x[self.column], compare_val=self.compare
|
|
549
|
-
)
|
|
476
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, self.compare)
|
|
550
477
|
|
|
551
478
|
if is_pandas_dataframe(self.x.to_native()):
|
|
552
479
|
tbl = self.x.with_columns(
|
|
@@ -584,54 +511,31 @@ class Interrogator:
|
|
|
584
511
|
|
|
585
512
|
return tbl
|
|
586
513
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
#
|
|
595
|
-
# Ibis column-to-column comparison
|
|
596
|
-
#
|
|
597
|
-
|
|
598
|
-
tbl = self.x.mutate(
|
|
599
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
600
|
-
& ibis.literal(self.na_pass),
|
|
601
|
-
pb_is_good_2=self.x[self.column] >= self.x[self.compare.name],
|
|
602
|
-
)
|
|
603
|
-
|
|
604
|
-
tbl = tbl.mutate(
|
|
605
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
609
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
610
|
-
)
|
|
514
|
+
else:
|
|
515
|
+
# Generic case for other DataFrame types (PySpark, etc.)
|
|
516
|
+
# Use similar logic to Polars but handle potential differences
|
|
517
|
+
tbl = self.x.with_columns(
|
|
518
|
+
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
519
|
+
pb_is_good_2=nw.lit(self.na_pass), # Pass if any Null in val or compare
|
|
520
|
+
)
|
|
611
521
|
|
|
612
|
-
|
|
613
|
-
# Ibis column-to-literal comparison
|
|
614
|
-
#
|
|
615
|
-
tbl = self.x.mutate(
|
|
616
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
617
|
-
pb_is_good_2=self.x[self.column] >= ibis.literal(self.compare),
|
|
618
|
-
)
|
|
522
|
+
tbl = tbl.with_columns(pb_is_good_3=nw.col(self.column) != nw.lit(compare_expr))
|
|
619
523
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
524
|
+
tbl = tbl.with_columns(
|
|
525
|
+
pb_is_good_=(
|
|
526
|
+
(nw.col("pb_is_good_1") & nw.col("pb_is_good_2"))
|
|
527
|
+
| (nw.col("pb_is_good_3") & ~nw.col("pb_is_good_1"))
|
|
528
|
+
)
|
|
529
|
+
)
|
|
623
530
|
|
|
624
|
-
|
|
625
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
626
|
-
)
|
|
531
|
+
return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
627
532
|
|
|
628
|
-
|
|
533
|
+
def ge(self) -> FrameT | Any:
|
|
534
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
629
535
|
|
|
630
536
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
631
537
|
|
|
632
|
-
compare_expr =
|
|
633
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
634
|
-
)
|
|
538
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
635
539
|
|
|
636
540
|
tbl = (
|
|
637
541
|
self.x.with_columns(
|
|
@@ -658,53 +562,11 @@ class Interrogator:
|
|
|
658
562
|
return tbl.drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3").to_native()
|
|
659
563
|
|
|
660
564
|
def le(self) -> FrameT | Any:
|
|
661
|
-
#
|
|
662
|
-
|
|
663
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
664
|
-
import ibis
|
|
665
|
-
|
|
666
|
-
if isinstance(self.compare, Column):
|
|
667
|
-
#
|
|
668
|
-
# Ibis column-to-column comparison
|
|
669
|
-
#
|
|
670
|
-
|
|
671
|
-
tbl = self.x.mutate(
|
|
672
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.compare.name].isnull())
|
|
673
|
-
& ibis.literal(self.na_pass),
|
|
674
|
-
pb_is_good_2=self.x[self.column] <= self.x[self.compare.name],
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
tbl = tbl.mutate(
|
|
678
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
679
|
-
)
|
|
680
|
-
|
|
681
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
682
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
683
|
-
)
|
|
684
|
-
|
|
685
|
-
#
|
|
686
|
-
# Ibis column-to-literal comparison
|
|
687
|
-
#
|
|
688
|
-
tbl = self.x.mutate(
|
|
689
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
690
|
-
pb_is_good_2=self.x[self.column] <= ibis.literal(self.compare),
|
|
691
|
-
)
|
|
692
|
-
|
|
693
|
-
tbl = tbl.mutate(
|
|
694
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
698
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
699
|
-
)
|
|
700
|
-
|
|
701
|
-
# Local backends (Narwhals) ---------------------------------
|
|
565
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
702
566
|
|
|
703
567
|
compare_expr = _get_compare_expr_nw(compare=self.compare)
|
|
704
568
|
|
|
705
|
-
compare_expr =
|
|
706
|
-
tgt_column=self.x[self.column], compare_val=compare_expr
|
|
707
|
-
)
|
|
569
|
+
compare_expr = _safe_modify_datetime_compare_val(self.x, self.column, compare_expr)
|
|
708
570
|
|
|
709
571
|
return (
|
|
710
572
|
self.x.with_columns(
|
|
@@ -731,113 +593,13 @@ class Interrogator:
|
|
|
731
593
|
)
|
|
732
594
|
|
|
733
595
|
def between(self) -> FrameT | Any:
|
|
734
|
-
#
|
|
735
|
-
|
|
736
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
737
|
-
import ibis
|
|
738
|
-
|
|
739
|
-
if isinstance(self.low, Column) or isinstance(self.high, Column):
|
|
740
|
-
#
|
|
741
|
-
# Ibis column-to-column/column or column-to-column/literal comparison
|
|
742
|
-
#
|
|
743
|
-
|
|
744
|
-
if isinstance(self.low, Column):
|
|
745
|
-
low_val = self.x[self.low.name]
|
|
746
|
-
else:
|
|
747
|
-
low_val = ibis.literal(self.low)
|
|
748
|
-
|
|
749
|
-
if isinstance(self.high, Column):
|
|
750
|
-
high_val = self.x[self.high.name]
|
|
751
|
-
else:
|
|
752
|
-
high_val = ibis.literal(self.high)
|
|
753
|
-
|
|
754
|
-
if isinstance(self.low, Column) and isinstance(self.high, Column):
|
|
755
|
-
tbl = self.x.mutate(
|
|
756
|
-
pb_is_good_1=(
|
|
757
|
-
self.x[self.column].isnull()
|
|
758
|
-
| self.x[self.low.name].isnull()
|
|
759
|
-
| self.x[self.high.name].isnull()
|
|
760
|
-
)
|
|
761
|
-
& ibis.literal(self.na_pass)
|
|
762
|
-
)
|
|
763
|
-
elif isinstance(self.low, Column):
|
|
764
|
-
tbl = self.x.mutate(
|
|
765
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.low.name].isnull())
|
|
766
|
-
& ibis.literal(self.na_pass)
|
|
767
|
-
)
|
|
768
|
-
elif isinstance(self.high, Column):
|
|
769
|
-
tbl = self.x.mutate(
|
|
770
|
-
pb_is_good_1=(
|
|
771
|
-
self.x[self.column].isnull() | self.x[self.high.name].isnull()
|
|
772
|
-
)
|
|
773
|
-
& ibis.literal(self.na_pass)
|
|
774
|
-
)
|
|
775
|
-
|
|
776
|
-
if self.inclusive[0]:
|
|
777
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] >= low_val)
|
|
778
|
-
else:
|
|
779
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] > low_val)
|
|
780
|
-
|
|
781
|
-
tbl = tbl.mutate(
|
|
782
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
783
|
-
)
|
|
784
|
-
|
|
785
|
-
if self.inclusive[1]:
|
|
786
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] <= high_val)
|
|
787
|
-
else:
|
|
788
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] < high_val)
|
|
789
|
-
|
|
790
|
-
tbl = tbl.mutate(
|
|
791
|
-
pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
|
|
792
|
-
)
|
|
793
|
-
|
|
794
|
-
return tbl.mutate(
|
|
795
|
-
pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 & tbl.pb_is_good_3)
|
|
796
|
-
).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
797
|
-
|
|
798
|
-
else:
|
|
799
|
-
#
|
|
800
|
-
# Ibis column-to-literal/literal comparison
|
|
801
|
-
#
|
|
802
|
-
|
|
803
|
-
low_val = ibis.literal(self.low)
|
|
804
|
-
high_val = ibis.literal(self.high)
|
|
805
|
-
|
|
806
|
-
tbl = self.x.mutate(
|
|
807
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass)
|
|
808
|
-
)
|
|
809
|
-
|
|
810
|
-
if self.inclusive[0]:
|
|
811
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] >= low_val)
|
|
812
|
-
else:
|
|
813
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] > low_val)
|
|
814
|
-
|
|
815
|
-
tbl = tbl.mutate(
|
|
816
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
817
|
-
)
|
|
818
|
-
|
|
819
|
-
if self.inclusive[1]:
|
|
820
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] <= high_val)
|
|
821
|
-
else:
|
|
822
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] < high_val)
|
|
823
|
-
|
|
824
|
-
tbl = tbl.mutate(
|
|
825
|
-
pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
|
|
826
|
-
)
|
|
827
|
-
|
|
828
|
-
return tbl.mutate(
|
|
829
|
-
pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 & tbl.pb_is_good_3)
|
|
830
|
-
).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
831
|
-
|
|
832
|
-
# Local backends (Narwhals) ---------------------------------
|
|
596
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
833
597
|
|
|
834
598
|
low_val = _get_compare_expr_nw(compare=self.low)
|
|
835
599
|
high_val = _get_compare_expr_nw(compare=self.high)
|
|
836
600
|
|
|
837
|
-
low_val =
|
|
838
|
-
high_val =
|
|
839
|
-
tgt_column=self.x[self.column], compare_val=high_val
|
|
840
|
-
)
|
|
601
|
+
low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
|
|
602
|
+
high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
|
|
841
603
|
|
|
842
604
|
tbl = self.x.with_columns(
|
|
843
605
|
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
@@ -900,136 +662,16 @@ class Interrogator:
|
|
|
900
662
|
return tbl
|
|
901
663
|
|
|
902
664
|
def outside(self) -> FrameT | Any:
|
|
903
|
-
#
|
|
904
|
-
|
|
905
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
906
|
-
import ibis
|
|
907
|
-
|
|
908
|
-
if isinstance(self.low, Column) or isinstance(self.high, Column):
|
|
909
|
-
#
|
|
910
|
-
# Ibis column-to-column/column or column-to-column/literal comparison
|
|
911
|
-
#
|
|
912
|
-
|
|
913
|
-
if isinstance(self.low, Column):
|
|
914
|
-
low_val = self.x[self.low.name]
|
|
915
|
-
else:
|
|
916
|
-
low_val = ibis.literal(self.low)
|
|
917
|
-
|
|
918
|
-
if isinstance(self.high, Column):
|
|
919
|
-
high_val = self.x[self.high.name]
|
|
920
|
-
else:
|
|
921
|
-
high_val = ibis.literal(self.high)
|
|
922
|
-
|
|
923
|
-
if isinstance(self.low, Column) and isinstance(self.high, Column):
|
|
924
|
-
tbl = self.x.mutate(
|
|
925
|
-
pb_is_good_1=(
|
|
926
|
-
self.x[self.column].isnull()
|
|
927
|
-
| self.x[self.low.name].isnull()
|
|
928
|
-
| self.x[self.high.name].isnull()
|
|
929
|
-
)
|
|
930
|
-
& ibis.literal(self.na_pass)
|
|
931
|
-
)
|
|
665
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
932
666
|
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
pb_is_good_1=(self.x[self.column].isnull() | self.x[self.low.name].isnull())
|
|
936
|
-
& ibis.literal(self.na_pass)
|
|
937
|
-
)
|
|
938
|
-
elif isinstance(self.high, Column):
|
|
939
|
-
tbl = self.x.mutate(
|
|
940
|
-
pb_is_good_1=(
|
|
941
|
-
self.x[self.column].isnull() | self.x[self.high.name].isnull()
|
|
942
|
-
)
|
|
943
|
-
& ibis.literal(self.na_pass)
|
|
944
|
-
)
|
|
945
|
-
|
|
946
|
-
if self.inclusive[0]:
|
|
947
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] < low_val)
|
|
948
|
-
else:
|
|
949
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] <= low_val)
|
|
950
|
-
|
|
951
|
-
if self.inclusive[1]:
|
|
952
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] > high_val)
|
|
953
|
-
else:
|
|
954
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] >= high_val)
|
|
955
|
-
|
|
956
|
-
tbl = tbl.mutate(
|
|
957
|
-
pb_is_good_2=ibis.ifelse(
|
|
958
|
-
tbl.pb_is_good_3.isnull(),
|
|
959
|
-
False,
|
|
960
|
-
tbl.pb_is_good_2,
|
|
961
|
-
)
|
|
962
|
-
)
|
|
963
|
-
|
|
964
|
-
tbl = tbl.mutate(
|
|
965
|
-
pb_is_good_3=ibis.ifelse(
|
|
966
|
-
tbl.pb_is_good_2.isnull(),
|
|
967
|
-
False,
|
|
968
|
-
tbl.pb_is_good_3,
|
|
969
|
-
)
|
|
970
|
-
)
|
|
971
|
-
|
|
972
|
-
tbl = tbl.mutate(
|
|
973
|
-
pb_is_good_2=ibis.ifelse(
|
|
974
|
-
tbl.pb_is_good_2.isnull(),
|
|
975
|
-
False,
|
|
976
|
-
tbl.pb_is_good_2,
|
|
977
|
-
)
|
|
978
|
-
)
|
|
979
|
-
|
|
980
|
-
tbl = tbl.mutate(
|
|
981
|
-
pb_is_good_3=ibis.ifelse(
|
|
982
|
-
tbl.pb_is_good_3.isnull(),
|
|
983
|
-
False,
|
|
984
|
-
tbl.pb_is_good_3,
|
|
985
|
-
)
|
|
986
|
-
)
|
|
987
|
-
|
|
988
|
-
return tbl.mutate(
|
|
989
|
-
pb_is_good_=tbl.pb_is_good_1 | (tbl.pb_is_good_2 | tbl.pb_is_good_3)
|
|
990
|
-
).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
991
|
-
|
|
992
|
-
#
|
|
993
|
-
# Ibis column-to-literal/literal comparison
|
|
994
|
-
#
|
|
995
|
-
low_val = ibis.literal(self.low)
|
|
996
|
-
high_val = ibis.literal(self.high)
|
|
997
|
-
|
|
998
|
-
tbl = self.x.mutate(
|
|
999
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass)
|
|
1000
|
-
)
|
|
1001
|
-
|
|
1002
|
-
if self.inclusive[0]:
|
|
1003
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] < low_val)
|
|
1004
|
-
else:
|
|
1005
|
-
tbl = tbl.mutate(pb_is_good_2=tbl[self.column] <= low_val)
|
|
1006
|
-
|
|
1007
|
-
tbl = tbl.mutate(
|
|
1008
|
-
pb_is_good_2=ibis.ifelse(tbl.pb_is_good_2.notnull(), tbl.pb_is_good_2, False)
|
|
1009
|
-
)
|
|
1010
|
-
|
|
1011
|
-
if self.inclusive[1]:
|
|
1012
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] > high_val)
|
|
1013
|
-
else:
|
|
1014
|
-
tbl = tbl.mutate(pb_is_good_3=tbl[self.column] >= high_val)
|
|
1015
|
-
|
|
1016
|
-
tbl = tbl.mutate(
|
|
1017
|
-
pb_is_good_3=ibis.ifelse(tbl.pb_is_good_3.notnull(), tbl.pb_is_good_3, False)
|
|
1018
|
-
)
|
|
1019
|
-
|
|
1020
|
-
return tbl.mutate(
|
|
1021
|
-
pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2 | tbl.pb_is_good_3
|
|
1022
|
-
).drop("pb_is_good_1", "pb_is_good_2", "pb_is_good_3")
|
|
1023
|
-
|
|
1024
|
-
# Local backends (Narwhals) ---------------------------------
|
|
667
|
+
low_val = _get_compare_expr_nw(compare=self.low)
|
|
668
|
+
high_val = _get_compare_expr_nw(compare=self.high)
|
|
1025
669
|
|
|
1026
670
|
low_val = _get_compare_expr_nw(compare=self.low)
|
|
1027
671
|
high_val = _get_compare_expr_nw(compare=self.high)
|
|
1028
672
|
|
|
1029
|
-
low_val =
|
|
1030
|
-
high_val =
|
|
1031
|
-
tgt_column=self.x[self.column], compare_val=high_val
|
|
1032
|
-
)
|
|
673
|
+
low_val = _safe_modify_datetime_compare_val(self.x, self.column, low_val)
|
|
674
|
+
high_val = _safe_modify_datetime_compare_val(self.x, self.column, high_val)
|
|
1033
675
|
|
|
1034
676
|
tbl = self.x.with_columns(
|
|
1035
677
|
pb_is_good_1=nw.col(self.column).is_null(), # val is Null in Column
|
|
@@ -1088,17 +730,10 @@ class Interrogator:
|
|
|
1088
730
|
return tbl
|
|
1089
731
|
|
|
1090
732
|
def isin(self) -> FrameT | Any:
|
|
1091
|
-
#
|
|
733
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1092
734
|
|
|
1093
735
|
can_be_null: bool = None in self.set
|
|
1094
736
|
|
|
1095
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1096
|
-
base_expr = self.x[self.column].isin(self.set)
|
|
1097
|
-
if can_be_null:
|
|
1098
|
-
base_expr = base_expr | self.x[self.column].isnull()
|
|
1099
|
-
return self.x.mutate(pb_is_good_=base_expr)
|
|
1100
|
-
|
|
1101
|
-
# Local backends (Narwhals) ---------------------------------
|
|
1102
737
|
base_expr: nw.Expr = nw.col(self.column).is_in(self.set)
|
|
1103
738
|
if can_be_null:
|
|
1104
739
|
base_expr = base_expr | nw.col(self.column).is_null()
|
|
@@ -1106,12 +741,7 @@ class Interrogator:
|
|
|
1106
741
|
return self.x.with_columns(pb_is_good_=base_expr).to_native()
|
|
1107
742
|
|
|
1108
743
|
def notin(self) -> FrameT | Any:
|
|
1109
|
-
#
|
|
1110
|
-
|
|
1111
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1112
|
-
return self.x.mutate(pb_is_good_=self.x[self.column].notin(self.set))
|
|
1113
|
-
|
|
1114
|
-
# Local backends (Narwhals) ---------------------------------
|
|
744
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1115
745
|
|
|
1116
746
|
return (
|
|
1117
747
|
self.x.with_columns(
|
|
@@ -1122,21 +752,7 @@ class Interrogator:
|
|
|
1122
752
|
)
|
|
1123
753
|
|
|
1124
754
|
def regex(self) -> FrameT | Any:
|
|
1125
|
-
#
|
|
1126
|
-
|
|
1127
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1128
|
-
import ibis
|
|
1129
|
-
|
|
1130
|
-
tbl = self.x.mutate(
|
|
1131
|
-
pb_is_good_1=self.x[self.column].isnull() & ibis.literal(self.na_pass),
|
|
1132
|
-
pb_is_good_2=self.x[self.column].re_search(self.pattern),
|
|
1133
|
-
)
|
|
1134
|
-
|
|
1135
|
-
return tbl.mutate(pb_is_good_=tbl.pb_is_good_1 | tbl.pb_is_good_2).drop(
|
|
1136
|
-
"pb_is_good_1", "pb_is_good_2"
|
|
1137
|
-
)
|
|
1138
|
-
|
|
1139
|
-
# Local backends (Narwhals) ---------------------------------
|
|
755
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1140
756
|
|
|
1141
757
|
return (
|
|
1142
758
|
self.x.with_columns(
|
|
@@ -1151,55 +767,21 @@ class Interrogator:
|
|
|
1151
767
|
)
|
|
1152
768
|
|
|
1153
769
|
def null(self) -> FrameT | Any:
|
|
1154
|
-
#
|
|
1155
|
-
|
|
1156
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1157
|
-
return self.x.mutate(
|
|
1158
|
-
pb_is_good_=self.x[self.column].isnull(),
|
|
1159
|
-
)
|
|
1160
|
-
|
|
1161
|
-
# Local backends (Narwhals) ---------------------------------
|
|
770
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1162
771
|
|
|
1163
772
|
return self.x.with_columns(
|
|
1164
773
|
pb_is_good_=nw.col(self.column).is_null(),
|
|
1165
774
|
).to_native()
|
|
1166
775
|
|
|
1167
776
|
def not_null(self) -> FrameT | Any:
|
|
1168
|
-
#
|
|
1169
|
-
|
|
1170
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1171
|
-
return self.x.mutate(
|
|
1172
|
-
pb_is_good_=~self.x[self.column].isnull(),
|
|
1173
|
-
)
|
|
1174
|
-
|
|
1175
|
-
# Local backends (Narwhals) ---------------------------------
|
|
777
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1176
778
|
|
|
1177
779
|
return self.x.with_columns(
|
|
1178
780
|
pb_is_good_=~nw.col(self.column).is_null(),
|
|
1179
781
|
).to_native()
|
|
1180
782
|
|
|
1181
783
|
def rows_distinct(self) -> FrameT | Any:
|
|
1182
|
-
#
|
|
1183
|
-
|
|
1184
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1185
|
-
import ibis
|
|
1186
|
-
|
|
1187
|
-
tbl = self.x
|
|
1188
|
-
|
|
1189
|
-
# Get the column subset to use for the test
|
|
1190
|
-
if self.columns_subset is None:
|
|
1191
|
-
columns_subset = tbl.columns
|
|
1192
|
-
else:
|
|
1193
|
-
columns_subset = self.columns_subset
|
|
1194
|
-
|
|
1195
|
-
# Create a subset of the table with only the columns of interest and count the
|
|
1196
|
-
# number of times each unique row (or portion thereof) appears
|
|
1197
|
-
tbl = tbl.group_by(columns_subset).mutate(pb_count_=ibis._.count())
|
|
1198
|
-
|
|
1199
|
-
# Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
|
|
1200
|
-
return tbl.mutate(pb_is_good_=tbl["pb_count_"] == 1).drop("pb_count_")
|
|
1201
|
-
|
|
1202
|
-
# Local backends (Narwhals) ---------------------------------
|
|
784
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1203
785
|
|
|
1204
786
|
tbl = self.x
|
|
1205
787
|
|
|
@@ -1209,32 +791,20 @@ class Interrogator:
|
|
|
1209
791
|
else:
|
|
1210
792
|
columns_subset = self.columns_subset
|
|
1211
793
|
|
|
1212
|
-
# Create a
|
|
1213
|
-
|
|
794
|
+
# Create a count of duplicates using group_by approach like Ibis backend
|
|
795
|
+
# Group by the columns of interest and count occurrences
|
|
796
|
+
count_tbl = tbl.group_by(columns_subset).agg(nw.len().alias("pb_count_"))
|
|
1214
797
|
|
|
1215
|
-
#
|
|
1216
|
-
|
|
798
|
+
# Join back to original table to get count for each row
|
|
799
|
+
tbl = tbl.join(count_tbl, on=columns_subset, how="left")
|
|
1217
800
|
|
|
1218
|
-
#
|
|
1219
|
-
tbl = tbl.with_columns(pb_is_good_
|
|
801
|
+
# Passing rows will have the value `1` (no duplicates, so True), otherwise False applies
|
|
802
|
+
tbl = tbl.with_columns(pb_is_good_=nw.col("pb_count_") == 1).drop("pb_count_")
|
|
1220
803
|
|
|
1221
804
|
return tbl.to_native()
|
|
1222
805
|
|
|
1223
806
|
def rows_complete(self) -> FrameT | Any:
|
|
1224
|
-
#
|
|
1225
|
-
|
|
1226
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1227
|
-
tbl = self.x
|
|
1228
|
-
|
|
1229
|
-
# Determine the number of null values in each row (column subsets are handled in
|
|
1230
|
-
# the `_check_nulls_across_columns_ibis()` function)
|
|
1231
|
-
tbl = _check_nulls_across_columns_ibis(table=tbl, columns_subset=self.columns_subset)
|
|
1232
|
-
|
|
1233
|
-
# Failing rows will have the value `True` in the generated column, so we need to negate
|
|
1234
|
-
# the result to get the passing rows
|
|
1235
|
-
return tbl.mutate(pb_is_good_=~tbl["_any_is_null_"]).drop("_any_is_null_")
|
|
1236
|
-
|
|
1237
|
-
# Local backends (Narwhals) ---------------------------------
|
|
807
|
+
# All backends now use Narwhals (including former Ibis tables) ---------
|
|
1238
808
|
|
|
1239
809
|
tbl = self.x
|
|
1240
810
|
|
|
@@ -1299,10 +869,8 @@ class ColValsCompareOne:
|
|
|
1299
869
|
tbl = _column_test_prep(
|
|
1300
870
|
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1301
871
|
)
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
# for now, just pass the table as is
|
|
1305
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
872
|
+
else:
|
|
873
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1306
874
|
tbl = self.data_tbl
|
|
1307
875
|
|
|
1308
876
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -1457,7 +1025,8 @@ class ColValsCompareTwo:
|
|
|
1457
1025
|
|
|
1458
1026
|
# TODO: For Ibis backends, check if the column exists and if the column type is compatible;
|
|
1459
1027
|
# for now, just pass the table as is
|
|
1460
|
-
|
|
1028
|
+
else:
|
|
1029
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1461
1030
|
tbl = self.data_tbl
|
|
1462
1031
|
|
|
1463
1032
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -1550,10 +1119,8 @@ class ColValsCompareSet:
|
|
|
1550
1119
|
tbl = _column_test_prep(
|
|
1551
1120
|
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1552
1121
|
)
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
# for now, just pass the table as is
|
|
1556
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1122
|
+
else:
|
|
1123
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1557
1124
|
tbl = self.data_tbl
|
|
1558
1125
|
|
|
1559
1126
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -1627,10 +1194,8 @@ class ColValsRegex:
|
|
|
1627
1194
|
tbl = _column_test_prep(
|
|
1628
1195
|
df=self.data_tbl, column=self.column, allowed_types=self.allowed_types
|
|
1629
1196
|
)
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
# for now, just pass the table as is
|
|
1633
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1197
|
+
else:
|
|
1198
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1634
1199
|
tbl = self.data_tbl
|
|
1635
1200
|
|
|
1636
1201
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -1758,11 +1323,9 @@ class ColExistsHasType:
|
|
|
1758
1323
|
# - check if the `column=` exists
|
|
1759
1324
|
# - check if the `column=` type is compatible with the test
|
|
1760
1325
|
tbl = _convert_to_narwhals(df=self.data_tbl)
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
if self.tbl_type in IBIS_BACKENDS:
|
|
1765
|
-
tbl = self.data_tbl
|
|
1326
|
+
else:
|
|
1327
|
+
# For remote backends (Ibis), pass the table as is since Narwhals can handle it
|
|
1328
|
+
tbl = _convert_to_narwhals(df=self.data_tbl)
|
|
1766
1329
|
|
|
1767
1330
|
if self.assertion_method == "exists":
|
|
1768
1331
|
res = int(self.column in tbl.columns)
|
|
@@ -1810,7 +1373,8 @@ class RowsDistinct:
|
|
|
1810
1373
|
|
|
1811
1374
|
# TODO: For Ibis backends, check if the column exists and if the column type is compatible;
|
|
1812
1375
|
# for now, just pass the table as is
|
|
1813
|
-
|
|
1376
|
+
else:
|
|
1377
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1814
1378
|
tbl = self.data_tbl
|
|
1815
1379
|
|
|
1816
1380
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -1862,7 +1426,8 @@ class RowsComplete:
|
|
|
1862
1426
|
|
|
1863
1427
|
# TODO: For Ibis backends, check if the column exists and if the column type is compatible;
|
|
1864
1428
|
# for now, just pass the table as is
|
|
1865
|
-
|
|
1429
|
+
else:
|
|
1430
|
+
# For remote backends (Ibis), pass the table as is since Interrogator now handles Ibis through Narwhals
|
|
1866
1431
|
tbl = self.data_tbl
|
|
1867
1432
|
|
|
1868
1433
|
# Collect results for the test units; the results are a list of booleans where
|
|
@@ -2088,6 +1653,8 @@ class ConjointlyValidation:
|
|
|
2088
1653
|
return self._get_pandas_results()
|
|
2089
1654
|
elif "duckdb" in self.tbl_type or "ibis" in self.tbl_type:
|
|
2090
1655
|
return self._get_ibis_results()
|
|
1656
|
+
elif "pyspark" in self.tbl_type:
|
|
1657
|
+
return self._get_pyspark_results()
|
|
2091
1658
|
else: # pragma: no cover
|
|
2092
1659
|
raise NotImplementedError(f"Support for {self.tbl_type} is not yet implemented")
|
|
2093
1660
|
|
|
@@ -2247,6 +1814,53 @@ class ConjointlyValidation:
|
|
|
2247
1814
|
results_tbl = self.data_tbl.mutate(pb_is_good_=ibis.literal(True))
|
|
2248
1815
|
return results_tbl
|
|
2249
1816
|
|
|
1817
|
+
def _get_pyspark_results(self):
|
|
1818
|
+
"""Process expressions for PySpark DataFrames."""
|
|
1819
|
+
from pyspark.sql import functions as F
|
|
1820
|
+
|
|
1821
|
+
pyspark_columns = []
|
|
1822
|
+
|
|
1823
|
+
for expr_fn in self.expressions:
|
|
1824
|
+
try:
|
|
1825
|
+
# First try direct evaluation with PySpark DataFrame
|
|
1826
|
+
expr_result = expr_fn(self.data_tbl)
|
|
1827
|
+
|
|
1828
|
+
# Check if it's a PySpark Column
|
|
1829
|
+
if hasattr(expr_result, "_jc"): # PySpark Column has _jc attribute
|
|
1830
|
+
pyspark_columns.append(expr_result)
|
|
1831
|
+
else:
|
|
1832
|
+
raise TypeError(
|
|
1833
|
+
f"Expression returned {type(expr_result)}, expected PySpark Column"
|
|
1834
|
+
)
|
|
1835
|
+
|
|
1836
|
+
except Exception as e:
|
|
1837
|
+
try:
|
|
1838
|
+
# Try as a ColumnExpression (for pb.expr_col style)
|
|
1839
|
+
col_expr = expr_fn(None)
|
|
1840
|
+
|
|
1841
|
+
if hasattr(col_expr, "to_pyspark_expr"):
|
|
1842
|
+
# Convert to PySpark expression
|
|
1843
|
+
pyspark_expr = col_expr.to_pyspark_expr(self.data_tbl)
|
|
1844
|
+
pyspark_columns.append(pyspark_expr)
|
|
1845
|
+
else:
|
|
1846
|
+
raise TypeError(f"Cannot convert {type(col_expr)} to PySpark Column")
|
|
1847
|
+
except Exception as nested_e:
|
|
1848
|
+
print(f"Error evaluating PySpark expression: {e} -> {nested_e}")
|
|
1849
|
+
|
|
1850
|
+
# Combine results with AND logic
|
|
1851
|
+
if pyspark_columns:
|
|
1852
|
+
final_result = pyspark_columns[0]
|
|
1853
|
+
for col in pyspark_columns[1:]:
|
|
1854
|
+
final_result = final_result & col
|
|
1855
|
+
|
|
1856
|
+
# Create results table with boolean column
|
|
1857
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", final_result)
|
|
1858
|
+
return results_tbl
|
|
1859
|
+
|
|
1860
|
+
# Default case
|
|
1861
|
+
results_tbl = self.data_tbl.withColumn("pb_is_good_", F.lit(True))
|
|
1862
|
+
return results_tbl
|
|
1863
|
+
|
|
2250
1864
|
|
|
2251
1865
|
class SpeciallyValidation:
|
|
2252
1866
|
def __init__(self, data_tbl, expression, threshold, tbl_type):
|
|
@@ -2359,13 +1973,22 @@ class NumberOfTestUnits:
|
|
|
2359
1973
|
column: str
|
|
2360
1974
|
|
|
2361
1975
|
def get_test_units(self, tbl_type: str) -> int:
|
|
2362
|
-
if
|
|
1976
|
+
if (
|
|
1977
|
+
tbl_type == "pandas"
|
|
1978
|
+
or tbl_type == "polars"
|
|
1979
|
+
or tbl_type == "pyspark"
|
|
1980
|
+
or tbl_type == "local"
|
|
1981
|
+
):
|
|
2363
1982
|
# Convert the DataFrame to a format that narwhals can work with and:
|
|
2364
1983
|
# - check if the column exists
|
|
2365
1984
|
dfn = _column_test_prep(
|
|
2366
1985
|
df=self.df, column=self.column, allowed_types=None, check_exists=False
|
|
2367
1986
|
)
|
|
2368
1987
|
|
|
1988
|
+
# Handle LazyFrames which don't have len()
|
|
1989
|
+
if hasattr(dfn, "collect"):
|
|
1990
|
+
dfn = dfn.collect()
|
|
1991
|
+
|
|
2369
1992
|
return len(dfn)
|
|
2370
1993
|
|
|
2371
1994
|
if tbl_type in IBIS_BACKENDS:
|
|
@@ -2383,7 +2006,22 @@ def _get_compare_expr_nw(compare: Any) -> Any:
|
|
|
2383
2006
|
|
|
2384
2007
|
|
|
2385
2008
|
def _column_has_null_values(table: FrameT, column: str) -> bool:
|
|
2386
|
-
|
|
2009
|
+
try:
|
|
2010
|
+
# Try the standard null_count() method
|
|
2011
|
+
null_count = (table.select(column).null_count())[column][0]
|
|
2012
|
+
except AttributeError:
|
|
2013
|
+
# For LazyFrames, collect first then get null count
|
|
2014
|
+
try:
|
|
2015
|
+
collected = table.select(column).collect()
|
|
2016
|
+
null_count = (collected.null_count())[column][0]
|
|
2017
|
+
except Exception:
|
|
2018
|
+
# Fallback: check if any values are null
|
|
2019
|
+
try:
|
|
2020
|
+
result = table.select(nw.col(column).is_null().sum().alias("null_count")).collect()
|
|
2021
|
+
null_count = result["null_count"][0]
|
|
2022
|
+
except Exception:
|
|
2023
|
+
# Last resort: return False (assume no nulls)
|
|
2024
|
+
return False
|
|
2387
2025
|
|
|
2388
2026
|
if null_count is None or null_count == 0:
|
|
2389
2027
|
return False
|
|
@@ -2414,7 +2052,7 @@ def _check_nulls_across_columns_nw(table, columns_subset):
|
|
|
2414
2052
|
|
|
2415
2053
|
# Build the expression by combining each column's `is_null()` with OR operations
|
|
2416
2054
|
null_expr = functools.reduce(
|
|
2417
|
-
lambda acc, col: acc |
|
|
2055
|
+
lambda acc, col: acc | nw.col(col).is_null() if acc is not None else nw.col(col).is_null(),
|
|
2418
2056
|
column_names,
|
|
2419
2057
|
None,
|
|
2420
2058
|
)
|