cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +60 -15
- cudf_polars/containers/column.py +137 -77
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +256 -114
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +33 -3
- cudf_polars/dsl/expressions/unary.py +126 -64
- cudf_polars/dsl/ir.py +1053 -350
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +307 -107
- cudf_polars/dsl/utils/aggregations.py +43 -30
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +55 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +792 -2
- cudf_polars/experimental/benchmarks/utils.py +596 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/distinct.py +2 -0
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +30 -15
- cudf_polars/experimental/groupby.py +25 -4
- cudf_polars/experimental/io.py +156 -124
- cudf_polars/experimental/join.py +53 -23
- cudf_polars/experimental/parallel.py +68 -19
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
- cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
- cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
- cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
- cudf_polars/experimental/rapidsmpf/core.py +488 -0
- cudf_polars/experimental/rapidsmpf/dask.py +172 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
- cudf_polars/experimental/rapidsmpf/io.py +696 -0
- cudf_polars/experimental/rapidsmpf/join.py +322 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
- cudf_polars/experimental/rapidsmpf/union.py +115 -0
- cudf_polars/experimental/rapidsmpf/utils.py +374 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +46 -12
- cudf_polars/experimental/sort.py +100 -26
- cudf_polars/experimental/spilling.py +1 -1
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +93 -17
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +473 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +5 -4
- cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
- cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
from decimal import Decimal
|
|
9
10
|
from functools import partial
|
|
10
11
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
12
|
|
|
@@ -16,23 +17,31 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
|
16
17
|
from cudf_polars.dsl.expressions.literal import Literal
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
20
|
+
from rmm.pylibrmm.stream import Stream
|
|
21
|
+
|
|
19
22
|
from cudf_polars.containers import DataFrame, DataType
|
|
20
23
|
|
|
21
24
|
__all__ = ["Agg"]
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
class Agg(Expr):
|
|
25
|
-
__slots__ = ("name", "op", "options", "request")
|
|
26
|
-
_non_child = ("dtype", "name", "options")
|
|
28
|
+
__slots__ = ("context", "name", "op", "options", "request")
|
|
29
|
+
_non_child = ("dtype", "name", "options", "context")
|
|
27
30
|
|
|
28
31
|
def __init__(
|
|
29
|
-
self,
|
|
32
|
+
self,
|
|
33
|
+
dtype: DataType,
|
|
34
|
+
name: str,
|
|
35
|
+
options: Any,
|
|
36
|
+
context: ExecutionContext,
|
|
37
|
+
*children: Expr,
|
|
30
38
|
) -> None:
|
|
31
39
|
self.dtype = dtype
|
|
32
40
|
self.name = name
|
|
33
41
|
self.options = options
|
|
34
42
|
self.is_pointwise = False
|
|
35
43
|
self.children = children
|
|
44
|
+
self.context = context
|
|
36
45
|
if name not in Agg._SUPPORTED:
|
|
37
46
|
raise NotImplementedError(
|
|
38
47
|
f"Unsupported aggregation {name=}"
|
|
@@ -71,7 +80,7 @@ class Agg(Expr):
|
|
|
71
80
|
raise NotImplementedError("Only support literal quantile values")
|
|
72
81
|
if options == "equiprobable":
|
|
73
82
|
raise NotImplementedError("Quantile with equiprobable interpolation")
|
|
74
|
-
if plc.traits.is_duration(child.dtype.
|
|
83
|
+
if plc.traits.is_duration(child.dtype.plc_type):
|
|
75
84
|
raise NotImplementedError("Quantile with duration data type")
|
|
76
85
|
req = plc.aggregation.quantile(
|
|
77
86
|
quantiles=[quantile.value], interp=Agg.interp_mapping[options]
|
|
@@ -80,9 +89,19 @@ class Agg(Expr):
|
|
|
80
89
|
raise NotImplementedError(
|
|
81
90
|
f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
|
|
82
91
|
) # pragma: no cover
|
|
92
|
+
if (
|
|
93
|
+
context == ExecutionContext.FRAME
|
|
94
|
+
and req is not None
|
|
95
|
+
and not plc.aggregation.is_valid_aggregation(dtype.plc_type, req)
|
|
96
|
+
):
|
|
97
|
+
# TODO: Check which cases polars raises vs returns all-NULL column.
|
|
98
|
+
# For the all-NULL column cases, we could build it using Column.all_null_like
|
|
99
|
+
# at evaluation time.
|
|
100
|
+
raise NotImplementedError(f"Invalid aggregation {req} with dtype {dtype}")
|
|
83
101
|
self.request = req
|
|
84
102
|
op = getattr(self, f"_{name}", None)
|
|
85
103
|
if op is None:
|
|
104
|
+
assert req is not None # Ensure req is not None for _reduce
|
|
86
105
|
op = partial(self._reduce, request=req)
|
|
87
106
|
elif name in {"min", "max"}:
|
|
88
107
|
op = partial(op, propagate_nans=options)
|
|
@@ -136,77 +155,110 @@ class Agg(Expr):
|
|
|
136
155
|
return self.request
|
|
137
156
|
|
|
138
157
|
def _reduce(
|
|
139
|
-
self, column: Column, *, request: plc.aggregation.Aggregation
|
|
158
|
+
self, column: Column, *, request: plc.aggregation.Aggregation, stream: Stream
|
|
140
159
|
) -> Column:
|
|
160
|
+
if (
|
|
161
|
+
# For sum, this condition can only pass
|
|
162
|
+
# after expression decomposition in the streaming
|
|
163
|
+
# engine
|
|
164
|
+
self.name in {"sum", "mean", "median"}
|
|
165
|
+
and plc.traits.is_fixed_point(column.dtype.plc_type)
|
|
166
|
+
and self.dtype.plc_type.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}
|
|
167
|
+
):
|
|
168
|
+
column = column.astype(self.dtype, stream=stream)
|
|
141
169
|
return Column(
|
|
142
170
|
plc.Column.from_scalar(
|
|
143
|
-
plc.reduce.reduce(
|
|
171
|
+
plc.reduce.reduce(
|
|
172
|
+
column.obj, request, self.dtype.plc_type, stream=stream
|
|
173
|
+
),
|
|
144
174
|
1,
|
|
175
|
+
stream=stream,
|
|
145
176
|
),
|
|
146
177
|
name=column.name,
|
|
147
178
|
dtype=self.dtype,
|
|
148
179
|
)
|
|
149
180
|
|
|
150
|
-
def _count(self, column: Column, *, include_nulls: bool) -> Column:
|
|
181
|
+
def _count(self, column: Column, *, include_nulls: bool, stream: Stream) -> Column:
|
|
151
182
|
null_count = column.null_count if not include_nulls else 0
|
|
152
183
|
return Column(
|
|
153
184
|
plc.Column.from_scalar(
|
|
154
|
-
plc.Scalar.from_py(
|
|
185
|
+
plc.Scalar.from_py(
|
|
186
|
+
column.size - null_count, self.dtype.plc_type, stream=stream
|
|
187
|
+
),
|
|
155
188
|
1,
|
|
189
|
+
stream=stream,
|
|
156
190
|
),
|
|
157
191
|
name=column.name,
|
|
158
192
|
dtype=self.dtype,
|
|
159
193
|
)
|
|
160
194
|
|
|
161
|
-
def _sum(self, column: Column) -> Column:
|
|
195
|
+
def _sum(self, column: Column, stream: Stream) -> Column:
|
|
162
196
|
if column.size == 0 or column.null_count == column.size:
|
|
197
|
+
dtype = self.dtype.plc_type
|
|
163
198
|
return Column(
|
|
164
199
|
plc.Column.from_scalar(
|
|
165
|
-
plc.Scalar.from_py(
|
|
200
|
+
plc.Scalar.from_py(
|
|
201
|
+
Decimal(0).scaleb(dtype.scale())
|
|
202
|
+
if plc.traits.is_fixed_point(dtype)
|
|
203
|
+
else 0,
|
|
204
|
+
dtype,
|
|
205
|
+
stream=stream,
|
|
206
|
+
),
|
|
166
207
|
1,
|
|
208
|
+
stream=stream,
|
|
167
209
|
),
|
|
168
210
|
name=column.name,
|
|
169
211
|
dtype=self.dtype,
|
|
170
212
|
)
|
|
171
|
-
return self._reduce(column, request=plc.aggregation.sum())
|
|
213
|
+
return self._reduce(column, request=plc.aggregation.sum(), stream=stream)
|
|
172
214
|
|
|
173
|
-
def _min(self, column: Column, *, propagate_nans: bool) -> Column:
|
|
174
|
-
|
|
215
|
+
def _min(self, column: Column, *, propagate_nans: bool, stream: Stream) -> Column:
|
|
216
|
+
nan_count = column.nan_count(stream=stream)
|
|
217
|
+
if propagate_nans and nan_count > 0:
|
|
175
218
|
return Column(
|
|
176
219
|
plc.Column.from_scalar(
|
|
177
|
-
plc.Scalar.from_py(
|
|
220
|
+
plc.Scalar.from_py(
|
|
221
|
+
float("nan"), self.dtype.plc_type, stream=stream
|
|
222
|
+
),
|
|
178
223
|
1,
|
|
224
|
+
stream=stream,
|
|
179
225
|
),
|
|
180
226
|
name=column.name,
|
|
181
227
|
dtype=self.dtype,
|
|
182
228
|
)
|
|
183
|
-
if
|
|
184
|
-
column = column.mask_nans()
|
|
185
|
-
return self._reduce(column, request=plc.aggregation.min())
|
|
229
|
+
if nan_count > 0:
|
|
230
|
+
column = column.mask_nans(stream=stream)
|
|
231
|
+
return self._reduce(column, request=plc.aggregation.min(), stream=stream)
|
|
186
232
|
|
|
187
|
-
def _max(self, column: Column, *, propagate_nans: bool) -> Column:
|
|
188
|
-
|
|
233
|
+
def _max(self, column: Column, *, propagate_nans: bool, stream: Stream) -> Column:
|
|
234
|
+
nan_count = column.nan_count(stream=stream)
|
|
235
|
+
if propagate_nans and nan_count > 0:
|
|
189
236
|
return Column(
|
|
190
237
|
plc.Column.from_scalar(
|
|
191
|
-
plc.Scalar.from_py(
|
|
238
|
+
plc.Scalar.from_py(
|
|
239
|
+
float("nan"), self.dtype.plc_type, stream=stream
|
|
240
|
+
),
|
|
192
241
|
1,
|
|
242
|
+
stream=stream,
|
|
193
243
|
),
|
|
194
244
|
name=column.name,
|
|
195
245
|
dtype=self.dtype,
|
|
196
246
|
)
|
|
197
|
-
if
|
|
198
|
-
column = column.mask_nans()
|
|
199
|
-
return self._reduce(column, request=plc.aggregation.max())
|
|
247
|
+
if nan_count > 0:
|
|
248
|
+
column = column.mask_nans(stream=stream)
|
|
249
|
+
return self._reduce(column, request=plc.aggregation.max(), stream=stream)
|
|
200
250
|
|
|
201
|
-
def _first(self, column: Column) -> Column:
|
|
251
|
+
def _first(self, column: Column, stream: Stream) -> Column:
|
|
202
252
|
return Column(
|
|
203
|
-
plc.copying.slice(column.obj, [0, 1])[0],
|
|
253
|
+
plc.copying.slice(column.obj, [0, 1], stream=stream)[0],
|
|
254
|
+
name=column.name,
|
|
255
|
+
dtype=self.dtype,
|
|
204
256
|
)
|
|
205
257
|
|
|
206
|
-
def _last(self, column: Column) -> Column:
|
|
258
|
+
def _last(self, column: Column, stream: Stream) -> Column:
|
|
207
259
|
n = column.size
|
|
208
260
|
return Column(
|
|
209
|
-
plc.copying.slice(column.obj, [n - 1, n])[0],
|
|
261
|
+
plc.copying.slice(column.obj, [n - 1, n], stream=stream)[0],
|
|
210
262
|
name=column.name,
|
|
211
263
|
dtype=self.dtype,
|
|
212
264
|
)
|
|
@@ -223,4 +275,4 @@ class Agg(Expr):
|
|
|
223
275
|
# Aggregations like quantiles may have additional children that were
|
|
224
276
|
# preprocessed into pylibcudf requests.
|
|
225
277
|
child = self.children[0]
|
|
226
|
-
return self.op(child.evaluate(df, context=context))
|
|
278
|
+
return self.op(child.evaluate(df, context=context), stream=df.stream)
|
|
@@ -8,13 +8,15 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING, ClassVar
|
|
10
10
|
|
|
11
|
-
from polars
|
|
11
|
+
from polars import polars # type: ignore[attr-defined]
|
|
12
12
|
|
|
13
13
|
import pylibcudf as plc
|
|
14
14
|
|
|
15
15
|
from cudf_polars.containers import Column
|
|
16
16
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
17
|
|
|
18
|
+
pl_expr = polars._expr_nodes
|
|
19
|
+
|
|
18
20
|
if TYPE_CHECKING:
|
|
19
21
|
from cudf_polars.containers import DataFrame, DataType
|
|
20
22
|
|
|
@@ -33,7 +35,7 @@ class BinOp(Expr):
|
|
|
33
35
|
right: Expr,
|
|
34
36
|
) -> None:
|
|
35
37
|
self.dtype = dtype
|
|
36
|
-
if plc.traits.is_boolean(self.dtype.
|
|
38
|
+
if plc.traits.is_boolean(self.dtype.plc_type):
|
|
37
39
|
# For boolean output types, bitand and bitor implement
|
|
38
40
|
# boolean logic, so translate. bitxor also does, but the
|
|
39
41
|
# default behaviour is correct.
|
|
@@ -42,7 +44,7 @@ class BinOp(Expr):
|
|
|
42
44
|
self.children = (left, right)
|
|
43
45
|
self.is_pointwise = True
|
|
44
46
|
if not plc.binaryop.is_supported_operation(
|
|
45
|
-
self.dtype.
|
|
47
|
+
self.dtype.plc_type, left.dtype.plc_type, right.dtype.plc_type, op
|
|
46
48
|
):
|
|
47
49
|
raise NotImplementedError(
|
|
48
50
|
f"Operation {op.name} not supported "
|
|
@@ -59,7 +61,9 @@ class BinOp(Expr):
|
|
|
59
61
|
plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
|
|
60
62
|
}
|
|
61
63
|
|
|
62
|
-
_MAPPING: ClassVar[
|
|
64
|
+
_MAPPING: ClassVar[
|
|
65
|
+
dict[polars._expr_nodes.Operator, plc.binaryop.BinaryOperator]
|
|
66
|
+
] = {
|
|
63
67
|
pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
|
|
64
68
|
pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
|
|
65
69
|
pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
|
|
@@ -87,20 +91,25 @@ class BinOp(Expr):
|
|
|
87
91
|
) -> Column:
|
|
88
92
|
"""Evaluate this expression given a dataframe for context."""
|
|
89
93
|
left, right = (child.evaluate(df, context=context) for child in self.children)
|
|
90
|
-
lop = left.obj
|
|
91
|
-
rop = right.obj
|
|
94
|
+
lop: plc.Column | plc.Scalar = left.obj
|
|
95
|
+
rop: plc.Column | plc.Scalar = right.obj
|
|
92
96
|
if left.size != right.size:
|
|
93
97
|
if left.is_scalar:
|
|
94
|
-
lop = left.obj_scalar
|
|
98
|
+
lop = left.obj_scalar(stream=df.stream)
|
|
95
99
|
elif right.is_scalar:
|
|
96
|
-
rop = right.obj_scalar
|
|
97
|
-
if plc.traits.is_integral_not_bool(self.dtype.
|
|
100
|
+
rop = right.obj_scalar(stream=df.stream)
|
|
101
|
+
if plc.traits.is_integral_not_bool(self.dtype.plc_type) and self.op in {
|
|
98
102
|
plc.binaryop.BinaryOperator.FLOOR_DIV,
|
|
99
103
|
plc.binaryop.BinaryOperator.PYMOD,
|
|
100
104
|
}:
|
|
101
|
-
if
|
|
105
|
+
if (
|
|
106
|
+
right.obj.size() == 1
|
|
107
|
+
and right.obj.to_scalar(stream=df.stream).to_py(stream=df.stream) == 0
|
|
108
|
+
):
|
|
102
109
|
return Column(
|
|
103
|
-
plc.Column.all_null_like(
|
|
110
|
+
plc.Column.all_null_like(
|
|
111
|
+
left.obj, left.obj.size(), stream=df.stream
|
|
112
|
+
),
|
|
104
113
|
dtype=self.dtype,
|
|
105
114
|
)
|
|
106
115
|
|
|
@@ -108,13 +117,24 @@ class BinOp(Expr):
|
|
|
108
117
|
rop = plc.replace.find_and_replace_all(
|
|
109
118
|
right.obj,
|
|
110
119
|
plc.Column.from_scalar(
|
|
111
|
-
plc.Scalar.from_py(
|
|
120
|
+
plc.Scalar.from_py(
|
|
121
|
+
0, dtype=self.dtype.plc_type, stream=df.stream
|
|
122
|
+
),
|
|
123
|
+
1,
|
|
124
|
+
stream=df.stream,
|
|
112
125
|
),
|
|
113
126
|
plc.Column.from_scalar(
|
|
114
|
-
plc.Scalar.from_py(
|
|
127
|
+
plc.Scalar.from_py(
|
|
128
|
+
None, dtype=self.dtype.plc_type, stream=df.stream
|
|
129
|
+
),
|
|
130
|
+
1,
|
|
131
|
+
stream=df.stream,
|
|
115
132
|
),
|
|
133
|
+
stream=df.stream,
|
|
116
134
|
)
|
|
117
135
|
return Column(
|
|
118
|
-
plc.binaryop.binary_operation(
|
|
136
|
+
plc.binaryop.binary_operation(
|
|
137
|
+
lop, rop, self.op, self.dtype.plc_type, stream=df.stream
|
|
138
|
+
),
|
|
119
139
|
dtype=self.dtype,
|
|
120
140
|
)
|
|
@@ -8,7 +8,9 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from enum import IntEnum, auto
|
|
10
10
|
from functools import partial, reduce
|
|
11
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
12
14
|
|
|
13
15
|
import pylibcudf as plc
|
|
14
16
|
|
|
@@ -22,7 +24,9 @@ if TYPE_CHECKING:
|
|
|
22
24
|
from typing_extensions import Self
|
|
23
25
|
|
|
24
26
|
import polars.type_aliases as pl_types
|
|
25
|
-
from polars
|
|
27
|
+
from polars import polars # type: ignore[attr-defined]
|
|
28
|
+
|
|
29
|
+
from rmm.pylibrmm.stream import Stream
|
|
26
30
|
|
|
27
31
|
from cudf_polars.containers import DataFrame
|
|
28
32
|
|
|
@@ -53,7 +57,7 @@ class BooleanFunction(Expr):
|
|
|
53
57
|
Not = auto()
|
|
54
58
|
|
|
55
59
|
@classmethod
|
|
56
|
-
def from_polars(cls, obj:
|
|
60
|
+
def from_polars(cls, obj: polars._expr_nodes.BooleanFunction) -> Self:
|
|
57
61
|
"""Convert from polars' `BooleanFunction`."""
|
|
58
62
|
try:
|
|
59
63
|
function, name = str(obj).split(".", maxsplit=1)
|
|
@@ -101,6 +105,7 @@ class BooleanFunction(Expr):
|
|
|
101
105
|
keep: plc.stream_compaction.DuplicateKeepOption,
|
|
102
106
|
source_value: plc.Scalar,
|
|
103
107
|
target_value: plc.Scalar,
|
|
108
|
+
stream: Stream,
|
|
104
109
|
) -> Column:
|
|
105
110
|
table = plc.Table([column.obj])
|
|
106
111
|
indices = plc.stream_compaction.distinct_indices(
|
|
@@ -109,12 +114,20 @@ class BooleanFunction(Expr):
|
|
|
109
114
|
# TODO: polars doesn't expose options for these
|
|
110
115
|
plc.types.NullEquality.EQUAL,
|
|
111
116
|
plc.types.NanEquality.ALL_EQUAL,
|
|
117
|
+
stream=stream,
|
|
112
118
|
)
|
|
113
119
|
return Column(
|
|
114
120
|
plc.copying.scatter(
|
|
115
121
|
[source_value],
|
|
116
122
|
indices,
|
|
117
|
-
plc.Table(
|
|
123
|
+
plc.Table(
|
|
124
|
+
[
|
|
125
|
+
plc.Column.from_scalar(
|
|
126
|
+
target_value, table.num_rows(), stream=stream
|
|
127
|
+
)
|
|
128
|
+
]
|
|
129
|
+
),
|
|
130
|
+
stream=stream,
|
|
118
131
|
).columns()[0],
|
|
119
132
|
dtype=dtype,
|
|
120
133
|
)
|
|
@@ -153,31 +166,36 @@ class BooleanFunction(Expr):
|
|
|
153
166
|
):
|
|
154
167
|
# Avoid evaluating the child if the dtype tells us it's unnecessary.
|
|
155
168
|
(child,) = self.children
|
|
156
|
-
|
|
157
|
-
is_float =
|
|
169
|
+
values = child.evaluate(df, context=context)
|
|
170
|
+
is_float = values.obj.type().id() in (
|
|
158
171
|
plc.TypeId.FLOAT32,
|
|
159
172
|
plc.TypeId.FLOAT64,
|
|
160
173
|
)
|
|
161
174
|
is_finite = self.name is BooleanFunction.Name.IsFinite
|
|
162
175
|
if not is_float:
|
|
163
176
|
base = plc.Column.from_scalar(
|
|
164
|
-
plc.Scalar.from_py(py_val=is_finite
|
|
177
|
+
plc.Scalar.from_py(py_val=is_finite, stream=df.stream),
|
|
178
|
+
values.size,
|
|
179
|
+
stream=df.stream,
|
|
165
180
|
)
|
|
166
|
-
out = base.with_mask(
|
|
181
|
+
out = base.with_mask(values.obj.null_mask(), values.null_count)
|
|
167
182
|
return Column(out, dtype=self.dtype)
|
|
168
183
|
to_search = [-float("inf"), float("inf")]
|
|
169
184
|
if is_finite:
|
|
170
185
|
# NaN is neither finite not infinite
|
|
171
186
|
to_search.append(float("nan"))
|
|
172
|
-
|
|
187
|
+
nonfinite_values = plc.Column.from_iterable_of_py(
|
|
173
188
|
to_search,
|
|
174
|
-
dtype=
|
|
189
|
+
dtype=values.obj.type(),
|
|
190
|
+
stream=df.stream,
|
|
175
191
|
)
|
|
176
|
-
result = plc.search.contains(
|
|
192
|
+
result = plc.search.contains(nonfinite_values, values.obj, stream=df.stream)
|
|
177
193
|
if is_finite:
|
|
178
|
-
result = plc.unary.unary_operation(
|
|
194
|
+
result = plc.unary.unary_operation(
|
|
195
|
+
result, plc.unary.UnaryOperator.NOT, stream=df.stream
|
|
196
|
+
)
|
|
179
197
|
return Column(
|
|
180
|
-
result.with_mask(
|
|
198
|
+
result.with_mask(values.obj.null_mask(), values.null_count),
|
|
181
199
|
dtype=self.dtype,
|
|
182
200
|
)
|
|
183
201
|
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
@@ -188,7 +206,9 @@ class BooleanFunction(Expr):
|
|
|
188
206
|
(column,) = columns
|
|
189
207
|
is_any = self.name is BooleanFunction.Name.Any
|
|
190
208
|
agg = plc.aggregation.any() if is_any else plc.aggregation.all()
|
|
191
|
-
|
|
209
|
+
scalar_result = plc.reduce.reduce(
|
|
210
|
+
column.obj, agg, self.dtype.plc_type, stream=df.stream
|
|
211
|
+
)
|
|
192
212
|
if not ignore_nulls and column.null_count > 0:
|
|
193
213
|
# Truth tables
|
|
194
214
|
# Any All
|
|
@@ -200,20 +220,28 @@ class BooleanFunction(Expr):
|
|
|
200
220
|
#
|
|
201
221
|
# If the input null count was non-zero, we must
|
|
202
222
|
# post-process the result to insert the correct value.
|
|
203
|
-
h_result =
|
|
223
|
+
h_result = scalar_result.to_py(stream=df.stream)
|
|
204
224
|
if (is_any and not h_result) or (not is_any and h_result):
|
|
205
225
|
# Any All
|
|
206
226
|
# False || Null => Null True && Null => Null
|
|
207
227
|
return Column(
|
|
208
|
-
plc.Column.all_null_like(column.obj, 1
|
|
228
|
+
plc.Column.all_null_like(column.obj, 1, stream=df.stream),
|
|
229
|
+
dtype=self.dtype,
|
|
209
230
|
)
|
|
210
|
-
return Column(
|
|
231
|
+
return Column(
|
|
232
|
+
plc.Column.from_scalar(scalar_result, 1, stream=df.stream),
|
|
233
|
+
dtype=self.dtype,
|
|
234
|
+
)
|
|
211
235
|
if self.name is BooleanFunction.Name.IsNull:
|
|
212
236
|
(column,) = columns
|
|
213
|
-
return Column(
|
|
237
|
+
return Column(
|
|
238
|
+
plc.unary.is_null(column.obj, stream=df.stream), dtype=self.dtype
|
|
239
|
+
)
|
|
214
240
|
elif self.name is BooleanFunction.Name.IsNotNull:
|
|
215
241
|
(column,) = columns
|
|
216
|
-
return Column(
|
|
242
|
+
return Column(
|
|
243
|
+
plc.unary.is_valid(column.obj, stream=df.stream), dtype=self.dtype
|
|
244
|
+
)
|
|
217
245
|
elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
|
|
218
246
|
(column,) = columns
|
|
219
247
|
is_float = column.obj.type().id() in (
|
|
@@ -230,9 +258,11 @@ class BooleanFunction(Expr):
|
|
|
230
258
|
else:
|
|
231
259
|
base = plc.Column.from_scalar(
|
|
232
260
|
plc.Scalar.from_py(
|
|
233
|
-
py_val=self.name is not BooleanFunction.Name.IsNan
|
|
261
|
+
py_val=self.name is not BooleanFunction.Name.IsNan,
|
|
262
|
+
stream=df.stream,
|
|
234
263
|
),
|
|
235
264
|
column.size,
|
|
265
|
+
stream=df.stream,
|
|
236
266
|
)
|
|
237
267
|
out = base.with_mask(column.obj.null_mask(), column.null_count)
|
|
238
268
|
return Column(out, dtype=self.dtype)
|
|
@@ -242,8 +272,13 @@ class BooleanFunction(Expr):
|
|
|
242
272
|
column,
|
|
243
273
|
dtype=self.dtype,
|
|
244
274
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
|
|
245
|
-
source_value=plc.Scalar.from_py(
|
|
246
|
-
|
|
275
|
+
source_value=plc.Scalar.from_py(
|
|
276
|
+
py_val=True, dtype=self.dtype.plc_type, stream=df.stream
|
|
277
|
+
),
|
|
278
|
+
target_value=plc.Scalar.from_py(
|
|
279
|
+
py_val=False, dtype=self.dtype.plc_type, stream=df.stream
|
|
280
|
+
),
|
|
281
|
+
stream=df.stream,
|
|
247
282
|
)
|
|
248
283
|
elif self.name is BooleanFunction.Name.IsLastDistinct:
|
|
249
284
|
(column,) = columns
|
|
@@ -251,8 +286,15 @@ class BooleanFunction(Expr):
|
|
|
251
286
|
column,
|
|
252
287
|
dtype=self.dtype,
|
|
253
288
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
|
|
254
|
-
source_value=plc.Scalar.from_py(
|
|
255
|
-
|
|
289
|
+
source_value=plc.Scalar.from_py(
|
|
290
|
+
py_val=True, dtype=self.dtype.plc_type, stream=df.stream
|
|
291
|
+
),
|
|
292
|
+
target_value=plc.Scalar.from_py(
|
|
293
|
+
py_val=False,
|
|
294
|
+
dtype=self.dtype.plc_type,
|
|
295
|
+
stream=df.stream,
|
|
296
|
+
),
|
|
297
|
+
stream=df.stream,
|
|
256
298
|
)
|
|
257
299
|
elif self.name is BooleanFunction.Name.IsUnique:
|
|
258
300
|
(column,) = columns
|
|
@@ -260,8 +302,13 @@ class BooleanFunction(Expr):
|
|
|
260
302
|
column,
|
|
261
303
|
dtype=self.dtype,
|
|
262
304
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
263
|
-
source_value=plc.Scalar.from_py(
|
|
264
|
-
|
|
305
|
+
source_value=plc.Scalar.from_py(
|
|
306
|
+
py_val=True, dtype=self.dtype.plc_type, stream=df.stream
|
|
307
|
+
),
|
|
308
|
+
target_value=plc.Scalar.from_py(
|
|
309
|
+
py_val=False, dtype=self.dtype.plc_type, stream=df.stream
|
|
310
|
+
),
|
|
311
|
+
stream=df.stream,
|
|
265
312
|
)
|
|
266
313
|
elif self.name is BooleanFunction.Name.IsDuplicated:
|
|
267
314
|
(column,) = columns
|
|
@@ -269,8 +316,13 @@ class BooleanFunction(Expr):
|
|
|
269
316
|
column,
|
|
270
317
|
dtype=self.dtype,
|
|
271
318
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
272
|
-
source_value=plc.Scalar.from_py(
|
|
273
|
-
|
|
319
|
+
source_value=plc.Scalar.from_py(
|
|
320
|
+
py_val=False, dtype=self.dtype.plc_type, stream=df.stream
|
|
321
|
+
),
|
|
322
|
+
target_value=plc.Scalar.from_py(
|
|
323
|
+
py_val=True, dtype=self.dtype.plc_type, stream=df.stream
|
|
324
|
+
),
|
|
325
|
+
stream=df.stream,
|
|
274
326
|
)
|
|
275
327
|
elif self.name is BooleanFunction.Name.AllHorizontal:
|
|
276
328
|
return Column(
|
|
@@ -278,7 +330,7 @@ class BooleanFunction(Expr):
|
|
|
278
330
|
partial(
|
|
279
331
|
plc.binaryop.binary_operation,
|
|
280
332
|
op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
|
|
281
|
-
output_type=self.dtype.
|
|
333
|
+
output_type=self.dtype.plc_type,
|
|
282
334
|
),
|
|
283
335
|
(c.obj for c in columns),
|
|
284
336
|
),
|
|
@@ -290,7 +342,7 @@ class BooleanFunction(Expr):
|
|
|
290
342
|
partial(
|
|
291
343
|
plc.binaryop.binary_operation,
|
|
292
344
|
op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
|
|
293
|
-
output_type=self.dtype.
|
|
345
|
+
output_type=self.dtype.plc_type,
|
|
294
346
|
),
|
|
295
347
|
(c.obj for c in columns),
|
|
296
348
|
),
|
|
@@ -300,24 +352,45 @@ class BooleanFunction(Expr):
|
|
|
300
352
|
needles, haystack = columns
|
|
301
353
|
if haystack.obj.type().id() == plc.TypeId.LIST:
|
|
302
354
|
# Unwrap values from the list column
|
|
303
|
-
#
|
|
304
|
-
# which always has an inner attribute.
|
|
355
|
+
# .inner returns DataTypeClass | DataType, need to cast to DataType
|
|
305
356
|
haystack = Column(
|
|
306
357
|
haystack.obj.children()[1],
|
|
307
|
-
dtype=DataType(
|
|
308
|
-
|
|
358
|
+
dtype=DataType(
|
|
359
|
+
cast(
|
|
360
|
+
pl.DataType, cast(pl.List, haystack.dtype.polars_type).inner
|
|
361
|
+
)
|
|
362
|
+
),
|
|
363
|
+
).astype(needles.dtype, stream=df.stream)
|
|
309
364
|
if haystack.size:
|
|
310
365
|
return Column(
|
|
311
|
-
plc.search.contains(
|
|
366
|
+
plc.search.contains(
|
|
367
|
+
haystack.obj,
|
|
368
|
+
needles.obj,
|
|
369
|
+
stream=df.stream,
|
|
370
|
+
),
|
|
371
|
+
dtype=self.dtype,
|
|
312
372
|
)
|
|
313
373
|
return Column(
|
|
314
|
-
plc.Column.from_scalar(
|
|
374
|
+
plc.Column.from_scalar(
|
|
375
|
+
plc.Scalar.from_py(py_val=False, stream=df.stream),
|
|
376
|
+
needles.size,
|
|
377
|
+
stream=df.stream,
|
|
378
|
+
),
|
|
315
379
|
dtype=self.dtype,
|
|
316
380
|
)
|
|
317
381
|
elif self.name is BooleanFunction.Name.Not:
|
|
318
382
|
(column,) = columns
|
|
383
|
+
# Polars semantics:
|
|
384
|
+
# integer input: NOT => bitwise invert.
|
|
385
|
+
# boolean input: NOT => logical NOT.
|
|
319
386
|
return Column(
|
|
320
|
-
plc.unary.unary_operation(
|
|
387
|
+
plc.unary.unary_operation(
|
|
388
|
+
column.obj,
|
|
389
|
+
plc.unary.UnaryOperator.NOT
|
|
390
|
+
if column.obj.type().id() == plc.TypeId.BOOL8
|
|
391
|
+
else plc.unary.UnaryOperator.BIT_INVERT,
|
|
392
|
+
stream=df.stream,
|
|
393
|
+
),
|
|
321
394
|
dtype=self.dtype,
|
|
322
395
|
)
|
|
323
396
|
else:
|