cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -9,22 +9,13 @@ from __future__ import annotations
|
|
|
9
9
|
from functools import partial
|
|
10
10
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
11
|
|
|
12
|
-
import pyarrow as pa
|
|
13
|
-
|
|
14
12
|
import pylibcudf as plc
|
|
15
13
|
|
|
16
14
|
from cudf_polars.containers import Column
|
|
17
|
-
from cudf_polars.dsl.expressions.base import
|
|
18
|
-
AggInfo,
|
|
19
|
-
ExecutionContext,
|
|
20
|
-
Expr,
|
|
21
|
-
)
|
|
15
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
22
16
|
from cudf_polars.dsl.expressions.literal import Literal
|
|
23
|
-
from cudf_polars.dsl.expressions.unary import UnaryFunction
|
|
24
17
|
|
|
25
18
|
if TYPE_CHECKING:
|
|
26
|
-
from collections.abc import Mapping
|
|
27
|
-
|
|
28
19
|
from cudf_polars.containers import DataFrame
|
|
29
20
|
|
|
30
21
|
__all__ = ["Agg"]
|
|
@@ -75,11 +66,15 @@ class Agg(Expr):
|
|
|
75
66
|
else plc.types.NullPolicy.INCLUDE
|
|
76
67
|
)
|
|
77
68
|
elif name == "quantile":
|
|
78
|
-
|
|
69
|
+
child, quantile = self.children
|
|
79
70
|
if not isinstance(quantile, Literal):
|
|
80
71
|
raise NotImplementedError("Only support literal quantile values")
|
|
72
|
+
if options == "equiprobable":
|
|
73
|
+
raise NotImplementedError("Quantile with equiprobable interpolation")
|
|
74
|
+
if plc.traits.is_duration(child.dtype):
|
|
75
|
+
raise NotImplementedError("Quantile with duration data type")
|
|
81
76
|
req = plc.aggregation.quantile(
|
|
82
|
-
quantiles=[quantile.value
|
|
77
|
+
quantiles=[quantile.value], interp=Agg.interp_mapping[options]
|
|
83
78
|
)
|
|
84
79
|
else:
|
|
85
80
|
raise NotImplementedError(
|
|
@@ -91,7 +86,9 @@ class Agg(Expr):
|
|
|
91
86
|
op = partial(self._reduce, request=req)
|
|
92
87
|
elif name in {"min", "max"}:
|
|
93
88
|
op = partial(op, propagate_nans=options)
|
|
94
|
-
elif name
|
|
89
|
+
elif name == "count":
|
|
90
|
+
op = partial(op, include_nulls=options)
|
|
91
|
+
elif name in {"sum", "first", "last"}:
|
|
95
92
|
pass
|
|
96
93
|
else:
|
|
97
94
|
raise NotImplementedError(
|
|
@@ -124,38 +121,19 @@ class Agg(Expr):
|
|
|
124
121
|
"linear": plc.types.Interpolation.LINEAR,
|
|
125
122
|
}
|
|
126
123
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
if depth >= 1:
|
|
130
|
-
raise NotImplementedError(
|
|
131
|
-
"Nested aggregations in groupby"
|
|
132
|
-
) # pragma: no cover; check_agg trips first
|
|
133
|
-
if (isminmax := self.name in {"min", "max"}) and self.options:
|
|
134
|
-
raise NotImplementedError("Nan propagation in groupby for min/max")
|
|
135
|
-
(child,) = self.children
|
|
136
|
-
((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
|
|
137
|
-
request = self.request
|
|
138
|
-
# These are handled specially here because we don't set up the
|
|
139
|
-
# request for the whole-frame agg because we can avoid a
|
|
140
|
-
# reduce for these.
|
|
124
|
+
@property
|
|
125
|
+
def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102
|
|
141
126
|
if self.name == "first":
|
|
142
|
-
|
|
127
|
+
return plc.aggregation.nth_element(
|
|
143
128
|
0, null_handling=plc.types.NullPolicy.INCLUDE
|
|
144
129
|
)
|
|
145
130
|
elif self.name == "last":
|
|
146
|
-
|
|
131
|
+
return plc.aggregation.nth_element(
|
|
147
132
|
-1, null_handling=plc.types.NullPolicy.INCLUDE
|
|
148
133
|
)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
) # pragma: no cover; __init__ trips first
|
|
153
|
-
if isminmax and plc.traits.is_floating_point(self.dtype):
|
|
154
|
-
assert expr is not None
|
|
155
|
-
# Ignore nans in these groupby aggs, do this by masking
|
|
156
|
-
# nans in the input
|
|
157
|
-
expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
|
|
158
|
-
return AggInfo([(expr, request, self)])
|
|
134
|
+
else:
|
|
135
|
+
assert self.request is not None, "Init should have raised"
|
|
136
|
+
return self.request
|
|
159
137
|
|
|
160
138
|
def _reduce(
|
|
161
139
|
self, column: Column, *, request: plc.aggregation.Aggregation
|
|
@@ -167,26 +145,20 @@ class Agg(Expr):
|
|
|
167
145
|
)
|
|
168
146
|
)
|
|
169
147
|
|
|
170
|
-
def _count(self, column: Column) -> Column:
|
|
148
|
+
def _count(self, column: Column, *, include_nulls: bool) -> Column:
|
|
149
|
+
null_count = column.null_count if not include_nulls else 0
|
|
171
150
|
return Column(
|
|
172
151
|
plc.Column.from_scalar(
|
|
173
|
-
plc.
|
|
174
|
-
pa.scalar(
|
|
175
|
-
column.obj.size() - column.obj.null_count(),
|
|
176
|
-
type=plc.interop.to_arrow(self.dtype),
|
|
177
|
-
),
|
|
178
|
-
),
|
|
152
|
+
plc.Scalar.from_py(column.size - null_count, self.dtype),
|
|
179
153
|
1,
|
|
180
154
|
)
|
|
181
155
|
)
|
|
182
156
|
|
|
183
157
|
def _sum(self, column: Column) -> Column:
|
|
184
|
-
if column.
|
|
158
|
+
if column.size == 0 or column.null_count == column.size:
|
|
185
159
|
return Column(
|
|
186
160
|
plc.Column.from_scalar(
|
|
187
|
-
plc.
|
|
188
|
-
pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
|
|
189
|
-
),
|
|
161
|
+
plc.Scalar.from_py(0, self.dtype),
|
|
190
162
|
1,
|
|
191
163
|
)
|
|
192
164
|
)
|
|
@@ -196,9 +168,7 @@ class Agg(Expr):
|
|
|
196
168
|
if propagate_nans and column.nan_count > 0:
|
|
197
169
|
return Column(
|
|
198
170
|
plc.Column.from_scalar(
|
|
199
|
-
plc.
|
|
200
|
-
pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
|
|
201
|
-
),
|
|
171
|
+
plc.Scalar.from_py(float("nan"), self.dtype),
|
|
202
172
|
1,
|
|
203
173
|
)
|
|
204
174
|
)
|
|
@@ -210,9 +180,7 @@ class Agg(Expr):
|
|
|
210
180
|
if propagate_nans and column.nan_count > 0:
|
|
211
181
|
return Column(
|
|
212
182
|
plc.Column.from_scalar(
|
|
213
|
-
plc.
|
|
214
|
-
pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
|
|
215
|
-
),
|
|
183
|
+
plc.Scalar.from_py(float("nan"), self.dtype),
|
|
216
184
|
1,
|
|
217
185
|
)
|
|
218
186
|
)
|
|
@@ -224,15 +192,11 @@ class Agg(Expr):
|
|
|
224
192
|
return Column(plc.copying.slice(column.obj, [0, 1])[0])
|
|
225
193
|
|
|
226
194
|
def _last(self, column: Column) -> Column:
|
|
227
|
-
n = column.
|
|
195
|
+
n = column.size
|
|
228
196
|
return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
|
|
229
197
|
|
|
230
198
|
def do_evaluate(
|
|
231
|
-
self,
|
|
232
|
-
df: DataFrame,
|
|
233
|
-
*,
|
|
234
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
235
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
199
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
236
200
|
) -> Column:
|
|
237
201
|
"""Evaluate this expression given a dataframe for context."""
|
|
238
202
|
if context is not ExecutionContext.FRAME:
|
|
@@ -243,4 +207,4 @@ class Agg(Expr):
|
|
|
243
207
|
# Aggregations like quantiles may have additional children that were
|
|
244
208
|
# preprocessed into pylibcudf requests.
|
|
245
209
|
child = self.children[0]
|
|
246
|
-
return self.op(child.evaluate(df, context=context
|
|
210
|
+
return self.op(child.evaluate(df, context=context))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -16,7 +16,7 @@ from cudf_polars.containers import Column
|
|
|
16
16
|
from cudf_polars.dsl.nodebase import Node
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from
|
|
19
|
+
from typing_extensions import Self
|
|
20
20
|
|
|
21
21
|
from cudf_polars.containers import Column, DataFrame
|
|
22
22
|
|
|
@@ -46,11 +46,7 @@ class Expr(Node["Expr"]):
|
|
|
46
46
|
"""Names of non-child data (not Exprs) for reconstruction."""
|
|
47
47
|
|
|
48
48
|
def do_evaluate(
|
|
49
|
-
self,
|
|
50
|
-
df: DataFrame,
|
|
51
|
-
*,
|
|
52
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
53
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
49
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
54
50
|
) -> Column:
|
|
55
51
|
"""
|
|
56
52
|
Evaluate this expression given a dataframe for context.
|
|
@@ -61,15 +57,10 @@ class Expr(Node["Expr"]):
|
|
|
61
57
|
DataFrame that will provide columns.
|
|
62
58
|
context
|
|
63
59
|
What context are we performing this evaluation in?
|
|
64
|
-
mapping
|
|
65
|
-
Substitution mapping from expressions to Columns, used to
|
|
66
|
-
override the evaluation of a given expression if we're
|
|
67
|
-
performing a simple rewritten evaluation.
|
|
68
60
|
|
|
69
61
|
Notes
|
|
70
62
|
-----
|
|
71
|
-
Do not call this function directly, but rather
|
|
72
|
-
:meth:`evaluate` which handles the mapping lookups.
|
|
63
|
+
Do not call this function directly, but rather :meth:`evaluate`.
|
|
73
64
|
|
|
74
65
|
Returns
|
|
75
66
|
-------
|
|
@@ -87,11 +78,7 @@ class Expr(Node["Expr"]):
|
|
|
87
78
|
) # pragma: no cover; translation of unimplemented nodes trips first
|
|
88
79
|
|
|
89
80
|
def evaluate(
|
|
90
|
-
self,
|
|
91
|
-
df: DataFrame,
|
|
92
|
-
*,
|
|
93
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
94
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
81
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
95
82
|
) -> Column:
|
|
96
83
|
"""
|
|
97
84
|
Evaluate this expression given a dataframe for context.
|
|
@@ -102,10 +89,6 @@ class Expr(Node["Expr"]):
|
|
|
102
89
|
DataFrame that will provide columns.
|
|
103
90
|
context
|
|
104
91
|
What context are we performing this evaluation in?
|
|
105
|
-
mapping
|
|
106
|
-
Substitution mapping from expressions to Columns, used to
|
|
107
|
-
override the evaluation of a given expression if we're
|
|
108
|
-
performing a simple rewritten evaluation.
|
|
109
92
|
|
|
110
93
|
Notes
|
|
111
94
|
-----
|
|
@@ -124,37 +107,28 @@ class Expr(Node["Expr"]):
|
|
|
124
107
|
are returned during translation to the IR, but for now we
|
|
125
108
|
are not perfect.
|
|
126
109
|
"""
|
|
127
|
-
|
|
128
|
-
return self.do_evaluate(df, context=context, mapping=mapping)
|
|
129
|
-
try:
|
|
130
|
-
return mapping[self]
|
|
131
|
-
except KeyError:
|
|
132
|
-
return self.do_evaluate(df, context=context, mapping=mapping)
|
|
133
|
-
|
|
134
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
135
|
-
"""
|
|
136
|
-
Collect information about aggregations in groupbys.
|
|
110
|
+
return self.do_evaluate(df, context=context)
|
|
137
111
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
expressions we are currently at.
|
|
112
|
+
@property
|
|
113
|
+
def agg_request(self) -> plc.aggregation.Aggregation:
|
|
114
|
+
"""
|
|
115
|
+
The aggregation for this expression in a grouped aggregation.
|
|
143
116
|
|
|
144
117
|
Returns
|
|
145
118
|
-------
|
|
146
|
-
Aggregation
|
|
147
|
-
|
|
119
|
+
Aggregation request. Default is to collect the expression.
|
|
120
|
+
|
|
121
|
+
Notes
|
|
122
|
+
-----
|
|
123
|
+
This presumes that the IR translation has decomposed groupby
|
|
124
|
+
reductions only into cases we can handle.
|
|
148
125
|
|
|
149
126
|
Raises
|
|
150
127
|
------
|
|
151
128
|
NotImplementedError
|
|
152
|
-
If
|
|
153
|
-
example nested aggregations like ``a.max().min()``.
|
|
129
|
+
If requesting an aggregation from an unexpected expression.
|
|
154
130
|
"""
|
|
155
|
-
|
|
156
|
-
f"Collecting aggregation info for {type(self).__name__}"
|
|
157
|
-
) # pragma: no cover; check_agg trips first
|
|
131
|
+
return plc.aggregation.collect_list()
|
|
158
132
|
|
|
159
133
|
|
|
160
134
|
class ErrorExpr(Expr):
|
|
@@ -166,7 +140,7 @@ class ErrorExpr(Expr):
|
|
|
166
140
|
self.dtype = dtype
|
|
167
141
|
self.error = error
|
|
168
142
|
self.children = ()
|
|
169
|
-
self.is_pointwise =
|
|
143
|
+
self.is_pointwise = False
|
|
170
144
|
|
|
171
145
|
|
|
172
146
|
class NamedExpr:
|
|
@@ -202,11 +176,7 @@ class NamedExpr:
|
|
|
202
176
|
return not self.__eq__(other)
|
|
203
177
|
|
|
204
178
|
def evaluate(
|
|
205
|
-
self,
|
|
206
|
-
df: DataFrame,
|
|
207
|
-
*,
|
|
208
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
209
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
179
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
210
180
|
) -> Column:
|
|
211
181
|
"""
|
|
212
182
|
Evaluate this expression given a dataframe for context.
|
|
@@ -217,8 +187,6 @@ class NamedExpr:
|
|
|
217
187
|
DataFrame providing context
|
|
218
188
|
context
|
|
219
189
|
Execution context
|
|
220
|
-
mapping
|
|
221
|
-
Substitution mapping
|
|
222
190
|
|
|
223
191
|
Returns
|
|
224
192
|
-------
|
|
@@ -229,13 +197,25 @@ class NamedExpr:
|
|
|
229
197
|
:meth:`Expr.evaluate` for details, this function just adds the
|
|
230
198
|
name to a column produced from an expression.
|
|
231
199
|
"""
|
|
232
|
-
return self.value.evaluate(df, context=context
|
|
233
|
-
|
|
234
|
-
|
|
200
|
+
return self.value.evaluate(df, context=context).rename(self.name)
|
|
201
|
+
|
|
202
|
+
def reconstruct(self, expr: Expr) -> Self:
|
|
203
|
+
"""
|
|
204
|
+
Rebuild with a new `Expr` value.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
expr
|
|
209
|
+
New `Expr` value
|
|
235
210
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
211
|
+
Returns
|
|
212
|
+
-------
|
|
213
|
+
New `NamedExpr` with `expr` as the underlying expression.
|
|
214
|
+
The name of the original `NamedExpr` is preserved.
|
|
215
|
+
"""
|
|
216
|
+
if expr is self.value:
|
|
217
|
+
return self
|
|
218
|
+
return type(self)(self.name, expr)
|
|
239
219
|
|
|
240
220
|
|
|
241
221
|
class Col(Expr):
|
|
@@ -250,21 +230,13 @@ class Col(Expr):
|
|
|
250
230
|
self.children = ()
|
|
251
231
|
|
|
252
232
|
def do_evaluate(
|
|
253
|
-
self,
|
|
254
|
-
df: DataFrame,
|
|
255
|
-
*,
|
|
256
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
257
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
233
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
258
234
|
) -> Column:
|
|
259
235
|
"""Evaluate this expression given a dataframe for context."""
|
|
260
236
|
# Deliberately remove the name here so that we guarantee
|
|
261
237
|
# evaluation of the IR produces names.
|
|
262
238
|
return df.column_map[self.name].rename(None)
|
|
263
239
|
|
|
264
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
265
|
-
"""Collect information about aggregations in groupbys."""
|
|
266
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
267
|
-
|
|
268
240
|
|
|
269
241
|
class ColRef(Expr):
|
|
270
242
|
__slots__ = ("index", "table_ref")
|
|
@@ -288,11 +260,7 @@ class ColRef(Expr):
|
|
|
288
260
|
self.children = (column,)
|
|
289
261
|
|
|
290
262
|
def do_evaluate(
|
|
291
|
-
self,
|
|
292
|
-
df: DataFrame,
|
|
293
|
-
*,
|
|
294
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
295
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
263
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
296
264
|
) -> Column:
|
|
297
265
|
"""Evaluate this expression given a dataframe for context."""
|
|
298
266
|
raise NotImplementedError(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -13,11 +13,9 @@ from polars.polars import _expr_nodes as pl_expr
|
|
|
13
13
|
import pylibcudf as plc
|
|
14
14
|
|
|
15
15
|
from cudf_polars.containers import Column
|
|
16
|
-
from cudf_polars.dsl.expressions.base import
|
|
16
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import Mapping
|
|
20
|
-
|
|
21
19
|
from cudf_polars.containers import DataFrame
|
|
22
20
|
|
|
23
21
|
__all__ = ["BinOp"]
|
|
@@ -85,20 +83,13 @@ class BinOp(Expr):
|
|
|
85
83
|
}
|
|
86
84
|
|
|
87
85
|
def do_evaluate(
|
|
88
|
-
self,
|
|
89
|
-
df: DataFrame,
|
|
90
|
-
*,
|
|
91
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
92
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
86
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
93
87
|
) -> Column:
|
|
94
88
|
"""Evaluate this expression given a dataframe for context."""
|
|
95
|
-
left, right = (
|
|
96
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
97
|
-
for child in self.children
|
|
98
|
-
)
|
|
89
|
+
left, right = (child.evaluate(df, context=context) for child in self.children)
|
|
99
90
|
lop = left.obj
|
|
100
91
|
rop = right.obj
|
|
101
|
-
if left.
|
|
92
|
+
if left.size != right.size:
|
|
102
93
|
if left.is_scalar:
|
|
103
94
|
lop = left.obj_scalar
|
|
104
95
|
elif right.is_scalar:
|
|
@@ -106,30 +97,3 @@ class BinOp(Expr):
|
|
|
106
97
|
return Column(
|
|
107
98
|
plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
|
|
108
99
|
)
|
|
109
|
-
|
|
110
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
111
|
-
"""Collect information about aggregations in groupbys."""
|
|
112
|
-
if depth == 1:
|
|
113
|
-
# inside aggregation, need to pre-evaluate,
|
|
114
|
-
# groupby construction has checked that we don't have
|
|
115
|
-
# nested aggs, so stop the recursion and return ourselves
|
|
116
|
-
# for pre-eval
|
|
117
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
118
|
-
else:
|
|
119
|
-
left_info, right_info = (
|
|
120
|
-
child.collect_agg(depth=depth) for child in self.children
|
|
121
|
-
)
|
|
122
|
-
requests = [*left_info.requests, *right_info.requests]
|
|
123
|
-
# TODO: Hack, if there were no reductions inside this
|
|
124
|
-
# binary expression then we want to pre-evaluate and
|
|
125
|
-
# collect ourselves. Otherwise we want to collect the
|
|
126
|
-
# aggregations inside and post-evaluate. This is a bad way
|
|
127
|
-
# of checking that we are in case 1.
|
|
128
|
-
if all(
|
|
129
|
-
agg.kind() == plc.aggregation.Kind.COLLECT_LIST
|
|
130
|
-
for _, agg, _ in requests
|
|
131
|
-
):
|
|
132
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
133
|
-
return AggInfo(
|
|
134
|
-
[*left_info.requests, *right_info.requests],
|
|
135
|
-
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -10,8 +10,6 @@ from enum import IntEnum, auto
|
|
|
10
10
|
from functools import partial, reduce
|
|
11
11
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
12
12
|
|
|
13
|
-
import pyarrow as pa
|
|
14
|
-
|
|
15
13
|
import pylibcudf as plc
|
|
16
14
|
|
|
17
15
|
from cudf_polars.containers import Column
|
|
@@ -19,10 +17,9 @@ from cudf_polars.dsl.expressions.base import (
|
|
|
19
17
|
ExecutionContext,
|
|
20
18
|
Expr,
|
|
21
19
|
)
|
|
20
|
+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
|
|
22
21
|
|
|
23
22
|
if TYPE_CHECKING:
|
|
24
|
-
from collections.abc import Mapping
|
|
25
|
-
|
|
26
23
|
from typing_extensions import Self
|
|
27
24
|
|
|
28
25
|
import polars.type_aliases as pl_types
|
|
@@ -89,9 +86,11 @@ class BooleanFunction(Expr):
|
|
|
89
86
|
BooleanFunction.Name.IsLastDistinct,
|
|
90
87
|
BooleanFunction.Name.IsUnique,
|
|
91
88
|
)
|
|
92
|
-
if
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
if (
|
|
90
|
+
POLARS_VERSION_LT_128
|
|
91
|
+
and self.name is BooleanFunction.Name.IsIn
|
|
92
|
+
and not all(c.dtype == self.children[0].dtype for c in self.children)
|
|
93
|
+
): # pragma: no cover
|
|
95
94
|
# TODO: If polars IR doesn't put the casts in, we need to
|
|
96
95
|
# mimic the supertype promotion rules.
|
|
97
96
|
raise NotImplementedError("IsIn doesn't support supertype casting")
|
|
@@ -145,11 +144,7 @@ class BooleanFunction(Expr):
|
|
|
145
144
|
}
|
|
146
145
|
|
|
147
146
|
def do_evaluate(
|
|
148
|
-
self,
|
|
149
|
-
df: DataFrame,
|
|
150
|
-
*,
|
|
151
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
152
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
147
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
153
148
|
) -> Column:
|
|
154
149
|
"""Evaluate this expression given a dataframe for context."""
|
|
155
150
|
if self.name in (
|
|
@@ -160,29 +155,22 @@ class BooleanFunction(Expr):
|
|
|
160
155
|
(child,) = self.children
|
|
161
156
|
is_finite = self.name is BooleanFunction.Name.IsFinite
|
|
162
157
|
if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
|
|
163
|
-
value = plc.
|
|
164
|
-
pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
|
|
165
|
-
)
|
|
158
|
+
value = plc.Scalar.from_py(is_finite)
|
|
166
159
|
return Column(plc.Column.from_scalar(value, df.num_rows))
|
|
167
|
-
needles = child.evaluate(df, context=context
|
|
160
|
+
needles = child.evaluate(df, context=context)
|
|
168
161
|
to_search = [-float("inf"), float("inf")]
|
|
169
162
|
if is_finite:
|
|
170
163
|
# NaN is neither finite not infinite
|
|
171
164
|
to_search.append(float("nan"))
|
|
172
|
-
haystack = plc.
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
type=plc.interop.to_arrow(needles.obj.type()),
|
|
176
|
-
)
|
|
165
|
+
haystack = plc.Column.from_iterable_of_py(
|
|
166
|
+
to_search,
|
|
167
|
+
dtype=needles.obj.type(),
|
|
177
168
|
)
|
|
178
169
|
result = plc.search.contains(haystack, needles.obj)
|
|
179
170
|
if is_finite:
|
|
180
171
|
result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
|
|
181
172
|
return Column(result)
|
|
182
|
-
columns = [
|
|
183
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
184
|
-
for child in self.children
|
|
185
|
-
]
|
|
173
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
186
174
|
# Kleene logic for Any (OR) and All (AND) if ignore_nulls is
|
|
187
175
|
# False
|
|
188
176
|
if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
|
|
@@ -191,7 +179,7 @@ class BooleanFunction(Expr):
|
|
|
191
179
|
is_any = self.name is BooleanFunction.Name.Any
|
|
192
180
|
agg = plc.aggregation.any() if is_any else plc.aggregation.all()
|
|
193
181
|
result = plc.reduce.reduce(column.obj, agg, self.dtype)
|
|
194
|
-
if not ignore_nulls and column.
|
|
182
|
+
if not ignore_nulls and column.null_count > 0:
|
|
195
183
|
# Truth tables
|
|
196
184
|
# Any All
|
|
197
185
|
# | F U T | F U T
|
|
@@ -218,14 +206,14 @@ class BooleanFunction(Expr):
|
|
|
218
206
|
(column,) = columns
|
|
219
207
|
return Column(
|
|
220
208
|
plc.unary.is_nan(column.obj).with_mask(
|
|
221
|
-
column.obj.null_mask(), column.
|
|
209
|
+
column.obj.null_mask(), column.null_count
|
|
222
210
|
)
|
|
223
211
|
)
|
|
224
212
|
elif self.name is BooleanFunction.Name.IsNotNan:
|
|
225
213
|
(column,) = columns
|
|
226
214
|
return Column(
|
|
227
215
|
plc.unary.is_not_nan(column.obj).with_mask(
|
|
228
|
-
column.obj.null_mask(), column.
|
|
216
|
+
column.obj.null_mask(), column.null_count
|
|
229
217
|
)
|
|
230
218
|
)
|
|
231
219
|
elif self.name is BooleanFunction.Name.IsFirstDistinct:
|
|
@@ -233,48 +221,32 @@ class BooleanFunction(Expr):
|
|
|
233
221
|
return self._distinct(
|
|
234
222
|
column,
|
|
235
223
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
|
|
236
|
-
source_value=plc.
|
|
237
|
-
|
|
238
|
-
),
|
|
239
|
-
target_value=plc.interop.from_arrow(
|
|
240
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
241
|
-
),
|
|
224
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
225
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
242
226
|
)
|
|
243
227
|
elif self.name is BooleanFunction.Name.IsLastDistinct:
|
|
244
228
|
(column,) = columns
|
|
245
229
|
return self._distinct(
|
|
246
230
|
column,
|
|
247
231
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
|
|
248
|
-
source_value=plc.
|
|
249
|
-
|
|
250
|
-
),
|
|
251
|
-
target_value=plc.interop.from_arrow(
|
|
252
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
253
|
-
),
|
|
232
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
233
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
254
234
|
)
|
|
255
235
|
elif self.name is BooleanFunction.Name.IsUnique:
|
|
256
236
|
(column,) = columns
|
|
257
237
|
return self._distinct(
|
|
258
238
|
column,
|
|
259
239
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
260
|
-
source_value=plc.
|
|
261
|
-
|
|
262
|
-
),
|
|
263
|
-
target_value=plc.interop.from_arrow(
|
|
264
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
265
|
-
),
|
|
240
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
241
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
266
242
|
)
|
|
267
243
|
elif self.name is BooleanFunction.Name.IsDuplicated:
|
|
268
244
|
(column,) = columns
|
|
269
245
|
return self._distinct(
|
|
270
246
|
column,
|
|
271
247
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
272
|
-
source_value=plc.
|
|
273
|
-
|
|
274
|
-
),
|
|
275
|
-
target_value=plc.interop.from_arrow(
|
|
276
|
-
pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
|
|
277
|
-
),
|
|
248
|
+
source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
249
|
+
target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
278
250
|
)
|
|
279
251
|
elif self.name is BooleanFunction.Name.AllHorizontal:
|
|
280
252
|
return Column(
|