cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +35 -50
- cudf_polars/containers/column.py +38 -0
- cudf_polars/containers/dataframe.py +11 -16
- cudf_polars/dsl/expressions/aggregation.py +25 -61
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +3 -39
- cudf_polars/dsl/expressions/boolean.py +21 -49
- cudf_polars/dsl/expressions/datetime.py +59 -17
- cudf_polars/dsl/expressions/literal.py +24 -24
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +6 -24
- cudf_polars/dsl/expressions/slicing.py +2 -8
- cudf_polars/dsl/expressions/sorting.py +4 -17
- cudf_polars/dsl/expressions/string.py +29 -32
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +32 -73
- cudf_polars/dsl/ir.py +575 -167
- cudf_polars/dsl/nodebase.py +1 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +272 -152
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +0 -8
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +109 -167
- cudf_polars/experimental/io.py +53 -26
- cudf_polars/experimental/join.py +59 -24
- cudf_polars/experimental/parallel.py +155 -133
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +109 -9
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +55 -42
- cudf_polars/typing/__init__.py +27 -5
- cudf_polars/utils/config.py +317 -102
- cudf_polars/utils/dtypes.py +8 -1
- cudf_polars/utils/timer.py +1 -1
- cudf_polars/utils/versions.py +4 -4
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +7 -5
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -73
- cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -13,11 +13,9 @@ from polars.polars import _expr_nodes as pl_expr
|
|
|
13
13
|
import pylibcudf as plc
|
|
14
14
|
|
|
15
15
|
from cudf_polars.containers import Column
|
|
16
|
-
from cudf_polars.dsl.expressions.base import
|
|
16
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import Mapping
|
|
20
|
-
|
|
21
19
|
from cudf_polars.containers import DataFrame
|
|
22
20
|
|
|
23
21
|
__all__ = ["BinOp"]
|
|
@@ -85,17 +83,10 @@ class BinOp(Expr):
|
|
|
85
83
|
}
|
|
86
84
|
|
|
87
85
|
def do_evaluate(
|
|
88
|
-
self,
|
|
89
|
-
df: DataFrame,
|
|
90
|
-
*,
|
|
91
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
92
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
86
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
93
87
|
) -> Column:
|
|
94
88
|
"""Evaluate this expression given a dataframe for context."""
|
|
95
|
-
left, right = (
|
|
96
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
97
|
-
for child in self.children
|
|
98
|
-
)
|
|
89
|
+
left, right = (child.evaluate(df, context=context) for child in self.children)
|
|
99
90
|
lop = left.obj
|
|
100
91
|
rop = right.obj
|
|
101
92
|
if left.size != right.size:
|
|
@@ -106,30 +97,3 @@ class BinOp(Expr):
|
|
|
106
97
|
return Column(
|
|
107
98
|
plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
|
|
108
99
|
)
|
|
109
|
-
|
|
110
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
111
|
-
"""Collect information about aggregations in groupbys."""
|
|
112
|
-
if depth == 1:
|
|
113
|
-
# inside aggregation, need to pre-evaluate,
|
|
114
|
-
# groupby construction has checked that we don't have
|
|
115
|
-
# nested aggs, so stop the recursion and return ourselves
|
|
116
|
-
# for pre-eval
|
|
117
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
118
|
-
else:
|
|
119
|
-
left_info, right_info = (
|
|
120
|
-
child.collect_agg(depth=depth) for child in self.children
|
|
121
|
-
)
|
|
122
|
-
requests = [*left_info.requests, *right_info.requests]
|
|
123
|
-
# TODO: Hack, if there were no reductions inside this
|
|
124
|
-
# binary expression then we want to pre-evaluate and
|
|
125
|
-
# collect ourselves. Otherwise we want to collect the
|
|
126
|
-
# aggregations inside and post-evaluate. This is a bad way
|
|
127
|
-
# of checking that we are in case 1.
|
|
128
|
-
if all(
|
|
129
|
-
agg.kind() == plc.aggregation.Kind.COLLECT_LIST
|
|
130
|
-
for _, agg, _ in requests
|
|
131
|
-
):
|
|
132
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
133
|
-
return AggInfo(
|
|
134
|
-
[*left_info.requests, *right_info.requests],
|
|
135
|
-
)
|
|
@@ -10,8 +10,6 @@ from enum import IntEnum, auto
|
|
|
10
10
|
from functools import partial, reduce
|
|
11
11
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
12
12
|
|
|
13
|
-
import pyarrow as pa
|
|
14
|
-
|
|
15
13
|
import pylibcudf as plc
|
|
16
14
|
|
|
17
15
|
from cudf_polars.containers import Column
|
|
@@ -19,10 +17,9 @@ from cudf_polars.dsl.expressions.base import (
|
|
|
19
17
|
ExecutionContext,
|
|
20
18
|
Expr,
|
|
21
19
|
)
|
|
20
|
+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
|
|
22
21
|
|
|
23
22
|
if TYPE_CHECKING:
|
|
24
|
-
from collections.abc import Mapping
|
|
25
|
-
|
|
26
23
|
from typing_extensions import Self
|
|
27
24
|
|
|
28
25
|
import polars.type_aliases as pl_types
|
|
@@ -89,9 +86,11 @@ class BooleanFunction(Expr):
|
|
|
89
86
|
BooleanFunction.Name.IsLastDistinct,
|
|
90
87
|
BooleanFunction.Name.IsUnique,
|
|
91
88
|
)
|
|
92
|
-
if
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
if (
|
|
90
|
+
POLARS_VERSION_LT_128
|
|
91
|
+
and self.name is BooleanFunction.Name.IsIn
|
|
92
|
+
and not all(c.dtype == self.children[0].dtype for c in self.children)
|
|
93
|
+
): # pragma: no cover
|
|
95
94
|
# TODO: If polars IR doesn't put the casts in, we need to
|
|
96
95
|
# mimic the supertype promotion rules.
|
|
97
96
|
raise NotImplementedError("IsIn doesn't support supertype casting")
|
|
@@ -145,11 +144,7 @@ class BooleanFunction(Expr):
|
|
|
145
144
|
}
|
|
146
145
|
|
|
147
146
|
def do_evaluate(
|
|
148
|
-
self,
|
|
149
|
-
df: DataFrame,
|
|
150
|
-
*,
|
|
151
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
152
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
147
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
153
148
|
) -> Column:
|
|
154
149
|
"""Evaluate this expression given a dataframe for context."""
|
|
155
150
|
if self.name in (
|
|
@@ -160,29 +155,22 @@ class BooleanFunction(Expr):
|
|
|
160
155
|
(child,) = self.children
|
|
161
156
|
is_finite = self.name is BooleanFunction.Name.IsFinite
|
|
162
157
|
if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
|
|
163
|
-
value = plc.
|
|
164
|
-
pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
|
|
165
|
-
)
|
|
158
|
+
value = plc.Scalar.from_py(is_finite)
|
|
166
159
|
return Column(plc.Column.from_scalar(value, df.num_rows))
|
|
167
|
-
needles = child.evaluate(df, context=context
|
|
160
|
+
needles = child.evaluate(df, context=context)
|
|
168
161
|
to_search = [-float("inf"), float("inf")]
|
|
169
162
|
if is_finite:
|
|
170
163
|
# NaN is neither finite not infinite
|
|
171
164
|
to_search.append(float("nan"))
|
|
172
|
-
haystack = plc.
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
type=plc.interop.to_arrow(needles.obj.type()),
|
|
176
|
-
)
|
|
165
|
+
haystack = plc.Column.from_iterable_of_py(
|
|
166
|
+
to_search,
|
|
167
|
+
dtype=needles.obj.type(),
|
|
177
168
|
)
|
|
178
169
|
result = plc.search.contains(haystack, needles.obj)
|
|
179
170
|
if is_finite:
|
|
180
171
|
result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
|
|
181
172
|
return Column(result)
|
|
182
|
-
columns = [
|
|
183
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
184
|
-
for child in self.children
|
|
185
|
-
]
|
|
173
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
186
174
|
# Kleene logic for Any (OR) and All (AND) if ignore_nulls is
|
|
187
175
|
# False
|
|
188
176
|
if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
|
|
@@ -233,48 +221,32 @@ class BooleanFunction(Expr):
|
|
|
233
221
|
return self._distinct(
|
|
234
222
|
column,
|
|
235
223
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
|
|
236
|
-
source_value=plc.
|
|
237
|
-
|
|
238
|
-
),
|
|
239
|
-
target_value=plc.interop.from_arrow(
|
|
240
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
241
|
-
),
|
|
224
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
225
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
242
226
|
)
|
|
243
227
|
elif self.name is BooleanFunction.Name.IsLastDistinct:
|
|
244
228
|
(column,) = columns
|
|
245
229
|
return self._distinct(
|
|
246
230
|
column,
|
|
247
231
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
|
|
248
|
-
source_value=plc.
|
|
249
|
-
|
|
250
|
-
),
|
|
251
|
-
target_value=plc.interop.from_arrow(
|
|
252
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
253
|
-
),
|
|
232
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
233
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
254
234
|
)
|
|
255
235
|
elif self.name is BooleanFunction.Name.IsUnique:
|
|
256
236
|
(column,) = columns
|
|
257
237
|
return self._distinct(
|
|
258
238
|
column,
|
|
259
239
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
260
|
-
source_value=plc.
|
|
261
|
-
|
|
262
|
-
),
|
|
263
|
-
target_value=plc.interop.from_arrow(
|
|
264
|
-
pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
|
|
265
|
-
),
|
|
240
|
+
source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
241
|
+
target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
266
242
|
)
|
|
267
243
|
elif self.name is BooleanFunction.Name.IsDuplicated:
|
|
268
244
|
(column,) = columns
|
|
269
245
|
return self._distinct(
|
|
270
246
|
column,
|
|
271
247
|
keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
|
|
272
|
-
source_value=plc.
|
|
273
|
-
|
|
274
|
-
),
|
|
275
|
-
target_value=plc.interop.from_arrow(
|
|
276
|
-
pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
|
|
277
|
-
),
|
|
248
|
+
source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
|
|
249
|
+
target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
|
|
278
250
|
)
|
|
279
251
|
elif self.name is BooleanFunction.Name.AllHorizontal:
|
|
280
252
|
return Column(
|
|
@@ -17,8 +17,6 @@ from cudf_polars.containers import Column
|
|
|
17
17
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Mapping
|
|
21
|
-
|
|
22
20
|
from typing_extensions import Self
|
|
23
21
|
|
|
24
22
|
from polars.polars import _expr_nodes as pl_expr
|
|
@@ -108,8 +106,12 @@ class TemporalFunction(Expr):
|
|
|
108
106
|
*_COMPONENT_MAP.keys(),
|
|
109
107
|
Name.IsLeapYear,
|
|
110
108
|
Name.OrdinalDay,
|
|
109
|
+
Name.ToString,
|
|
110
|
+
Name.Week,
|
|
111
|
+
Name.IsoYear,
|
|
111
112
|
Name.MonthStart,
|
|
112
113
|
Name.MonthEnd,
|
|
114
|
+
Name.CastTimeUnit,
|
|
113
115
|
}
|
|
114
116
|
|
|
115
117
|
def __init__(
|
|
@@ -127,26 +129,66 @@ class TemporalFunction(Expr):
|
|
|
127
129
|
if self.name not in self._valid_ops:
|
|
128
130
|
raise NotImplementedError(f"Temporal function {self.name}")
|
|
129
131
|
|
|
132
|
+
if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
|
|
133
|
+
self.children[0].dtype
|
|
134
|
+
):
|
|
135
|
+
raise NotImplementedError("ToString is not supported on duration types")
|
|
136
|
+
|
|
130
137
|
def do_evaluate(
|
|
131
|
-
self,
|
|
132
|
-
df: DataFrame,
|
|
133
|
-
*,
|
|
134
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
135
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
138
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
136
139
|
) -> Column:
|
|
137
140
|
"""Evaluate this expression given a dataframe for context."""
|
|
138
|
-
columns = [
|
|
139
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
140
|
-
for child in self.children
|
|
141
|
-
]
|
|
141
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
142
142
|
(column,) = columns
|
|
143
|
+
if self.name is TemporalFunction.Name.CastTimeUnit:
|
|
144
|
+
(unit,) = self.options
|
|
145
|
+
if plc.traits.is_timestamp(column.obj.type()):
|
|
146
|
+
dtype = plc.interop.from_arrow(pa.timestamp(unit))
|
|
147
|
+
elif plc.traits.is_duration(column.obj.type()):
|
|
148
|
+
dtype = plc.interop.from_arrow(pa.duration(unit))
|
|
149
|
+
result = plc.unary.cast(column.obj, dtype)
|
|
150
|
+
return Column(result)
|
|
151
|
+
if self.name == TemporalFunction.Name.ToString:
|
|
152
|
+
return Column(
|
|
153
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
154
|
+
column.obj,
|
|
155
|
+
self.options[0],
|
|
156
|
+
plc.Column.from_iterable_of_py(
|
|
157
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
if self.name is TemporalFunction.Name.Week:
|
|
162
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
163
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
164
|
+
column.obj,
|
|
165
|
+
format="%V",
|
|
166
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
167
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
168
|
+
),
|
|
169
|
+
),
|
|
170
|
+
plc.types.DataType(plc.types.TypeId.INT8),
|
|
171
|
+
)
|
|
172
|
+
return Column(result)
|
|
173
|
+
if self.name is TemporalFunction.Name.IsoYear:
|
|
174
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
175
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
176
|
+
column.obj,
|
|
177
|
+
format="%G",
|
|
178
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
179
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
180
|
+
),
|
|
181
|
+
),
|
|
182
|
+
plc.types.DataType(plc.types.TypeId.INT32),
|
|
183
|
+
)
|
|
184
|
+
return Column(result)
|
|
143
185
|
if self.name is TemporalFunction.Name.MonthStart:
|
|
144
186
|
ends = plc.datetime.last_day_of_month(column.obj)
|
|
145
187
|
days_to_subtract = plc.datetime.days_in_month(column.obj)
|
|
146
188
|
# must subtract 1 to avoid rolling over to the previous month
|
|
147
189
|
days_to_subtract = plc.binaryop.binary_operation(
|
|
148
190
|
days_to_subtract,
|
|
149
|
-
plc.
|
|
191
|
+
plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
|
|
150
192
|
plc.binaryop.BinaryOperator.SUB,
|
|
151
193
|
plc.DataType(plc.TypeId.DURATION_DAYS),
|
|
152
194
|
)
|
|
@@ -179,7 +221,7 @@ class TemporalFunction(Expr):
|
|
|
179
221
|
)
|
|
180
222
|
millis_as_micros = plc.binaryop.binary_operation(
|
|
181
223
|
millis,
|
|
182
|
-
plc.
|
|
224
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
183
225
|
plc.binaryop.BinaryOperator.MUL,
|
|
184
226
|
plc.DataType(plc.TypeId.INT32),
|
|
185
227
|
)
|
|
@@ -202,15 +244,15 @@ class TemporalFunction(Expr):
|
|
|
202
244
|
)
|
|
203
245
|
millis_as_nanos = plc.binaryop.binary_operation(
|
|
204
246
|
millis,
|
|
205
|
-
plc.
|
|
247
|
+
plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
|
|
206
248
|
plc.binaryop.BinaryOperator.MUL,
|
|
207
|
-
plc.
|
|
249
|
+
plc.DataType(plc.TypeId.INT32),
|
|
208
250
|
)
|
|
209
251
|
micros_as_nanos = plc.binaryop.binary_operation(
|
|
210
252
|
micros,
|
|
211
|
-
plc.
|
|
253
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
212
254
|
plc.binaryop.BinaryOperator.MUL,
|
|
213
|
-
plc.
|
|
255
|
+
plc.DataType(plc.TypeId.INT32),
|
|
214
256
|
)
|
|
215
257
|
total_nanos = plc.binaryop.binary_operation(
|
|
216
258
|
nanos,
|
|
@@ -6,15 +6,15 @@
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any, NoReturn
|
|
10
10
|
|
|
11
11
|
import pylibcudf as plc
|
|
12
12
|
|
|
13
13
|
from cudf_polars.containers import Column
|
|
14
|
-
from cudf_polars.dsl.expressions.base import
|
|
14
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
|
-
from collections.abc import Hashable
|
|
17
|
+
from collections.abc import Hashable
|
|
18
18
|
|
|
19
19
|
import pyarrow as pa
|
|
20
20
|
|
|
@@ -26,29 +26,31 @@ __all__ = ["Literal", "LiteralColumn"]
|
|
|
26
26
|
class Literal(Expr):
|
|
27
27
|
__slots__ = ("value",)
|
|
28
28
|
_non_child = ("dtype", "value")
|
|
29
|
-
value:
|
|
29
|
+
value: Any # Python scalar
|
|
30
30
|
|
|
31
|
-
def __init__(self, dtype: plc.DataType, value:
|
|
31
|
+
def __init__(self, dtype: plc.DataType, value: Any) -> None:
|
|
32
|
+
if value is None and dtype.id() == plc.TypeId.EMPTY:
|
|
33
|
+
# TypeId.EMPTY not supported by libcudf
|
|
34
|
+
# cuDF Python also maps EMPTY to INT8
|
|
35
|
+
dtype = plc.DataType(plc.TypeId.INT8)
|
|
32
36
|
self.dtype = dtype
|
|
33
|
-
assert value.type == plc.interop.to_arrow(dtype)
|
|
34
37
|
self.value = value
|
|
35
38
|
self.children = ()
|
|
36
39
|
self.is_pointwise = True
|
|
37
40
|
|
|
38
41
|
def do_evaluate(
|
|
39
|
-
self,
|
|
40
|
-
df: DataFrame,
|
|
41
|
-
*,
|
|
42
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
43
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
42
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
44
43
|
) -> Column:
|
|
45
44
|
"""Evaluate this expression given a dataframe for context."""
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
return Column(
|
|
46
|
+
plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype), 1)
|
|
47
|
+
)
|
|
48
48
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
@property
|
|
50
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
51
|
+
raise NotImplementedError(
|
|
52
|
+
"Not expecting to require agg request of literal"
|
|
53
|
+
) # pragma: no cover
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
class LiteralColumn(Expr):
|
|
@@ -70,16 +72,14 @@ class LiteralColumn(Expr):
|
|
|
70
72
|
return (type(self), self.dtype, id(self.value))
|
|
71
73
|
|
|
72
74
|
def do_evaluate(
|
|
73
|
-
self,
|
|
74
|
-
df: DataFrame,
|
|
75
|
-
*,
|
|
76
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
77
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
75
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
78
76
|
) -> Column:
|
|
79
77
|
"""Evaluate this expression given a dataframe for context."""
|
|
80
78
|
# datatype of pyarrow array is correct by construction.
|
|
81
79
|
return Column(plc.interop.from_arrow(self.value))
|
|
82
80
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
@property
|
|
82
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
83
|
+
raise NotImplementedError(
|
|
84
|
+
"Not expecting to require agg request of literal"
|
|
85
|
+
) # pragma: no cover
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -8,24 +8,125 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
import pylibcudf as plc
|
|
12
|
+
|
|
13
|
+
from cudf_polars.containers import Column
|
|
14
|
+
from cudf_polars.dsl import expr
|
|
15
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
16
|
+
from cudf_polars.dsl.utils.windows import range_window_bounds
|
|
12
17
|
|
|
13
18
|
if TYPE_CHECKING:
|
|
14
|
-
import
|
|
19
|
+
import pyarrow as pa
|
|
20
|
+
|
|
21
|
+
from cudf_polars.containers import DataFrame
|
|
22
|
+
from cudf_polars.typing import ClosedInterval
|
|
23
|
+
|
|
24
|
+
__all__ = ["GroupedRollingWindow", "RollingWindow", "to_request"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def to_request(
|
|
28
|
+
value: expr.Expr, orderby: Column, df: DataFrame
|
|
29
|
+
) -> plc.rolling.RollingRequest:
|
|
30
|
+
"""
|
|
31
|
+
Produce a rolling request for evaluation with pylibcudf.
|
|
15
32
|
|
|
16
|
-
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
value
|
|
36
|
+
The expression to perform the rolling aggregation on.
|
|
37
|
+
orderby
|
|
38
|
+
Orderby column, used as input to the request when the aggregation is Len.
|
|
39
|
+
df
|
|
40
|
+
DataFrame used to evaluate the inputs to the aggregation.
|
|
41
|
+
"""
|
|
42
|
+
min_periods = 1
|
|
43
|
+
if isinstance(value, expr.Len):
|
|
44
|
+
# A count aggregation, we need a column so use the orderby column
|
|
45
|
+
col = orderby
|
|
46
|
+
elif isinstance(value, expr.Agg):
|
|
47
|
+
child = value.children[0]
|
|
48
|
+
col = child.evaluate(df, context=ExecutionContext.ROLLING)
|
|
49
|
+
if value.name == "var":
|
|
50
|
+
# Polars variance produces null if nvalues <= ddof
|
|
51
|
+
# libcudf produces NaN. However, we can get the polars
|
|
52
|
+
# behaviour by setting the minimum window size to ddof +
|
|
53
|
+
# 1.
|
|
54
|
+
min_periods = value.options + 1
|
|
55
|
+
else:
|
|
56
|
+
col = value.evaluate(
|
|
57
|
+
df, context=ExecutionContext.ROLLING
|
|
58
|
+
) # pragma: no cover; raise before we get here because we
|
|
59
|
+
# don't do correct handling of empty groups
|
|
60
|
+
return plc.rolling.RollingRequest(col.obj, min_periods, value.agg_request)
|
|
17
61
|
|
|
18
62
|
|
|
19
63
|
class RollingWindow(Expr):
|
|
20
|
-
__slots__ = ("
|
|
21
|
-
_non_child = ("dtype", "
|
|
64
|
+
__slots__ = ("closed_window", "following", "orderby", "preceding")
|
|
65
|
+
_non_child = ("dtype", "preceding", "following", "closed_window", "orderby")
|
|
22
66
|
|
|
23
|
-
def __init__(
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
dtype: plc.DataType,
|
|
70
|
+
preceding: pa.Scalar,
|
|
71
|
+
following: pa.Scalar,
|
|
72
|
+
closed_window: ClosedInterval,
|
|
73
|
+
orderby: str,
|
|
74
|
+
agg: Expr,
|
|
75
|
+
) -> None:
|
|
24
76
|
self.dtype = dtype
|
|
25
|
-
self.
|
|
77
|
+
self.preceding = preceding
|
|
78
|
+
self.following = following
|
|
79
|
+
self.closed_window = closed_window
|
|
80
|
+
self.orderby = orderby
|
|
26
81
|
self.children = (agg,)
|
|
27
82
|
self.is_pointwise = False
|
|
28
|
-
|
|
83
|
+
if agg.agg_request.kind() == plc.aggregation.Kind.COLLECT_LIST:
|
|
84
|
+
raise NotImplementedError(
|
|
85
|
+
"Incorrect handling of empty groups for list collection"
|
|
86
|
+
)
|
|
87
|
+
if not plc.rolling.is_valid_rolling_aggregation(agg.dtype, agg.agg_request):
|
|
88
|
+
raise NotImplementedError(f"Unsupported rolling aggregation {agg}")
|
|
89
|
+
|
|
90
|
+
def do_evaluate( # noqa: D102
|
|
91
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
92
|
+
) -> Column:
|
|
93
|
+
if context != ExecutionContext.FRAME:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
"Rolling aggregation inside groupby/over/rolling"
|
|
96
|
+
) # pragma: no cover; translation raises first
|
|
97
|
+
(agg,) = self.children
|
|
98
|
+
orderby = df.column_map[self.orderby]
|
|
99
|
+
# Polars casts integral orderby to int64, but only for calculating window bounds
|
|
100
|
+
if (
|
|
101
|
+
plc.traits.is_integral(orderby.obj.type())
|
|
102
|
+
and orderby.obj.type().id() != plc.TypeId.INT64
|
|
103
|
+
):
|
|
104
|
+
orderby_obj = plc.unary.cast(orderby.obj, plc.DataType(plc.TypeId.INT64))
|
|
105
|
+
else:
|
|
106
|
+
orderby_obj = orderby.obj
|
|
107
|
+
preceding, following = range_window_bounds(
|
|
108
|
+
self.preceding, self.following, self.closed_window
|
|
109
|
+
)
|
|
110
|
+
if orderby.obj.null_count() != 0:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"Index column '{self.orderby}' in rolling may not contain nulls"
|
|
113
|
+
)
|
|
114
|
+
if not orderby.check_sorted(
|
|
115
|
+
order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE
|
|
116
|
+
):
|
|
117
|
+
raise RuntimeError(
|
|
118
|
+
f"Index column '{self.orderby}' in rolling is not sorted, please sort first"
|
|
119
|
+
)
|
|
120
|
+
(result,) = plc.rolling.grouped_range_rolling_window(
|
|
121
|
+
plc.Table([]),
|
|
122
|
+
orderby_obj,
|
|
123
|
+
plc.types.Order.ASCENDING,
|
|
124
|
+
plc.types.NullOrder.BEFORE,
|
|
125
|
+
preceding,
|
|
126
|
+
following,
|
|
127
|
+
[to_request(agg, orderby, df)],
|
|
128
|
+
).columns()
|
|
129
|
+
return Column(result)
|
|
29
130
|
|
|
30
131
|
|
|
31
132
|
class GroupedRollingWindow(Expr):
|
|
@@ -8,16 +8,12 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
10
|
|
|
11
|
-
import pyarrow as pa
|
|
12
|
-
|
|
13
11
|
import pylibcudf as plc
|
|
14
12
|
|
|
15
13
|
from cudf_polars.containers import Column
|
|
16
14
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
15
|
|
|
18
16
|
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import Mapping
|
|
20
|
-
|
|
21
17
|
from cudf_polars.containers import DataFrame
|
|
22
18
|
|
|
23
19
|
__all__ = ["Filter", "Gather"]
|
|
@@ -33,16 +29,11 @@ class Gather(Expr):
|
|
|
33
29
|
self.is_pointwise = False
|
|
34
30
|
|
|
35
31
|
def do_evaluate(
|
|
36
|
-
self,
|
|
37
|
-
df: DataFrame,
|
|
38
|
-
*,
|
|
39
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
40
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
32
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
41
33
|
) -> Column:
|
|
42
34
|
"""Evaluate this expression given a dataframe for context."""
|
|
43
35
|
values, indices = (
|
|
44
|
-
child.evaluate(df, context=context
|
|
45
|
-
for child in self.children
|
|
36
|
+
child.evaluate(df, context=context) for child in self.children
|
|
46
37
|
)
|
|
47
38
|
lo, hi = plc.reduce.minmax(indices.obj)
|
|
48
39
|
lo = plc.interop.to_arrow(lo).as_py()
|
|
@@ -54,9 +45,7 @@ class Gather(Expr):
|
|
|
54
45
|
bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
|
|
55
46
|
obj = plc.replace.replace_nulls(
|
|
56
47
|
indices.obj,
|
|
57
|
-
plc.
|
|
58
|
-
pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
|
|
59
|
-
),
|
|
48
|
+
plc.Scalar.from_py(n, dtype=indices.obj.type()),
|
|
60
49
|
)
|
|
61
50
|
else:
|
|
62
51
|
bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
|
|
@@ -72,20 +61,13 @@ class Filter(Expr):
|
|
|
72
61
|
def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
|
|
73
62
|
self.dtype = dtype
|
|
74
63
|
self.children = (values, indices)
|
|
75
|
-
self.is_pointwise =
|
|
64
|
+
self.is_pointwise = False
|
|
76
65
|
|
|
77
66
|
def do_evaluate(
|
|
78
|
-
self,
|
|
79
|
-
df: DataFrame,
|
|
80
|
-
*,
|
|
81
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
82
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
67
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
83
68
|
) -> Column:
|
|
84
69
|
"""Evaluate this expression given a dataframe for context."""
|
|
85
|
-
values, mask = (
|
|
86
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
87
|
-
for child in self.children
|
|
88
|
-
)
|
|
70
|
+
values, mask = (child.evaluate(df, context=context) for child in self.children)
|
|
89
71
|
table = plc.stream_compaction.apply_boolean_mask(
|
|
90
72
|
plc.Table([values.obj]), mask.obj
|
|
91
73
|
)
|
|
@@ -14,8 +14,6 @@ from cudf_polars.dsl.expressions.base import (
|
|
|
14
14
|
)
|
|
15
15
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
|
-
from collections.abc import Mapping
|
|
18
|
-
|
|
19
17
|
import pylibcudf as plc
|
|
20
18
|
|
|
21
19
|
from cudf_polars.containers import Column, DataFrame
|
|
@@ -41,13 +39,9 @@ class Slice(Expr):
|
|
|
41
39
|
self.children = (column,)
|
|
42
40
|
|
|
43
41
|
def do_evaluate(
|
|
44
|
-
self,
|
|
45
|
-
df: DataFrame,
|
|
46
|
-
*,
|
|
47
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
48
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
42
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
49
43
|
) -> Column:
|
|
50
44
|
"""Evaluate this expression given a dataframe for context."""
|
|
51
45
|
(child,) = self.children
|
|
52
|
-
column = child.evaluate(df, context=context
|
|
46
|
+
column = child.evaluate(df, context=context)
|
|
53
47
|
return column.slice((self.offset, self.length))
|