cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +82 -65
- cudf_polars/containers/column.py +138 -7
- cudf_polars/containers/dataframe.py +26 -39
- cudf_polars/dsl/expr.py +3 -1
- cudf_polars/dsl/expressions/aggregation.py +27 -63
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +5 -41
- cudf_polars/dsl/expressions/boolean.py +25 -53
- cudf_polars/dsl/expressions/datetime.py +97 -17
- cudf_polars/dsl/expressions/literal.py +27 -33
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +8 -26
- cudf_polars/dsl/expressions/slicing.py +47 -0
- cudf_polars/dsl/expressions/sorting.py +5 -18
- cudf_polars/dsl/expressions/string.py +33 -36
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +35 -75
- cudf_polars/dsl/ir.py +749 -212
- cudf_polars/dsl/nodebase.py +8 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +319 -171
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +17 -19
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +288 -0
- cudf_polars/experimental/io.py +58 -29
- cudf_polars/experimental/join.py +353 -0
- cudf_polars/experimental/parallel.py +166 -93
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +294 -0
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +78 -76
- cudf_polars/typing/__init__.py +59 -6
- cudf_polars/utils/config.py +353 -0
- cudf_polars/utils/conversion.py +40 -0
- cudf_polars/utils/dtypes.py +22 -5
- cudf_polars/utils/timer.py +39 -0
- cudf_polars/utils/versions.py +5 -4
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -59
- cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
- {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -17,8 +17,6 @@ from cudf_polars.containers import Column
|
|
|
17
17
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Mapping
|
|
21
|
-
|
|
22
20
|
from typing_extensions import Self
|
|
23
21
|
|
|
24
22
|
from polars.polars import _expr_nodes as pl_expr
|
|
@@ -104,6 +102,18 @@ class TemporalFunction(Expr):
|
|
|
104
102
|
Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
|
|
105
103
|
}
|
|
106
104
|
|
|
105
|
+
_valid_ops: ClassVar[set[Name]] = {
|
|
106
|
+
*_COMPONENT_MAP.keys(),
|
|
107
|
+
Name.IsLeapYear,
|
|
108
|
+
Name.OrdinalDay,
|
|
109
|
+
Name.ToString,
|
|
110
|
+
Name.Week,
|
|
111
|
+
Name.IsoYear,
|
|
112
|
+
Name.MonthStart,
|
|
113
|
+
Name.MonthEnd,
|
|
114
|
+
Name.CastTimeUnit,
|
|
115
|
+
}
|
|
116
|
+
|
|
107
117
|
def __init__(
|
|
108
118
|
self,
|
|
109
119
|
dtype: plc.DataType,
|
|
@@ -116,22 +126,92 @@ class TemporalFunction(Expr):
|
|
|
116
126
|
self.name = name
|
|
117
127
|
self.children = children
|
|
118
128
|
self.is_pointwise = True
|
|
119
|
-
if self.name not in self.
|
|
129
|
+
if self.name not in self._valid_ops:
|
|
120
130
|
raise NotImplementedError(f"Temporal function {self.name}")
|
|
121
131
|
|
|
132
|
+
if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
|
|
133
|
+
self.children[0].dtype
|
|
134
|
+
):
|
|
135
|
+
raise NotImplementedError("ToString is not supported on duration types")
|
|
136
|
+
|
|
122
137
|
def do_evaluate(
|
|
123
|
-
self,
|
|
124
|
-
df: DataFrame,
|
|
125
|
-
*,
|
|
126
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
127
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
138
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
128
139
|
) -> Column:
|
|
129
140
|
"""Evaluate this expression given a dataframe for context."""
|
|
130
|
-
columns = [
|
|
131
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
132
|
-
for child in self.children
|
|
133
|
-
]
|
|
141
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
134
142
|
(column,) = columns
|
|
143
|
+
if self.name is TemporalFunction.Name.CastTimeUnit:
|
|
144
|
+
(unit,) = self.options
|
|
145
|
+
if plc.traits.is_timestamp(column.obj.type()):
|
|
146
|
+
dtype = plc.interop.from_arrow(pa.timestamp(unit))
|
|
147
|
+
elif plc.traits.is_duration(column.obj.type()):
|
|
148
|
+
dtype = plc.interop.from_arrow(pa.duration(unit))
|
|
149
|
+
result = plc.unary.cast(column.obj, dtype)
|
|
150
|
+
return Column(result)
|
|
151
|
+
if self.name == TemporalFunction.Name.ToString:
|
|
152
|
+
return Column(
|
|
153
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
154
|
+
column.obj,
|
|
155
|
+
self.options[0],
|
|
156
|
+
plc.Column.from_iterable_of_py(
|
|
157
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
if self.name is TemporalFunction.Name.Week:
|
|
162
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
163
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
164
|
+
column.obj,
|
|
165
|
+
format="%V",
|
|
166
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
167
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
168
|
+
),
|
|
169
|
+
),
|
|
170
|
+
plc.types.DataType(plc.types.TypeId.INT8),
|
|
171
|
+
)
|
|
172
|
+
return Column(result)
|
|
173
|
+
if self.name is TemporalFunction.Name.IsoYear:
|
|
174
|
+
result = plc.strings.convert.convert_integers.to_integers(
|
|
175
|
+
plc.strings.convert.convert_datetime.from_timestamps(
|
|
176
|
+
column.obj,
|
|
177
|
+
format="%G",
|
|
178
|
+
input_strings_names=plc.Column.from_iterable_of_py(
|
|
179
|
+
[], dtype=plc.DataType(plc.TypeId.STRING)
|
|
180
|
+
),
|
|
181
|
+
),
|
|
182
|
+
plc.types.DataType(plc.types.TypeId.INT32),
|
|
183
|
+
)
|
|
184
|
+
return Column(result)
|
|
185
|
+
if self.name is TemporalFunction.Name.MonthStart:
|
|
186
|
+
ends = plc.datetime.last_day_of_month(column.obj)
|
|
187
|
+
days_to_subtract = plc.datetime.days_in_month(column.obj)
|
|
188
|
+
# must subtract 1 to avoid rolling over to the previous month
|
|
189
|
+
days_to_subtract = plc.binaryop.binary_operation(
|
|
190
|
+
days_to_subtract,
|
|
191
|
+
plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
|
|
192
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
193
|
+
plc.DataType(plc.TypeId.DURATION_DAYS),
|
|
194
|
+
)
|
|
195
|
+
result = plc.binaryop.binary_operation(
|
|
196
|
+
ends,
|
|
197
|
+
days_to_subtract,
|
|
198
|
+
plc.binaryop.BinaryOperator.SUB,
|
|
199
|
+
column.obj.type(),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return Column(result)
|
|
203
|
+
if self.name is TemporalFunction.Name.MonthEnd:
|
|
204
|
+
return Column(
|
|
205
|
+
plc.unary.cast(
|
|
206
|
+
plc.datetime.last_day_of_month(column.obj), column.obj.type()
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
if self.name is TemporalFunction.Name.IsLeapYear:
|
|
210
|
+
return Column(
|
|
211
|
+
plc.datetime.is_leap_year(column.obj),
|
|
212
|
+
)
|
|
213
|
+
if self.name is TemporalFunction.Name.OrdinalDay:
|
|
214
|
+
return Column(plc.datetime.day_of_year(column.obj))
|
|
135
215
|
if self.name is TemporalFunction.Name.Microsecond:
|
|
136
216
|
millis = plc.datetime.extract_datetime_component(
|
|
137
217
|
column.obj, plc.datetime.DatetimeComponent.MILLISECOND
|
|
@@ -141,7 +221,7 @@ class TemporalFunction(Expr):
|
|
|
141
221
|
)
|
|
142
222
|
millis_as_micros = plc.binaryop.binary_operation(
|
|
143
223
|
millis,
|
|
144
|
-
plc.
|
|
224
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
145
225
|
plc.binaryop.BinaryOperator.MUL,
|
|
146
226
|
plc.DataType(plc.TypeId.INT32),
|
|
147
227
|
)
|
|
@@ -164,15 +244,15 @@ class TemporalFunction(Expr):
|
|
|
164
244
|
)
|
|
165
245
|
millis_as_nanos = plc.binaryop.binary_operation(
|
|
166
246
|
millis,
|
|
167
|
-
plc.
|
|
247
|
+
plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
|
|
168
248
|
plc.binaryop.BinaryOperator.MUL,
|
|
169
|
-
plc.
|
|
249
|
+
plc.DataType(plc.TypeId.INT32),
|
|
170
250
|
)
|
|
171
251
|
micros_as_nanos = plc.binaryop.binary_operation(
|
|
172
252
|
micros,
|
|
173
|
-
plc.
|
|
253
|
+
plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
|
|
174
254
|
plc.binaryop.BinaryOperator.MUL,
|
|
175
|
-
plc.
|
|
255
|
+
plc.DataType(plc.TypeId.INT32),
|
|
176
256
|
)
|
|
177
257
|
total_nanos = plc.binaryop.binary_operation(
|
|
178
258
|
nanos,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -6,23 +6,18 @@
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
10
|
-
|
|
11
|
-
import pyarrow as pa
|
|
9
|
+
from typing import TYPE_CHECKING, Any, NoReturn
|
|
12
10
|
|
|
13
11
|
import pylibcudf as plc
|
|
14
12
|
|
|
15
13
|
from cudf_polars.containers import Column
|
|
16
|
-
from cudf_polars.dsl.expressions.base import
|
|
17
|
-
from cudf_polars.utils import dtypes
|
|
14
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
18
15
|
|
|
19
16
|
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Hashable
|
|
17
|
+
from collections.abc import Hashable
|
|
21
18
|
|
|
22
19
|
import pyarrow as pa
|
|
23
20
|
|
|
24
|
-
import polars as pl
|
|
25
|
-
|
|
26
21
|
from cudf_polars.containers import DataFrame
|
|
27
22
|
|
|
28
23
|
__all__ = ["Literal", "LiteralColumn"]
|
|
@@ -31,29 +26,31 @@ __all__ = ["Literal", "LiteralColumn"]
|
|
|
31
26
|
class Literal(Expr):
|
|
32
27
|
__slots__ = ("value",)
|
|
33
28
|
_non_child = ("dtype", "value")
|
|
34
|
-
value:
|
|
29
|
+
value: Any # Python scalar
|
|
35
30
|
|
|
36
|
-
def __init__(self, dtype: plc.DataType, value:
|
|
31
|
+
def __init__(self, dtype: plc.DataType, value: Any) -> None:
|
|
32
|
+
if value is None and dtype.id() == plc.TypeId.EMPTY:
|
|
33
|
+
# TypeId.EMPTY not supported by libcudf
|
|
34
|
+
# cuDF Python also maps EMPTY to INT8
|
|
35
|
+
dtype = plc.DataType(plc.TypeId.INT8)
|
|
37
36
|
self.dtype = dtype
|
|
38
|
-
assert value.type == plc.interop.to_arrow(dtype)
|
|
39
37
|
self.value = value
|
|
40
38
|
self.children = ()
|
|
41
39
|
self.is_pointwise = True
|
|
42
40
|
|
|
43
41
|
def do_evaluate(
|
|
44
|
-
self,
|
|
45
|
-
df: DataFrame,
|
|
46
|
-
*,
|
|
47
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
48
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
42
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
49
43
|
) -> Column:
|
|
50
44
|
"""Evaluate this expression given a dataframe for context."""
|
|
51
|
-
|
|
52
|
-
|
|
45
|
+
return Column(
|
|
46
|
+
plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype), 1)
|
|
47
|
+
)
|
|
53
48
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
49
|
+
@property
|
|
50
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
51
|
+
raise NotImplementedError(
|
|
52
|
+
"Not expecting to require agg request of literal"
|
|
53
|
+
) # pragma: no cover
|
|
57
54
|
|
|
58
55
|
|
|
59
56
|
class LiteralColumn(Expr):
|
|
@@ -61,10 +58,9 @@ class LiteralColumn(Expr):
|
|
|
61
58
|
_non_child = ("dtype", "value")
|
|
62
59
|
value: pa.Array[Any]
|
|
63
60
|
|
|
64
|
-
def __init__(self, dtype: plc.DataType, value:
|
|
61
|
+
def __init__(self, dtype: plc.DataType, value: pa.Array) -> None:
|
|
65
62
|
self.dtype = dtype
|
|
66
|
-
|
|
67
|
-
self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
|
|
63
|
+
self.value = value
|
|
68
64
|
self.children = ()
|
|
69
65
|
self.is_pointwise = True
|
|
70
66
|
|
|
@@ -76,16 +72,14 @@ class LiteralColumn(Expr):
|
|
|
76
72
|
return (type(self), self.dtype, id(self.value))
|
|
77
73
|
|
|
78
74
|
def do_evaluate(
|
|
79
|
-
self,
|
|
80
|
-
df: DataFrame,
|
|
81
|
-
*,
|
|
82
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
83
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
75
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
84
76
|
) -> Column:
|
|
85
77
|
"""Evaluate this expression given a dataframe for context."""
|
|
86
78
|
# datatype of pyarrow array is correct by construction.
|
|
87
79
|
return Column(plc.interop.from_arrow(self.value))
|
|
88
80
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
81
|
+
@property
|
|
82
|
+
def agg_request(self) -> NoReturn: # noqa: D102
|
|
83
|
+
raise NotImplementedError(
|
|
84
|
+
"Not expecting to require agg request of literal"
|
|
85
|
+
) # pragma: no cover
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -8,24 +8,125 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
import pylibcudf as plc
|
|
12
|
+
|
|
13
|
+
from cudf_polars.containers import Column
|
|
14
|
+
from cudf_polars.dsl import expr
|
|
15
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
16
|
+
from cudf_polars.dsl.utils.windows import range_window_bounds
|
|
12
17
|
|
|
13
18
|
if TYPE_CHECKING:
|
|
14
|
-
import
|
|
19
|
+
import pyarrow as pa
|
|
20
|
+
|
|
21
|
+
from cudf_polars.containers import DataFrame
|
|
22
|
+
from cudf_polars.typing import ClosedInterval
|
|
23
|
+
|
|
24
|
+
__all__ = ["GroupedRollingWindow", "RollingWindow", "to_request"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def to_request(
|
|
28
|
+
value: expr.Expr, orderby: Column, df: DataFrame
|
|
29
|
+
) -> plc.rolling.RollingRequest:
|
|
30
|
+
"""
|
|
31
|
+
Produce a rolling request for evaluation with pylibcudf.
|
|
15
32
|
|
|
16
|
-
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
value
|
|
36
|
+
The expression to perform the rolling aggregation on.
|
|
37
|
+
orderby
|
|
38
|
+
Orderby column, used as input to the request when the aggregation is Len.
|
|
39
|
+
df
|
|
40
|
+
DataFrame used to evaluate the inputs to the aggregation.
|
|
41
|
+
"""
|
|
42
|
+
min_periods = 1
|
|
43
|
+
if isinstance(value, expr.Len):
|
|
44
|
+
# A count aggregation, we need a column so use the orderby column
|
|
45
|
+
col = orderby
|
|
46
|
+
elif isinstance(value, expr.Agg):
|
|
47
|
+
child = value.children[0]
|
|
48
|
+
col = child.evaluate(df, context=ExecutionContext.ROLLING)
|
|
49
|
+
if value.name == "var":
|
|
50
|
+
# Polars variance produces null if nvalues <= ddof
|
|
51
|
+
# libcudf produces NaN. However, we can get the polars
|
|
52
|
+
# behaviour by setting the minimum window size to ddof +
|
|
53
|
+
# 1.
|
|
54
|
+
min_periods = value.options + 1
|
|
55
|
+
else:
|
|
56
|
+
col = value.evaluate(
|
|
57
|
+
df, context=ExecutionContext.ROLLING
|
|
58
|
+
) # pragma: no cover; raise before we get here because we
|
|
59
|
+
# don't do correct handling of empty groups
|
|
60
|
+
return plc.rolling.RollingRequest(col.obj, min_periods, value.agg_request)
|
|
17
61
|
|
|
18
62
|
|
|
19
63
|
class RollingWindow(Expr):
|
|
20
|
-
__slots__ = ("
|
|
21
|
-
_non_child = ("dtype", "
|
|
64
|
+
__slots__ = ("closed_window", "following", "orderby", "preceding")
|
|
65
|
+
_non_child = ("dtype", "preceding", "following", "closed_window", "orderby")
|
|
22
66
|
|
|
23
|
-
def __init__(
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
dtype: plc.DataType,
|
|
70
|
+
preceding: pa.Scalar,
|
|
71
|
+
following: pa.Scalar,
|
|
72
|
+
closed_window: ClosedInterval,
|
|
73
|
+
orderby: str,
|
|
74
|
+
agg: Expr,
|
|
75
|
+
) -> None:
|
|
24
76
|
self.dtype = dtype
|
|
25
|
-
self.
|
|
77
|
+
self.preceding = preceding
|
|
78
|
+
self.following = following
|
|
79
|
+
self.closed_window = closed_window
|
|
80
|
+
self.orderby = orderby
|
|
26
81
|
self.children = (agg,)
|
|
27
82
|
self.is_pointwise = False
|
|
28
|
-
|
|
83
|
+
if agg.agg_request.kind() == plc.aggregation.Kind.COLLECT_LIST:
|
|
84
|
+
raise NotImplementedError(
|
|
85
|
+
"Incorrect handling of empty groups for list collection"
|
|
86
|
+
)
|
|
87
|
+
if not plc.rolling.is_valid_rolling_aggregation(agg.dtype, agg.agg_request):
|
|
88
|
+
raise NotImplementedError(f"Unsupported rolling aggregation {agg}")
|
|
89
|
+
|
|
90
|
+
def do_evaluate( # noqa: D102
|
|
91
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
92
|
+
) -> Column:
|
|
93
|
+
if context != ExecutionContext.FRAME:
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
"Rolling aggregation inside groupby/over/rolling"
|
|
96
|
+
) # pragma: no cover; translation raises first
|
|
97
|
+
(agg,) = self.children
|
|
98
|
+
orderby = df.column_map[self.orderby]
|
|
99
|
+
# Polars casts integral orderby to int64, but only for calculating window bounds
|
|
100
|
+
if (
|
|
101
|
+
plc.traits.is_integral(orderby.obj.type())
|
|
102
|
+
and orderby.obj.type().id() != plc.TypeId.INT64
|
|
103
|
+
):
|
|
104
|
+
orderby_obj = plc.unary.cast(orderby.obj, plc.DataType(plc.TypeId.INT64))
|
|
105
|
+
else:
|
|
106
|
+
orderby_obj = orderby.obj
|
|
107
|
+
preceding, following = range_window_bounds(
|
|
108
|
+
self.preceding, self.following, self.closed_window
|
|
109
|
+
)
|
|
110
|
+
if orderby.obj.null_count() != 0:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"Index column '{self.orderby}' in rolling may not contain nulls"
|
|
113
|
+
)
|
|
114
|
+
if not orderby.check_sorted(
|
|
115
|
+
order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE
|
|
116
|
+
):
|
|
117
|
+
raise RuntimeError(
|
|
118
|
+
f"Index column '{self.orderby}' in rolling is not sorted, please sort first"
|
|
119
|
+
)
|
|
120
|
+
(result,) = plc.rolling.grouped_range_rolling_window(
|
|
121
|
+
plc.Table([]),
|
|
122
|
+
orderby_obj,
|
|
123
|
+
plc.types.Order.ASCENDING,
|
|
124
|
+
plc.types.NullOrder.BEFORE,
|
|
125
|
+
preceding,
|
|
126
|
+
following,
|
|
127
|
+
[to_request(agg, orderby, df)],
|
|
128
|
+
).columns()
|
|
129
|
+
return Column(result)
|
|
29
130
|
|
|
30
131
|
|
|
31
132
|
class GroupedRollingWindow(Expr):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -8,16 +8,12 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING
|
|
10
10
|
|
|
11
|
-
import pyarrow as pa
|
|
12
|
-
|
|
13
11
|
import pylibcudf as plc
|
|
14
12
|
|
|
15
13
|
from cudf_polars.containers import Column
|
|
16
14
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
17
15
|
|
|
18
16
|
if TYPE_CHECKING:
|
|
19
|
-
from collections.abc import Mapping
|
|
20
|
-
|
|
21
17
|
from cudf_polars.containers import DataFrame
|
|
22
18
|
|
|
23
19
|
__all__ = ["Filter", "Gather"]
|
|
@@ -33,16 +29,11 @@ class Gather(Expr):
|
|
|
33
29
|
self.is_pointwise = False
|
|
34
30
|
|
|
35
31
|
def do_evaluate(
|
|
36
|
-
self,
|
|
37
|
-
df: DataFrame,
|
|
38
|
-
*,
|
|
39
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
40
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
32
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
41
33
|
) -> Column:
|
|
42
34
|
"""Evaluate this expression given a dataframe for context."""
|
|
43
35
|
values, indices = (
|
|
44
|
-
child.evaluate(df, context=context
|
|
45
|
-
for child in self.children
|
|
36
|
+
child.evaluate(df, context=context) for child in self.children
|
|
46
37
|
)
|
|
47
38
|
lo, hi = plc.reduce.minmax(indices.obj)
|
|
48
39
|
lo = plc.interop.to_arrow(lo).as_py()
|
|
@@ -50,13 +41,11 @@ class Gather(Expr):
|
|
|
50
41
|
n = df.num_rows
|
|
51
42
|
if hi >= n or lo < -n:
|
|
52
43
|
raise ValueError("gather indices are out of bounds")
|
|
53
|
-
if indices.
|
|
44
|
+
if indices.null_count:
|
|
54
45
|
bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
|
|
55
46
|
obj = plc.replace.replace_nulls(
|
|
56
47
|
indices.obj,
|
|
57
|
-
plc.
|
|
58
|
-
pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
|
|
59
|
-
),
|
|
48
|
+
plc.Scalar.from_py(n, dtype=indices.obj.type()),
|
|
60
49
|
)
|
|
61
50
|
else:
|
|
62
51
|
bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
|
|
@@ -72,20 +61,13 @@ class Filter(Expr):
|
|
|
72
61
|
def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
|
|
73
62
|
self.dtype = dtype
|
|
74
63
|
self.children = (values, indices)
|
|
75
|
-
self.is_pointwise =
|
|
64
|
+
self.is_pointwise = False
|
|
76
65
|
|
|
77
66
|
def do_evaluate(
|
|
78
|
-
self,
|
|
79
|
-
df: DataFrame,
|
|
80
|
-
*,
|
|
81
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
82
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
67
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
83
68
|
) -> Column:
|
|
84
69
|
"""Evaluate this expression given a dataframe for context."""
|
|
85
|
-
values, mask = (
|
|
86
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
87
|
-
for child in self.children
|
|
88
|
-
)
|
|
70
|
+
values, mask = (child.evaluate(df, context=context) for child in self.children)
|
|
89
71
|
table = plc.stream_compaction.apply_boolean_mask(
|
|
90
72
|
plc.Table([values.obj]), mask.obj
|
|
91
73
|
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
# TODO: remove need for this
|
|
4
|
+
# ruff: noqa: D101
|
|
5
|
+
"""Slicing DSL nodes."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from cudf_polars.dsl.expressions.base import (
|
|
12
|
+
ExecutionContext,
|
|
13
|
+
Expr,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
import pylibcudf as plc
|
|
18
|
+
|
|
19
|
+
from cudf_polars.containers import Column, DataFrame
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = ["Slice"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Slice(Expr):
|
|
26
|
+
__slots__ = ("length", "offset")
|
|
27
|
+
_non_child = ("dtype", "offset", "length")
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
dtype: plc.DataType,
|
|
32
|
+
offset: int,
|
|
33
|
+
length: int,
|
|
34
|
+
column: Expr,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.dtype = dtype
|
|
37
|
+
self.offset = offset
|
|
38
|
+
self.length = length
|
|
39
|
+
self.children = (column,)
|
|
40
|
+
|
|
41
|
+
def do_evaluate(
|
|
42
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
43
|
+
) -> Column:
|
|
44
|
+
"""Evaluate this expression given a dataframe for context."""
|
|
45
|
+
(child,) = self.children
|
|
46
|
+
column = child.evaluate(df, context=context)
|
|
47
|
+
return column.slice((self.offset, self.length))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -15,8 +15,6 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
|
15
15
|
from cudf_polars.utils import sorting
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
|
-
from collections.abc import Mapping
|
|
19
|
-
|
|
20
18
|
from cudf_polars.containers import DataFrame
|
|
21
19
|
|
|
22
20
|
__all__ = ["Sort", "SortBy"]
|
|
@@ -35,15 +33,11 @@ class Sort(Expr):
|
|
|
35
33
|
self.is_pointwise = False
|
|
36
34
|
|
|
37
35
|
def do_evaluate(
|
|
38
|
-
self,
|
|
39
|
-
df: DataFrame,
|
|
40
|
-
*,
|
|
41
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
42
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
36
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
43
37
|
) -> Column:
|
|
44
38
|
"""Evaluate this expression given a dataframe for context."""
|
|
45
39
|
(child,) = self.children
|
|
46
|
-
column = child.evaluate(df, context=context
|
|
40
|
+
column = child.evaluate(df, context=context)
|
|
47
41
|
(stable, nulls_last, descending) = self.options
|
|
48
42
|
order, null_order = sorting.sort_order(
|
|
49
43
|
[descending], nulls_last=[nulls_last], num_keys=1
|
|
@@ -75,17 +69,10 @@ class SortBy(Expr):
|
|
|
75
69
|
self.is_pointwise = False
|
|
76
70
|
|
|
77
71
|
def do_evaluate(
|
|
78
|
-
self,
|
|
79
|
-
df: DataFrame,
|
|
80
|
-
*,
|
|
81
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
82
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
72
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
83
73
|
) -> Column:
|
|
84
74
|
"""Evaluate this expression given a dataframe for context."""
|
|
85
|
-
column, *by = (
|
|
86
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
87
|
-
for child in self.children
|
|
88
|
-
)
|
|
75
|
+
column, *by = (child.evaluate(df, context=context) for child in self.children)
|
|
89
76
|
(stable, nulls_last, descending) = self.options
|
|
90
77
|
order, null_order = sorting.sort_order(
|
|
91
78
|
descending, nulls_last=nulls_last, num_keys=len(by)
|