cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +35 -50
  3. cudf_polars/containers/column.py +38 -0
  4. cudf_polars/containers/dataframe.py +11 -16
  5. cudf_polars/dsl/expressions/aggregation.py +25 -61
  6. cudf_polars/dsl/expressions/base.py +40 -72
  7. cudf_polars/dsl/expressions/binaryop.py +3 -39
  8. cudf_polars/dsl/expressions/boolean.py +21 -49
  9. cudf_polars/dsl/expressions/datetime.py +59 -17
  10. cudf_polars/dsl/expressions/literal.py +24 -24
  11. cudf_polars/dsl/expressions/rolling.py +110 -9
  12. cudf_polars/dsl/expressions/selection.py +6 -24
  13. cudf_polars/dsl/expressions/slicing.py +2 -8
  14. cudf_polars/dsl/expressions/sorting.py +4 -17
  15. cudf_polars/dsl/expressions/string.py +29 -32
  16. cudf_polars/dsl/expressions/ternary.py +3 -10
  17. cudf_polars/dsl/expressions/unary.py +32 -73
  18. cudf_polars/dsl/ir.py +575 -167
  19. cudf_polars/dsl/nodebase.py +1 -1
  20. cudf_polars/dsl/to_ast.py +5 -3
  21. cudf_polars/dsl/translate.py +272 -152
  22. cudf_polars/dsl/utils/__init__.py +8 -0
  23. cudf_polars/dsl/utils/aggregations.py +292 -0
  24. cudf_polars/dsl/utils/groupby.py +97 -0
  25. cudf_polars/dsl/utils/naming.py +34 -0
  26. cudf_polars/dsl/utils/replace.py +46 -0
  27. cudf_polars/dsl/utils/rolling.py +113 -0
  28. cudf_polars/dsl/utils/windows.py +186 -0
  29. cudf_polars/experimental/base.py +0 -8
  30. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  31. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  32. cudf_polars/experimental/dask_registers.py +196 -0
  33. cudf_polars/experimental/distinct.py +174 -0
  34. cudf_polars/experimental/explain.py +127 -0
  35. cudf_polars/experimental/expressions.py +521 -0
  36. cudf_polars/experimental/groupby.py +109 -167
  37. cudf_polars/experimental/io.py +53 -26
  38. cudf_polars/experimental/join.py +59 -24
  39. cudf_polars/experimental/parallel.py +155 -133
  40. cudf_polars/experimental/repartition.py +69 -0
  41. cudf_polars/experimental/scheduler.py +155 -0
  42. cudf_polars/experimental/select.py +92 -7
  43. cudf_polars/experimental/shuffle.py +109 -9
  44. cudf_polars/experimental/sort.py +45 -0
  45. cudf_polars/experimental/spilling.py +151 -0
  46. cudf_polars/experimental/utils.py +100 -0
  47. cudf_polars/testing/asserts.py +146 -6
  48. cudf_polars/testing/io.py +72 -0
  49. cudf_polars/testing/plugin.py +55 -42
  50. cudf_polars/typing/__init__.py +27 -5
  51. cudf_polars/utils/config.py +317 -102
  52. cudf_polars/utils/dtypes.py +8 -1
  53. cudf_polars/utils/timer.py +1 -1
  54. cudf_polars/utils/versions.py +4 -4
  55. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +7 -5
  56. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  57. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  58. cudf_polars/experimental/dask_serialize.py +0 -73
  59. cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
  60. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/licenses/LICENSE +0 -0
  61. {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -13,11 +13,9 @@ from polars.polars import _expr_nodes as pl_expr
13
13
  import pylibcudf as plc
14
14
 
15
15
  from cudf_polars.containers import Column
16
- from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
16
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
17
 
18
18
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
20
-
21
19
  from cudf_polars.containers import DataFrame
22
20
 
23
21
  __all__ = ["BinOp"]
@@ -85,17 +83,10 @@ class BinOp(Expr):
85
83
  }
86
84
 
87
85
  def do_evaluate(
88
- self,
89
- df: DataFrame,
90
- *,
91
- context: ExecutionContext = ExecutionContext.FRAME,
92
- mapping: Mapping[Expr, Column] | None = None,
86
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
93
87
  ) -> Column:
94
88
  """Evaluate this expression given a dataframe for context."""
95
- left, right = (
96
- child.evaluate(df, context=context, mapping=mapping)
97
- for child in self.children
98
- )
89
+ left, right = (child.evaluate(df, context=context) for child in self.children)
99
90
  lop = left.obj
100
91
  rop = right.obj
101
92
  if left.size != right.size:
@@ -106,30 +97,3 @@ class BinOp(Expr):
106
97
  return Column(
107
98
  plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
108
99
  )
109
-
110
- def collect_agg(self, *, depth: int) -> AggInfo:
111
- """Collect information about aggregations in groupbys."""
112
- if depth == 1:
113
- # inside aggregation, need to pre-evaluate,
114
- # groupby construction has checked that we don't have
115
- # nested aggs, so stop the recursion and return ourselves
116
- # for pre-eval
117
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
118
- else:
119
- left_info, right_info = (
120
- child.collect_agg(depth=depth) for child in self.children
121
- )
122
- requests = [*left_info.requests, *right_info.requests]
123
- # TODO: Hack, if there were no reductions inside this
124
- # binary expression then we want to pre-evaluate and
125
- # collect ourselves. Otherwise we want to collect the
126
- # aggregations inside and post-evaluate. This is a bad way
127
- # of checking that we are in case 1.
128
- if all(
129
- agg.kind() == plc.aggregation.Kind.COLLECT_LIST
130
- for _, agg, _ in requests
131
- ):
132
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
133
- return AggInfo(
134
- [*left_info.requests, *right_info.requests],
135
- )
@@ -10,8 +10,6 @@ from enum import IntEnum, auto
10
10
  from functools import partial, reduce
11
11
  from typing import TYPE_CHECKING, Any, ClassVar
12
12
 
13
- import pyarrow as pa
14
-
15
13
  import pylibcudf as plc
16
14
 
17
15
  from cudf_polars.containers import Column
@@ -19,10 +17,9 @@ from cudf_polars.dsl.expressions.base import (
19
17
  ExecutionContext,
20
18
  Expr,
21
19
  )
20
+ from cudf_polars.utils.versions import POLARS_VERSION_LT_128
22
21
 
23
22
  if TYPE_CHECKING:
24
- from collections.abc import Mapping
25
-
26
23
  from typing_extensions import Self
27
24
 
28
25
  import polars.type_aliases as pl_types
@@ -89,9 +86,11 @@ class BooleanFunction(Expr):
89
86
  BooleanFunction.Name.IsLastDistinct,
90
87
  BooleanFunction.Name.IsUnique,
91
88
  )
92
- if self.name is BooleanFunction.Name.IsIn and not all(
93
- c.dtype == self.children[0].dtype for c in self.children
94
- ):
89
+ if (
90
+ POLARS_VERSION_LT_128
91
+ and self.name is BooleanFunction.Name.IsIn
92
+ and not all(c.dtype == self.children[0].dtype for c in self.children)
93
+ ): # pragma: no cover
95
94
  # TODO: If polars IR doesn't put the casts in, we need to
96
95
  # mimic the supertype promotion rules.
97
96
  raise NotImplementedError("IsIn doesn't support supertype casting")
@@ -145,11 +144,7 @@ class BooleanFunction(Expr):
145
144
  }
146
145
 
147
146
  def do_evaluate(
148
- self,
149
- df: DataFrame,
150
- *,
151
- context: ExecutionContext = ExecutionContext.FRAME,
152
- mapping: Mapping[Expr, Column] | None = None,
147
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
153
148
  ) -> Column:
154
149
  """Evaluate this expression given a dataframe for context."""
155
150
  if self.name in (
@@ -160,29 +155,22 @@ class BooleanFunction(Expr):
160
155
  (child,) = self.children
161
156
  is_finite = self.name is BooleanFunction.Name.IsFinite
162
157
  if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
163
- value = plc.interop.from_arrow(
164
- pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
165
- )
158
+ value = plc.Scalar.from_py(is_finite)
166
159
  return Column(plc.Column.from_scalar(value, df.num_rows))
167
- needles = child.evaluate(df, context=context, mapping=mapping)
160
+ needles = child.evaluate(df, context=context)
168
161
  to_search = [-float("inf"), float("inf")]
169
162
  if is_finite:
170
163
  # NaN is neither finite not infinite
171
164
  to_search.append(float("nan"))
172
- haystack = plc.interop.from_arrow(
173
- pa.array(
174
- to_search,
175
- type=plc.interop.to_arrow(needles.obj.type()),
176
- )
165
+ haystack = plc.Column.from_iterable_of_py(
166
+ to_search,
167
+ dtype=needles.obj.type(),
177
168
  )
178
169
  result = plc.search.contains(haystack, needles.obj)
179
170
  if is_finite:
180
171
  result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
181
172
  return Column(result)
182
- columns = [
183
- child.evaluate(df, context=context, mapping=mapping)
184
- for child in self.children
185
- ]
173
+ columns = [child.evaluate(df, context=context) for child in self.children]
186
174
  # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
187
175
  # False
188
176
  if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
@@ -233,48 +221,32 @@ class BooleanFunction(Expr):
233
221
  return self._distinct(
234
222
  column,
235
223
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
236
- source_value=plc.interop.from_arrow(
237
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
238
- ),
239
- target_value=plc.interop.from_arrow(
240
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
241
- ),
224
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
225
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
242
226
  )
243
227
  elif self.name is BooleanFunction.Name.IsLastDistinct:
244
228
  (column,) = columns
245
229
  return self._distinct(
246
230
  column,
247
231
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
248
- source_value=plc.interop.from_arrow(
249
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
250
- ),
251
- target_value=plc.interop.from_arrow(
252
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
253
- ),
232
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
233
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
254
234
  )
255
235
  elif self.name is BooleanFunction.Name.IsUnique:
256
236
  (column,) = columns
257
237
  return self._distinct(
258
238
  column,
259
239
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
260
- source_value=plc.interop.from_arrow(
261
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
262
- ),
263
- target_value=plc.interop.from_arrow(
264
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
265
- ),
240
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
241
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
266
242
  )
267
243
  elif self.name is BooleanFunction.Name.IsDuplicated:
268
244
  (column,) = columns
269
245
  return self._distinct(
270
246
  column,
271
247
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
272
- source_value=plc.interop.from_arrow(
273
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
274
- ),
275
- target_value=plc.interop.from_arrow(
276
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
277
- ),
248
+ source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
249
+ target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
278
250
  )
279
251
  elif self.name is BooleanFunction.Name.AllHorizontal:
280
252
  return Column(
@@ -17,8 +17,6 @@ from cudf_polars.containers import Column
17
17
  from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
18
18
 
19
19
  if TYPE_CHECKING:
20
- from collections.abc import Mapping
21
-
22
20
  from typing_extensions import Self
23
21
 
24
22
  from polars.polars import _expr_nodes as pl_expr
@@ -108,8 +106,12 @@ class TemporalFunction(Expr):
108
106
  *_COMPONENT_MAP.keys(),
109
107
  Name.IsLeapYear,
110
108
  Name.OrdinalDay,
109
+ Name.ToString,
110
+ Name.Week,
111
+ Name.IsoYear,
111
112
  Name.MonthStart,
112
113
  Name.MonthEnd,
114
+ Name.CastTimeUnit,
113
115
  }
114
116
 
115
117
  def __init__(
@@ -127,26 +129,66 @@ class TemporalFunction(Expr):
127
129
  if self.name not in self._valid_ops:
128
130
  raise NotImplementedError(f"Temporal function {self.name}")
129
131
 
132
+ if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
133
+ self.children[0].dtype
134
+ ):
135
+ raise NotImplementedError("ToString is not supported on duration types")
136
+
130
137
  def do_evaluate(
131
- self,
132
- df: DataFrame,
133
- *,
134
- context: ExecutionContext = ExecutionContext.FRAME,
135
- mapping: Mapping[Expr, Column] | None = None,
138
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
136
139
  ) -> Column:
137
140
  """Evaluate this expression given a dataframe for context."""
138
- columns = [
139
- child.evaluate(df, context=context, mapping=mapping)
140
- for child in self.children
141
- ]
141
+ columns = [child.evaluate(df, context=context) for child in self.children]
142
142
  (column,) = columns
143
+ if self.name is TemporalFunction.Name.CastTimeUnit:
144
+ (unit,) = self.options
145
+ if plc.traits.is_timestamp(column.obj.type()):
146
+ dtype = plc.interop.from_arrow(pa.timestamp(unit))
147
+ elif plc.traits.is_duration(column.obj.type()):
148
+ dtype = plc.interop.from_arrow(pa.duration(unit))
149
+ result = plc.unary.cast(column.obj, dtype)
150
+ return Column(result)
151
+ if self.name == TemporalFunction.Name.ToString:
152
+ return Column(
153
+ plc.strings.convert.convert_datetime.from_timestamps(
154
+ column.obj,
155
+ self.options[0],
156
+ plc.Column.from_iterable_of_py(
157
+ [], dtype=plc.DataType(plc.TypeId.STRING)
158
+ ),
159
+ )
160
+ )
161
+ if self.name is TemporalFunction.Name.Week:
162
+ result = plc.strings.convert.convert_integers.to_integers(
163
+ plc.strings.convert.convert_datetime.from_timestamps(
164
+ column.obj,
165
+ format="%V",
166
+ input_strings_names=plc.Column.from_iterable_of_py(
167
+ [], dtype=plc.DataType(plc.TypeId.STRING)
168
+ ),
169
+ ),
170
+ plc.types.DataType(plc.types.TypeId.INT8),
171
+ )
172
+ return Column(result)
173
+ if self.name is TemporalFunction.Name.IsoYear:
174
+ result = plc.strings.convert.convert_integers.to_integers(
175
+ plc.strings.convert.convert_datetime.from_timestamps(
176
+ column.obj,
177
+ format="%G",
178
+ input_strings_names=plc.Column.from_iterable_of_py(
179
+ [], dtype=plc.DataType(plc.TypeId.STRING)
180
+ ),
181
+ ),
182
+ plc.types.DataType(plc.types.TypeId.INT32),
183
+ )
184
+ return Column(result)
143
185
  if self.name is TemporalFunction.Name.MonthStart:
144
186
  ends = plc.datetime.last_day_of_month(column.obj)
145
187
  days_to_subtract = plc.datetime.days_in_month(column.obj)
146
188
  # must subtract 1 to avoid rolling over to the previous month
147
189
  days_to_subtract = plc.binaryop.binary_operation(
148
190
  days_to_subtract,
149
- plc.interop.from_arrow(pa.scalar(1, type=pa.int32())),
191
+ plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
150
192
  plc.binaryop.BinaryOperator.SUB,
151
193
  plc.DataType(plc.TypeId.DURATION_DAYS),
152
194
  )
@@ -179,7 +221,7 @@ class TemporalFunction(Expr):
179
221
  )
180
222
  millis_as_micros = plc.binaryop.binary_operation(
181
223
  millis,
182
- plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
224
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
183
225
  plc.binaryop.BinaryOperator.MUL,
184
226
  plc.DataType(plc.TypeId.INT32),
185
227
  )
@@ -202,15 +244,15 @@ class TemporalFunction(Expr):
202
244
  )
203
245
  millis_as_nanos = plc.binaryop.binary_operation(
204
246
  millis,
205
- plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
247
+ plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
206
248
  plc.binaryop.BinaryOperator.MUL,
207
- plc.types.DataType(plc.types.TypeId.INT32),
249
+ plc.DataType(plc.TypeId.INT32),
208
250
  )
209
251
  micros_as_nanos = plc.binaryop.binary_operation(
210
252
  micros,
211
- plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
253
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
212
254
  plc.binaryop.BinaryOperator.MUL,
213
- plc.types.DataType(plc.types.TypeId.INT32),
255
+ plc.DataType(plc.TypeId.INT32),
214
256
  )
215
257
  total_nanos = plc.binaryop.binary_operation(
216
258
  nanos,
@@ -6,15 +6,15 @@
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from typing import TYPE_CHECKING, Any
9
+ from typing import TYPE_CHECKING, Any, NoReturn
10
10
 
11
11
  import pylibcudf as plc
12
12
 
13
13
  from cudf_polars.containers import Column
14
- from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
14
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
15
15
 
16
16
  if TYPE_CHECKING:
17
- from collections.abc import Hashable, Mapping
17
+ from collections.abc import Hashable
18
18
 
19
19
  import pyarrow as pa
20
20
 
@@ -26,29 +26,31 @@ __all__ = ["Literal", "LiteralColumn"]
26
26
  class Literal(Expr):
27
27
  __slots__ = ("value",)
28
28
  _non_child = ("dtype", "value")
29
- value: pa.Scalar[Any]
29
+ value: Any # Python scalar
30
30
 
31
- def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
31
+ def __init__(self, dtype: plc.DataType, value: Any) -> None:
32
+ if value is None and dtype.id() == plc.TypeId.EMPTY:
33
+ # TypeId.EMPTY not supported by libcudf
34
+ # cuDF Python also maps EMPTY to INT8
35
+ dtype = plc.DataType(plc.TypeId.INT8)
32
36
  self.dtype = dtype
33
- assert value.type == plc.interop.to_arrow(dtype)
34
37
  self.value = value
35
38
  self.children = ()
36
39
  self.is_pointwise = True
37
40
 
38
41
  def do_evaluate(
39
- self,
40
- df: DataFrame,
41
- *,
42
- context: ExecutionContext = ExecutionContext.FRAME,
43
- mapping: Mapping[Expr, Column] | None = None,
42
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
44
43
  ) -> Column:
45
44
  """Evaluate this expression given a dataframe for context."""
46
- # datatype of pyarrow scalar is correct by construction.
47
- return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
45
+ return Column(
46
+ plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype), 1)
47
+ )
48
48
 
49
- def collect_agg(self, *, depth: int) -> AggInfo:
50
- """Collect information about aggregations in groupbys."""
51
- return AggInfo([])
49
+ @property
50
+ def agg_request(self) -> NoReturn: # noqa: D102
51
+ raise NotImplementedError(
52
+ "Not expecting to require agg request of literal"
53
+ ) # pragma: no cover
52
54
 
53
55
 
54
56
  class LiteralColumn(Expr):
@@ -70,16 +72,14 @@ class LiteralColumn(Expr):
70
72
  return (type(self), self.dtype, id(self.value))
71
73
 
72
74
  def do_evaluate(
73
- self,
74
- df: DataFrame,
75
- *,
76
- context: ExecutionContext = ExecutionContext.FRAME,
77
- mapping: Mapping[Expr, Column] | None = None,
75
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
78
76
  ) -> Column:
79
77
  """Evaluate this expression given a dataframe for context."""
80
78
  # datatype of pyarrow array is correct by construction.
81
79
  return Column(plc.interop.from_arrow(self.value))
82
80
 
83
- def collect_agg(self, *, depth: int) -> AggInfo:
84
- """Collect information about aggregations in groupbys."""
85
- return AggInfo([])
81
+ @property
82
+ def agg_request(self) -> NoReturn: # noqa: D102
83
+ raise NotImplementedError(
84
+ "Not expecting to require agg request of literal"
85
+ ) # pragma: no cover
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -8,24 +8,125 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
- from cudf_polars.dsl.expressions.base import Expr
11
+ import pylibcudf as plc
12
+
13
+ from cudf_polars.containers import Column
14
+ from cudf_polars.dsl import expr
15
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
16
+ from cudf_polars.dsl.utils.windows import range_window_bounds
12
17
 
13
18
  if TYPE_CHECKING:
14
- import pylibcudf as plc
19
+ import pyarrow as pa
20
+
21
+ from cudf_polars.containers import DataFrame
22
+ from cudf_polars.typing import ClosedInterval
23
+
24
+ __all__ = ["GroupedRollingWindow", "RollingWindow", "to_request"]
25
+
26
+
27
+ def to_request(
28
+ value: expr.Expr, orderby: Column, df: DataFrame
29
+ ) -> plc.rolling.RollingRequest:
30
+ """
31
+ Produce a rolling request for evaluation with pylibcudf.
15
32
 
16
- __all__ = ["GroupedRollingWindow", "RollingWindow"]
33
+ Parameters
34
+ ----------
35
+ value
36
+ The expression to perform the rolling aggregation on.
37
+ orderby
38
+ Orderby column, used as input to the request when the aggregation is Len.
39
+ df
40
+ DataFrame used to evaluate the inputs to the aggregation.
41
+ """
42
+ min_periods = 1
43
+ if isinstance(value, expr.Len):
44
+ # A count aggregation, we need a column so use the orderby column
45
+ col = orderby
46
+ elif isinstance(value, expr.Agg):
47
+ child = value.children[0]
48
+ col = child.evaluate(df, context=ExecutionContext.ROLLING)
49
+ if value.name == "var":
50
+ # Polars variance produces null if nvalues <= ddof
51
+ # libcudf produces NaN. However, we can get the polars
52
+ # behaviour by setting the minimum window size to ddof +
53
+ # 1.
54
+ min_periods = value.options + 1
55
+ else:
56
+ col = value.evaluate(
57
+ df, context=ExecutionContext.ROLLING
58
+ ) # pragma: no cover; raise before we get here because we
59
+ # don't do correct handling of empty groups
60
+ return plc.rolling.RollingRequest(col.obj, min_periods, value.agg_request)
17
61
 
18
62
 
19
63
  class RollingWindow(Expr):
20
- __slots__ = ("options",)
21
- _non_child = ("dtype", "options")
64
+ __slots__ = ("closed_window", "following", "orderby", "preceding")
65
+ _non_child = ("dtype", "preceding", "following", "closed_window", "orderby")
22
66
 
23
- def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
67
+ def __init__(
68
+ self,
69
+ dtype: plc.DataType,
70
+ preceding: pa.Scalar,
71
+ following: pa.Scalar,
72
+ closed_window: ClosedInterval,
73
+ orderby: str,
74
+ agg: Expr,
75
+ ) -> None:
24
76
  self.dtype = dtype
25
- self.options = options
77
+ self.preceding = preceding
78
+ self.following = following
79
+ self.closed_window = closed_window
80
+ self.orderby = orderby
26
81
  self.children = (agg,)
27
82
  self.is_pointwise = False
28
- raise NotImplementedError("Rolling window not implemented")
83
+ if agg.agg_request.kind() == plc.aggregation.Kind.COLLECT_LIST:
84
+ raise NotImplementedError(
85
+ "Incorrect handling of empty groups for list collection"
86
+ )
87
+ if not plc.rolling.is_valid_rolling_aggregation(agg.dtype, agg.agg_request):
88
+ raise NotImplementedError(f"Unsupported rolling aggregation {agg}")
89
+
90
+ def do_evaluate( # noqa: D102
91
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
92
+ ) -> Column:
93
+ if context != ExecutionContext.FRAME:
94
+ raise RuntimeError(
95
+ "Rolling aggregation inside groupby/over/rolling"
96
+ ) # pragma: no cover; translation raises first
97
+ (agg,) = self.children
98
+ orderby = df.column_map[self.orderby]
99
+ # Polars casts integral orderby to int64, but only for calculating window bounds
100
+ if (
101
+ plc.traits.is_integral(orderby.obj.type())
102
+ and orderby.obj.type().id() != plc.TypeId.INT64
103
+ ):
104
+ orderby_obj = plc.unary.cast(orderby.obj, plc.DataType(plc.TypeId.INT64))
105
+ else:
106
+ orderby_obj = orderby.obj
107
+ preceding, following = range_window_bounds(
108
+ self.preceding, self.following, self.closed_window
109
+ )
110
+ if orderby.obj.null_count() != 0:
111
+ raise RuntimeError(
112
+ f"Index column '{self.orderby}' in rolling may not contain nulls"
113
+ )
114
+ if not orderby.check_sorted(
115
+ order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE
116
+ ):
117
+ raise RuntimeError(
118
+ f"Index column '{self.orderby}' in rolling is not sorted, please sort first"
119
+ )
120
+ (result,) = plc.rolling.grouped_range_rolling_window(
121
+ plc.Table([]),
122
+ orderby_obj,
123
+ plc.types.Order.ASCENDING,
124
+ plc.types.NullOrder.BEFORE,
125
+ preceding,
126
+ following,
127
+ [to_request(agg, orderby, df)],
128
+ ).columns()
129
+ return Column(result)
29
130
 
30
131
 
31
132
  class GroupedRollingWindow(Expr):
@@ -8,16 +8,12 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING
10
10
 
11
- import pyarrow as pa
12
-
13
11
  import pylibcudf as plc
14
12
 
15
13
  from cudf_polars.containers import Column
16
14
  from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
15
 
18
16
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
20
-
21
17
  from cudf_polars.containers import DataFrame
22
18
 
23
19
  __all__ = ["Filter", "Gather"]
@@ -33,16 +29,11 @@ class Gather(Expr):
33
29
  self.is_pointwise = False
34
30
 
35
31
  def do_evaluate(
36
- self,
37
- df: DataFrame,
38
- *,
39
- context: ExecutionContext = ExecutionContext.FRAME,
40
- mapping: Mapping[Expr, Column] | None = None,
32
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
41
33
  ) -> Column:
42
34
  """Evaluate this expression given a dataframe for context."""
43
35
  values, indices = (
44
- child.evaluate(df, context=context, mapping=mapping)
45
- for child in self.children
36
+ child.evaluate(df, context=context) for child in self.children
46
37
  )
47
38
  lo, hi = plc.reduce.minmax(indices.obj)
48
39
  lo = plc.interop.to_arrow(lo).as_py()
@@ -54,9 +45,7 @@ class Gather(Expr):
54
45
  bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
55
46
  obj = plc.replace.replace_nulls(
56
47
  indices.obj,
57
- plc.interop.from_arrow(
58
- pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
59
- ),
48
+ plc.Scalar.from_py(n, dtype=indices.obj.type()),
60
49
  )
61
50
  else:
62
51
  bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
@@ -72,20 +61,13 @@ class Filter(Expr):
72
61
  def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
73
62
  self.dtype = dtype
74
63
  self.children = (values, indices)
75
- self.is_pointwise = True
64
+ self.is_pointwise = False
76
65
 
77
66
  def do_evaluate(
78
- self,
79
- df: DataFrame,
80
- *,
81
- context: ExecutionContext = ExecutionContext.FRAME,
82
- mapping: Mapping[Expr, Column] | None = None,
67
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
83
68
  ) -> Column:
84
69
  """Evaluate this expression given a dataframe for context."""
85
- values, mask = (
86
- child.evaluate(df, context=context, mapping=mapping)
87
- for child in self.children
88
- )
70
+ values, mask = (child.evaluate(df, context=context) for child in self.children)
89
71
  table = plc.stream_compaction.apply_boolean_mask(
90
72
  plc.Table([values.obj]), mask.obj
91
73
  )
@@ -14,8 +14,6 @@ from cudf_polars.dsl.expressions.base import (
14
14
  )
15
15
 
16
16
  if TYPE_CHECKING:
17
- from collections.abc import Mapping
18
-
19
17
  import pylibcudf as plc
20
18
 
21
19
  from cudf_polars.containers import Column, DataFrame
@@ -41,13 +39,9 @@ class Slice(Expr):
41
39
  self.children = (column,)
42
40
 
43
41
  def do_evaluate(
44
- self,
45
- df: DataFrame,
46
- *,
47
- context: ExecutionContext = ExecutionContext.FRAME,
48
- mapping: Mapping[Expr, Column] | None = None,
42
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
49
43
  ) -> Column:
50
44
  """Evaluate this expression given a dataframe for context."""
51
45
  (child,) = self.children
52
- column = child.evaluate(df, context=context, mapping=mapping)
46
+ column = child.evaluate(df, context=context)
53
47
  return column.slice((self.offset, self.length))