cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -17,8 +17,6 @@ from cudf_polars.containers import Column
17
17
  from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
18
18
 
19
19
  if TYPE_CHECKING:
20
- from collections.abc import Mapping
21
-
22
20
  from typing_extensions import Self
23
21
 
24
22
  from polars.polars import _expr_nodes as pl_expr
@@ -104,6 +102,18 @@ class TemporalFunction(Expr):
104
102
  Name.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
105
103
  }
106
104
 
105
+ _valid_ops: ClassVar[set[Name]] = {
106
+ *_COMPONENT_MAP.keys(),
107
+ Name.IsLeapYear,
108
+ Name.OrdinalDay,
109
+ Name.ToString,
110
+ Name.Week,
111
+ Name.IsoYear,
112
+ Name.MonthStart,
113
+ Name.MonthEnd,
114
+ Name.CastTimeUnit,
115
+ }
116
+
107
117
  def __init__(
108
118
  self,
109
119
  dtype: plc.DataType,
@@ -116,22 +126,92 @@ class TemporalFunction(Expr):
116
126
  self.name = name
117
127
  self.children = children
118
128
  self.is_pointwise = True
119
- if self.name not in self._COMPONENT_MAP:
129
+ if self.name not in self._valid_ops:
120
130
  raise NotImplementedError(f"Temporal function {self.name}")
121
131
 
132
+ if self.name is TemporalFunction.Name.ToString and plc.traits.is_duration(
133
+ self.children[0].dtype
134
+ ):
135
+ raise NotImplementedError("ToString is not supported on duration types")
136
+
122
137
  def do_evaluate(
123
- self,
124
- df: DataFrame,
125
- *,
126
- context: ExecutionContext = ExecutionContext.FRAME,
127
- mapping: Mapping[Expr, Column] | None = None,
138
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
128
139
  ) -> Column:
129
140
  """Evaluate this expression given a dataframe for context."""
130
- columns = [
131
- child.evaluate(df, context=context, mapping=mapping)
132
- for child in self.children
133
- ]
141
+ columns = [child.evaluate(df, context=context) for child in self.children]
134
142
  (column,) = columns
143
+ if self.name is TemporalFunction.Name.CastTimeUnit:
144
+ (unit,) = self.options
145
+ if plc.traits.is_timestamp(column.obj.type()):
146
+ dtype = plc.interop.from_arrow(pa.timestamp(unit))
147
+ elif plc.traits.is_duration(column.obj.type()):
148
+ dtype = plc.interop.from_arrow(pa.duration(unit))
149
+ result = plc.unary.cast(column.obj, dtype)
150
+ return Column(result)
151
+ if self.name == TemporalFunction.Name.ToString:
152
+ return Column(
153
+ plc.strings.convert.convert_datetime.from_timestamps(
154
+ column.obj,
155
+ self.options[0],
156
+ plc.Column.from_iterable_of_py(
157
+ [], dtype=plc.DataType(plc.TypeId.STRING)
158
+ ),
159
+ )
160
+ )
161
+ if self.name is TemporalFunction.Name.Week:
162
+ result = plc.strings.convert.convert_integers.to_integers(
163
+ plc.strings.convert.convert_datetime.from_timestamps(
164
+ column.obj,
165
+ format="%V",
166
+ input_strings_names=plc.Column.from_iterable_of_py(
167
+ [], dtype=plc.DataType(plc.TypeId.STRING)
168
+ ),
169
+ ),
170
+ plc.types.DataType(plc.types.TypeId.INT8),
171
+ )
172
+ return Column(result)
173
+ if self.name is TemporalFunction.Name.IsoYear:
174
+ result = plc.strings.convert.convert_integers.to_integers(
175
+ plc.strings.convert.convert_datetime.from_timestamps(
176
+ column.obj,
177
+ format="%G",
178
+ input_strings_names=plc.Column.from_iterable_of_py(
179
+ [], dtype=plc.DataType(plc.TypeId.STRING)
180
+ ),
181
+ ),
182
+ plc.types.DataType(plc.types.TypeId.INT32),
183
+ )
184
+ return Column(result)
185
+ if self.name is TemporalFunction.Name.MonthStart:
186
+ ends = plc.datetime.last_day_of_month(column.obj)
187
+ days_to_subtract = plc.datetime.days_in_month(column.obj)
188
+ # must subtract 1 to avoid rolling over to the previous month
189
+ days_to_subtract = plc.binaryop.binary_operation(
190
+ days_to_subtract,
191
+ plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT32)),
192
+ plc.binaryop.BinaryOperator.SUB,
193
+ plc.DataType(plc.TypeId.DURATION_DAYS),
194
+ )
195
+ result = plc.binaryop.binary_operation(
196
+ ends,
197
+ days_to_subtract,
198
+ plc.binaryop.BinaryOperator.SUB,
199
+ column.obj.type(),
200
+ )
201
+
202
+ return Column(result)
203
+ if self.name is TemporalFunction.Name.MonthEnd:
204
+ return Column(
205
+ plc.unary.cast(
206
+ plc.datetime.last_day_of_month(column.obj), column.obj.type()
207
+ )
208
+ )
209
+ if self.name is TemporalFunction.Name.IsLeapYear:
210
+ return Column(
211
+ plc.datetime.is_leap_year(column.obj),
212
+ )
213
+ if self.name is TemporalFunction.Name.OrdinalDay:
214
+ return Column(plc.datetime.day_of_year(column.obj))
135
215
  if self.name is TemporalFunction.Name.Microsecond:
136
216
  millis = plc.datetime.extract_datetime_component(
137
217
  column.obj, plc.datetime.DatetimeComponent.MILLISECOND
@@ -141,7 +221,7 @@ class TemporalFunction(Expr):
141
221
  )
142
222
  millis_as_micros = plc.binaryop.binary_operation(
143
223
  millis,
144
- plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
224
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
145
225
  plc.binaryop.BinaryOperator.MUL,
146
226
  plc.DataType(plc.TypeId.INT32),
147
227
  )
@@ -164,15 +244,15 @@ class TemporalFunction(Expr):
164
244
  )
165
245
  millis_as_nanos = plc.binaryop.binary_operation(
166
246
  millis,
167
- plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
247
+ plc.Scalar.from_py(1_000_000, plc.DataType(plc.TypeId.INT32)),
168
248
  plc.binaryop.BinaryOperator.MUL,
169
- plc.types.DataType(plc.types.TypeId.INT32),
249
+ plc.DataType(plc.TypeId.INT32),
170
250
  )
171
251
  micros_as_nanos = plc.binaryop.binary_operation(
172
252
  micros,
173
- plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
253
+ plc.Scalar.from_py(1_000, plc.DataType(plc.TypeId.INT32)),
174
254
  plc.binaryop.BinaryOperator.MUL,
175
- plc.types.DataType(plc.types.TypeId.INT32),
255
+ plc.DataType(plc.TypeId.INT32),
176
256
  )
177
257
  total_nanos = plc.binaryop.binary_operation(
178
258
  nanos,
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -6,23 +6,18 @@
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from typing import TYPE_CHECKING, Any
10
-
11
- import pyarrow as pa
9
+ from typing import TYPE_CHECKING, Any, NoReturn
12
10
 
13
11
  import pylibcudf as plc
14
12
 
15
13
  from cudf_polars.containers import Column
16
- from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
17
- from cudf_polars.utils import dtypes
14
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
18
15
 
19
16
  if TYPE_CHECKING:
20
- from collections.abc import Hashable, Mapping
17
+ from collections.abc import Hashable
21
18
 
22
19
  import pyarrow as pa
23
20
 
24
- import polars as pl
25
-
26
21
  from cudf_polars.containers import DataFrame
27
22
 
28
23
  __all__ = ["Literal", "LiteralColumn"]
@@ -31,29 +26,31 @@ __all__ = ["Literal", "LiteralColumn"]
31
26
  class Literal(Expr):
32
27
  __slots__ = ("value",)
33
28
  _non_child = ("dtype", "value")
34
- value: pa.Scalar[Any]
29
+ value: Any # Python scalar
35
30
 
36
- def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
31
+ def __init__(self, dtype: plc.DataType, value: Any) -> None:
32
+ if value is None and dtype.id() == plc.TypeId.EMPTY:
33
+ # TypeId.EMPTY not supported by libcudf
34
+ # cuDF Python also maps EMPTY to INT8
35
+ dtype = plc.DataType(plc.TypeId.INT8)
37
36
  self.dtype = dtype
38
- assert value.type == plc.interop.to_arrow(dtype)
39
37
  self.value = value
40
38
  self.children = ()
41
39
  self.is_pointwise = True
42
40
 
43
41
  def do_evaluate(
44
- self,
45
- df: DataFrame,
46
- *,
47
- context: ExecutionContext = ExecutionContext.FRAME,
48
- mapping: Mapping[Expr, Column] | None = None,
42
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
49
43
  ) -> Column:
50
44
  """Evaluate this expression given a dataframe for context."""
51
- # datatype of pyarrow scalar is correct by construction.
52
- return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
45
+ return Column(
46
+ plc.Column.from_scalar(plc.Scalar.from_py(self.value, self.dtype), 1)
47
+ )
53
48
 
54
- def collect_agg(self, *, depth: int) -> AggInfo:
55
- """Collect information about aggregations in groupbys."""
56
- return AggInfo([])
49
+ @property
50
+ def agg_request(self) -> NoReturn: # noqa: D102
51
+ raise NotImplementedError(
52
+ "Not expecting to require agg request of literal"
53
+ ) # pragma: no cover
57
54
 
58
55
 
59
56
  class LiteralColumn(Expr):
@@ -61,10 +58,9 @@ class LiteralColumn(Expr):
61
58
  _non_child = ("dtype", "value")
62
59
  value: pa.Array[Any]
63
60
 
64
- def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
61
+ def __init__(self, dtype: plc.DataType, value: pa.Array) -> None:
65
62
  self.dtype = dtype
66
- data = value.to_arrow()
67
- self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
63
+ self.value = value
68
64
  self.children = ()
69
65
  self.is_pointwise = True
70
66
 
@@ -76,16 +72,14 @@ class LiteralColumn(Expr):
76
72
  return (type(self), self.dtype, id(self.value))
77
73
 
78
74
  def do_evaluate(
79
- self,
80
- df: DataFrame,
81
- *,
82
- context: ExecutionContext = ExecutionContext.FRAME,
83
- mapping: Mapping[Expr, Column] | None = None,
75
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
84
76
  ) -> Column:
85
77
  """Evaluate this expression given a dataframe for context."""
86
78
  # datatype of pyarrow array is correct by construction.
87
79
  return Column(plc.interop.from_arrow(self.value))
88
80
 
89
- def collect_agg(self, *, depth: int) -> AggInfo:
90
- """Collect information about aggregations in groupbys."""
91
- return AggInfo([])
81
+ @property
82
+ def agg_request(self) -> NoReturn: # noqa: D102
83
+ raise NotImplementedError(
84
+ "Not expecting to require agg request of literal"
85
+ ) # pragma: no cover
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -8,24 +8,125 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
- from cudf_polars.dsl.expressions.base import Expr
11
+ import pylibcudf as plc
12
+
13
+ from cudf_polars.containers import Column
14
+ from cudf_polars.dsl import expr
15
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
16
+ from cudf_polars.dsl.utils.windows import range_window_bounds
12
17
 
13
18
  if TYPE_CHECKING:
14
- import pylibcudf as plc
19
+ import pyarrow as pa
20
+
21
+ from cudf_polars.containers import DataFrame
22
+ from cudf_polars.typing import ClosedInterval
23
+
24
+ __all__ = ["GroupedRollingWindow", "RollingWindow", "to_request"]
25
+
26
+
27
+ def to_request(
28
+ value: expr.Expr, orderby: Column, df: DataFrame
29
+ ) -> plc.rolling.RollingRequest:
30
+ """
31
+ Produce a rolling request for evaluation with pylibcudf.
15
32
 
16
- __all__ = ["GroupedRollingWindow", "RollingWindow"]
33
+ Parameters
34
+ ----------
35
+ value
36
+ The expression to perform the rolling aggregation on.
37
+ orderby
38
+ Orderby column, used as input to the request when the aggregation is Len.
39
+ df
40
+ DataFrame used to evaluate the inputs to the aggregation.
41
+ """
42
+ min_periods = 1
43
+ if isinstance(value, expr.Len):
44
+ # A count aggregation, we need a column so use the orderby column
45
+ col = orderby
46
+ elif isinstance(value, expr.Agg):
47
+ child = value.children[0]
48
+ col = child.evaluate(df, context=ExecutionContext.ROLLING)
49
+ if value.name == "var":
50
+ # Polars variance produces null if nvalues <= ddof
51
+ # libcudf produces NaN. However, we can get the polars
52
+ # behaviour by setting the minimum window size to ddof +
53
+ # 1.
54
+ min_periods = value.options + 1
55
+ else:
56
+ col = value.evaluate(
57
+ df, context=ExecutionContext.ROLLING
58
+ ) # pragma: no cover; raise before we get here because we
59
+ # don't do correct handling of empty groups
60
+ return plc.rolling.RollingRequest(col.obj, min_periods, value.agg_request)
17
61
 
18
62
 
19
63
  class RollingWindow(Expr):
20
- __slots__ = ("options",)
21
- _non_child = ("dtype", "options")
64
+ __slots__ = ("closed_window", "following", "orderby", "preceding")
65
+ _non_child = ("dtype", "preceding", "following", "closed_window", "orderby")
22
66
 
23
- def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
67
+ def __init__(
68
+ self,
69
+ dtype: plc.DataType,
70
+ preceding: pa.Scalar,
71
+ following: pa.Scalar,
72
+ closed_window: ClosedInterval,
73
+ orderby: str,
74
+ agg: Expr,
75
+ ) -> None:
24
76
  self.dtype = dtype
25
- self.options = options
77
+ self.preceding = preceding
78
+ self.following = following
79
+ self.closed_window = closed_window
80
+ self.orderby = orderby
26
81
  self.children = (agg,)
27
82
  self.is_pointwise = False
28
- raise NotImplementedError("Rolling window not implemented")
83
+ if agg.agg_request.kind() == plc.aggregation.Kind.COLLECT_LIST:
84
+ raise NotImplementedError(
85
+ "Incorrect handling of empty groups for list collection"
86
+ )
87
+ if not plc.rolling.is_valid_rolling_aggregation(agg.dtype, agg.agg_request):
88
+ raise NotImplementedError(f"Unsupported rolling aggregation {agg}")
89
+
90
+ def do_evaluate( # noqa: D102
91
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
92
+ ) -> Column:
93
+ if context != ExecutionContext.FRAME:
94
+ raise RuntimeError(
95
+ "Rolling aggregation inside groupby/over/rolling"
96
+ ) # pragma: no cover; translation raises first
97
+ (agg,) = self.children
98
+ orderby = df.column_map[self.orderby]
99
+ # Polars casts integral orderby to int64, but only for calculating window bounds
100
+ if (
101
+ plc.traits.is_integral(orderby.obj.type())
102
+ and orderby.obj.type().id() != plc.TypeId.INT64
103
+ ):
104
+ orderby_obj = plc.unary.cast(orderby.obj, plc.DataType(plc.TypeId.INT64))
105
+ else:
106
+ orderby_obj = orderby.obj
107
+ preceding, following = range_window_bounds(
108
+ self.preceding, self.following, self.closed_window
109
+ )
110
+ if orderby.obj.null_count() != 0:
111
+ raise RuntimeError(
112
+ f"Index column '{self.orderby}' in rolling may not contain nulls"
113
+ )
114
+ if not orderby.check_sorted(
115
+ order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE
116
+ ):
117
+ raise RuntimeError(
118
+ f"Index column '{self.orderby}' in rolling is not sorted, please sort first"
119
+ )
120
+ (result,) = plc.rolling.grouped_range_rolling_window(
121
+ plc.Table([]),
122
+ orderby_obj,
123
+ plc.types.Order.ASCENDING,
124
+ plc.types.NullOrder.BEFORE,
125
+ preceding,
126
+ following,
127
+ [to_request(agg, orderby, df)],
128
+ ).columns()
129
+ return Column(result)
29
130
 
30
131
 
31
132
  class GroupedRollingWindow(Expr):
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -8,16 +8,12 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING
10
10
 
11
- import pyarrow as pa
12
-
13
11
  import pylibcudf as plc
14
12
 
15
13
  from cudf_polars.containers import Column
16
14
  from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
15
 
18
16
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
20
-
21
17
  from cudf_polars.containers import DataFrame
22
18
 
23
19
  __all__ = ["Filter", "Gather"]
@@ -33,16 +29,11 @@ class Gather(Expr):
33
29
  self.is_pointwise = False
34
30
 
35
31
  def do_evaluate(
36
- self,
37
- df: DataFrame,
38
- *,
39
- context: ExecutionContext = ExecutionContext.FRAME,
40
- mapping: Mapping[Expr, Column] | None = None,
32
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
41
33
  ) -> Column:
42
34
  """Evaluate this expression given a dataframe for context."""
43
35
  values, indices = (
44
- child.evaluate(df, context=context, mapping=mapping)
45
- for child in self.children
36
+ child.evaluate(df, context=context) for child in self.children
46
37
  )
47
38
  lo, hi = plc.reduce.minmax(indices.obj)
48
39
  lo = plc.interop.to_arrow(lo).as_py()
@@ -50,13 +41,11 @@ class Gather(Expr):
50
41
  n = df.num_rows
51
42
  if hi >= n or lo < -n:
52
43
  raise ValueError("gather indices are out of bounds")
53
- if indices.obj.null_count():
44
+ if indices.null_count:
54
45
  bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
55
46
  obj = plc.replace.replace_nulls(
56
47
  indices.obj,
57
- plc.interop.from_arrow(
58
- pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
59
- ),
48
+ plc.Scalar.from_py(n, dtype=indices.obj.type()),
60
49
  )
61
50
  else:
62
51
  bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
@@ -72,20 +61,13 @@ class Filter(Expr):
72
61
  def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
73
62
  self.dtype = dtype
74
63
  self.children = (values, indices)
75
- self.is_pointwise = True
64
+ self.is_pointwise = False
76
65
 
77
66
  def do_evaluate(
78
- self,
79
- df: DataFrame,
80
- *,
81
- context: ExecutionContext = ExecutionContext.FRAME,
82
- mapping: Mapping[Expr, Column] | None = None,
67
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
83
68
  ) -> Column:
84
69
  """Evaluate this expression given a dataframe for context."""
85
- values, mask = (
86
- child.evaluate(df, context=context, mapping=mapping)
87
- for child in self.children
88
- )
70
+ values, mask = (child.evaluate(df, context=context) for child in self.children)
89
71
  table = plc.stream_compaction.apply_boolean_mask(
90
72
  plc.Table([values.obj]), mask.obj
91
73
  )
@@ -0,0 +1,47 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ # TODO: remove need for this
4
+ # ruff: noqa: D101
5
+ """Slicing DSL nodes."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING
10
+
11
+ from cudf_polars.dsl.expressions.base import (
12
+ ExecutionContext,
13
+ Expr,
14
+ )
15
+
16
+ if TYPE_CHECKING:
17
+ import pylibcudf as plc
18
+
19
+ from cudf_polars.containers import Column, DataFrame
20
+
21
+
22
+ __all__ = ["Slice"]
23
+
24
+
25
+ class Slice(Expr):
26
+ __slots__ = ("length", "offset")
27
+ _non_child = ("dtype", "offset", "length")
28
+
29
+ def __init__(
30
+ self,
31
+ dtype: plc.DataType,
32
+ offset: int,
33
+ length: int,
34
+ column: Expr,
35
+ ) -> None:
36
+ self.dtype = dtype
37
+ self.offset = offset
38
+ self.length = length
39
+ self.children = (column,)
40
+
41
+ def do_evaluate(
42
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
43
+ ) -> Column:
44
+ """Evaluate this expression given a dataframe for context."""
45
+ (child,) = self.children
46
+ column = child.evaluate(df, context=context)
47
+ return column.slice((self.offset, self.length))
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -15,8 +15,6 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
15
15
  from cudf_polars.utils import sorting
16
16
 
17
17
  if TYPE_CHECKING:
18
- from collections.abc import Mapping
19
-
20
18
  from cudf_polars.containers import DataFrame
21
19
 
22
20
  __all__ = ["Sort", "SortBy"]
@@ -35,15 +33,11 @@ class Sort(Expr):
35
33
  self.is_pointwise = False
36
34
 
37
35
  def do_evaluate(
38
- self,
39
- df: DataFrame,
40
- *,
41
- context: ExecutionContext = ExecutionContext.FRAME,
42
- mapping: Mapping[Expr, Column] | None = None,
36
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
43
37
  ) -> Column:
44
38
  """Evaluate this expression given a dataframe for context."""
45
39
  (child,) = self.children
46
- column = child.evaluate(df, context=context, mapping=mapping)
40
+ column = child.evaluate(df, context=context)
47
41
  (stable, nulls_last, descending) = self.options
48
42
  order, null_order = sorting.sort_order(
49
43
  [descending], nulls_last=[nulls_last], num_keys=1
@@ -75,17 +69,10 @@ class SortBy(Expr):
75
69
  self.is_pointwise = False
76
70
 
77
71
  def do_evaluate(
78
- self,
79
- df: DataFrame,
80
- *,
81
- context: ExecutionContext = ExecutionContext.FRAME,
82
- mapping: Mapping[Expr, Column] | None = None,
72
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
83
73
  ) -> Column:
84
74
  """Evaluate this expression given a dataframe for context."""
85
- column, *by = (
86
- child.evaluate(df, context=context, mapping=mapping)
87
- for child in self.children
88
- )
75
+ column, *by = (child.evaluate(df, context=context) for child in self.children)
89
76
  (stable, nulls_last, descending) = self.options
90
77
  order, null_order = sorting.sort_order(
91
78
  descending, nulls_last=nulls_last, num_keys=len(by)