cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/callback.py +82 -65
  3. cudf_polars/containers/column.py +138 -7
  4. cudf_polars/containers/dataframe.py +26 -39
  5. cudf_polars/dsl/expr.py +3 -1
  6. cudf_polars/dsl/expressions/aggregation.py +27 -63
  7. cudf_polars/dsl/expressions/base.py +40 -72
  8. cudf_polars/dsl/expressions/binaryop.py +5 -41
  9. cudf_polars/dsl/expressions/boolean.py +25 -53
  10. cudf_polars/dsl/expressions/datetime.py +97 -17
  11. cudf_polars/dsl/expressions/literal.py +27 -33
  12. cudf_polars/dsl/expressions/rolling.py +110 -9
  13. cudf_polars/dsl/expressions/selection.py +8 -26
  14. cudf_polars/dsl/expressions/slicing.py +47 -0
  15. cudf_polars/dsl/expressions/sorting.py +5 -18
  16. cudf_polars/dsl/expressions/string.py +33 -36
  17. cudf_polars/dsl/expressions/ternary.py +3 -10
  18. cudf_polars/dsl/expressions/unary.py +35 -75
  19. cudf_polars/dsl/ir.py +749 -212
  20. cudf_polars/dsl/nodebase.py +8 -1
  21. cudf_polars/dsl/to_ast.py +5 -3
  22. cudf_polars/dsl/translate.py +319 -171
  23. cudf_polars/dsl/utils/__init__.py +8 -0
  24. cudf_polars/dsl/utils/aggregations.py +292 -0
  25. cudf_polars/dsl/utils/groupby.py +97 -0
  26. cudf_polars/dsl/utils/naming.py +34 -0
  27. cudf_polars/dsl/utils/replace.py +46 -0
  28. cudf_polars/dsl/utils/rolling.py +113 -0
  29. cudf_polars/dsl/utils/windows.py +186 -0
  30. cudf_polars/experimental/base.py +17 -19
  31. cudf_polars/experimental/benchmarks/__init__.py +4 -0
  32. cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
  33. cudf_polars/experimental/dask_registers.py +196 -0
  34. cudf_polars/experimental/distinct.py +174 -0
  35. cudf_polars/experimental/explain.py +127 -0
  36. cudf_polars/experimental/expressions.py +521 -0
  37. cudf_polars/experimental/groupby.py +288 -0
  38. cudf_polars/experimental/io.py +58 -29
  39. cudf_polars/experimental/join.py +353 -0
  40. cudf_polars/experimental/parallel.py +166 -93
  41. cudf_polars/experimental/repartition.py +69 -0
  42. cudf_polars/experimental/scheduler.py +155 -0
  43. cudf_polars/experimental/select.py +92 -7
  44. cudf_polars/experimental/shuffle.py +294 -0
  45. cudf_polars/experimental/sort.py +45 -0
  46. cudf_polars/experimental/spilling.py +151 -0
  47. cudf_polars/experimental/utils.py +100 -0
  48. cudf_polars/testing/asserts.py +146 -6
  49. cudf_polars/testing/io.py +72 -0
  50. cudf_polars/testing/plugin.py +78 -76
  51. cudf_polars/typing/__init__.py +59 -6
  52. cudf_polars/utils/config.py +353 -0
  53. cudf_polars/utils/conversion.py +40 -0
  54. cudf_polars/utils/dtypes.py +22 -5
  55. cudf_polars/utils/timer.py +39 -0
  56. cudf_polars/utils/versions.py +5 -4
  57. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
  58. cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
  59. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
  60. cudf_polars/experimental/dask_serialize.py +0 -59
  61. cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
  62. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
  63. {cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
@@ -9,22 +9,13 @@ from __future__ import annotations
9
9
  from functools import partial
10
10
  from typing import TYPE_CHECKING, Any, ClassVar
11
11
 
12
- import pyarrow as pa
13
-
14
12
  import pylibcudf as plc
15
13
 
16
14
  from cudf_polars.containers import Column
17
- from cudf_polars.dsl.expressions.base import (
18
- AggInfo,
19
- ExecutionContext,
20
- Expr,
21
- )
15
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
22
16
  from cudf_polars.dsl.expressions.literal import Literal
23
- from cudf_polars.dsl.expressions.unary import UnaryFunction
24
17
 
25
18
  if TYPE_CHECKING:
26
- from collections.abc import Mapping
27
-
28
19
  from cudf_polars.containers import DataFrame
29
20
 
30
21
  __all__ = ["Agg"]
@@ -75,11 +66,15 @@ class Agg(Expr):
75
66
  else plc.types.NullPolicy.INCLUDE
76
67
  )
77
68
  elif name == "quantile":
78
- _, quantile = self.children
69
+ child, quantile = self.children
79
70
  if not isinstance(quantile, Literal):
80
71
  raise NotImplementedError("Only support literal quantile values")
72
+ if options == "equiprobable":
73
+ raise NotImplementedError("Quantile with equiprobable interpolation")
74
+ if plc.traits.is_duration(child.dtype):
75
+ raise NotImplementedError("Quantile with duration data type")
81
76
  req = plc.aggregation.quantile(
82
- quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
77
+ quantiles=[quantile.value], interp=Agg.interp_mapping[options]
83
78
  )
84
79
  else:
85
80
  raise NotImplementedError(
@@ -91,7 +86,9 @@ class Agg(Expr):
91
86
  op = partial(self._reduce, request=req)
92
87
  elif name in {"min", "max"}:
93
88
  op = partial(op, propagate_nans=options)
94
- elif name in {"count", "sum", "first", "last"}:
89
+ elif name == "count":
90
+ op = partial(op, include_nulls=options)
91
+ elif name in {"sum", "first", "last"}:
95
92
  pass
96
93
  else:
97
94
  raise NotImplementedError(
@@ -124,38 +121,19 @@ class Agg(Expr):
124
121
  "linear": plc.types.Interpolation.LINEAR,
125
122
  }
126
123
 
127
- def collect_agg(self, *, depth: int) -> AggInfo:
128
- """Collect information about aggregations in groupbys."""
129
- if depth >= 1:
130
- raise NotImplementedError(
131
- "Nested aggregations in groupby"
132
- ) # pragma: no cover; check_agg trips first
133
- if (isminmax := self.name in {"min", "max"}) and self.options:
134
- raise NotImplementedError("Nan propagation in groupby for min/max")
135
- (child,) = self.children
136
- ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
137
- request = self.request
138
- # These are handled specially here because we don't set up the
139
- # request for the whole-frame agg because we can avoid a
140
- # reduce for these.
124
+ @property
125
+ def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102
141
126
  if self.name == "first":
142
- request = plc.aggregation.nth_element(
127
+ return plc.aggregation.nth_element(
143
128
  0, null_handling=plc.types.NullPolicy.INCLUDE
144
129
  )
145
130
  elif self.name == "last":
146
- request = plc.aggregation.nth_element(
131
+ return plc.aggregation.nth_element(
147
132
  -1, null_handling=plc.types.NullPolicy.INCLUDE
148
133
  )
149
- if request is None:
150
- raise NotImplementedError(
151
- f"Aggregation {self.name} in groupby"
152
- ) # pragma: no cover; __init__ trips first
153
- if isminmax and plc.traits.is_floating_point(self.dtype):
154
- assert expr is not None
155
- # Ignore nans in these groupby aggs, do this by masking
156
- # nans in the input
157
- expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
158
- return AggInfo([(expr, request, self)])
134
+ else:
135
+ assert self.request is not None, "Init should have raised"
136
+ return self.request
159
137
 
160
138
  def _reduce(
161
139
  self, column: Column, *, request: plc.aggregation.Aggregation
@@ -167,26 +145,20 @@ class Agg(Expr):
167
145
  )
168
146
  )
169
147
 
170
- def _count(self, column: Column) -> Column:
148
+ def _count(self, column: Column, *, include_nulls: bool) -> Column:
149
+ null_count = column.null_count if not include_nulls else 0
171
150
  return Column(
172
151
  plc.Column.from_scalar(
173
- plc.interop.from_arrow(
174
- pa.scalar(
175
- column.obj.size() - column.obj.null_count(),
176
- type=plc.interop.to_arrow(self.dtype),
177
- ),
178
- ),
152
+ plc.Scalar.from_py(column.size - null_count, self.dtype),
179
153
  1,
180
154
  )
181
155
  )
182
156
 
183
157
  def _sum(self, column: Column) -> Column:
184
- if column.obj.size() == 0:
158
+ if column.size == 0 or column.null_count == column.size:
185
159
  return Column(
186
160
  plc.Column.from_scalar(
187
- plc.interop.from_arrow(
188
- pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
189
- ),
161
+ plc.Scalar.from_py(0, self.dtype),
190
162
  1,
191
163
  )
192
164
  )
@@ -196,9 +168,7 @@ class Agg(Expr):
196
168
  if propagate_nans and column.nan_count > 0:
197
169
  return Column(
198
170
  plc.Column.from_scalar(
199
- plc.interop.from_arrow(
200
- pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
201
- ),
171
+ plc.Scalar.from_py(float("nan"), self.dtype),
202
172
  1,
203
173
  )
204
174
  )
@@ -210,9 +180,7 @@ class Agg(Expr):
210
180
  if propagate_nans and column.nan_count > 0:
211
181
  return Column(
212
182
  plc.Column.from_scalar(
213
- plc.interop.from_arrow(
214
- pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
215
- ),
183
+ plc.Scalar.from_py(float("nan"), self.dtype),
216
184
  1,
217
185
  )
218
186
  )
@@ -224,15 +192,11 @@ class Agg(Expr):
224
192
  return Column(plc.copying.slice(column.obj, [0, 1])[0])
225
193
 
226
194
  def _last(self, column: Column) -> Column:
227
- n = column.obj.size()
195
+ n = column.size
228
196
  return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
229
197
 
230
198
  def do_evaluate(
231
- self,
232
- df: DataFrame,
233
- *,
234
- context: ExecutionContext = ExecutionContext.FRAME,
235
- mapping: Mapping[Expr, Column] | None = None,
199
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
236
200
  ) -> Column:
237
201
  """Evaluate this expression given a dataframe for context."""
238
202
  if context is not ExecutionContext.FRAME:
@@ -243,4 +207,4 @@ class Agg(Expr):
243
207
  # Aggregations like quantiles may have additional children that were
244
208
  # preprocessed into pylibcudf requests.
245
209
  child = self.children[0]
246
- return self.op(child.evaluate(df, context=context, mapping=mapping))
210
+ return self.op(child.evaluate(df, context=context))
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -16,7 +16,7 @@ from cudf_polars.containers import Column
16
16
  from cudf_polars.dsl.nodebase import Node
17
17
 
18
18
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
19
+ from typing_extensions import Self
20
20
 
21
21
  from cudf_polars.containers import Column, DataFrame
22
22
 
@@ -46,11 +46,7 @@ class Expr(Node["Expr"]):
46
46
  """Names of non-child data (not Exprs) for reconstruction."""
47
47
 
48
48
  def do_evaluate(
49
- self,
50
- df: DataFrame,
51
- *,
52
- context: ExecutionContext = ExecutionContext.FRAME,
53
- mapping: Mapping[Expr, Column] | None = None,
49
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
54
50
  ) -> Column:
55
51
  """
56
52
  Evaluate this expression given a dataframe for context.
@@ -61,15 +57,10 @@ class Expr(Node["Expr"]):
61
57
  DataFrame that will provide columns.
62
58
  context
63
59
  What context are we performing this evaluation in?
64
- mapping
65
- Substitution mapping from expressions to Columns, used to
66
- override the evaluation of a given expression if we're
67
- performing a simple rewritten evaluation.
68
60
 
69
61
  Notes
70
62
  -----
71
- Do not call this function directly, but rather
72
- :meth:`evaluate` which handles the mapping lookups.
63
+ Do not call this function directly, but rather :meth:`evaluate`.
73
64
 
74
65
  Returns
75
66
  -------
@@ -87,11 +78,7 @@ class Expr(Node["Expr"]):
87
78
  ) # pragma: no cover; translation of unimplemented nodes trips first
88
79
 
89
80
  def evaluate(
90
- self,
91
- df: DataFrame,
92
- *,
93
- context: ExecutionContext = ExecutionContext.FRAME,
94
- mapping: Mapping[Expr, Column] | None = None,
81
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
95
82
  ) -> Column:
96
83
  """
97
84
  Evaluate this expression given a dataframe for context.
@@ -102,10 +89,6 @@ class Expr(Node["Expr"]):
102
89
  DataFrame that will provide columns.
103
90
  context
104
91
  What context are we performing this evaluation in?
105
- mapping
106
- Substitution mapping from expressions to Columns, used to
107
- override the evaluation of a given expression if we're
108
- performing a simple rewritten evaluation.
109
92
 
110
93
  Notes
111
94
  -----
@@ -124,37 +107,28 @@ class Expr(Node["Expr"]):
124
107
  are returned during translation to the IR, but for now we
125
108
  are not perfect.
126
109
  """
127
- if mapping is None:
128
- return self.do_evaluate(df, context=context, mapping=mapping)
129
- try:
130
- return mapping[self]
131
- except KeyError:
132
- return self.do_evaluate(df, context=context, mapping=mapping)
133
-
134
- def collect_agg(self, *, depth: int) -> AggInfo:
135
- """
136
- Collect information about aggregations in groupbys.
110
+ return self.do_evaluate(df, context=context)
137
111
 
138
- Parameters
139
- ----------
140
- depth
141
- The depth of aggregating (reduction or sampling)
142
- expressions we are currently at.
112
+ @property
113
+ def agg_request(self) -> plc.aggregation.Aggregation:
114
+ """
115
+ The aggregation for this expression in a grouped aggregation.
143
116
 
144
117
  Returns
145
118
  -------
146
- Aggregation info describing the expression to aggregate in the
147
- groupby.
119
+ Aggregation request. Default is to collect the expression.
120
+
121
+ Notes
122
+ -----
123
+ This presumes that the IR translation has decomposed groupby
124
+ reductions only into cases we can handle.
148
125
 
149
126
  Raises
150
127
  ------
151
128
  NotImplementedError
152
- If we can't currently perform the aggregation request, for
153
- example nested aggregations like ``a.max().min()``.
129
+ If requesting an aggregation from an unexpected expression.
154
130
  """
155
- raise NotImplementedError(
156
- f"Collecting aggregation info for {type(self).__name__}"
157
- ) # pragma: no cover; check_agg trips first
131
+ return plc.aggregation.collect_list()
158
132
 
159
133
 
160
134
  class ErrorExpr(Expr):
@@ -166,7 +140,7 @@ class ErrorExpr(Expr):
166
140
  self.dtype = dtype
167
141
  self.error = error
168
142
  self.children = ()
169
- self.is_pointwise = True
143
+ self.is_pointwise = False
170
144
 
171
145
 
172
146
  class NamedExpr:
@@ -202,11 +176,7 @@ class NamedExpr:
202
176
  return not self.__eq__(other)
203
177
 
204
178
  def evaluate(
205
- self,
206
- df: DataFrame,
207
- *,
208
- context: ExecutionContext = ExecutionContext.FRAME,
209
- mapping: Mapping[Expr, Column] | None = None,
179
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
210
180
  ) -> Column:
211
181
  """
212
182
  Evaluate this expression given a dataframe for context.
@@ -217,8 +187,6 @@ class NamedExpr:
217
187
  DataFrame providing context
218
188
  context
219
189
  Execution context
220
- mapping
221
- Substitution mapping
222
190
 
223
191
  Returns
224
192
  -------
@@ -229,13 +197,25 @@ class NamedExpr:
229
197
  :meth:`Expr.evaluate` for details, this function just adds the
230
198
  name to a column produced from an expression.
231
199
  """
232
- return self.value.evaluate(df, context=context, mapping=mapping).rename(
233
- self.name
234
- )
200
+ return self.value.evaluate(df, context=context).rename(self.name)
201
+
202
+ def reconstruct(self, expr: Expr) -> Self:
203
+ """
204
+ Rebuild with a new `Expr` value.
205
+
206
+ Parameters
207
+ ----------
208
+ expr
209
+ New `Expr` value
235
210
 
236
- def collect_agg(self, *, depth: int) -> AggInfo:
237
- """Collect information about aggregations in groupbys."""
238
- return self.value.collect_agg(depth=depth)
211
+ Returns
212
+ -------
213
+ New `NamedExpr` with `expr` as the underlying expression.
214
+ The name of the original `NamedExpr` is preserved.
215
+ """
216
+ if expr is self.value:
217
+ return self
218
+ return type(self)(self.name, expr)
239
219
 
240
220
 
241
221
  class Col(Expr):
@@ -250,21 +230,13 @@ class Col(Expr):
250
230
  self.children = ()
251
231
 
252
232
  def do_evaluate(
253
- self,
254
- df: DataFrame,
255
- *,
256
- context: ExecutionContext = ExecutionContext.FRAME,
257
- mapping: Mapping[Expr, Column] | None = None,
233
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
258
234
  ) -> Column:
259
235
  """Evaluate this expression given a dataframe for context."""
260
236
  # Deliberately remove the name here so that we guarantee
261
237
  # evaluation of the IR produces names.
262
238
  return df.column_map[self.name].rename(None)
263
239
 
264
- def collect_agg(self, *, depth: int) -> AggInfo:
265
- """Collect information about aggregations in groupbys."""
266
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
267
-
268
240
 
269
241
  class ColRef(Expr):
270
242
  __slots__ = ("index", "table_ref")
@@ -288,11 +260,7 @@ class ColRef(Expr):
288
260
  self.children = (column,)
289
261
 
290
262
  def do_evaluate(
291
- self,
292
- df: DataFrame,
293
- *,
294
- context: ExecutionContext = ExecutionContext.FRAME,
295
- mapping: Mapping[Expr, Column] | None = None,
263
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
296
264
  ) -> Column:
297
265
  """Evaluate this expression given a dataframe for context."""
298
266
  raise NotImplementedError(
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -13,11 +13,9 @@ from polars.polars import _expr_nodes as pl_expr
13
13
  import pylibcudf as plc
14
14
 
15
15
  from cudf_polars.containers import Column
16
- from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
16
+ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
17
 
18
18
  if TYPE_CHECKING:
19
- from collections.abc import Mapping
20
-
21
19
  from cudf_polars.containers import DataFrame
22
20
 
23
21
  __all__ = ["BinOp"]
@@ -85,20 +83,13 @@ class BinOp(Expr):
85
83
  }
86
84
 
87
85
  def do_evaluate(
88
- self,
89
- df: DataFrame,
90
- *,
91
- context: ExecutionContext = ExecutionContext.FRAME,
92
- mapping: Mapping[Expr, Column] | None = None,
86
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
93
87
  ) -> Column:
94
88
  """Evaluate this expression given a dataframe for context."""
95
- left, right = (
96
- child.evaluate(df, context=context, mapping=mapping)
97
- for child in self.children
98
- )
89
+ left, right = (child.evaluate(df, context=context) for child in self.children)
99
90
  lop = left.obj
100
91
  rop = right.obj
101
- if left.obj.size() != right.obj.size():
92
+ if left.size != right.size:
102
93
  if left.is_scalar:
103
94
  lop = left.obj_scalar
104
95
  elif right.is_scalar:
@@ -106,30 +97,3 @@ class BinOp(Expr):
106
97
  return Column(
107
98
  plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
108
99
  )
109
-
110
- def collect_agg(self, *, depth: int) -> AggInfo:
111
- """Collect information about aggregations in groupbys."""
112
- if depth == 1:
113
- # inside aggregation, need to pre-evaluate,
114
- # groupby construction has checked that we don't have
115
- # nested aggs, so stop the recursion and return ourselves
116
- # for pre-eval
117
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
118
- else:
119
- left_info, right_info = (
120
- child.collect_agg(depth=depth) for child in self.children
121
- )
122
- requests = [*left_info.requests, *right_info.requests]
123
- # TODO: Hack, if there were no reductions inside this
124
- # binary expression then we want to pre-evaluate and
125
- # collect ourselves. Otherwise we want to collect the
126
- # aggregations inside and post-evaluate. This is a bad way
127
- # of checking that we are in case 1.
128
- if all(
129
- agg.kind() == plc.aggregation.Kind.COLLECT_LIST
130
- for _, agg, _ in requests
131
- ):
132
- return AggInfo([(self, plc.aggregation.collect_list(), self)])
133
- return AggInfo(
134
- [*left_info.requests, *right_info.requests],
135
- )
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -10,8 +10,6 @@ from enum import IntEnum, auto
10
10
  from functools import partial, reduce
11
11
  from typing import TYPE_CHECKING, Any, ClassVar
12
12
 
13
- import pyarrow as pa
14
-
15
13
  import pylibcudf as plc
16
14
 
17
15
  from cudf_polars.containers import Column
@@ -19,10 +17,9 @@ from cudf_polars.dsl.expressions.base import (
19
17
  ExecutionContext,
20
18
  Expr,
21
19
  )
20
+ from cudf_polars.utils.versions import POLARS_VERSION_LT_128
22
21
 
23
22
  if TYPE_CHECKING:
24
- from collections.abc import Mapping
25
-
26
23
  from typing_extensions import Self
27
24
 
28
25
  import polars.type_aliases as pl_types
@@ -89,9 +86,11 @@ class BooleanFunction(Expr):
89
86
  BooleanFunction.Name.IsLastDistinct,
90
87
  BooleanFunction.Name.IsUnique,
91
88
  )
92
- if self.name is BooleanFunction.Name.IsIn and not all(
93
- c.dtype == self.children[0].dtype for c in self.children
94
- ):
89
+ if (
90
+ POLARS_VERSION_LT_128
91
+ and self.name is BooleanFunction.Name.IsIn
92
+ and not all(c.dtype == self.children[0].dtype for c in self.children)
93
+ ): # pragma: no cover
95
94
  # TODO: If polars IR doesn't put the casts in, we need to
96
95
  # mimic the supertype promotion rules.
97
96
  raise NotImplementedError("IsIn doesn't support supertype casting")
@@ -145,11 +144,7 @@ class BooleanFunction(Expr):
145
144
  }
146
145
 
147
146
  def do_evaluate(
148
- self,
149
- df: DataFrame,
150
- *,
151
- context: ExecutionContext = ExecutionContext.FRAME,
152
- mapping: Mapping[Expr, Column] | None = None,
147
+ self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
153
148
  ) -> Column:
154
149
  """Evaluate this expression given a dataframe for context."""
155
150
  if self.name in (
@@ -160,29 +155,22 @@ class BooleanFunction(Expr):
160
155
  (child,) = self.children
161
156
  is_finite = self.name is BooleanFunction.Name.IsFinite
162
157
  if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
163
- value = plc.interop.from_arrow(
164
- pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
165
- )
158
+ value = plc.Scalar.from_py(is_finite)
166
159
  return Column(plc.Column.from_scalar(value, df.num_rows))
167
- needles = child.evaluate(df, context=context, mapping=mapping)
160
+ needles = child.evaluate(df, context=context)
168
161
  to_search = [-float("inf"), float("inf")]
169
162
  if is_finite:
170
163
  # NaN is neither finite not infinite
171
164
  to_search.append(float("nan"))
172
- haystack = plc.interop.from_arrow(
173
- pa.array(
174
- to_search,
175
- type=plc.interop.to_arrow(needles.obj.type()),
176
- )
165
+ haystack = plc.Column.from_iterable_of_py(
166
+ to_search,
167
+ dtype=needles.obj.type(),
177
168
  )
178
169
  result = plc.search.contains(haystack, needles.obj)
179
170
  if is_finite:
180
171
  result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
181
172
  return Column(result)
182
- columns = [
183
- child.evaluate(df, context=context, mapping=mapping)
184
- for child in self.children
185
- ]
173
+ columns = [child.evaluate(df, context=context) for child in self.children]
186
174
  # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
187
175
  # False
188
176
  if self.name in (BooleanFunction.Name.Any, BooleanFunction.Name.All):
@@ -191,7 +179,7 @@ class BooleanFunction(Expr):
191
179
  is_any = self.name is BooleanFunction.Name.Any
192
180
  agg = plc.aggregation.any() if is_any else plc.aggregation.all()
193
181
  result = plc.reduce.reduce(column.obj, agg, self.dtype)
194
- if not ignore_nulls and column.obj.null_count() > 0:
182
+ if not ignore_nulls and column.null_count > 0:
195
183
  # Truth tables
196
184
  # Any All
197
185
  # | F U T | F U T
@@ -218,14 +206,14 @@ class BooleanFunction(Expr):
218
206
  (column,) = columns
219
207
  return Column(
220
208
  plc.unary.is_nan(column.obj).with_mask(
221
- column.obj.null_mask(), column.obj.null_count()
209
+ column.obj.null_mask(), column.null_count
222
210
  )
223
211
  )
224
212
  elif self.name is BooleanFunction.Name.IsNotNan:
225
213
  (column,) = columns
226
214
  return Column(
227
215
  plc.unary.is_not_nan(column.obj).with_mask(
228
- column.obj.null_mask(), column.obj.null_count()
216
+ column.obj.null_mask(), column.null_count
229
217
  )
230
218
  )
231
219
  elif self.name is BooleanFunction.Name.IsFirstDistinct:
@@ -233,48 +221,32 @@ class BooleanFunction(Expr):
233
221
  return self._distinct(
234
222
  column,
235
223
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
236
- source_value=plc.interop.from_arrow(
237
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
238
- ),
239
- target_value=plc.interop.from_arrow(
240
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
241
- ),
224
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
225
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
242
226
  )
243
227
  elif self.name is BooleanFunction.Name.IsLastDistinct:
244
228
  (column,) = columns
245
229
  return self._distinct(
246
230
  column,
247
231
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
248
- source_value=plc.interop.from_arrow(
249
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
250
- ),
251
- target_value=plc.interop.from_arrow(
252
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
253
- ),
232
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
233
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
254
234
  )
255
235
  elif self.name is BooleanFunction.Name.IsUnique:
256
236
  (column,) = columns
257
237
  return self._distinct(
258
238
  column,
259
239
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
260
- source_value=plc.interop.from_arrow(
261
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
262
- ),
263
- target_value=plc.interop.from_arrow(
264
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
265
- ),
240
+ source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
241
+ target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
266
242
  )
267
243
  elif self.name is BooleanFunction.Name.IsDuplicated:
268
244
  (column,) = columns
269
245
  return self._distinct(
270
246
  column,
271
247
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
272
- source_value=plc.interop.from_arrow(
273
- pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
274
- ),
275
- target_value=plc.interop.from_arrow(
276
- pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
277
- ),
248
+ source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype),
249
+ target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype),
278
250
  )
279
251
  elif self.name is BooleanFunction.Name.AllHorizontal:
280
252
  return Column(