cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +60 -15
  4. cudf_polars/containers/column.py +137 -77
  5. cudf_polars/containers/dataframe.py +123 -34
  6. cudf_polars/containers/datatype.py +134 -13
  7. cudf_polars/dsl/expr.py +0 -2
  8. cudf_polars/dsl/expressions/aggregation.py +80 -28
  9. cudf_polars/dsl/expressions/binaryop.py +34 -14
  10. cudf_polars/dsl/expressions/boolean.py +110 -37
  11. cudf_polars/dsl/expressions/datetime.py +59 -30
  12. cudf_polars/dsl/expressions/literal.py +11 -5
  13. cudf_polars/dsl/expressions/rolling.py +460 -119
  14. cudf_polars/dsl/expressions/selection.py +9 -8
  15. cudf_polars/dsl/expressions/slicing.py +1 -1
  16. cudf_polars/dsl/expressions/string.py +256 -114
  17. cudf_polars/dsl/expressions/struct.py +19 -7
  18. cudf_polars/dsl/expressions/ternary.py +33 -3
  19. cudf_polars/dsl/expressions/unary.py +126 -64
  20. cudf_polars/dsl/ir.py +1053 -350
  21. cudf_polars/dsl/to_ast.py +30 -13
  22. cudf_polars/dsl/tracing.py +194 -0
  23. cudf_polars/dsl/translate.py +307 -107
  24. cudf_polars/dsl/utils/aggregations.py +43 -30
  25. cudf_polars/dsl/utils/reshape.py +14 -2
  26. cudf_polars/dsl/utils/rolling.py +12 -8
  27. cudf_polars/dsl/utils/windows.py +35 -20
  28. cudf_polars/experimental/base.py +55 -2
  29. cudf_polars/experimental/benchmarks/pdsds.py +12 -126
  30. cudf_polars/experimental/benchmarks/pdsh.py +792 -2
  31. cudf_polars/experimental/benchmarks/utils.py +596 -39
  32. cudf_polars/experimental/dask_registers.py +47 -20
  33. cudf_polars/experimental/dispatch.py +9 -3
  34. cudf_polars/experimental/distinct.py +2 -0
  35. cudf_polars/experimental/explain.py +15 -2
  36. cudf_polars/experimental/expressions.py +30 -15
  37. cudf_polars/experimental/groupby.py +25 -4
  38. cudf_polars/experimental/io.py +156 -124
  39. cudf_polars/experimental/join.py +53 -23
  40. cudf_polars/experimental/parallel.py +68 -19
  41. cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
  42. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  43. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  44. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  45. cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
  46. cudf_polars/experimental/rapidsmpf/core.py +488 -0
  47. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  48. cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
  49. cudf_polars/experimental/rapidsmpf/io.py +696 -0
  50. cudf_polars/experimental/rapidsmpf/join.py +322 -0
  51. cudf_polars/experimental/rapidsmpf/lower.py +74 -0
  52. cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
  53. cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
  54. cudf_polars/experimental/rapidsmpf/union.py +115 -0
  55. cudf_polars/experimental/rapidsmpf/utils.py +374 -0
  56. cudf_polars/experimental/repartition.py +9 -2
  57. cudf_polars/experimental/select.py +177 -14
  58. cudf_polars/experimental/shuffle.py +46 -12
  59. cudf_polars/experimental/sort.py +100 -26
  60. cudf_polars/experimental/spilling.py +1 -1
  61. cudf_polars/experimental/statistics.py +24 -5
  62. cudf_polars/experimental/utils.py +25 -7
  63. cudf_polars/testing/asserts.py +13 -8
  64. cudf_polars/testing/io.py +2 -1
  65. cudf_polars/testing/plugin.py +93 -17
  66. cudf_polars/typing/__init__.py +86 -32
  67. cudf_polars/utils/config.py +473 -58
  68. cudf_polars/utils/cuda_stream.py +70 -0
  69. cudf_polars/utils/versions.py +5 -4
  70. cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
  71. cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
  72. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  73. cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
  74. cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
  75. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  76. {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ from decimal import Decimal
9
10
  from functools import partial
10
11
  from typing import TYPE_CHECKING, Any, ClassVar
11
12
 
@@ -16,23 +17,31 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
16
17
  from cudf_polars.dsl.expressions.literal import Literal
17
18
 
18
19
  if TYPE_CHECKING:
20
+ from rmm.pylibrmm.stream import Stream
21
+
19
22
  from cudf_polars.containers import DataFrame, DataType
20
23
 
21
24
  __all__ = ["Agg"]
22
25
 
23
26
 
24
27
  class Agg(Expr):
25
- __slots__ = ("name", "op", "options", "request")
26
- _non_child = ("dtype", "name", "options")
28
+ __slots__ = ("context", "name", "op", "options", "request")
29
+ _non_child = ("dtype", "name", "options", "context")
27
30
 
28
31
  def __init__(
29
- self, dtype: DataType, name: str, options: Any, *children: Expr
32
+ self,
33
+ dtype: DataType,
34
+ name: str,
35
+ options: Any,
36
+ context: ExecutionContext,
37
+ *children: Expr,
30
38
  ) -> None:
31
39
  self.dtype = dtype
32
40
  self.name = name
33
41
  self.options = options
34
42
  self.is_pointwise = False
35
43
  self.children = children
44
+ self.context = context
36
45
  if name not in Agg._SUPPORTED:
37
46
  raise NotImplementedError(
38
47
  f"Unsupported aggregation {name=}"
@@ -71,7 +80,7 @@ class Agg(Expr):
71
80
  raise NotImplementedError("Only support literal quantile values")
72
81
  if options == "equiprobable":
73
82
  raise NotImplementedError("Quantile with equiprobable interpolation")
74
- if plc.traits.is_duration(child.dtype.plc):
83
+ if plc.traits.is_duration(child.dtype.plc_type):
75
84
  raise NotImplementedError("Quantile with duration data type")
76
85
  req = plc.aggregation.quantile(
77
86
  quantiles=[quantile.value], interp=Agg.interp_mapping[options]
@@ -80,9 +89,19 @@ class Agg(Expr):
80
89
  raise NotImplementedError(
81
90
  f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
82
91
  ) # pragma: no cover
92
+ if (
93
+ context == ExecutionContext.FRAME
94
+ and req is not None
95
+ and not plc.aggregation.is_valid_aggregation(dtype.plc_type, req)
96
+ ):
97
+ # TODO: Check which cases polars raises vs returns all-NULL column.
98
+ # For the all-NULL column cases, we could build it using Column.all_null_like
99
+ # at evaluation time.
100
+ raise NotImplementedError(f"Invalid aggregation {req} with dtype {dtype}")
83
101
  self.request = req
84
102
  op = getattr(self, f"_{name}", None)
85
103
  if op is None:
104
+ assert req is not None # Ensure req is not None for _reduce
86
105
  op = partial(self._reduce, request=req)
87
106
  elif name in {"min", "max"}:
88
107
  op = partial(op, propagate_nans=options)
@@ -136,77 +155,110 @@ class Agg(Expr):
136
155
  return self.request
137
156
 
138
157
  def _reduce(
139
- self, column: Column, *, request: plc.aggregation.Aggregation
158
+ self, column: Column, *, request: plc.aggregation.Aggregation, stream: Stream
140
159
  ) -> Column:
160
+ if (
161
+ # For sum, this condition can only pass
162
+ # after expression decomposition in the streaming
163
+ # engine
164
+ self.name in {"sum", "mean", "median"}
165
+ and plc.traits.is_fixed_point(column.dtype.plc_type)
166
+ and self.dtype.plc_type.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}
167
+ ):
168
+ column = column.astype(self.dtype, stream=stream)
141
169
  return Column(
142
170
  plc.Column.from_scalar(
143
- plc.reduce.reduce(column.obj, request, self.dtype.plc),
171
+ plc.reduce.reduce(
172
+ column.obj, request, self.dtype.plc_type, stream=stream
173
+ ),
144
174
  1,
175
+ stream=stream,
145
176
  ),
146
177
  name=column.name,
147
178
  dtype=self.dtype,
148
179
  )
149
180
 
150
- def _count(self, column: Column, *, include_nulls: bool) -> Column:
181
+ def _count(self, column: Column, *, include_nulls: bool, stream: Stream) -> Column:
151
182
  null_count = column.null_count if not include_nulls else 0
152
183
  return Column(
153
184
  plc.Column.from_scalar(
154
- plc.Scalar.from_py(column.size - null_count, self.dtype.plc),
185
+ plc.Scalar.from_py(
186
+ column.size - null_count, self.dtype.plc_type, stream=stream
187
+ ),
155
188
  1,
189
+ stream=stream,
156
190
  ),
157
191
  name=column.name,
158
192
  dtype=self.dtype,
159
193
  )
160
194
 
161
- def _sum(self, column: Column) -> Column:
195
+ def _sum(self, column: Column, stream: Stream) -> Column:
162
196
  if column.size == 0 or column.null_count == column.size:
197
+ dtype = self.dtype.plc_type
163
198
  return Column(
164
199
  plc.Column.from_scalar(
165
- plc.Scalar.from_py(0, self.dtype.plc),
200
+ plc.Scalar.from_py(
201
+ Decimal(0).scaleb(dtype.scale())
202
+ if plc.traits.is_fixed_point(dtype)
203
+ else 0,
204
+ dtype,
205
+ stream=stream,
206
+ ),
166
207
  1,
208
+ stream=stream,
167
209
  ),
168
210
  name=column.name,
169
211
  dtype=self.dtype,
170
212
  )
171
- return self._reduce(column, request=plc.aggregation.sum())
213
+ return self._reduce(column, request=plc.aggregation.sum(), stream=stream)
172
214
 
173
- def _min(self, column: Column, *, propagate_nans: bool) -> Column:
174
- if propagate_nans and column.nan_count > 0:
215
+ def _min(self, column: Column, *, propagate_nans: bool, stream: Stream) -> Column:
216
+ nan_count = column.nan_count(stream=stream)
217
+ if propagate_nans and nan_count > 0:
175
218
  return Column(
176
219
  plc.Column.from_scalar(
177
- plc.Scalar.from_py(float("nan"), self.dtype.plc),
220
+ plc.Scalar.from_py(
221
+ float("nan"), self.dtype.plc_type, stream=stream
222
+ ),
178
223
  1,
224
+ stream=stream,
179
225
  ),
180
226
  name=column.name,
181
227
  dtype=self.dtype,
182
228
  )
183
- if column.nan_count > 0:
184
- column = column.mask_nans()
185
- return self._reduce(column, request=plc.aggregation.min())
229
+ if nan_count > 0:
230
+ column = column.mask_nans(stream=stream)
231
+ return self._reduce(column, request=plc.aggregation.min(), stream=stream)
186
232
 
187
- def _max(self, column: Column, *, propagate_nans: bool) -> Column:
188
- if propagate_nans and column.nan_count > 0:
233
+ def _max(self, column: Column, *, propagate_nans: bool, stream: Stream) -> Column:
234
+ nan_count = column.nan_count(stream=stream)
235
+ if propagate_nans and nan_count > 0:
189
236
  return Column(
190
237
  plc.Column.from_scalar(
191
- plc.Scalar.from_py(float("nan"), self.dtype.plc),
238
+ plc.Scalar.from_py(
239
+ float("nan"), self.dtype.plc_type, stream=stream
240
+ ),
192
241
  1,
242
+ stream=stream,
193
243
  ),
194
244
  name=column.name,
195
245
  dtype=self.dtype,
196
246
  )
197
- if column.nan_count > 0:
198
- column = column.mask_nans()
199
- return self._reduce(column, request=plc.aggregation.max())
247
+ if nan_count > 0:
248
+ column = column.mask_nans(stream=stream)
249
+ return self._reduce(column, request=plc.aggregation.max(), stream=stream)
200
250
 
201
- def _first(self, column: Column) -> Column:
251
+ def _first(self, column: Column, stream: Stream) -> Column:
202
252
  return Column(
203
- plc.copying.slice(column.obj, [0, 1])[0], name=column.name, dtype=self.dtype
253
+ plc.copying.slice(column.obj, [0, 1], stream=stream)[0],
254
+ name=column.name,
255
+ dtype=self.dtype,
204
256
  )
205
257
 
206
- def _last(self, column: Column) -> Column:
258
+ def _last(self, column: Column, stream: Stream) -> Column:
207
259
  n = column.size
208
260
  return Column(
209
- plc.copying.slice(column.obj, [n - 1, n])[0],
261
+ plc.copying.slice(column.obj, [n - 1, n], stream=stream)[0],
210
262
  name=column.name,
211
263
  dtype=self.dtype,
212
264
  )
@@ -223,4 +275,4 @@ class Agg(Expr):
223
275
  # Aggregations like quantiles may have additional children that were
224
276
  # preprocessed into pylibcudf requests.
225
277
  child = self.children[0]
226
- return self.op(child.evaluate(df, context=context))
278
+ return self.op(child.evaluate(df, context=context), stream=df.stream)
@@ -8,13 +8,15 @@ from __future__ import annotations
8
8
 
9
9
  from typing import TYPE_CHECKING, ClassVar
10
10
 
11
- from polars.polars import _expr_nodes as pl_expr
11
+ from polars import polars # type: ignore[attr-defined]
12
12
 
13
13
  import pylibcudf as plc
14
14
 
15
15
  from cudf_polars.containers import Column
16
16
  from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
17
17
 
18
+ pl_expr = polars._expr_nodes
19
+
18
20
  if TYPE_CHECKING:
19
21
  from cudf_polars.containers import DataFrame, DataType
20
22
 
@@ -33,7 +35,7 @@ class BinOp(Expr):
33
35
  right: Expr,
34
36
  ) -> None:
35
37
  self.dtype = dtype
36
- if plc.traits.is_boolean(self.dtype.plc):
38
+ if plc.traits.is_boolean(self.dtype.plc_type):
37
39
  # For boolean output types, bitand and bitor implement
38
40
  # boolean logic, so translate. bitxor also does, but the
39
41
  # default behaviour is correct.
@@ -42,7 +44,7 @@ class BinOp(Expr):
42
44
  self.children = (left, right)
43
45
  self.is_pointwise = True
44
46
  if not plc.binaryop.is_supported_operation(
45
- self.dtype.plc, left.dtype.plc, right.dtype.plc, op
47
+ self.dtype.plc_type, left.dtype.plc_type, right.dtype.plc_type, op
46
48
  ):
47
49
  raise NotImplementedError(
48
50
  f"Operation {op.name} not supported "
@@ -59,7 +61,9 @@ class BinOp(Expr):
59
61
  plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
60
62
  }
61
63
 
62
- _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
64
+ _MAPPING: ClassVar[
65
+ dict[polars._expr_nodes.Operator, plc.binaryop.BinaryOperator]
66
+ ] = {
63
67
  pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
64
68
  pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
65
69
  pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
@@ -87,20 +91,25 @@ class BinOp(Expr):
87
91
  ) -> Column:
88
92
  """Evaluate this expression given a dataframe for context."""
89
93
  left, right = (child.evaluate(df, context=context) for child in self.children)
90
- lop = left.obj
91
- rop = right.obj
94
+ lop: plc.Column | plc.Scalar = left.obj
95
+ rop: plc.Column | plc.Scalar = right.obj
92
96
  if left.size != right.size:
93
97
  if left.is_scalar:
94
- lop = left.obj_scalar
98
+ lop = left.obj_scalar(stream=df.stream)
95
99
  elif right.is_scalar:
96
- rop = right.obj_scalar
97
- if plc.traits.is_integral_not_bool(self.dtype.plc) and self.op in {
100
+ rop = right.obj_scalar(stream=df.stream)
101
+ if plc.traits.is_integral_not_bool(self.dtype.plc_type) and self.op in {
98
102
  plc.binaryop.BinaryOperator.FLOOR_DIV,
99
103
  plc.binaryop.BinaryOperator.PYMOD,
100
104
  }:
101
- if right.obj.size() == 1 and right.obj.to_scalar().to_py() == 0:
105
+ if (
106
+ right.obj.size() == 1
107
+ and right.obj.to_scalar(stream=df.stream).to_py(stream=df.stream) == 0
108
+ ):
102
109
  return Column(
103
- plc.Column.all_null_like(left.obj, left.obj.size()),
110
+ plc.Column.all_null_like(
111
+ left.obj, left.obj.size(), stream=df.stream
112
+ ),
104
113
  dtype=self.dtype,
105
114
  )
106
115
 
@@ -108,13 +117,24 @@ class BinOp(Expr):
108
117
  rop = plc.replace.find_and_replace_all(
109
118
  right.obj,
110
119
  plc.Column.from_scalar(
111
- plc.Scalar.from_py(0, dtype=self.dtype.plc), 1
120
+ plc.Scalar.from_py(
121
+ 0, dtype=self.dtype.plc_type, stream=df.stream
122
+ ),
123
+ 1,
124
+ stream=df.stream,
112
125
  ),
113
126
  plc.Column.from_scalar(
114
- plc.Scalar.from_py(None, dtype=self.dtype.plc), 1
127
+ plc.Scalar.from_py(
128
+ None, dtype=self.dtype.plc_type, stream=df.stream
129
+ ),
130
+ 1,
131
+ stream=df.stream,
115
132
  ),
133
+ stream=df.stream,
116
134
  )
117
135
  return Column(
118
- plc.binaryop.binary_operation(lop, rop, self.op, self.dtype.plc),
136
+ plc.binaryop.binary_operation(
137
+ lop, rop, self.op, self.dtype.plc_type, stream=df.stream
138
+ ),
119
139
  dtype=self.dtype,
120
140
  )
@@ -8,7 +8,9 @@ from __future__ import annotations
8
8
 
9
9
  from enum import IntEnum, auto
10
10
  from functools import partial, reduce
11
- from typing import TYPE_CHECKING, Any, ClassVar
11
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
12
+
13
+ import polars as pl
12
14
 
13
15
  import pylibcudf as plc
14
16
 
@@ -22,7 +24,9 @@ if TYPE_CHECKING:
22
24
  from typing_extensions import Self
23
25
 
24
26
  import polars.type_aliases as pl_types
25
- from polars.polars import _expr_nodes as pl_expr
27
+ from polars import polars # type: ignore[attr-defined]
28
+
29
+ from rmm.pylibrmm.stream import Stream
26
30
 
27
31
  from cudf_polars.containers import DataFrame
28
32
 
@@ -53,7 +57,7 @@ class BooleanFunction(Expr):
53
57
  Not = auto()
54
58
 
55
59
  @classmethod
56
- def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self:
60
+ def from_polars(cls, obj: polars._expr_nodes.BooleanFunction) -> Self:
57
61
  """Convert from polars' `BooleanFunction`."""
58
62
  try:
59
63
  function, name = str(obj).split(".", maxsplit=1)
@@ -101,6 +105,7 @@ class BooleanFunction(Expr):
101
105
  keep: plc.stream_compaction.DuplicateKeepOption,
102
106
  source_value: plc.Scalar,
103
107
  target_value: plc.Scalar,
108
+ stream: Stream,
104
109
  ) -> Column:
105
110
  table = plc.Table([column.obj])
106
111
  indices = plc.stream_compaction.distinct_indices(
@@ -109,12 +114,20 @@ class BooleanFunction(Expr):
109
114
  # TODO: polars doesn't expose options for these
110
115
  plc.types.NullEquality.EQUAL,
111
116
  plc.types.NanEquality.ALL_EQUAL,
117
+ stream=stream,
112
118
  )
113
119
  return Column(
114
120
  plc.copying.scatter(
115
121
  [source_value],
116
122
  indices,
117
- plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
123
+ plc.Table(
124
+ [
125
+ plc.Column.from_scalar(
126
+ target_value, table.num_rows(), stream=stream
127
+ )
128
+ ]
129
+ ),
130
+ stream=stream,
118
131
  ).columns()[0],
119
132
  dtype=dtype,
120
133
  )
@@ -153,31 +166,36 @@ class BooleanFunction(Expr):
153
166
  ):
154
167
  # Avoid evaluating the child if the dtype tells us it's unnecessary.
155
168
  (child,) = self.children
156
- needles = child.evaluate(df, context=context)
157
- is_float = needles.obj.type().id() in (
169
+ values = child.evaluate(df, context=context)
170
+ is_float = values.obj.type().id() in (
158
171
  plc.TypeId.FLOAT32,
159
172
  plc.TypeId.FLOAT64,
160
173
  )
161
174
  is_finite = self.name is BooleanFunction.Name.IsFinite
162
175
  if not is_float:
163
176
  base = plc.Column.from_scalar(
164
- plc.Scalar.from_py(py_val=is_finite), needles.size
177
+ plc.Scalar.from_py(py_val=is_finite, stream=df.stream),
178
+ values.size,
179
+ stream=df.stream,
165
180
  )
166
- out = base.with_mask(needles.obj.null_mask(), needles.null_count)
181
+ out = base.with_mask(values.obj.null_mask(), values.null_count)
167
182
  return Column(out, dtype=self.dtype)
168
183
  to_search = [-float("inf"), float("inf")]
169
184
  if is_finite:
170
185
  # NaN is neither finite not infinite
171
186
  to_search.append(float("nan"))
172
- haystack = plc.Column.from_iterable_of_py(
187
+ nonfinite_values = plc.Column.from_iterable_of_py(
173
188
  to_search,
174
- dtype=needles.obj.type(),
189
+ dtype=values.obj.type(),
190
+ stream=df.stream,
175
191
  )
176
- result = plc.search.contains(haystack, needles.obj)
192
+ result = plc.search.contains(nonfinite_values, values.obj, stream=df.stream)
177
193
  if is_finite:
178
- result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
194
+ result = plc.unary.unary_operation(
195
+ result, plc.unary.UnaryOperator.NOT, stream=df.stream
196
+ )
179
197
  return Column(
180
- result.with_mask(needles.obj.null_mask(), needles.null_count),
198
+ result.with_mask(values.obj.null_mask(), values.null_count),
181
199
  dtype=self.dtype,
182
200
  )
183
201
  columns = [child.evaluate(df, context=context) for child in self.children]
@@ -188,7 +206,9 @@ class BooleanFunction(Expr):
188
206
  (column,) = columns
189
207
  is_any = self.name is BooleanFunction.Name.Any
190
208
  agg = plc.aggregation.any() if is_any else plc.aggregation.all()
191
- result = plc.reduce.reduce(column.obj, agg, self.dtype.plc)
209
+ scalar_result = plc.reduce.reduce(
210
+ column.obj, agg, self.dtype.plc_type, stream=df.stream
211
+ )
192
212
  if not ignore_nulls and column.null_count > 0:
193
213
  # Truth tables
194
214
  # Any All
@@ -200,20 +220,28 @@ class BooleanFunction(Expr):
200
220
  #
201
221
  # If the input null count was non-zero, we must
202
222
  # post-process the result to insert the correct value.
203
- h_result = result.to_py()
223
+ h_result = scalar_result.to_py(stream=df.stream)
204
224
  if (is_any and not h_result) or (not is_any and h_result):
205
225
  # Any All
206
226
  # False || Null => Null True && Null => Null
207
227
  return Column(
208
- plc.Column.all_null_like(column.obj, 1), dtype=self.dtype
228
+ plc.Column.all_null_like(column.obj, 1, stream=df.stream),
229
+ dtype=self.dtype,
209
230
  )
210
- return Column(plc.Column.from_scalar(result, 1), dtype=self.dtype)
231
+ return Column(
232
+ plc.Column.from_scalar(scalar_result, 1, stream=df.stream),
233
+ dtype=self.dtype,
234
+ )
211
235
  if self.name is BooleanFunction.Name.IsNull:
212
236
  (column,) = columns
213
- return Column(plc.unary.is_null(column.obj), dtype=self.dtype)
237
+ return Column(
238
+ plc.unary.is_null(column.obj, stream=df.stream), dtype=self.dtype
239
+ )
214
240
  elif self.name is BooleanFunction.Name.IsNotNull:
215
241
  (column,) = columns
216
- return Column(plc.unary.is_valid(column.obj), dtype=self.dtype)
242
+ return Column(
243
+ plc.unary.is_valid(column.obj, stream=df.stream), dtype=self.dtype
244
+ )
217
245
  elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan):
218
246
  (column,) = columns
219
247
  is_float = column.obj.type().id() in (
@@ -230,9 +258,11 @@ class BooleanFunction(Expr):
230
258
  else:
231
259
  base = plc.Column.from_scalar(
232
260
  plc.Scalar.from_py(
233
- py_val=self.name is not BooleanFunction.Name.IsNan
261
+ py_val=self.name is not BooleanFunction.Name.IsNan,
262
+ stream=df.stream,
234
263
  ),
235
264
  column.size,
265
+ stream=df.stream,
236
266
  )
237
267
  out = base.with_mask(column.obj.null_mask(), column.null_count)
238
268
  return Column(out, dtype=self.dtype)
@@ -242,8 +272,13 @@ class BooleanFunction(Expr):
242
272
  column,
243
273
  dtype=self.dtype,
244
274
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
245
- source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
246
- target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
275
+ source_value=plc.Scalar.from_py(
276
+ py_val=True, dtype=self.dtype.plc_type, stream=df.stream
277
+ ),
278
+ target_value=plc.Scalar.from_py(
279
+ py_val=False, dtype=self.dtype.plc_type, stream=df.stream
280
+ ),
281
+ stream=df.stream,
247
282
  )
248
283
  elif self.name is BooleanFunction.Name.IsLastDistinct:
249
284
  (column,) = columns
@@ -251,8 +286,15 @@ class BooleanFunction(Expr):
251
286
  column,
252
287
  dtype=self.dtype,
253
288
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
254
- source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
255
- target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
289
+ source_value=plc.Scalar.from_py(
290
+ py_val=True, dtype=self.dtype.plc_type, stream=df.stream
291
+ ),
292
+ target_value=plc.Scalar.from_py(
293
+ py_val=False,
294
+ dtype=self.dtype.plc_type,
295
+ stream=df.stream,
296
+ ),
297
+ stream=df.stream,
256
298
  )
257
299
  elif self.name is BooleanFunction.Name.IsUnique:
258
300
  (column,) = columns
@@ -260,8 +302,13 @@ class BooleanFunction(Expr):
260
302
  column,
261
303
  dtype=self.dtype,
262
304
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
263
- source_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
264
- target_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
305
+ source_value=plc.Scalar.from_py(
306
+ py_val=True, dtype=self.dtype.plc_type, stream=df.stream
307
+ ),
308
+ target_value=plc.Scalar.from_py(
309
+ py_val=False, dtype=self.dtype.plc_type, stream=df.stream
310
+ ),
311
+ stream=df.stream,
265
312
  )
266
313
  elif self.name is BooleanFunction.Name.IsDuplicated:
267
314
  (column,) = columns
@@ -269,8 +316,13 @@ class BooleanFunction(Expr):
269
316
  column,
270
317
  dtype=self.dtype,
271
318
  keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
272
- source_value=plc.Scalar.from_py(py_val=False, dtype=self.dtype.plc),
273
- target_value=plc.Scalar.from_py(py_val=True, dtype=self.dtype.plc),
319
+ source_value=plc.Scalar.from_py(
320
+ py_val=False, dtype=self.dtype.plc_type, stream=df.stream
321
+ ),
322
+ target_value=plc.Scalar.from_py(
323
+ py_val=True, dtype=self.dtype.plc_type, stream=df.stream
324
+ ),
325
+ stream=df.stream,
274
326
  )
275
327
  elif self.name is BooleanFunction.Name.AllHorizontal:
276
328
  return Column(
@@ -278,7 +330,7 @@ class BooleanFunction(Expr):
278
330
  partial(
279
331
  plc.binaryop.binary_operation,
280
332
  op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
281
- output_type=self.dtype.plc,
333
+ output_type=self.dtype.plc_type,
282
334
  ),
283
335
  (c.obj for c in columns),
284
336
  ),
@@ -290,7 +342,7 @@ class BooleanFunction(Expr):
290
342
  partial(
291
343
  plc.binaryop.binary_operation,
292
344
  op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
293
- output_type=self.dtype.plc,
345
+ output_type=self.dtype.plc_type,
294
346
  ),
295
347
  (c.obj for c in columns),
296
348
  ),
@@ -300,24 +352,45 @@ class BooleanFunction(Expr):
300
352
  needles, haystack = columns
301
353
  if haystack.obj.type().id() == plc.TypeId.LIST:
302
354
  # Unwrap values from the list column
303
- # the type: ignore is safe because we know that the type ID is LIST,
304
- # which always has an inner attribute.
355
+ # .inner returns DataTypeClass | DataType, need to cast to DataType
305
356
  haystack = Column(
306
357
  haystack.obj.children()[1],
307
- dtype=DataType(haystack.dtype.polars.inner), # type: ignore[attr-defined]
308
- ).astype(needles.dtype)
358
+ dtype=DataType(
359
+ cast(
360
+ pl.DataType, cast(pl.List, haystack.dtype.polars_type).inner
361
+ )
362
+ ),
363
+ ).astype(needles.dtype, stream=df.stream)
309
364
  if haystack.size:
310
365
  return Column(
311
- plc.search.contains(haystack.obj, needles.obj), dtype=self.dtype
366
+ plc.search.contains(
367
+ haystack.obj,
368
+ needles.obj,
369
+ stream=df.stream,
370
+ ),
371
+ dtype=self.dtype,
312
372
  )
313
373
  return Column(
314
- plc.Column.from_scalar(plc.Scalar.from_py(py_val=False), needles.size),
374
+ plc.Column.from_scalar(
375
+ plc.Scalar.from_py(py_val=False, stream=df.stream),
376
+ needles.size,
377
+ stream=df.stream,
378
+ ),
315
379
  dtype=self.dtype,
316
380
  )
317
381
  elif self.name is BooleanFunction.Name.Not:
318
382
  (column,) = columns
383
+ # Polars semantics:
384
+ # integer input: NOT => bitwise invert.
385
+ # boolean input: NOT => logical NOT.
319
386
  return Column(
320
- plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT),
387
+ plc.unary.unary_operation(
388
+ column.obj,
389
+ plc.unary.UnaryOperator.NOT
390
+ if column.obj.type().id() == plc.TypeId.BOOL8
391
+ else plc.unary.UnaryOperator.BIT_INVERT,
392
+ stream=df.stream,
393
+ ),
321
394
  dtype=self.dtype,
322
395
  )
323
396
  else: