cudf-polars-cu13 25.10.0__py3-none-any.whl → 25.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/GIT_COMMIT +1 -1
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +32 -8
- cudf_polars/containers/column.py +94 -59
- cudf_polars/containers/dataframe.py +123 -34
- cudf_polars/containers/datatype.py +134 -13
- cudf_polars/dsl/expr.py +0 -2
- cudf_polars/dsl/expressions/aggregation.py +80 -28
- cudf_polars/dsl/expressions/binaryop.py +34 -14
- cudf_polars/dsl/expressions/boolean.py +110 -37
- cudf_polars/dsl/expressions/datetime.py +59 -30
- cudf_polars/dsl/expressions/literal.py +11 -5
- cudf_polars/dsl/expressions/rolling.py +460 -119
- cudf_polars/dsl/expressions/selection.py +9 -8
- cudf_polars/dsl/expressions/slicing.py +1 -1
- cudf_polars/dsl/expressions/string.py +235 -102
- cudf_polars/dsl/expressions/struct.py +19 -7
- cudf_polars/dsl/expressions/ternary.py +9 -3
- cudf_polars/dsl/expressions/unary.py +117 -58
- cudf_polars/dsl/ir.py +923 -290
- cudf_polars/dsl/to_ast.py +30 -13
- cudf_polars/dsl/tracing.py +194 -0
- cudf_polars/dsl/translate.py +294 -97
- cudf_polars/dsl/utils/aggregations.py +34 -26
- cudf_polars/dsl/utils/reshape.py +14 -2
- cudf_polars/dsl/utils/rolling.py +12 -8
- cudf_polars/dsl/utils/windows.py +35 -20
- cudf_polars/experimental/base.py +45 -2
- cudf_polars/experimental/benchmarks/pdsds.py +12 -126
- cudf_polars/experimental/benchmarks/pdsh.py +791 -1
- cudf_polars/experimental/benchmarks/utils.py +515 -39
- cudf_polars/experimental/dask_registers.py +47 -20
- cudf_polars/experimental/dispatch.py +9 -3
- cudf_polars/experimental/explain.py +15 -2
- cudf_polars/experimental/expressions.py +22 -10
- cudf_polars/experimental/groupby.py +23 -4
- cudf_polars/experimental/io.py +93 -83
- cudf_polars/experimental/join.py +39 -22
- cudf_polars/experimental/parallel.py +60 -14
- cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
- cudf_polars/experimental/rapidsmpf/core.py +361 -0
- cudf_polars/experimental/rapidsmpf/dispatch.py +150 -0
- cudf_polars/experimental/rapidsmpf/io.py +604 -0
- cudf_polars/experimental/rapidsmpf/join.py +237 -0
- cudf_polars/experimental/rapidsmpf/lower.py +74 -0
- cudf_polars/experimental/rapidsmpf/nodes.py +494 -0
- cudf_polars/experimental/rapidsmpf/repartition.py +151 -0
- cudf_polars/experimental/rapidsmpf/shuffle.py +277 -0
- cudf_polars/experimental/rapidsmpf/union.py +96 -0
- cudf_polars/experimental/rapidsmpf/utils.py +162 -0
- cudf_polars/experimental/repartition.py +9 -2
- cudf_polars/experimental/select.py +177 -14
- cudf_polars/experimental/shuffle.py +28 -8
- cudf_polars/experimental/sort.py +92 -25
- cudf_polars/experimental/statistics.py +24 -5
- cudf_polars/experimental/utils.py +25 -7
- cudf_polars/testing/asserts.py +13 -8
- cudf_polars/testing/io.py +2 -1
- cudf_polars/testing/plugin.py +88 -15
- cudf_polars/typing/__init__.py +86 -32
- cudf_polars/utils/config.py +406 -58
- cudf_polars/utils/cuda_stream.py +70 -0
- cudf_polars/utils/versions.py +3 -2
- cudf_polars_cu13-25.12.0.dist-info/METADATA +182 -0
- cudf_polars_cu13-25.12.0.dist-info/RECORD +104 -0
- cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
- cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/WHEEL +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-25.12.0.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,9 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
from enum import IntEnum, auto
|
|
10
10
|
from io import StringIO
|
|
11
|
-
from typing import TYPE_CHECKING, Any, ClassVar
|
|
11
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
12
14
|
|
|
13
15
|
import pylibcudf as plc
|
|
14
16
|
|
|
@@ -18,7 +20,7 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
|
18
20
|
if TYPE_CHECKING:
|
|
19
21
|
from typing_extensions import Self
|
|
20
22
|
|
|
21
|
-
from polars
|
|
23
|
+
from polars import polars # type: ignore[attr-defined]
|
|
22
24
|
|
|
23
25
|
from cudf_polars.containers import DataFrame, DataType
|
|
24
26
|
|
|
@@ -42,7 +44,7 @@ class StructFunction(Expr):
|
|
|
42
44
|
) # https://github.com/pola-rs/polars/pull/23022#issuecomment-2933910958
|
|
43
45
|
|
|
44
46
|
@classmethod
|
|
45
|
-
def from_polars(cls, obj:
|
|
47
|
+
def from_polars(cls, obj: polars._expr_nodes.StructFunction) -> Self:
|
|
46
48
|
"""Convert from polars' `StructFunction`."""
|
|
47
49
|
try:
|
|
48
50
|
function, name = str(obj).split(".", maxsplit=1)
|
|
@@ -87,11 +89,14 @@ class StructFunction(Expr):
|
|
|
87
89
|
"""Evaluate this expression given a dataframe for context."""
|
|
88
90
|
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
89
91
|
(column,) = columns
|
|
92
|
+
# Type checker doesn't know polars only calls StructFunction with struct types
|
|
90
93
|
if self.name == StructFunction.Name.FieldByName:
|
|
91
94
|
field_index = next(
|
|
92
95
|
(
|
|
93
96
|
i
|
|
94
|
-
for i, field in enumerate(
|
|
97
|
+
for i, field in enumerate(
|
|
98
|
+
cast(pl.Struct, self.children[0].dtype.polars_type).fields
|
|
99
|
+
)
|
|
95
100
|
if field.name == self.options[0]
|
|
96
101
|
),
|
|
97
102
|
None,
|
|
@@ -109,7 +114,12 @@ class StructFunction(Expr):
|
|
|
109
114
|
table = plc.Table(column.obj.children())
|
|
110
115
|
metadata = plc.io.TableWithMetadata(
|
|
111
116
|
table,
|
|
112
|
-
[
|
|
117
|
+
[
|
|
118
|
+
(field.name, [])
|
|
119
|
+
for field in cast(
|
|
120
|
+
pl.Struct, self.children[0].dtype.polars_type
|
|
121
|
+
).fields
|
|
122
|
+
],
|
|
113
123
|
)
|
|
114
124
|
options = (
|
|
115
125
|
plc.io.json.JsonWriterOptions.builder(target, table)
|
|
@@ -120,9 +130,11 @@ class StructFunction(Expr):
|
|
|
120
130
|
.utf8_escaped(val=False)
|
|
121
131
|
.build()
|
|
122
132
|
)
|
|
123
|
-
plc.io.json.write_json(options)
|
|
133
|
+
plc.io.json.write_json(options, stream=df.stream)
|
|
124
134
|
return Column(
|
|
125
|
-
plc.Column.from_iterable_of_py(
|
|
135
|
+
plc.Column.from_iterable_of_py(
|
|
136
|
+
buff.getvalue().split(), stream=df.stream
|
|
137
|
+
),
|
|
126
138
|
dtype=self.dtype,
|
|
127
139
|
)
|
|
128
140
|
elif self.name in {
|
|
@@ -41,9 +41,15 @@ class Ternary(Expr):
|
|
|
41
41
|
when, then, otherwise = (
|
|
42
42
|
child.evaluate(df, context=context) for child in self.children
|
|
43
43
|
)
|
|
44
|
-
then_obj = then.obj_scalar if then.is_scalar else then.obj
|
|
45
|
-
otherwise_obj =
|
|
44
|
+
then_obj = then.obj_scalar(stream=df.stream) if then.is_scalar else then.obj
|
|
45
|
+
otherwise_obj = (
|
|
46
|
+
otherwise.obj_scalar(stream=df.stream)
|
|
47
|
+
if otherwise.is_scalar
|
|
48
|
+
else otherwise.obj
|
|
49
|
+
)
|
|
46
50
|
return Column(
|
|
47
|
-
plc.copying.copy_if_else(
|
|
51
|
+
plc.copying.copy_if_else(
|
|
52
|
+
then_obj, otherwise_obj, when.obj, stream=df.stream
|
|
53
|
+
),
|
|
48
54
|
dtype=self.dtype,
|
|
49
55
|
)
|
|
@@ -15,7 +15,6 @@ from cudf_polars.containers import Column
|
|
|
15
15
|
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
16
16
|
from cudf_polars.dsl.expressions.literal import Literal
|
|
17
17
|
from cudf_polars.utils import dtypes
|
|
18
|
-
from cudf_polars.utils.versions import POLARS_VERSION_LT_129
|
|
19
18
|
|
|
20
19
|
if TYPE_CHECKING:
|
|
21
20
|
from cudf_polars.containers import DataFrame, DataType
|
|
@@ -33,7 +32,7 @@ class Cast(Expr):
|
|
|
33
32
|
self.dtype = dtype
|
|
34
33
|
self.children = (value,)
|
|
35
34
|
self.is_pointwise = True
|
|
36
|
-
if not dtypes.can_cast(value.dtype.
|
|
35
|
+
if not dtypes.can_cast(value.dtype.plc_type, self.dtype.plc_type):
|
|
37
36
|
raise NotImplementedError(
|
|
38
37
|
f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}"
|
|
39
38
|
)
|
|
@@ -44,7 +43,7 @@ class Cast(Expr):
|
|
|
44
43
|
"""Evaluate this expression given a dataframe for context."""
|
|
45
44
|
(child,) = self.children
|
|
46
45
|
column = child.evaluate(df, context=context)
|
|
47
|
-
return column.astype(self.dtype)
|
|
46
|
+
return column.astype(self.dtype, stream=df.stream)
|
|
48
47
|
|
|
49
48
|
|
|
50
49
|
class Len(Expr):
|
|
@@ -61,8 +60,9 @@ class Len(Expr):
|
|
|
61
60
|
"""Evaluate this expression given a dataframe for context."""
|
|
62
61
|
return Column(
|
|
63
62
|
plc.Column.from_scalar(
|
|
64
|
-
plc.Scalar.from_py(df.num_rows, self.dtype.
|
|
63
|
+
plc.Scalar.from_py(df.num_rows, self.dtype.plc_type, stream=df.stream),
|
|
65
64
|
1,
|
|
65
|
+
stream=df.stream,
|
|
66
66
|
),
|
|
67
67
|
dtype=self.dtype,
|
|
68
68
|
)
|
|
@@ -150,7 +150,7 @@ class UnaryFunction(Expr):
|
|
|
150
150
|
)
|
|
151
151
|
|
|
152
152
|
if self.name not in UnaryFunction._supported_fns:
|
|
153
|
-
raise NotImplementedError(f"Unary function {name=}")
|
|
153
|
+
raise NotImplementedError(f"Unary function {name=}") # pragma: no cover
|
|
154
154
|
if self.name in UnaryFunction._supported_cum_aggs:
|
|
155
155
|
(reverse,) = self.options
|
|
156
156
|
if reverse:
|
|
@@ -174,26 +174,25 @@ class UnaryFunction(Expr):
|
|
|
174
174
|
"""Evaluate this expression given a dataframe for context."""
|
|
175
175
|
if self.name == "mask_nans":
|
|
176
176
|
(child,) = self.children
|
|
177
|
-
return child.evaluate(df, context=context).mask_nans()
|
|
177
|
+
return child.evaluate(df, context=context).mask_nans(stream=df.stream)
|
|
178
178
|
if self.name == "null_count":
|
|
179
179
|
(column,) = (child.evaluate(df, context=context) for child in self.children)
|
|
180
180
|
return Column(
|
|
181
181
|
plc.Column.from_scalar(
|
|
182
|
-
plc.Scalar.from_py(
|
|
182
|
+
plc.Scalar.from_py(
|
|
183
|
+
column.null_count, self.dtype.plc_type, stream=df.stream
|
|
184
|
+
),
|
|
183
185
|
1,
|
|
186
|
+
stream=df.stream,
|
|
184
187
|
),
|
|
185
188
|
dtype=self.dtype,
|
|
186
189
|
)
|
|
190
|
+
arg: plc.Column | plc.Scalar
|
|
187
191
|
if self.name == "round":
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
# pragma: no cover
|
|
193
|
-
(
|
|
194
|
-
decimal_places,
|
|
195
|
-
round_mode,
|
|
196
|
-
) = self.options
|
|
192
|
+
(
|
|
193
|
+
decimal_places,
|
|
194
|
+
round_mode,
|
|
195
|
+
) = self.options
|
|
197
196
|
(values,) = (child.evaluate(df, context=context) for child in self.children)
|
|
198
197
|
return Column(
|
|
199
198
|
plc.round.round(
|
|
@@ -204,6 +203,7 @@ class UnaryFunction(Expr):
|
|
|
204
203
|
if round_mode == "half_to_even"
|
|
205
204
|
else plc.round.RoundingMethod.HALF_UP
|
|
206
205
|
),
|
|
206
|
+
stream=df.stream,
|
|
207
207
|
),
|
|
208
208
|
dtype=self.dtype,
|
|
209
209
|
).sorted_like(values) # pragma: no cover
|
|
@@ -215,30 +215,31 @@ class UnaryFunction(Expr):
|
|
|
215
215
|
keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
|
|
216
216
|
if values.is_sorted:
|
|
217
217
|
maintain_order = True
|
|
218
|
-
|
|
218
|
+
(compacted,) = plc.stream_compaction.unique(
|
|
219
219
|
plc.Table([values.obj]),
|
|
220
220
|
[0],
|
|
221
221
|
keep,
|
|
222
222
|
plc.types.NullEquality.EQUAL,
|
|
223
|
-
|
|
223
|
+
stream=df.stream,
|
|
224
|
+
).columns()
|
|
224
225
|
else:
|
|
225
226
|
distinct = (
|
|
226
227
|
plc.stream_compaction.stable_distinct
|
|
227
228
|
if maintain_order
|
|
228
229
|
else plc.stream_compaction.distinct
|
|
229
230
|
)
|
|
230
|
-
|
|
231
|
+
(compacted,) = distinct(
|
|
231
232
|
plc.Table([values.obj]),
|
|
232
233
|
[0],
|
|
233
234
|
keep,
|
|
234
235
|
plc.types.NullEquality.EQUAL,
|
|
235
236
|
plc.types.NanEquality.ALL_EQUAL,
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
237
|
+
stream=df.stream,
|
|
238
|
+
).columns()
|
|
239
|
+
column = Column(compacted, dtype=self.dtype)
|
|
239
240
|
if maintain_order:
|
|
240
|
-
|
|
241
|
-
return
|
|
241
|
+
column = column.sorted_like(values)
|
|
242
|
+
return column
|
|
242
243
|
elif self.name == "set_sorted":
|
|
243
244
|
(column,) = (child.evaluate(df, context=context) for child in self.children)
|
|
244
245
|
(asc,) = self.options
|
|
@@ -250,9 +251,11 @@ class UnaryFunction(Expr):
|
|
|
250
251
|
null_order = plc.types.NullOrder.BEFORE
|
|
251
252
|
if column.null_count > 0 and (n := column.size) > 1:
|
|
252
253
|
# PERF: This invokes four stream synchronisations!
|
|
253
|
-
has_nulls_first = not plc.copying.get_element(
|
|
254
|
+
has_nulls_first = not plc.copying.get_element(
|
|
255
|
+
column.obj, 0, stream=df.stream
|
|
256
|
+
).is_valid()
|
|
254
257
|
has_nulls_last = not plc.copying.get_element(
|
|
255
|
-
column.obj, n - 1
|
|
258
|
+
column.obj, n - 1, stream=df.stream
|
|
256
259
|
).is_valid()
|
|
257
260
|
if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
|
|
258
261
|
order == plc.types.Order.ASCENDING and has_nulls_last
|
|
@@ -280,30 +283,43 @@ class UnaryFunction(Expr):
|
|
|
280
283
|
counts_table,
|
|
281
284
|
[plc.types.Order.DESCENDING],
|
|
282
285
|
[plc.types.NullOrder.BEFORE],
|
|
286
|
+
stream=df.stream,
|
|
283
287
|
)
|
|
284
288
|
counts_table = plc.copying.gather(
|
|
285
|
-
counts_table,
|
|
289
|
+
counts_table,
|
|
290
|
+
sort_indices,
|
|
291
|
+
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
|
|
292
|
+
stream=df.stream,
|
|
286
293
|
)
|
|
287
294
|
keys_table = plc.copying.gather(
|
|
288
|
-
keys_table,
|
|
295
|
+
keys_table,
|
|
296
|
+
sort_indices,
|
|
297
|
+
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
|
|
298
|
+
stream=df.stream,
|
|
289
299
|
)
|
|
290
300
|
keys_col = keys_table.columns()[0]
|
|
291
301
|
counts_col = counts_table.columns()[0]
|
|
292
302
|
if normalize:
|
|
293
303
|
total_counts = plc.reduce.reduce(
|
|
294
|
-
counts_col,
|
|
304
|
+
counts_col,
|
|
305
|
+
plc.aggregation.sum(),
|
|
306
|
+
plc.DataType(plc.TypeId.UINT64),
|
|
307
|
+
stream=df.stream,
|
|
295
308
|
)
|
|
296
309
|
counts_col = plc.binaryop.binary_operation(
|
|
297
310
|
counts_col,
|
|
298
311
|
total_counts,
|
|
299
312
|
plc.binaryop.BinaryOperator.DIV,
|
|
300
313
|
plc.DataType(plc.TypeId.FLOAT64),
|
|
314
|
+
stream=df.stream,
|
|
301
315
|
)
|
|
302
316
|
elif counts_col.type().id() == plc.TypeId.INT32:
|
|
303
|
-
counts_col = plc.unary.cast(
|
|
317
|
+
counts_col = plc.unary.cast(
|
|
318
|
+
counts_col, plc.DataType(plc.TypeId.UINT32), stream=df.stream
|
|
319
|
+
)
|
|
304
320
|
|
|
305
321
|
plc_column = plc.Column(
|
|
306
|
-
self.dtype.
|
|
322
|
+
self.dtype.plc_type,
|
|
307
323
|
counts_col.size(),
|
|
308
324
|
None,
|
|
309
325
|
None,
|
|
@@ -318,7 +334,7 @@ class UnaryFunction(Expr):
|
|
|
318
334
|
return column
|
|
319
335
|
return Column(
|
|
320
336
|
plc.stream_compaction.drop_nulls(
|
|
321
|
-
plc.Table([column.obj]), [0], 1
|
|
337
|
+
plc.Table([column.obj]), [0], 1, stream=df.stream
|
|
322
338
|
).columns()[0],
|
|
323
339
|
dtype=self.dtype,
|
|
324
340
|
)
|
|
@@ -328,19 +344,31 @@ class UnaryFunction(Expr):
|
|
|
328
344
|
return column
|
|
329
345
|
fill_value = self.children[1]
|
|
330
346
|
if isinstance(fill_value, Literal):
|
|
331
|
-
arg = plc.Scalar.from_py(
|
|
347
|
+
arg = plc.Scalar.from_py(
|
|
348
|
+
fill_value.value, fill_value.dtype.plc_type, stream=df.stream
|
|
349
|
+
)
|
|
332
350
|
else:
|
|
333
351
|
evaluated = fill_value.evaluate(df, context=context)
|
|
334
|
-
arg =
|
|
352
|
+
arg = (
|
|
353
|
+
evaluated.obj_scalar(stream=df.stream)
|
|
354
|
+
if evaluated.is_scalar
|
|
355
|
+
else evaluated.obj
|
|
356
|
+
)
|
|
335
357
|
if isinstance(arg, plc.Scalar) and dtypes.can_cast(
|
|
336
|
-
column.dtype.
|
|
358
|
+
column.dtype.plc_type, arg.type()
|
|
337
359
|
): # pragma: no cover
|
|
338
360
|
arg = (
|
|
339
|
-
Column(
|
|
340
|
-
|
|
341
|
-
|
|
361
|
+
Column(
|
|
362
|
+
plc.Column.from_scalar(arg, 1, stream=df.stream),
|
|
363
|
+
dtype=fill_value.dtype,
|
|
364
|
+
)
|
|
365
|
+
.astype(column.dtype, stream=df.stream)
|
|
366
|
+
.obj.to_scalar(stream=df.stream)
|
|
342
367
|
)
|
|
343
|
-
return Column(
|
|
368
|
+
return Column(
|
|
369
|
+
plc.replace.replace_nulls(column.obj, arg, stream=df.stream),
|
|
370
|
+
dtype=self.dtype,
|
|
371
|
+
)
|
|
344
372
|
elif self.name == "fill_null_with_strategy":
|
|
345
373
|
column = self.children[0].evaluate(df, context=context)
|
|
346
374
|
strategy, limit = self.options
|
|
@@ -352,6 +380,8 @@ class UnaryFunction(Expr):
|
|
|
352
380
|
)
|
|
353
381
|
):
|
|
354
382
|
return column
|
|
383
|
+
|
|
384
|
+
replacement: plc.replace.ReplacePolicy | plc.Scalar
|
|
355
385
|
if strategy == "forward":
|
|
356
386
|
replacement = plc.replace.ReplacePolicy.PRECEDING
|
|
357
387
|
elif strategy == "backward":
|
|
@@ -360,37 +390,49 @@ class UnaryFunction(Expr):
|
|
|
360
390
|
replacement = plc.reduce.reduce(
|
|
361
391
|
column.obj,
|
|
362
392
|
plc.aggregation.min(),
|
|
363
|
-
column.dtype.
|
|
393
|
+
column.dtype.plc_type,
|
|
394
|
+
stream=df.stream,
|
|
364
395
|
)
|
|
365
396
|
elif strategy == "max":
|
|
366
397
|
replacement = plc.reduce.reduce(
|
|
367
398
|
column.obj,
|
|
368
399
|
plc.aggregation.max(),
|
|
369
|
-
column.dtype.
|
|
400
|
+
column.dtype.plc_type,
|
|
401
|
+
stream=df.stream,
|
|
370
402
|
)
|
|
371
403
|
elif strategy == "mean":
|
|
372
404
|
replacement = plc.reduce.reduce(
|
|
373
405
|
column.obj,
|
|
374
406
|
plc.aggregation.mean(),
|
|
375
407
|
plc.DataType(plc.TypeId.FLOAT64),
|
|
408
|
+
stream=df.stream,
|
|
376
409
|
)
|
|
377
410
|
elif strategy == "zero":
|
|
378
|
-
replacement = plc.scalar.Scalar.from_py(
|
|
411
|
+
replacement = plc.scalar.Scalar.from_py(
|
|
412
|
+
0, dtype=column.dtype.plc_type, stream=df.stream
|
|
413
|
+
)
|
|
379
414
|
elif strategy == "one":
|
|
380
|
-
replacement = plc.scalar.Scalar.from_py(
|
|
415
|
+
replacement = plc.scalar.Scalar.from_py(
|
|
416
|
+
1, dtype=column.dtype.plc_type, stream=df.stream
|
|
417
|
+
)
|
|
381
418
|
else:
|
|
382
419
|
assert_never(strategy) # pragma: no cover
|
|
383
420
|
|
|
384
421
|
if strategy == "mean":
|
|
385
422
|
return Column(
|
|
386
423
|
plc.replace.replace_nulls(
|
|
387
|
-
plc.unary.cast(
|
|
424
|
+
plc.unary.cast(
|
|
425
|
+
column.obj,
|
|
426
|
+
plc.DataType(plc.TypeId.FLOAT64),
|
|
427
|
+
stream=df.stream,
|
|
428
|
+
),
|
|
388
429
|
replacement,
|
|
430
|
+
stream=df.stream,
|
|
389
431
|
),
|
|
390
432
|
dtype=self.dtype,
|
|
391
|
-
).astype(self.dtype)
|
|
433
|
+
).astype(self.dtype, stream=df.stream)
|
|
392
434
|
return Column(
|
|
393
|
-
plc.replace.replace_nulls(column.obj, replacement),
|
|
435
|
+
plc.replace.replace_nulls(column.obj, replacement, stream=df.stream),
|
|
394
436
|
dtype=self.dtype,
|
|
395
437
|
)
|
|
396
438
|
elif self.name == "as_struct":
|
|
@@ -399,7 +441,7 @@ class UnaryFunction(Expr):
|
|
|
399
441
|
]
|
|
400
442
|
return Column(
|
|
401
443
|
plc.Column(
|
|
402
|
-
data_type=self.dtype.
|
|
444
|
+
data_type=self.dtype.plc_type,
|
|
403
445
|
size=children[0].size(),
|
|
404
446
|
data=None,
|
|
405
447
|
mask=None,
|
|
@@ -432,19 +474,24 @@ class UnaryFunction(Expr):
|
|
|
432
474
|
plc.types.NullPolicy.EXCLUDE,
|
|
433
475
|
plc.types.NullOrder.BEFORE if descending else plc.types.NullOrder.AFTER,
|
|
434
476
|
percentage=False,
|
|
477
|
+
stream=df.stream,
|
|
435
478
|
)
|
|
436
479
|
|
|
437
480
|
# Min/Max/Dense/Ordinal -> IDX_DTYPE
|
|
438
481
|
# See https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/rank.rs
|
|
439
482
|
if method_str in {"min", "max", "dense", "ordinal"}:
|
|
440
|
-
dest = self.dtype.
|
|
483
|
+
dest = self.dtype.plc_type.id()
|
|
441
484
|
src = ranked.type().id()
|
|
442
485
|
if dest == plc.TypeId.UINT32 and src != plc.TypeId.UINT32:
|
|
443
|
-
ranked = plc.unary.cast(
|
|
486
|
+
ranked = plc.unary.cast(
|
|
487
|
+
ranked, plc.DataType(plc.TypeId.UINT32), stream=df.stream
|
|
488
|
+
)
|
|
444
489
|
elif (
|
|
445
490
|
dest == plc.TypeId.UINT64 and src != plc.TypeId.UINT64
|
|
446
491
|
): # pragma: no cover
|
|
447
|
-
ranked = plc.unary.cast(
|
|
492
|
+
ranked = plc.unary.cast(
|
|
493
|
+
ranked, plc.DataType(plc.TypeId.UINT64), stream=df.stream
|
|
494
|
+
)
|
|
448
495
|
|
|
449
496
|
return Column(ranked, dtype=self.dtype)
|
|
450
497
|
elif self.name == "top_k":
|
|
@@ -459,23 +506,26 @@ class UnaryFunction(Expr):
|
|
|
459
506
|
plc.types.Order.ASCENDING
|
|
460
507
|
if reverse
|
|
461
508
|
else plc.types.Order.DESCENDING,
|
|
509
|
+
stream=df.stream,
|
|
462
510
|
),
|
|
463
511
|
dtype=self.dtype,
|
|
464
512
|
)
|
|
465
513
|
elif self.name in self._OP_MAPPING:
|
|
466
514
|
column = self.children[0].evaluate(df, context=context)
|
|
467
|
-
if column.dtype.
|
|
468
|
-
arg = plc.unary.cast(column.obj, self.dtype.
|
|
515
|
+
if column.dtype.plc_type.id() != self.dtype.id():
|
|
516
|
+
arg = plc.unary.cast(column.obj, self.dtype.plc_type, stream=df.stream)
|
|
469
517
|
else:
|
|
470
518
|
arg = column.obj
|
|
471
519
|
return Column(
|
|
472
|
-
plc.unary.unary_operation(
|
|
520
|
+
plc.unary.unary_operation(
|
|
521
|
+
arg, self._OP_MAPPING[self.name], stream=df.stream
|
|
522
|
+
),
|
|
473
523
|
dtype=self.dtype,
|
|
474
524
|
)
|
|
475
525
|
elif self.name in UnaryFunction._supported_cum_aggs:
|
|
476
526
|
column = self.children[0].evaluate(df, context=context)
|
|
477
527
|
plc_col = column.obj
|
|
478
|
-
col_type = column.dtype.
|
|
528
|
+
col_type = column.dtype.plc_type
|
|
479
529
|
# cum_sum casts
|
|
480
530
|
# Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
|
|
481
531
|
# Bool -> UInt32
|
|
@@ -496,9 +546,16 @@ class UnaryFunction(Expr):
|
|
|
496
546
|
and plc.traits.is_integral(col_type)
|
|
497
547
|
and plc.types.size_of(col_type) <= 4
|
|
498
548
|
):
|
|
499
|
-
plc_col = plc.unary.cast(
|
|
500
|
-
|
|
501
|
-
|
|
549
|
+
plc_col = plc.unary.cast(
|
|
550
|
+
plc_col, plc.DataType(plc.TypeId.INT64), stream=df.stream
|
|
551
|
+
)
|
|
552
|
+
elif (
|
|
553
|
+
self.name == "cum_sum"
|
|
554
|
+
and column.dtype.plc_type.id() == plc.TypeId.BOOL8
|
|
555
|
+
):
|
|
556
|
+
plc_col = plc.unary.cast(
|
|
557
|
+
plc_col, plc.DataType(plc.TypeId.UINT32), stream=df.stream
|
|
558
|
+
)
|
|
502
559
|
if self.name == "cum_sum":
|
|
503
560
|
agg = plc.aggregation.sum()
|
|
504
561
|
elif self.name == "cum_prod":
|
|
@@ -509,7 +566,9 @@ class UnaryFunction(Expr):
|
|
|
509
566
|
agg = plc.aggregation.max()
|
|
510
567
|
|
|
511
568
|
return Column(
|
|
512
|
-
plc.reduce.scan(
|
|
569
|
+
plc.reduce.scan(
|
|
570
|
+
plc_col, agg, plc.reduce.ScanType.INCLUSIVE, stream=df.stream
|
|
571
|
+
),
|
|
513
572
|
dtype=self.dtype,
|
|
514
573
|
)
|
|
515
574
|
raise NotImplementedError(
|