cudf-polars-cu12 25.4.0__py3-none-any.whl → 25.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/callback.py +35 -50
- cudf_polars/containers/column.py +38 -0
- cudf_polars/containers/dataframe.py +11 -16
- cudf_polars/dsl/expressions/aggregation.py +25 -61
- cudf_polars/dsl/expressions/base.py +40 -72
- cudf_polars/dsl/expressions/binaryop.py +3 -39
- cudf_polars/dsl/expressions/boolean.py +21 -49
- cudf_polars/dsl/expressions/datetime.py +59 -17
- cudf_polars/dsl/expressions/literal.py +24 -24
- cudf_polars/dsl/expressions/rolling.py +110 -9
- cudf_polars/dsl/expressions/selection.py +6 -24
- cudf_polars/dsl/expressions/slicing.py +2 -8
- cudf_polars/dsl/expressions/sorting.py +4 -17
- cudf_polars/dsl/expressions/string.py +29 -32
- cudf_polars/dsl/expressions/ternary.py +3 -10
- cudf_polars/dsl/expressions/unary.py +32 -73
- cudf_polars/dsl/ir.py +575 -167
- cudf_polars/dsl/nodebase.py +1 -1
- cudf_polars/dsl/to_ast.py +5 -3
- cudf_polars/dsl/translate.py +272 -152
- cudf_polars/dsl/utils/__init__.py +8 -0
- cudf_polars/dsl/utils/aggregations.py +292 -0
- cudf_polars/dsl/utils/groupby.py +97 -0
- cudf_polars/dsl/utils/naming.py +34 -0
- cudf_polars/dsl/utils/replace.py +46 -0
- cudf_polars/dsl/utils/rolling.py +113 -0
- cudf_polars/dsl/utils/windows.py +186 -0
- cudf_polars/experimental/base.py +0 -8
- cudf_polars/experimental/benchmarks/__init__.py +4 -0
- cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
- cudf_polars/experimental/dask_registers.py +196 -0
- cudf_polars/experimental/distinct.py +174 -0
- cudf_polars/experimental/explain.py +127 -0
- cudf_polars/experimental/expressions.py +521 -0
- cudf_polars/experimental/groupby.py +109 -167
- cudf_polars/experimental/io.py +53 -26
- cudf_polars/experimental/join.py +59 -24
- cudf_polars/experimental/parallel.py +155 -133
- cudf_polars/experimental/repartition.py +69 -0
- cudf_polars/experimental/scheduler.py +155 -0
- cudf_polars/experimental/select.py +92 -7
- cudf_polars/experimental/shuffle.py +109 -9
- cudf_polars/experimental/sort.py +45 -0
- cudf_polars/experimental/spilling.py +151 -0
- cudf_polars/experimental/utils.py +100 -0
- cudf_polars/testing/asserts.py +146 -6
- cudf_polars/testing/io.py +72 -0
- cudf_polars/testing/plugin.py +55 -42
- cudf_polars/typing/__init__.py +27 -5
- cudf_polars/utils/config.py +317 -102
- cudf_polars/utils/dtypes.py +8 -1
- cudf_polars/utils/timer.py +1 -1
- cudf_polars/utils/versions.py +4 -4
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +7 -5
- cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
- cudf_polars/experimental/dask_serialize.py +0 -73
- cudf_polars_cu12-25.4.0.dist-info/RECORD +0 -55
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/licenses/LICENSE +0 -0
- {cudf_polars_cu12-25.4.0.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0
|
@@ -15,8 +15,6 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
|
15
15
|
from cudf_polars.utils import sorting
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
|
-
from collections.abc import Mapping
|
|
19
|
-
|
|
20
18
|
from cudf_polars.containers import DataFrame
|
|
21
19
|
|
|
22
20
|
__all__ = ["Sort", "SortBy"]
|
|
@@ -35,15 +33,11 @@ class Sort(Expr):
|
|
|
35
33
|
self.is_pointwise = False
|
|
36
34
|
|
|
37
35
|
def do_evaluate(
|
|
38
|
-
self,
|
|
39
|
-
df: DataFrame,
|
|
40
|
-
*,
|
|
41
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
42
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
36
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
43
37
|
) -> Column:
|
|
44
38
|
"""Evaluate this expression given a dataframe for context."""
|
|
45
39
|
(child,) = self.children
|
|
46
|
-
column = child.evaluate(df, context=context
|
|
40
|
+
column = child.evaluate(df, context=context)
|
|
47
41
|
(stable, nulls_last, descending) = self.options
|
|
48
42
|
order, null_order = sorting.sort_order(
|
|
49
43
|
[descending], nulls_last=[nulls_last], num_keys=1
|
|
@@ -75,17 +69,10 @@ class SortBy(Expr):
|
|
|
75
69
|
self.is_pointwise = False
|
|
76
70
|
|
|
77
71
|
def do_evaluate(
|
|
78
|
-
self,
|
|
79
|
-
df: DataFrame,
|
|
80
|
-
*,
|
|
81
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
82
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
72
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
83
73
|
) -> Column:
|
|
84
74
|
"""Evaluate this expression given a dataframe for context."""
|
|
85
|
-
column, *by = (
|
|
86
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
87
|
-
for child in self.children
|
|
88
|
-
)
|
|
75
|
+
column, *by = (child.evaluate(df, context=context) for child in self.children)
|
|
89
76
|
(stable, nulls_last, descending) = self.options
|
|
90
77
|
order, null_order = sorting.sort_order(
|
|
91
78
|
descending, nulls_last=nulls_last, num_keys=len(by)
|
|
@@ -21,8 +21,6 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
|
21
21
|
from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
|
-
from collections.abc import Mapping
|
|
25
|
-
|
|
26
24
|
from typing_extensions import Self
|
|
27
25
|
|
|
28
26
|
from polars.polars import _expr_nodes as pl_expr
|
|
@@ -107,10 +105,10 @@ class StringFunction(Expr):
|
|
|
107
105
|
self.options = options
|
|
108
106
|
self.name = name
|
|
109
107
|
self.children = children
|
|
110
|
-
self.is_pointwise =
|
|
108
|
+
self.is_pointwise = self.name != StringFunction.Name.ConcatVertical
|
|
111
109
|
self._validate_input()
|
|
112
110
|
|
|
113
|
-
def _validate_input(self):
|
|
111
|
+
def _validate_input(self) -> None:
|
|
114
112
|
if self.name not in (
|
|
115
113
|
StringFunction.Name.ConcatVertical,
|
|
116
114
|
StringFunction.Name.Contains,
|
|
@@ -138,7 +136,7 @@ class StringFunction(Expr):
|
|
|
138
136
|
raise NotImplementedError(
|
|
139
137
|
"Regex contains only supports a scalar pattern"
|
|
140
138
|
)
|
|
141
|
-
pattern = self.children[1].value
|
|
139
|
+
pattern = self.children[1].value
|
|
142
140
|
try:
|
|
143
141
|
self._regex_program = plc.strings.regex_program.RegexProgram.create(
|
|
144
142
|
pattern,
|
|
@@ -155,7 +153,9 @@ class StringFunction(Expr):
|
|
|
155
153
|
if not all(isinstance(expr, Literal) for expr in self.children[1:]):
|
|
156
154
|
raise NotImplementedError("replace only supports scalar target")
|
|
157
155
|
target = self.children[1]
|
|
158
|
-
if target
|
|
156
|
+
# Above, we raise NotImplementedError if the target is not a Literal,
|
|
157
|
+
# so we can safely access .value here.
|
|
158
|
+
if target.value == "": # type: ignore[attr-defined]
|
|
159
159
|
raise NotImplementedError(
|
|
160
160
|
"libcudf replace does not support empty strings"
|
|
161
161
|
)
|
|
@@ -170,7 +170,14 @@ class StringFunction(Expr):
|
|
|
170
170
|
):
|
|
171
171
|
raise NotImplementedError("replace_many only supports literal inputs")
|
|
172
172
|
target = self.children[1]
|
|
173
|
-
if
|
|
173
|
+
# Above, we raise NotImplementedError if the target is not a Literal,
|
|
174
|
+
# so we can safely access .value here.
|
|
175
|
+
if (isinstance(target, Literal) and target.value == "") or (
|
|
176
|
+
isinstance(target, LiteralColumn)
|
|
177
|
+
and pc.any(
|
|
178
|
+
pc.equal(target.value.cast(pa.string()), "") # type: ignore[attr-defined]
|
|
179
|
+
).as_py()
|
|
180
|
+
):
|
|
174
181
|
raise NotImplementedError(
|
|
175
182
|
"libcudf replace_many is implemented differently from polars "
|
|
176
183
|
"for empty strings"
|
|
@@ -199,33 +206,29 @@ class StringFunction(Expr):
|
|
|
199
206
|
)
|
|
200
207
|
|
|
201
208
|
def do_evaluate(
|
|
202
|
-
self,
|
|
203
|
-
df: DataFrame,
|
|
204
|
-
*,
|
|
205
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
206
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
209
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
207
210
|
) -> Column:
|
|
208
211
|
"""Evaluate this expression given a dataframe for context."""
|
|
209
212
|
if self.name is StringFunction.Name.ConcatVertical:
|
|
210
213
|
(child,) = self.children
|
|
211
|
-
column = child.evaluate(df, context=context
|
|
214
|
+
column = child.evaluate(df, context=context)
|
|
212
215
|
delimiter, ignore_nulls = self.options
|
|
213
216
|
if column.null_count > 0 and not ignore_nulls:
|
|
214
217
|
return Column(plc.Column.all_null_like(column.obj, 1))
|
|
215
218
|
return Column(
|
|
216
219
|
plc.strings.combine.join_strings(
|
|
217
220
|
column.obj,
|
|
218
|
-
plc.
|
|
219
|
-
plc.
|
|
221
|
+
plc.Scalar.from_py(delimiter, plc.DataType(plc.TypeId.STRING)),
|
|
222
|
+
plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
|
|
220
223
|
)
|
|
221
224
|
)
|
|
222
225
|
elif self.name is StringFunction.Name.Contains:
|
|
223
226
|
child, arg = self.children
|
|
224
|
-
column = child.evaluate(df, context=context
|
|
227
|
+
column = child.evaluate(df, context=context)
|
|
225
228
|
|
|
226
229
|
literal, _ = self.options
|
|
227
230
|
if literal:
|
|
228
|
-
pat = arg.evaluate(df, context=context
|
|
231
|
+
pat = arg.evaluate(df, context=context)
|
|
229
232
|
pattern = (
|
|
230
233
|
pat.obj_scalar
|
|
231
234
|
if pat.is_scalar and pat.size != column.size
|
|
@@ -241,15 +244,15 @@ class StringFunction(Expr):
|
|
|
241
244
|
assert isinstance(expr_offset, Literal)
|
|
242
245
|
assert isinstance(expr_length, Literal)
|
|
243
246
|
|
|
244
|
-
column = child.evaluate(df, context=context
|
|
247
|
+
column = child.evaluate(df, context=context)
|
|
245
248
|
# libcudf slices via [start,stop).
|
|
246
249
|
# polars slices with offset + length where start == offset
|
|
247
250
|
# stop = start + length. Negative values for start look backward
|
|
248
251
|
# from the last element of the string. If the end index would be
|
|
249
252
|
# below zero, an empty string is returned.
|
|
250
253
|
# Do this maths on the host
|
|
251
|
-
start = expr_offset.value
|
|
252
|
-
length = expr_length.value
|
|
254
|
+
start = expr_offset.value
|
|
255
|
+
length = expr_length.value
|
|
253
256
|
|
|
254
257
|
if length == 0:
|
|
255
258
|
stop = start
|
|
@@ -262,8 +265,8 @@ class StringFunction(Expr):
|
|
|
262
265
|
return Column(
|
|
263
266
|
plc.strings.slice.slice_strings(
|
|
264
267
|
column.obj,
|
|
265
|
-
plc.
|
|
266
|
-
plc.
|
|
268
|
+
plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
|
|
269
|
+
plc.Scalar.from_py(stop, plc.DataType(plc.TypeId.INT32)),
|
|
267
270
|
)
|
|
268
271
|
)
|
|
269
272
|
elif self.name in {
|
|
@@ -271,9 +274,7 @@ class StringFunction(Expr):
|
|
|
271
274
|
StringFunction.Name.StripCharsStart,
|
|
272
275
|
StringFunction.Name.StripCharsEnd,
|
|
273
276
|
}:
|
|
274
|
-
column, chars = (
|
|
275
|
-
c.evaluate(df, context=context, mapping=mapping) for c in self.children
|
|
276
|
-
)
|
|
277
|
+
column, chars = (c.evaluate(df, context=context) for c in self.children)
|
|
277
278
|
if self.name is StringFunction.Name.StripCharsStart:
|
|
278
279
|
side = plc.strings.SideType.LEFT
|
|
279
280
|
elif self.name is StringFunction.Name.StripCharsEnd:
|
|
@@ -282,10 +283,7 @@ class StringFunction(Expr):
|
|
|
282
283
|
side = plc.strings.SideType.BOTH
|
|
283
284
|
return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
|
|
284
285
|
|
|
285
|
-
columns = [
|
|
286
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
287
|
-
for child in self.children
|
|
288
|
-
]
|
|
286
|
+
columns = [child.evaluate(df, context=context) for child in self.children]
|
|
289
287
|
if self.name is StringFunction.Name.Lowercase:
|
|
290
288
|
(column,) = columns
|
|
291
289
|
return Column(plc.strings.case.to_lower(column.obj))
|
|
@@ -315,7 +313,7 @@ class StringFunction(Expr):
|
|
|
315
313
|
elif self.name is StringFunction.Name.Strptime:
|
|
316
314
|
# TODO: ignores ambiguous
|
|
317
315
|
format, strict, exact, cache = self.options
|
|
318
|
-
col = self.children[0].evaluate(df, context=context
|
|
316
|
+
col = self.children[0].evaluate(df, context=context)
|
|
319
317
|
|
|
320
318
|
is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
|
|
321
319
|
col.obj, format
|
|
@@ -334,8 +332,7 @@ class StringFunction(Expr):
|
|
|
334
332
|
not_timestamps = plc.unary.unary_operation(
|
|
335
333
|
is_timestamps, plc.unary.UnaryOperator.NOT
|
|
336
334
|
)
|
|
337
|
-
|
|
338
|
-
null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
|
|
335
|
+
null = plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING))
|
|
339
336
|
res = plc.copying.boolean_mask_scatter(
|
|
340
337
|
[null], plc.Table([col.obj]), not_timestamps
|
|
341
338
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -17,8 +17,6 @@ from cudf_polars.dsl.expressions.base import (
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Mapping
|
|
21
|
-
|
|
22
20
|
from cudf_polars.containers import DataFrame
|
|
23
21
|
|
|
24
22
|
|
|
@@ -37,16 +35,11 @@ class Ternary(Expr):
|
|
|
37
35
|
self.is_pointwise = True
|
|
38
36
|
|
|
39
37
|
def do_evaluate(
|
|
40
|
-
self,
|
|
41
|
-
df: DataFrame,
|
|
42
|
-
*,
|
|
43
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
44
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
38
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
45
39
|
) -> Column:
|
|
46
40
|
"""Evaluate this expression given a dataframe for context."""
|
|
47
41
|
when, then, otherwise = (
|
|
48
|
-
child.evaluate(df, context=context
|
|
49
|
-
for child in self.children
|
|
42
|
+
child.evaluate(df, context=context) for child in self.children
|
|
50
43
|
)
|
|
51
44
|
then_obj = then.obj_scalar if then.is_scalar else then.obj
|
|
52
45
|
otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
|
|
@@ -7,18 +7,15 @@ from __future__ import annotations
|
|
|
7
7
|
|
|
8
8
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
9
9
|
|
|
10
|
-
import pyarrow as pa
|
|
11
|
-
|
|
12
10
|
import pylibcudf as plc
|
|
13
11
|
|
|
14
12
|
from cudf_polars.containers import Column
|
|
15
|
-
from cudf_polars.dsl.expressions.base import
|
|
13
|
+
from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
|
|
16
14
|
from cudf_polars.dsl.expressions.literal import Literal
|
|
17
15
|
from cudf_polars.utils import dtypes
|
|
16
|
+
from cudf_polars.utils.versions import POLARS_VERSION_LT_128
|
|
18
17
|
|
|
19
18
|
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Mapping
|
|
21
|
-
|
|
22
19
|
from cudf_polars.containers import DataFrame
|
|
23
20
|
|
|
24
21
|
__all__ = ["Cast", "Len", "UnaryFunction"]
|
|
@@ -40,23 +37,13 @@ class Cast(Expr):
|
|
|
40
37
|
)
|
|
41
38
|
|
|
42
39
|
def do_evaluate(
|
|
43
|
-
self,
|
|
44
|
-
df: DataFrame,
|
|
45
|
-
*,
|
|
46
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
47
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
40
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
48
41
|
) -> Column:
|
|
49
42
|
"""Evaluate this expression given a dataframe for context."""
|
|
50
43
|
(child,) = self.children
|
|
51
|
-
column = child.evaluate(df, context=context
|
|
44
|
+
column = child.evaluate(df, context=context)
|
|
52
45
|
return column.astype(self.dtype)
|
|
53
46
|
|
|
54
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
55
|
-
"""Collect information about aggregations in groupbys."""
|
|
56
|
-
# TODO: Could do with sort-based groupby and segmented filter
|
|
57
|
-
(child,) = self.children
|
|
58
|
-
return child.collect_agg(depth=depth)
|
|
59
|
-
|
|
60
47
|
|
|
61
48
|
class Len(Expr):
|
|
62
49
|
"""Class representing the length of an expression."""
|
|
@@ -67,28 +54,19 @@ class Len(Expr):
|
|
|
67
54
|
self.is_pointwise = False
|
|
68
55
|
|
|
69
56
|
def do_evaluate(
|
|
70
|
-
self,
|
|
71
|
-
df: DataFrame,
|
|
72
|
-
*,
|
|
73
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
74
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
57
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
75
58
|
) -> Column:
|
|
76
59
|
"""Evaluate this expression given a dataframe for context."""
|
|
77
60
|
return Column(
|
|
78
61
|
plc.Column.from_scalar(
|
|
79
|
-
plc.
|
|
80
|
-
pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
|
|
81
|
-
),
|
|
62
|
+
plc.Scalar.from_py(df.num_rows, self.dtype),
|
|
82
63
|
1,
|
|
83
64
|
)
|
|
84
65
|
)
|
|
85
66
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
return AggInfo(
|
|
90
|
-
[(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
|
|
91
|
-
)
|
|
67
|
+
@property
|
|
68
|
+
def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102
|
|
69
|
+
return plc.aggregation.count(plc.types.NullPolicy.INCLUDE)
|
|
92
70
|
|
|
93
71
|
|
|
94
72
|
class UnaryFunction(Expr):
|
|
@@ -169,22 +147,15 @@ class UnaryFunction(Expr):
|
|
|
169
147
|
)
|
|
170
148
|
|
|
171
149
|
def do_evaluate(
|
|
172
|
-
self,
|
|
173
|
-
df: DataFrame,
|
|
174
|
-
*,
|
|
175
|
-
context: ExecutionContext = ExecutionContext.FRAME,
|
|
176
|
-
mapping: Mapping[Expr, Column] | None = None,
|
|
150
|
+
self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
|
|
177
151
|
) -> Column:
|
|
178
152
|
"""Evaluate this expression given a dataframe for context."""
|
|
179
153
|
if self.name == "mask_nans":
|
|
180
154
|
(child,) = self.children
|
|
181
|
-
return child.evaluate(df, context=context
|
|
155
|
+
return child.evaluate(df, context=context).mask_nans()
|
|
182
156
|
if self.name == "round":
|
|
183
157
|
(decimal_places,) = self.options
|
|
184
|
-
(values,) = (
|
|
185
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
186
|
-
for child in self.children
|
|
187
|
-
)
|
|
158
|
+
(values,) = (child.evaluate(df, context=context) for child in self.children)
|
|
188
159
|
return Column(
|
|
189
160
|
plc.round.round(
|
|
190
161
|
values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
|
|
@@ -192,10 +163,7 @@ class UnaryFunction(Expr):
|
|
|
192
163
|
).sorted_like(values)
|
|
193
164
|
elif self.name == "unique":
|
|
194
165
|
(maintain_order,) = self.options
|
|
195
|
-
(values,) = (
|
|
196
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
197
|
-
for child in self.children
|
|
198
|
-
)
|
|
166
|
+
(values,) = (child.evaluate(df, context=context) for child in self.children)
|
|
199
167
|
# Only one column, so keep_any is the same as keep_first
|
|
200
168
|
# for stable distinct
|
|
201
169
|
keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
|
|
@@ -225,10 +193,7 @@ class UnaryFunction(Expr):
|
|
|
225
193
|
return Column(column).sorted_like(values)
|
|
226
194
|
return Column(column)
|
|
227
195
|
elif self.name == "set_sorted":
|
|
228
|
-
(column,) = (
|
|
229
|
-
child.evaluate(df, context=context, mapping=mapping)
|
|
230
|
-
for child in self.children
|
|
231
|
-
)
|
|
196
|
+
(column,) = (child.evaluate(df, context=context) for child in self.children)
|
|
232
197
|
(asc,) = self.options
|
|
233
198
|
order = (
|
|
234
199
|
plc.types.Order.ASCENDING
|
|
@@ -252,34 +217,41 @@ class UnaryFunction(Expr):
|
|
|
252
217
|
null_order=null_order,
|
|
253
218
|
)
|
|
254
219
|
elif self.name == "drop_nulls":
|
|
255
|
-
(column,) = (
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
)
|
|
220
|
+
(column,) = (child.evaluate(df, context=context) for child in self.children)
|
|
221
|
+
if column.null_count == 0:
|
|
222
|
+
return column
|
|
259
223
|
return Column(
|
|
260
224
|
plc.stream_compaction.drop_nulls(
|
|
261
225
|
plc.Table([column.obj]), [0], 1
|
|
262
226
|
).columns()[0]
|
|
263
227
|
)
|
|
264
228
|
elif self.name == "fill_null":
|
|
265
|
-
column = self.children[0].evaluate(df, context=context
|
|
229
|
+
column = self.children[0].evaluate(df, context=context)
|
|
230
|
+
if column.null_count == 0:
|
|
231
|
+
return column
|
|
266
232
|
if isinstance(self.children[1], Literal):
|
|
267
|
-
arg = plc.
|
|
233
|
+
arg = plc.Scalar.from_py(self.children[1].value, self.children[1].dtype)
|
|
268
234
|
else:
|
|
269
|
-
evaluated = self.children[1].evaluate(
|
|
270
|
-
df, context=context, mapping=mapping
|
|
271
|
-
)
|
|
235
|
+
evaluated = self.children[1].evaluate(df, context=context)
|
|
272
236
|
arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
|
|
237
|
+
if (
|
|
238
|
+
not POLARS_VERSION_LT_128
|
|
239
|
+
and isinstance(arg, plc.Scalar)
|
|
240
|
+
and dtypes.can_cast(column.obj.type(), arg.type())
|
|
241
|
+
): # pragma: no cover
|
|
242
|
+
arg = plc.unary.cast(
|
|
243
|
+
plc.Column.from_scalar(arg, 1), column.obj.type()
|
|
244
|
+
).to_scalar()
|
|
273
245
|
return Column(plc.replace.replace_nulls(column.obj, arg))
|
|
274
246
|
elif self.name in self._OP_MAPPING:
|
|
275
|
-
column = self.children[0].evaluate(df, context=context
|
|
247
|
+
column = self.children[0].evaluate(df, context=context)
|
|
276
248
|
if column.obj.type().id() != self.dtype.id():
|
|
277
249
|
arg = plc.unary.cast(column.obj, self.dtype)
|
|
278
250
|
else:
|
|
279
251
|
arg = column.obj
|
|
280
252
|
return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
|
|
281
253
|
elif self.name in UnaryFunction._supported_cum_aggs:
|
|
282
|
-
column = self.children[0].evaluate(df, context=context
|
|
254
|
+
column = self.children[0].evaluate(df, context=context)
|
|
283
255
|
plc_col = column.obj
|
|
284
256
|
col_type = column.obj.type()
|
|
285
257
|
# cum_sum casts
|
|
@@ -325,16 +297,3 @@ class UnaryFunction(Expr):
|
|
|
325
297
|
raise NotImplementedError(
|
|
326
298
|
f"Unimplemented unary function {self.name=}"
|
|
327
299
|
) # pragma: no cover; init trips first
|
|
328
|
-
|
|
329
|
-
def collect_agg(self, *, depth: int) -> AggInfo:
|
|
330
|
-
"""Collect information about aggregations in groupbys."""
|
|
331
|
-
if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
|
|
332
|
-
raise NotImplementedError(f"{self.name} in groupby")
|
|
333
|
-
if depth == 1:
|
|
334
|
-
# inside aggregation, need to pre-evaluate, groupby
|
|
335
|
-
# construction has checked that we don't have nested aggs,
|
|
336
|
-
# so stop the recursion and return ourselves for pre-eval
|
|
337
|
-
return AggInfo([(self, plc.aggregation.collect_list(), self)])
|
|
338
|
-
else:
|
|
339
|
-
(child,) = self.children
|
|
340
|
-
return child.collect_agg(depth=depth)
|