cudf-polars-cu12 24.12.0__py3-none-any.whl → 25.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cudf_polars/VERSION +1 -1
- cudf_polars/__init__.py +1 -1
- cudf_polars/callback.py +28 -3
- cudf_polars/containers/__init__.py +1 -1
- cudf_polars/dsl/expr.py +16 -16
- cudf_polars/dsl/expressions/aggregation.py +21 -4
- cudf_polars/dsl/expressions/base.py +7 -2
- cudf_polars/dsl/expressions/binaryop.py +1 -0
- cudf_polars/dsl/expressions/boolean.py +65 -22
- cudf_polars/dsl/expressions/datetime.py +82 -20
- cudf_polars/dsl/expressions/literal.py +2 -0
- cudf_polars/dsl/expressions/rolling.py +3 -1
- cudf_polars/dsl/expressions/selection.py +3 -1
- cudf_polars/dsl/expressions/sorting.py +2 -0
- cudf_polars/dsl/expressions/string.py +118 -39
- cudf_polars/dsl/expressions/ternary.py +1 -0
- cudf_polars/dsl/expressions/unary.py +11 -1
- cudf_polars/dsl/ir.py +173 -122
- cudf_polars/dsl/to_ast.py +4 -6
- cudf_polars/dsl/translate.py +53 -21
- cudf_polars/dsl/traversal.py +10 -10
- cudf_polars/experimental/base.py +43 -0
- cudf_polars/experimental/dispatch.py +84 -0
- cudf_polars/experimental/io.py +325 -0
- cudf_polars/experimental/parallel.py +253 -0
- cudf_polars/experimental/select.py +36 -0
- cudf_polars/testing/asserts.py +14 -5
- cudf_polars/testing/plugin.py +64 -4
- cudf_polars/typing/__init__.py +5 -5
- cudf_polars/utils/dtypes.py +9 -7
- cudf_polars/utils/versions.py +4 -7
- {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.1.dist-info}/METADATA +6 -6
- cudf_polars_cu12-25.2.1.dist-info/RECORD +48 -0
- {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.1.dist-info}/WHEEL +1 -1
- cudf_polars_cu12-24.12.0.dist-info/RECORD +0 -43
- {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.1.dist-info}/LICENSE +0 -0
- {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.1.dist-info}/top_level.txt +0 -0
|
@@ -20,7 +20,7 @@ if TYPE_CHECKING:
|
|
|
20
20
|
|
|
21
21
|
from cudf_polars.containers import DataFrame
|
|
22
22
|
|
|
23
|
-
__all__ = ["
|
|
23
|
+
__all__ = ["Filter", "Gather"]
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class Gather(Expr):
|
|
@@ -30,6 +30,7 @@ class Gather(Expr):
|
|
|
30
30
|
def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
|
|
31
31
|
self.dtype = dtype
|
|
32
32
|
self.children = (values, indices)
|
|
33
|
+
self.is_pointwise = False
|
|
33
34
|
|
|
34
35
|
def do_evaluate(
|
|
35
36
|
self,
|
|
@@ -71,6 +72,7 @@ class Filter(Expr):
|
|
|
71
72
|
def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
|
|
72
73
|
self.dtype = dtype
|
|
73
74
|
self.children = (values, indices)
|
|
75
|
+
self.is_pointwise = True
|
|
74
76
|
|
|
75
77
|
def do_evaluate(
|
|
76
78
|
self,
|
|
@@ -32,6 +32,7 @@ class Sort(Expr):
|
|
|
32
32
|
self.dtype = dtype
|
|
33
33
|
self.options = options
|
|
34
34
|
self.children = (column,)
|
|
35
|
+
self.is_pointwise = False
|
|
35
36
|
|
|
36
37
|
def do_evaluate(
|
|
37
38
|
self,
|
|
@@ -71,6 +72,7 @@ class SortBy(Expr):
|
|
|
71
72
|
self.dtype = dtype
|
|
72
73
|
self.options = options
|
|
73
74
|
self.children = (column, *by)
|
|
75
|
+
self.is_pointwise = False
|
|
74
76
|
|
|
75
77
|
def do_evaluate(
|
|
76
78
|
self,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
# TODO: remove need for this
|
|
4
4
|
# ruff: noqa: D101
|
|
@@ -6,13 +6,13 @@
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
from enum import IntEnum, auto
|
|
9
10
|
from typing import TYPE_CHECKING, Any
|
|
10
11
|
|
|
11
12
|
import pyarrow as pa
|
|
12
13
|
import pyarrow.compute as pc
|
|
13
14
|
|
|
14
15
|
from polars.exceptions import InvalidOperationError
|
|
15
|
-
from polars.polars import _expr_nodes as pl_expr
|
|
16
16
|
|
|
17
17
|
import pylibcudf as plc
|
|
18
18
|
|
|
@@ -23,19 +23,83 @@ from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
24
|
from collections.abc import Mapping
|
|
25
25
|
|
|
26
|
+
from typing_extensions import Self
|
|
27
|
+
|
|
28
|
+
from polars.polars import _expr_nodes as pl_expr
|
|
29
|
+
|
|
26
30
|
from cudf_polars.containers import DataFrame
|
|
27
31
|
|
|
28
32
|
__all__ = ["StringFunction"]
|
|
29
33
|
|
|
30
34
|
|
|
31
35
|
class StringFunction(Expr):
|
|
32
|
-
|
|
36
|
+
class Name(IntEnum):
|
|
37
|
+
"""Internal and picklable representation of polars' `StringFunction`."""
|
|
38
|
+
|
|
39
|
+
Base64Decode = auto()
|
|
40
|
+
Base64Encode = auto()
|
|
41
|
+
ConcatHorizontal = auto()
|
|
42
|
+
ConcatVertical = auto()
|
|
43
|
+
Contains = auto()
|
|
44
|
+
ContainsAny = auto()
|
|
45
|
+
CountMatches = auto()
|
|
46
|
+
EndsWith = auto()
|
|
47
|
+
EscapeRegex = auto()
|
|
48
|
+
Extract = auto()
|
|
49
|
+
ExtractAll = auto()
|
|
50
|
+
ExtractGroups = auto()
|
|
51
|
+
Find = auto()
|
|
52
|
+
Head = auto()
|
|
53
|
+
HexDecode = auto()
|
|
54
|
+
HexEncode = auto()
|
|
55
|
+
JsonDecode = auto()
|
|
56
|
+
JsonPathMatch = auto()
|
|
57
|
+
LenBytes = auto()
|
|
58
|
+
LenChars = auto()
|
|
59
|
+
Lowercase = auto()
|
|
60
|
+
Normalize = auto()
|
|
61
|
+
PadEnd = auto()
|
|
62
|
+
PadStart = auto()
|
|
63
|
+
Replace = auto()
|
|
64
|
+
ReplaceMany = auto()
|
|
65
|
+
Reverse = auto()
|
|
66
|
+
Slice = auto()
|
|
67
|
+
Split = auto()
|
|
68
|
+
SplitExact = auto()
|
|
69
|
+
SplitN = auto()
|
|
70
|
+
StartsWith = auto()
|
|
71
|
+
StripChars = auto()
|
|
72
|
+
StripCharsEnd = auto()
|
|
73
|
+
StripCharsStart = auto()
|
|
74
|
+
StripPrefix = auto()
|
|
75
|
+
StripSuffix = auto()
|
|
76
|
+
Strptime = auto()
|
|
77
|
+
Tail = auto()
|
|
78
|
+
Titlecase = auto()
|
|
79
|
+
ToDecimal = auto()
|
|
80
|
+
ToInteger = auto()
|
|
81
|
+
Uppercase = auto()
|
|
82
|
+
ZFill = auto()
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
|
|
86
|
+
"""Convert from polars' `StringFunction`."""
|
|
87
|
+
try:
|
|
88
|
+
function, name = str(obj).split(".", maxsplit=1)
|
|
89
|
+
except ValueError:
|
|
90
|
+
# Failed to unpack string
|
|
91
|
+
function = None
|
|
92
|
+
if function != "StringFunction":
|
|
93
|
+
raise ValueError("StringFunction required")
|
|
94
|
+
return getattr(cls, name)
|
|
95
|
+
|
|
96
|
+
__slots__ = ("_regex_program", "name", "options")
|
|
33
97
|
_non_child = ("dtype", "name", "options")
|
|
34
98
|
|
|
35
99
|
def __init__(
|
|
36
100
|
self,
|
|
37
101
|
dtype: plc.DataType,
|
|
38
|
-
name:
|
|
102
|
+
name: StringFunction.Name,
|
|
39
103
|
options: tuple[Any, ...],
|
|
40
104
|
*children: Expr,
|
|
41
105
|
) -> None:
|
|
@@ -43,25 +107,27 @@ class StringFunction(Expr):
|
|
|
43
107
|
self.options = options
|
|
44
108
|
self.name = name
|
|
45
109
|
self.children = children
|
|
110
|
+
self.is_pointwise = True
|
|
46
111
|
self._validate_input()
|
|
47
112
|
|
|
48
113
|
def _validate_input(self):
|
|
49
114
|
if self.name not in (
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
115
|
+
StringFunction.Name.ConcatVertical,
|
|
116
|
+
StringFunction.Name.Contains,
|
|
117
|
+
StringFunction.Name.EndsWith,
|
|
118
|
+
StringFunction.Name.Lowercase,
|
|
119
|
+
StringFunction.Name.Replace,
|
|
120
|
+
StringFunction.Name.ReplaceMany,
|
|
121
|
+
StringFunction.Name.Slice,
|
|
122
|
+
StringFunction.Name.Strptime,
|
|
123
|
+
StringFunction.Name.StartsWith,
|
|
124
|
+
StringFunction.Name.StripChars,
|
|
125
|
+
StringFunction.Name.StripCharsStart,
|
|
126
|
+
StringFunction.Name.StripCharsEnd,
|
|
127
|
+
StringFunction.Name.Uppercase,
|
|
62
128
|
):
|
|
63
|
-
raise NotImplementedError(f"String function {self.name}")
|
|
64
|
-
if self.name
|
|
129
|
+
raise NotImplementedError(f"String function {self.name!r}")
|
|
130
|
+
if self.name is StringFunction.Name.Contains:
|
|
65
131
|
literal, strict = self.options
|
|
66
132
|
if not literal:
|
|
67
133
|
if not strict:
|
|
@@ -82,7 +148,7 @@ class StringFunction(Expr):
|
|
|
82
148
|
raise NotImplementedError(
|
|
83
149
|
f"Unsupported regex {pattern} for GPU engine."
|
|
84
150
|
) from e
|
|
85
|
-
elif self.name
|
|
151
|
+
elif self.name is StringFunction.Name.Replace:
|
|
86
152
|
_, literal = self.options
|
|
87
153
|
if not literal:
|
|
88
154
|
raise NotImplementedError("literal=False is not supported for replace")
|
|
@@ -93,7 +159,7 @@ class StringFunction(Expr):
|
|
|
93
159
|
raise NotImplementedError(
|
|
94
160
|
"libcudf replace does not support empty strings"
|
|
95
161
|
)
|
|
96
|
-
elif self.name
|
|
162
|
+
elif self.name is StringFunction.Name.ReplaceMany:
|
|
97
163
|
(ascii_case_insensitive,) = self.options
|
|
98
164
|
if ascii_case_insensitive:
|
|
99
165
|
raise NotImplementedError(
|
|
@@ -109,12 +175,12 @@ class StringFunction(Expr):
|
|
|
109
175
|
"libcudf replace_many is implemented differently from polars "
|
|
110
176
|
"for empty strings"
|
|
111
177
|
)
|
|
112
|
-
elif self.name
|
|
178
|
+
elif self.name is StringFunction.Name.Slice:
|
|
113
179
|
if not all(isinstance(child, Literal) for child in self.children[1:]):
|
|
114
180
|
raise NotImplementedError(
|
|
115
181
|
"Slice only supports literal start and stop values"
|
|
116
182
|
)
|
|
117
|
-
elif self.name
|
|
183
|
+
elif self.name is StringFunction.Name.Strptime:
|
|
118
184
|
format, _, exact, cache = self.options
|
|
119
185
|
if cache:
|
|
120
186
|
raise NotImplementedError("Strptime cache is a CPU feature")
|
|
@@ -123,9 +189,9 @@ class StringFunction(Expr):
|
|
|
123
189
|
if not exact:
|
|
124
190
|
raise NotImplementedError("Strptime does not support exact=False")
|
|
125
191
|
elif self.name in {
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
192
|
+
StringFunction.Name.StripChars,
|
|
193
|
+
StringFunction.Name.StripCharsStart,
|
|
194
|
+
StringFunction.Name.StripCharsEnd,
|
|
129
195
|
}:
|
|
130
196
|
if not isinstance(self.children[1], Literal):
|
|
131
197
|
raise NotImplementedError(
|
|
@@ -140,7 +206,20 @@ class StringFunction(Expr):
|
|
|
140
206
|
mapping: Mapping[Expr, Column] | None = None,
|
|
141
207
|
) -> Column:
|
|
142
208
|
"""Evaluate this expression given a dataframe for context."""
|
|
143
|
-
if self.name
|
|
209
|
+
if self.name is StringFunction.Name.ConcatVertical:
|
|
210
|
+
(child,) = self.children
|
|
211
|
+
column = child.evaluate(df, context=context, mapping=mapping)
|
|
212
|
+
delimiter, ignore_nulls = self.options
|
|
213
|
+
if column.obj.null_count() > 0 and not ignore_nulls:
|
|
214
|
+
return Column(plc.Column.all_null_like(column.obj, 1))
|
|
215
|
+
return Column(
|
|
216
|
+
plc.strings.combine.join_strings(
|
|
217
|
+
column.obj,
|
|
218
|
+
plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
|
|
219
|
+
plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
elif self.name is StringFunction.Name.Contains:
|
|
144
223
|
child, arg = self.children
|
|
145
224
|
column = child.evaluate(df, context=context, mapping=mapping)
|
|
146
225
|
|
|
@@ -157,7 +236,7 @@ class StringFunction(Expr):
|
|
|
157
236
|
return Column(
|
|
158
237
|
plc.strings.contains.contains_re(column.obj, self._regex_program)
|
|
159
238
|
)
|
|
160
|
-
elif self.name
|
|
239
|
+
elif self.name is StringFunction.Name.Slice:
|
|
161
240
|
child, expr_offset, expr_length = self.children
|
|
162
241
|
assert isinstance(expr_offset, Literal)
|
|
163
242
|
assert isinstance(expr_length, Literal)
|
|
@@ -188,16 +267,16 @@ class StringFunction(Expr):
|
|
|
188
267
|
)
|
|
189
268
|
)
|
|
190
269
|
elif self.name in {
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
270
|
+
StringFunction.Name.StripChars,
|
|
271
|
+
StringFunction.Name.StripCharsStart,
|
|
272
|
+
StringFunction.Name.StripCharsEnd,
|
|
194
273
|
}:
|
|
195
274
|
column, chars = (
|
|
196
275
|
c.evaluate(df, context=context, mapping=mapping) for c in self.children
|
|
197
276
|
)
|
|
198
|
-
if self.name
|
|
277
|
+
if self.name is StringFunction.Name.StripCharsStart:
|
|
199
278
|
side = plc.strings.SideType.LEFT
|
|
200
|
-
elif self.name
|
|
279
|
+
elif self.name is StringFunction.Name.StripCharsEnd:
|
|
201
280
|
side = plc.strings.SideType.RIGHT
|
|
202
281
|
else:
|
|
203
282
|
side = plc.strings.SideType.BOTH
|
|
@@ -207,13 +286,13 @@ class StringFunction(Expr):
|
|
|
207
286
|
child.evaluate(df, context=context, mapping=mapping)
|
|
208
287
|
for child in self.children
|
|
209
288
|
]
|
|
210
|
-
if self.name
|
|
289
|
+
if self.name is StringFunction.Name.Lowercase:
|
|
211
290
|
(column,) = columns
|
|
212
291
|
return Column(plc.strings.case.to_lower(column.obj))
|
|
213
|
-
elif self.name
|
|
292
|
+
elif self.name is StringFunction.Name.Uppercase:
|
|
214
293
|
(column,) = columns
|
|
215
294
|
return Column(plc.strings.case.to_upper(column.obj))
|
|
216
|
-
elif self.name
|
|
295
|
+
elif self.name is StringFunction.Name.EndsWith:
|
|
217
296
|
column, suffix = columns
|
|
218
297
|
return Column(
|
|
219
298
|
plc.strings.find.ends_with(
|
|
@@ -223,7 +302,7 @@ class StringFunction(Expr):
|
|
|
223
302
|
else suffix.obj,
|
|
224
303
|
)
|
|
225
304
|
)
|
|
226
|
-
elif self.name
|
|
305
|
+
elif self.name is StringFunction.Name.StartsWith:
|
|
227
306
|
column, prefix = columns
|
|
228
307
|
return Column(
|
|
229
308
|
plc.strings.find.starts_with(
|
|
@@ -233,7 +312,7 @@ class StringFunction(Expr):
|
|
|
233
312
|
else prefix.obj,
|
|
234
313
|
)
|
|
235
314
|
)
|
|
236
|
-
elif self.name
|
|
315
|
+
elif self.name is StringFunction.Name.Strptime:
|
|
237
316
|
# TODO: ignores ambiguous
|
|
238
317
|
format, strict, exact, cache = self.options
|
|
239
318
|
col = self.children[0].evaluate(df, context=context, mapping=mapping)
|
|
@@ -265,7 +344,7 @@ class StringFunction(Expr):
|
|
|
265
344
|
res.columns()[0], self.dtype, format
|
|
266
345
|
)
|
|
267
346
|
)
|
|
268
|
-
elif self.name
|
|
347
|
+
elif self.name is StringFunction.Name.Replace:
|
|
269
348
|
column, target, repl = columns
|
|
270
349
|
n, _ = self.options
|
|
271
350
|
return Column(
|
|
@@ -273,7 +352,7 @@ class StringFunction(Expr):
|
|
|
273
352
|
column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
|
|
274
353
|
)
|
|
275
354
|
)
|
|
276
|
-
elif self.name
|
|
355
|
+
elif self.name is StringFunction.Name.ReplaceMany:
|
|
277
356
|
column, target, repl = columns
|
|
278
357
|
return Column(
|
|
279
358
|
plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
|
|
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
|
|
|
21
21
|
|
|
22
22
|
from cudf_polars.containers import DataFrame
|
|
23
23
|
|
|
24
|
-
__all__ = ["Cast", "
|
|
24
|
+
__all__ = ["Cast", "Len", "UnaryFunction"]
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class Cast(Expr):
|
|
@@ -33,6 +33,7 @@ class Cast(Expr):
|
|
|
33
33
|
def __init__(self, dtype: plc.DataType, value: Expr) -> None:
|
|
34
34
|
self.dtype = dtype
|
|
35
35
|
self.children = (value,)
|
|
36
|
+
self.is_pointwise = True
|
|
36
37
|
if not dtypes.can_cast(value.dtype, self.dtype):
|
|
37
38
|
raise NotImplementedError(
|
|
38
39
|
f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}"
|
|
@@ -63,6 +64,7 @@ class Len(Expr):
|
|
|
63
64
|
def __init__(self, dtype: plc.DataType) -> None:
|
|
64
65
|
self.dtype = dtype
|
|
65
66
|
self.children = ()
|
|
67
|
+
self.is_pointwise = False
|
|
66
68
|
|
|
67
69
|
def do_evaluate(
|
|
68
70
|
self,
|
|
@@ -147,6 +149,14 @@ class UnaryFunction(Expr):
|
|
|
147
149
|
self.name = name
|
|
148
150
|
self.options = options
|
|
149
151
|
self.children = children
|
|
152
|
+
self.is_pointwise = self.name not in (
|
|
153
|
+
"cum_min",
|
|
154
|
+
"cum_max",
|
|
155
|
+
"cum_prod",
|
|
156
|
+
"cum_sum",
|
|
157
|
+
"drop_nulls",
|
|
158
|
+
"unique",
|
|
159
|
+
)
|
|
150
160
|
|
|
151
161
|
if self.name not in UnaryFunction._supported_fns:
|
|
152
162
|
raise NotImplementedError(f"Unary function {name=}")
|