cudf-polars-cu12 24.12.0__py3-none-any.whl → 25.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. cudf_polars/VERSION +1 -1
  2. cudf_polars/__init__.py +1 -1
  3. cudf_polars/callback.py +28 -3
  4. cudf_polars/containers/__init__.py +1 -1
  5. cudf_polars/dsl/expr.py +16 -16
  6. cudf_polars/dsl/expressions/aggregation.py +21 -4
  7. cudf_polars/dsl/expressions/base.py +7 -2
  8. cudf_polars/dsl/expressions/binaryop.py +1 -0
  9. cudf_polars/dsl/expressions/boolean.py +65 -22
  10. cudf_polars/dsl/expressions/datetime.py +82 -20
  11. cudf_polars/dsl/expressions/literal.py +2 -0
  12. cudf_polars/dsl/expressions/rolling.py +3 -1
  13. cudf_polars/dsl/expressions/selection.py +3 -1
  14. cudf_polars/dsl/expressions/sorting.py +2 -0
  15. cudf_polars/dsl/expressions/string.py +118 -39
  16. cudf_polars/dsl/expressions/ternary.py +1 -0
  17. cudf_polars/dsl/expressions/unary.py +11 -1
  18. cudf_polars/dsl/ir.py +173 -122
  19. cudf_polars/dsl/to_ast.py +4 -6
  20. cudf_polars/dsl/translate.py +53 -21
  21. cudf_polars/dsl/traversal.py +10 -10
  22. cudf_polars/experimental/base.py +43 -0
  23. cudf_polars/experimental/dispatch.py +84 -0
  24. cudf_polars/experimental/io.py +325 -0
  25. cudf_polars/experimental/parallel.py +253 -0
  26. cudf_polars/experimental/select.py +36 -0
  27. cudf_polars/testing/asserts.py +14 -5
  28. cudf_polars/testing/plugin.py +60 -4
  29. cudf_polars/typing/__init__.py +5 -5
  30. cudf_polars/utils/dtypes.py +9 -7
  31. cudf_polars/utils/versions.py +4 -7
  32. {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +6 -6
  33. cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
  34. {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
  35. cudf_polars_cu12-24.12.0.dist-info/RECORD +0 -43
  36. {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
  37. {cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ if TYPE_CHECKING:
20
20
 
21
21
  from cudf_polars.containers import DataFrame
22
22
 
23
- __all__ = ["Gather", "Filter"]
23
+ __all__ = ["Filter", "Gather"]
24
24
 
25
25
 
26
26
  class Gather(Expr):
@@ -30,6 +30,7 @@ class Gather(Expr):
30
30
  def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
31
31
  self.dtype = dtype
32
32
  self.children = (values, indices)
33
+ self.is_pointwise = False
33
34
 
34
35
  def do_evaluate(
35
36
  self,
@@ -71,6 +72,7 @@ class Filter(Expr):
71
72
  def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
72
73
  self.dtype = dtype
73
74
  self.children = (values, indices)
75
+ self.is_pointwise = True
74
76
 
75
77
  def do_evaluate(
76
78
  self,
@@ -32,6 +32,7 @@ class Sort(Expr):
32
32
  self.dtype = dtype
33
33
  self.options = options
34
34
  self.children = (column,)
35
+ self.is_pointwise = False
35
36
 
36
37
  def do_evaluate(
37
38
  self,
@@ -71,6 +72,7 @@ class SortBy(Expr):
71
72
  self.dtype = dtype
72
73
  self.options = options
73
74
  self.children = (column, *by)
75
+ self.is_pointwise = False
74
76
 
75
77
  def do_evaluate(
76
78
  self,
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  # ruff: noqa: D101
@@ -6,13 +6,13 @@
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ from enum import IntEnum, auto
9
10
  from typing import TYPE_CHECKING, Any
10
11
 
11
12
  import pyarrow as pa
12
13
  import pyarrow.compute as pc
13
14
 
14
15
  from polars.exceptions import InvalidOperationError
15
- from polars.polars import _expr_nodes as pl_expr
16
16
 
17
17
  import pylibcudf as plc
18
18
 
@@ -23,19 +23,83 @@ from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
23
23
  if TYPE_CHECKING:
24
24
  from collections.abc import Mapping
25
25
 
26
+ from typing_extensions import Self
27
+
28
+ from polars.polars import _expr_nodes as pl_expr
29
+
26
30
  from cudf_polars.containers import DataFrame
27
31
 
28
32
  __all__ = ["StringFunction"]
29
33
 
30
34
 
31
35
  class StringFunction(Expr):
32
- __slots__ = ("name", "options", "_regex_program")
36
+ class Name(IntEnum):
37
+ """Internal and picklable representation of polars' `StringFunction`."""
38
+
39
+ Base64Decode = auto()
40
+ Base64Encode = auto()
41
+ ConcatHorizontal = auto()
42
+ ConcatVertical = auto()
43
+ Contains = auto()
44
+ ContainsAny = auto()
45
+ CountMatches = auto()
46
+ EndsWith = auto()
47
+ EscapeRegex = auto()
48
+ Extract = auto()
49
+ ExtractAll = auto()
50
+ ExtractGroups = auto()
51
+ Find = auto()
52
+ Head = auto()
53
+ HexDecode = auto()
54
+ HexEncode = auto()
55
+ JsonDecode = auto()
56
+ JsonPathMatch = auto()
57
+ LenBytes = auto()
58
+ LenChars = auto()
59
+ Lowercase = auto()
60
+ Normalize = auto()
61
+ PadEnd = auto()
62
+ PadStart = auto()
63
+ Replace = auto()
64
+ ReplaceMany = auto()
65
+ Reverse = auto()
66
+ Slice = auto()
67
+ Split = auto()
68
+ SplitExact = auto()
69
+ SplitN = auto()
70
+ StartsWith = auto()
71
+ StripChars = auto()
72
+ StripCharsEnd = auto()
73
+ StripCharsStart = auto()
74
+ StripPrefix = auto()
75
+ StripSuffix = auto()
76
+ Strptime = auto()
77
+ Tail = auto()
78
+ Titlecase = auto()
79
+ ToDecimal = auto()
80
+ ToInteger = auto()
81
+ Uppercase = auto()
82
+ ZFill = auto()
83
+
84
+ @classmethod
85
+ def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
86
+ """Convert from polars' `StringFunction`."""
87
+ try:
88
+ function, name = str(obj).split(".", maxsplit=1)
89
+ except ValueError:
90
+ # Failed to unpack string
91
+ function = None
92
+ if function != "StringFunction":
93
+ raise ValueError("StringFunction required")
94
+ return getattr(cls, name)
95
+
96
+ __slots__ = ("_regex_program", "name", "options")
33
97
  _non_child = ("dtype", "name", "options")
34
98
 
35
99
  def __init__(
36
100
  self,
37
101
  dtype: plc.DataType,
38
- name: pl_expr.StringFunction,
102
+ name: StringFunction.Name,
39
103
  options: tuple[Any, ...],
40
104
  *children: Expr,
41
105
  ) -> None:
@@ -43,25 +107,27 @@ class StringFunction(Expr):
43
107
  self.options = options
44
108
  self.name = name
45
109
  self.children = children
110
+ self.is_pointwise = True
46
111
  self._validate_input()
47
112
 
48
113
  def _validate_input(self):
49
114
  if self.name not in (
50
- pl_expr.StringFunction.Contains,
51
- pl_expr.StringFunction.EndsWith,
52
- pl_expr.StringFunction.Lowercase,
53
- pl_expr.StringFunction.Replace,
54
- pl_expr.StringFunction.ReplaceMany,
55
- pl_expr.StringFunction.Slice,
56
- pl_expr.StringFunction.Strptime,
57
- pl_expr.StringFunction.StartsWith,
58
- pl_expr.StringFunction.StripChars,
59
- pl_expr.StringFunction.StripCharsStart,
60
- pl_expr.StringFunction.StripCharsEnd,
61
- pl_expr.StringFunction.Uppercase,
115
+ StringFunction.Name.ConcatVertical,
116
+ StringFunction.Name.Contains,
117
+ StringFunction.Name.EndsWith,
118
+ StringFunction.Name.Lowercase,
119
+ StringFunction.Name.Replace,
120
+ StringFunction.Name.ReplaceMany,
121
+ StringFunction.Name.Slice,
122
+ StringFunction.Name.Strptime,
123
+ StringFunction.Name.StartsWith,
124
+ StringFunction.Name.StripChars,
125
+ StringFunction.Name.StripCharsStart,
126
+ StringFunction.Name.StripCharsEnd,
127
+ StringFunction.Name.Uppercase,
62
128
  ):
63
- raise NotImplementedError(f"String function {self.name}")
64
- if self.name == pl_expr.StringFunction.Contains:
129
+ raise NotImplementedError(f"String function {self.name!r}")
130
+ if self.name is StringFunction.Name.Contains:
65
131
  literal, strict = self.options
66
132
  if not literal:
67
133
  if not strict:
@@ -82,7 +148,7 @@ class StringFunction(Expr):
82
148
  raise NotImplementedError(
83
149
  f"Unsupported regex {pattern} for GPU engine."
84
150
  ) from e
85
- elif self.name == pl_expr.StringFunction.Replace:
151
+ elif self.name is StringFunction.Name.Replace:
86
152
  _, literal = self.options
87
153
  if not literal:
88
154
  raise NotImplementedError("literal=False is not supported for replace")
@@ -93,7 +159,7 @@ class StringFunction(Expr):
93
159
  raise NotImplementedError(
94
160
  "libcudf replace does not support empty strings"
95
161
  )
96
- elif self.name == pl_expr.StringFunction.ReplaceMany:
162
+ elif self.name is StringFunction.Name.ReplaceMany:
97
163
  (ascii_case_insensitive,) = self.options
98
164
  if ascii_case_insensitive:
99
165
  raise NotImplementedError(
@@ -109,12 +175,12 @@ class StringFunction(Expr):
109
175
  "libcudf replace_many is implemented differently from polars "
110
176
  "for empty strings"
111
177
  )
112
- elif self.name == pl_expr.StringFunction.Slice:
178
+ elif self.name is StringFunction.Name.Slice:
113
179
  if not all(isinstance(child, Literal) for child in self.children[1:]):
114
180
  raise NotImplementedError(
115
181
  "Slice only supports literal start and stop values"
116
182
  )
117
- elif self.name == pl_expr.StringFunction.Strptime:
183
+ elif self.name is StringFunction.Name.Strptime:
118
184
  format, _, exact, cache = self.options
119
185
  if cache:
120
186
  raise NotImplementedError("Strptime cache is a CPU feature")
@@ -123,9 +189,9 @@ class StringFunction(Expr):
123
189
  if not exact:
124
190
  raise NotImplementedError("Strptime does not support exact=False")
125
191
  elif self.name in {
126
- pl_expr.StringFunction.StripChars,
127
- pl_expr.StringFunction.StripCharsStart,
128
- pl_expr.StringFunction.StripCharsEnd,
192
+ StringFunction.Name.StripChars,
193
+ StringFunction.Name.StripCharsStart,
194
+ StringFunction.Name.StripCharsEnd,
129
195
  }:
130
196
  if not isinstance(self.children[1], Literal):
131
197
  raise NotImplementedError(
@@ -140,7 +206,20 @@ class StringFunction(Expr):
140
206
  mapping: Mapping[Expr, Column] | None = None,
141
207
  ) -> Column:
142
208
  """Evaluate this expression given a dataframe for context."""
143
- if self.name == pl_expr.StringFunction.Contains:
209
+ if self.name is StringFunction.Name.ConcatVertical:
210
+ (child,) = self.children
211
+ column = child.evaluate(df, context=context, mapping=mapping)
212
+ delimiter, ignore_nulls = self.options
213
+ if column.obj.null_count() > 0 and not ignore_nulls:
214
+ return Column(plc.Column.all_null_like(column.obj, 1))
215
+ return Column(
216
+ plc.strings.combine.join_strings(
217
+ column.obj,
218
+ plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
219
+ plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
220
+ )
221
+ )
222
+ elif self.name is StringFunction.Name.Contains:
144
223
  child, arg = self.children
145
224
  column = child.evaluate(df, context=context, mapping=mapping)
146
225
 
@@ -157,7 +236,7 @@ class StringFunction(Expr):
157
236
  return Column(
158
237
  plc.strings.contains.contains_re(column.obj, self._regex_program)
159
238
  )
160
- elif self.name == pl_expr.StringFunction.Slice:
239
+ elif self.name is StringFunction.Name.Slice:
161
240
  child, expr_offset, expr_length = self.children
162
241
  assert isinstance(expr_offset, Literal)
163
242
  assert isinstance(expr_length, Literal)
@@ -188,16 +267,16 @@ class StringFunction(Expr):
188
267
  )
189
268
  )
190
269
  elif self.name in {
191
- pl_expr.StringFunction.StripChars,
192
- pl_expr.StringFunction.StripCharsStart,
193
- pl_expr.StringFunction.StripCharsEnd,
270
+ StringFunction.Name.StripChars,
271
+ StringFunction.Name.StripCharsStart,
272
+ StringFunction.Name.StripCharsEnd,
194
273
  }:
195
274
  column, chars = (
196
275
  c.evaluate(df, context=context, mapping=mapping) for c in self.children
197
276
  )
198
- if self.name == pl_expr.StringFunction.StripCharsStart:
277
+ if self.name is StringFunction.Name.StripCharsStart:
199
278
  side = plc.strings.SideType.LEFT
200
- elif self.name == pl_expr.StringFunction.StripCharsEnd:
279
+ elif self.name is StringFunction.Name.StripCharsEnd:
201
280
  side = plc.strings.SideType.RIGHT
202
281
  else:
203
282
  side = plc.strings.SideType.BOTH
@@ -207,13 +286,13 @@ class StringFunction(Expr):
207
286
  child.evaluate(df, context=context, mapping=mapping)
208
287
  for child in self.children
209
288
  ]
210
- if self.name == pl_expr.StringFunction.Lowercase:
289
+ if self.name is StringFunction.Name.Lowercase:
211
290
  (column,) = columns
212
291
  return Column(plc.strings.case.to_lower(column.obj))
213
- elif self.name == pl_expr.StringFunction.Uppercase:
292
+ elif self.name is StringFunction.Name.Uppercase:
214
293
  (column,) = columns
215
294
  return Column(plc.strings.case.to_upper(column.obj))
216
- elif self.name == pl_expr.StringFunction.EndsWith:
295
+ elif self.name is StringFunction.Name.EndsWith:
217
296
  column, suffix = columns
218
297
  return Column(
219
298
  plc.strings.find.ends_with(
@@ -223,7 +302,7 @@ class StringFunction(Expr):
223
302
  else suffix.obj,
224
303
  )
225
304
  )
226
- elif self.name == pl_expr.StringFunction.StartsWith:
305
+ elif self.name is StringFunction.Name.StartsWith:
227
306
  column, prefix = columns
228
307
  return Column(
229
308
  plc.strings.find.starts_with(
@@ -233,7 +312,7 @@ class StringFunction(Expr):
233
312
  else prefix.obj,
234
313
  )
235
314
  )
236
- elif self.name == pl_expr.StringFunction.Strptime:
315
+ elif self.name is StringFunction.Name.Strptime:
237
316
  # TODO: ignores ambiguous
238
317
  format, strict, exact, cache = self.options
239
318
  col = self.children[0].evaluate(df, context=context, mapping=mapping)
@@ -265,7 +344,7 @@ class StringFunction(Expr):
265
344
  res.columns()[0], self.dtype, format
266
345
  )
267
346
  )
268
- elif self.name == pl_expr.StringFunction.Replace:
347
+ elif self.name is StringFunction.Name.Replace:
269
348
  column, target, repl = columns
270
349
  n, _ = self.options
271
350
  return Column(
@@ -273,7 +352,7 @@ class StringFunction(Expr):
273
352
  column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
274
353
  )
275
354
  )
276
- elif self.name == pl_expr.StringFunction.ReplaceMany:
355
+ elif self.name is StringFunction.Name.ReplaceMany:
277
356
  column, target, repl = columns
278
357
  return Column(
279
358
  plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
@@ -34,6 +34,7 @@ class Ternary(Expr):
34
34
  ) -> None:
35
35
  self.dtype = dtype
36
36
  self.children = (when, then, otherwise)
37
+ self.is_pointwise = True
37
38
 
38
39
  def do_evaluate(
39
40
  self,
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
21
21
 
22
22
  from cudf_polars.containers import DataFrame
23
23
 
24
- __all__ = ["Cast", "UnaryFunction", "Len"]
24
+ __all__ = ["Cast", "Len", "UnaryFunction"]
25
25
 
26
26
 
27
27
  class Cast(Expr):
@@ -33,6 +33,7 @@ class Cast(Expr):
33
33
  def __init__(self, dtype: plc.DataType, value: Expr) -> None:
34
34
  self.dtype = dtype
35
35
  self.children = (value,)
36
+ self.is_pointwise = True
36
37
  if not dtypes.can_cast(value.dtype, self.dtype):
37
38
  raise NotImplementedError(
38
39
  f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}"
@@ -63,6 +64,7 @@ class Len(Expr):
63
64
  def __init__(self, dtype: plc.DataType) -> None:
64
65
  self.dtype = dtype
65
66
  self.children = ()
67
+ self.is_pointwise = False
66
68
 
67
69
  def do_evaluate(
68
70
  self,
@@ -147,6 +149,14 @@ class UnaryFunction(Expr):
147
149
  self.name = name
148
150
  self.options = options
149
151
  self.children = children
152
+ self.is_pointwise = self.name not in (
153
+ "cum_min",
154
+ "cum_max",
155
+ "cum_prod",
156
+ "cum_sum",
157
+ "drop_nulls",
158
+ "unique",
159
+ )
150
160
 
151
161
  if self.name not in UnaryFunction._supported_fns:
152
162
  raise NotImplementedError(f"Unary function {name=}")