cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. cudf_polars/GIT_COMMIT +1 -1
  2. cudf_polars/VERSION +1 -1
  3. cudf_polars/callback.py +28 -7
  4. cudf_polars/containers/column.py +51 -26
  5. cudf_polars/dsl/expressions/binaryop.py +1 -1
  6. cudf_polars/dsl/expressions/boolean.py +1 -1
  7. cudf_polars/dsl/expressions/selection.py +1 -1
  8. cudf_polars/dsl/expressions/string.py +29 -20
  9. cudf_polars/dsl/expressions/ternary.py +25 -1
  10. cudf_polars/dsl/expressions/unary.py +11 -8
  11. cudf_polars/dsl/ir.py +351 -281
  12. cudf_polars/dsl/translate.py +18 -15
  13. cudf_polars/dsl/utils/aggregations.py +10 -5
  14. cudf_polars/experimental/base.py +10 -0
  15. cudf_polars/experimental/benchmarks/pdsh.py +1 -1
  16. cudf_polars/experimental/benchmarks/utils.py +83 -2
  17. cudf_polars/experimental/distinct.py +2 -0
  18. cudf_polars/experimental/explain.py +1 -1
  19. cudf_polars/experimental/expressions.py +8 -5
  20. cudf_polars/experimental/groupby.py +2 -0
  21. cudf_polars/experimental/io.py +64 -42
  22. cudf_polars/experimental/join.py +15 -2
  23. cudf_polars/experimental/parallel.py +10 -7
  24. cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
  25. cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
  26. cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
  27. cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
  28. cudf_polars/experimental/rapidsmpf/core.py +194 -67
  29. cudf_polars/experimental/rapidsmpf/dask.py +172 -0
  30. cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
  31. cudf_polars/experimental/rapidsmpf/io.py +162 -70
  32. cudf_polars/experimental/rapidsmpf/join.py +162 -77
  33. cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
  34. cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
  35. cudf_polars/experimental/rapidsmpf/union.py +24 -5
  36. cudf_polars/experimental/rapidsmpf/utils.py +228 -16
  37. cudf_polars/experimental/shuffle.py +18 -4
  38. cudf_polars/experimental/sort.py +13 -6
  39. cudf_polars/experimental/spilling.py +1 -1
  40. cudf_polars/testing/plugin.py +6 -3
  41. cudf_polars/utils/config.py +67 -0
  42. cudf_polars/utils/versions.py +3 -3
  43. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
  44. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
  45. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
  46. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
  47. {cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0
cudf_polars/GIT_COMMIT CHANGED
@@ -1 +1 @@
1
- 580975be72b3516c2c18da149b62de557b28fb67
1
+ 9782a269e689140d2b00b5172a93056bdf19e8c2
cudf_polars/VERSION CHANGED
@@ -1 +1 @@
1
- 25.12.00
1
+ 26.02.000
cudf_polars/callback.py CHANGED
@@ -11,6 +11,7 @@ import textwrap
11
11
  import time
12
12
  import warnings
13
13
  from functools import cache, partial
14
+ from threading import Lock
14
15
  from typing import TYPE_CHECKING, Literal, overload
15
16
 
16
17
  import nvtx
@@ -162,6 +163,11 @@ def set_memory_resource(
162
163
  rmm.mr.set_current_device_resource(previous)
163
164
 
164
165
 
166
+ # libcudf doesn't support executing on multiple devices from within the same process.
167
+ SEEN_DEVICE = None
168
+ SEEN_DEVICE_LOCK = Lock()
169
+
170
+
165
171
  @contextlib.contextmanager
166
172
  def set_device(device: int | None) -> Generator[int, None, None]:
167
173
  """
@@ -180,13 +186,28 @@ def set_device(device: int | None) -> Generator[int, None, None]:
180
186
  -----
181
187
  At exit, the device is restored to whatever was current at entry.
182
188
  """
183
- previous: int = gpu.getDevice()
184
- if device is not None:
185
- gpu.setDevice(device)
186
- try:
187
- yield previous
188
- finally:
189
- gpu.setDevice(previous)
189
+ global SEEN_DEVICE # noqa: PLW0603
190
+ current: int = gpu.getDevice()
191
+ to_use = device if device is not None else current
192
+ with SEEN_DEVICE_LOCK:
193
+ if (
194
+ SEEN_DEVICE is not None and to_use != SEEN_DEVICE
195
+ ): # pragma: no cover; requires multiple GPUs in CI
196
+ raise RuntimeError(
197
+ "cudf-polars does not support running queries on "
198
+ "multiple devices in the same process. "
199
+ f"A previous query used device-{SEEN_DEVICE}, "
200
+ f"the current query is using device-{to_use}."
201
+ )
202
+ SEEN_DEVICE = to_use
203
+ if to_use != current:
204
+ gpu.setDevice(to_use)
205
+ try:
206
+ yield to_use
207
+ finally:
208
+ gpu.setDevice(current)
209
+ else:
210
+ yield to_use
190
211
 
191
212
 
192
213
  @overload
@@ -16,7 +16,6 @@ from pylibcudf.strings.convert.convert_integers import (
16
16
  is_integer,
17
17
  to_integers,
18
18
  )
19
- from pylibcudf.traits import is_floating_point
20
19
 
21
20
  from cudf_polars.containers import DataType
22
21
  from cudf_polars.containers.datatype import _dtype_from_header, _dtype_to_header
@@ -24,6 +23,8 @@ from cudf_polars.utils import conversion
24
23
  from cudf_polars.utils.dtypes import is_order_preserving_cast
25
24
 
26
25
  if TYPE_CHECKING:
26
+ from collections.abc import Callable
27
+
27
28
  from typing_extensions import Self
28
29
 
29
30
  from polars import Series as pl_Series
@@ -264,7 +265,7 @@ class Column:
264
265
  return True
265
266
  return False
266
267
 
267
- def astype(self, dtype: DataType, stream: Stream) -> Column:
268
+ def astype(self, dtype: DataType, stream: Stream, *, strict: bool = True) -> Column:
268
269
  """
269
270
  Cast the column to as the requested dtype.
270
271
 
@@ -275,6 +276,9 @@ class Column:
275
276
  stream
276
277
  CUDA stream used for device memory operations and kernel launches
277
278
  on this Column. The data in ``self.obj`` must be valid on this stream.
279
+ strict
280
+ If True, raise an error if the cast is unsupported.
281
+ If False, return nulls for unsupported casts.
278
282
 
279
283
  Returns
280
284
  -------
@@ -299,7 +303,8 @@ class Column:
299
303
  or self.obj.type().id() == plc.TypeId.STRING
300
304
  ):
301
305
  return Column(
302
- self._handle_string_cast(plc_dtype, stream=stream), dtype=dtype
306
+ self._handle_string_cast(plc_dtype, stream=stream, strict=strict),
307
+ dtype=dtype,
303
308
  )
304
309
  elif plc.traits.is_integral_not_bool(
305
310
  self.obj.type()
@@ -340,33 +345,53 @@ class Column:
340
345
  return result.sorted_like(self)
341
346
  return result
342
347
 
343
- def _handle_string_cast(self, dtype: plc.DataType, stream: Stream) -> plc.Column:
348
+ def _handle_string_cast(
349
+ self, dtype: plc.DataType, stream: Stream, *, strict: bool
350
+ ) -> plc.Column:
344
351
  if dtype.id() == plc.TypeId.STRING:
345
- if is_floating_point(self.obj.type()):
352
+ if plc.traits.is_floating_point(self.obj.type()):
346
353
  return from_floats(self.obj, stream=stream)
347
- else:
354
+ elif plc.traits.is_integral_not_bool(self.obj.type()):
348
355
  return from_integers(self.obj, stream=stream)
356
+ else:
357
+ raise InvalidOperationError(
358
+ f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
359
+ )
360
+
361
+ type_checker: Callable[[plc.Column, Stream], plc.Column]
362
+ type_caster: Callable[[plc.Column, plc.DataType, Stream], plc.Column]
363
+ if plc.traits.is_floating_point(dtype):
364
+ type_checker = is_float
365
+ type_caster = to_floats
366
+ elif plc.traits.is_integral_not_bool(dtype):
367
+ # is_integer has a second optional int_type: plc.DataType | None = None argument
368
+ # we do not use
369
+ # unused-ignore for if RMM is missing
370
+ type_checker = is_integer # type: ignore[assignment,unused-ignore]
371
+ type_caster = to_integers
349
372
  else:
350
- if is_floating_point(dtype):
351
- floats = is_float(self.obj, stream=stream)
352
- if not plc.reduce.reduce(
353
- floats,
354
- plc.aggregation.all(),
355
- plc.DataType(plc.TypeId.BOOL8),
356
- stream=stream,
357
- ).to_py():
358
- raise InvalidOperationError("Conversion from `str` failed.")
359
- return to_floats(self.obj, dtype)
373
+ raise InvalidOperationError(
374
+ f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
375
+ )
376
+
377
+ castable = type_checker(self.obj, stream=stream) # type: ignore[call-arg]
378
+ if not plc.reduce.reduce(
379
+ castable,
380
+ plc.aggregation.all(),
381
+ plc.DataType(plc.TypeId.BOOL8),
382
+ stream=stream,
383
+ ).to_py(stream=stream):
384
+ if strict:
385
+ raise InvalidOperationError(
386
+ f"Conversion from {self.dtype.id()} to {dtype.id()} failed."
387
+ )
360
388
  else:
361
- integers = is_integer(self.obj, stream=stream)
362
- if not plc.reduce.reduce(
363
- integers,
364
- plc.aggregation.all(),
365
- plc.DataType(plc.TypeId.BOOL8),
366
- stream=stream,
367
- ).to_py():
368
- raise InvalidOperationError("Conversion from `str` failed.")
369
- return to_integers(self.obj, dtype, stream=stream)
389
+ values = self.obj.with_mask(
390
+ *plc.transform.bools_to_mask(castable, stream=stream)
391
+ )
392
+ else:
393
+ values = self.obj
394
+ return type_caster(values, dtype, stream=stream)
370
395
 
371
396
  def copy_metadata(self, from_: pl_Series, /) -> Self:
372
397
  """
@@ -487,7 +512,7 @@ class Column:
487
512
  plc.aggregation.sum(),
488
513
  plc.types.SIZE_TYPE,
489
514
  stream=stream,
490
- ).to_py()
515
+ ).to_py(stream=stream)
491
516
  else:
492
517
  result = 0
493
518
  return result
@@ -104,7 +104,7 @@ class BinOp(Expr):
104
104
  }:
105
105
  if (
106
106
  right.obj.size() == 1
107
- and right.obj.to_scalar(stream=df.stream).to_py() == 0
107
+ and right.obj.to_scalar(stream=df.stream).to_py(stream=df.stream) == 0
108
108
  ):
109
109
  return Column(
110
110
  plc.Column.all_null_like(
@@ -220,7 +220,7 @@ class BooleanFunction(Expr):
220
220
  #
221
221
  # If the input null count was non-zero, we must
222
222
  # post-process the result to insert the correct value.
223
- h_result = scalar_result.to_py()
223
+ h_result = scalar_result.to_py(stream=df.stream)
224
224
  if (is_any and not h_result) or (not is_any and h_result):
225
225
  # Any All
226
226
  # False || Null => Null True && Null => Null
@@ -37,7 +37,7 @@ class Gather(Expr):
37
37
  )
38
38
  n = values.size
39
39
  lo, hi = plc.reduce.minmax(indices.obj, stream=df.stream)
40
- if hi.to_py() >= n or lo.to_py() < -n: # type: ignore[operator]
40
+ if hi.to_py(stream=df.stream) >= n or lo.to_py(stream=df.stream) < -n: # type: ignore[operator]
41
41
  raise ValueError("gather indices are out of bounds")
42
42
  if indices.null_count:
43
43
  bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
@@ -390,7 +390,7 @@ class StringFunction(Expr):
390
390
  plc.aggregation.all(),
391
391
  plc.DataType(plc.TypeId.BOOL8),
392
392
  stream=df.stream,
393
- ).to_py():
393
+ ).to_py(stream=df.stream):
394
394
  raise InvalidOperationError(
395
395
  "zfill only supports ascii strings with no unicode characters"
396
396
  )
@@ -427,15 +427,12 @@ class StringFunction(Expr):
427
427
  stream=df.stream,
428
428
  )
429
429
 
430
- if (
431
- POLARS_VERSION_LT_132
432
- and not plc.reduce.reduce(
433
- all_gt_0,
434
- plc.aggregation.all(),
435
- plc.DataType(plc.TypeId.BOOL8),
436
- stream=df.stream,
437
- ).to_py()
438
- ): # pragma: no cover
430
+ if POLARS_VERSION_LT_132 and not plc.reduce.reduce(
431
+ all_gt_0,
432
+ plc.aggregation.all(),
433
+ plc.DataType(plc.TypeId.BOOL8),
434
+ stream=df.stream,
435
+ ).to_py(stream=df.stream): # pragma: no cover
439
436
  raise InvalidOperationError("fill conversion failed.")
440
437
 
441
438
  return Column(
@@ -887,11 +884,11 @@ class StringFunction(Expr):
887
884
  filtered = table.columns()[0]
888
885
  first_valid_data = plc.copying.get_element(
889
886
  filtered, 0, stream=df.stream
890
- ).to_py()
887
+ ).to_py(stream=df.stream)
891
888
  else:
892
889
  first_valid_data = plc.copying.get_element(
893
890
  plc_col, 0, stream=df.stream
894
- ).to_py()
891
+ ).to_py(stream=df.stream)
895
892
 
896
893
  # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
897
894
  format = _infer_datetime_format(first_valid_data) # type: ignore[arg-type]
@@ -909,7 +906,7 @@ class StringFunction(Expr):
909
906
  plc.aggregation.all(),
910
907
  plc.DataType(plc.TypeId.BOOL8),
911
908
  stream=df.stream,
912
- ).to_py():
909
+ ).to_py(stream=df.stream):
913
910
  raise InvalidOperationError("conversion from `str` failed.")
914
911
  else:
915
912
  not_timestamps = plc.unary.unary_operation(
@@ -950,18 +947,24 @@ class StringFunction(Expr):
950
947
  elif self.name is StringFunction.Name.PadStart:
951
948
  if POLARS_VERSION_LT_132: # pragma: no cover
952
949
  (column,) = columns
953
- width, char = self.options
950
+ width_arg, char = self.options
951
+ pad_width = cast(int, width_arg)
954
952
  else:
955
953
  (column, width_col) = columns
956
954
  (char,) = self.options
957
955
  # TODO: Maybe accept a string scalar in
958
956
  # cudf::strings::pad to avoid DtoH transfer
959
- # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
960
- width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
957
+ # See https://github.com/rapidsai/cudf/issues/20202
958
+ width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
959
+ stream=df.stream
960
+ )
961
+ assert width_py is not None
962
+ pad_width = int(width_py)
963
+
961
964
  return Column(
962
965
  plc.strings.padding.pad(
963
966
  column.obj,
964
- width, # type: ignore[arg-type]
967
+ pad_width,
965
968
  plc.strings.SideType.LEFT,
966
969
  char,
967
970
  stream=df.stream,
@@ -971,17 +974,23 @@ class StringFunction(Expr):
971
974
  elif self.name is StringFunction.Name.PadEnd:
972
975
  if POLARS_VERSION_LT_132: # pragma: no cover
973
976
  (column,) = columns
974
- width, char = self.options
977
+ width_arg, char = self.options
978
+ pad_width = cast(int, width_arg)
975
979
  else:
976
980
  (column, width_col) = columns
977
981
  (char,) = self.options
978
982
  # TODO: Maybe accept a string scalar in
979
983
  # cudf::strings::pad to avoid DtoH transfer
980
- width: int = width_col.obj.to_scalar(stream=df.stream).to_py() # type: ignore[no-redef]
984
+ width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
985
+ stream=df.stream
986
+ )
987
+ assert width_py is not None
988
+ pad_width = int(width_py)
989
+
981
990
  return Column(
982
991
  plc.strings.padding.pad(
983
992
  column.obj,
984
- width, # type: ignore[arg-type]
993
+ pad_width,
985
994
  plc.strings.SideType.RIGHT,
986
995
  char,
987
996
  stream=df.stream,
@@ -15,6 +15,7 @@ from cudf_polars.dsl.expressions.base import (
15
15
  ExecutionContext,
16
16
  Expr,
17
17
  )
18
+ from cudf_polars.dsl.utils.reshape import broadcast
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from cudf_polars.containers import DataFrame, DataType
@@ -41,15 +42,38 @@ class Ternary(Expr):
41
42
  when, then, otherwise = (
42
43
  child.evaluate(df, context=context) for child in self.children
43
44
  )
45
+
46
+ if when.is_scalar:
47
+ # For scalar predicates: lowering to copy_if_else would require
48
+ # materializing an all true/false mask column. Instead, just pick
49
+ # the correct branch.
50
+ when_predicate = when.obj_scalar(stream=df.stream).to_py(stream=df.stream)
51
+ pick, other = (then, otherwise) if when_predicate else (otherwise, then)
52
+
53
+ pick_col = (
54
+ broadcast(
55
+ pick,
56
+ target_length=1 if other.is_scalar else other.size,
57
+ stream=df.stream,
58
+ )[0]
59
+ if pick.is_scalar
60
+ else pick
61
+ )
62
+ return Column(pick_col.obj, dtype=self.dtype)
63
+
44
64
  then_obj = then.obj_scalar(stream=df.stream) if then.is_scalar else then.obj
45
65
  otherwise_obj = (
46
66
  otherwise.obj_scalar(stream=df.stream)
47
67
  if otherwise.is_scalar
48
68
  else otherwise.obj
49
69
  )
70
+
50
71
  return Column(
51
72
  plc.copying.copy_if_else(
52
- then_obj, otherwise_obj, when.obj, stream=df.stream
73
+ then_obj,
74
+ otherwise_obj,
75
+ when.obj,
76
+ stream=df.stream,
53
77
  ),
54
78
  dtype=self.dtype,
55
79
  )
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
  # TODO: remove need for this
4
4
  """DSL nodes for unary operations."""
@@ -25,11 +25,12 @@ __all__ = ["Cast", "Len", "UnaryFunction"]
25
25
  class Cast(Expr):
26
26
  """Class representing a cast of an expression."""
27
27
 
28
- __slots__ = ()
29
- _non_child = ("dtype",)
28
+ __slots__ = ("strict",)
29
+ _non_child = ("dtype", "strict")
30
30
 
31
- def __init__(self, dtype: DataType, value: Expr) -> None:
31
+ def __init__(self, dtype: DataType, strict: bool, value: Expr) -> None: # noqa: FBT001
32
32
  self.dtype = dtype
33
+ self.strict = strict
33
34
  self.children = (value,)
34
35
  self.is_pointwise = True
35
36
  if not dtypes.can_cast(value.dtype.plc_type, self.dtype.plc_type):
@@ -43,7 +44,7 @@ class Cast(Expr):
43
44
  """Evaluate this expression given a dataframe for context."""
44
45
  (child,) = self.children
45
46
  column = child.evaluate(df, context=context)
46
- return column.astype(self.dtype, stream=df.stream)
47
+ return column.astype(self.dtype, stream=df.stream, strict=self.strict)
47
48
 
48
49
 
49
50
  class Len(Expr):
@@ -240,7 +241,9 @@ class UnaryFunction(Expr):
240
241
  if maintain_order:
241
242
  column = column.sorted_like(values)
242
243
  return column
243
- elif self.name == "set_sorted":
244
+ elif self.name == "set_sorted": # pragma: no cover
245
+ # TODO: LazyFrame.set_sorted is proper IR concept (ie. FunctionIR::Hint)
246
+ # and is is currently not implemented. We should reimplement it as a MapFunction.
244
247
  (column,) = (child.evaluate(df, context=context) for child in self.children)
245
248
  (asc,) = self.options
246
249
  order = (
@@ -253,10 +256,10 @@ class UnaryFunction(Expr):
253
256
  # PERF: This invokes four stream synchronisations!
254
257
  has_nulls_first = not plc.copying.get_element(
255
258
  column.obj, 0, stream=df.stream
256
- ).is_valid()
259
+ ).is_valid(df.stream)
257
260
  has_nulls_last = not plc.copying.get_element(
258
261
  column.obj, n - 1, stream=df.stream
259
- ).is_valid()
262
+ ).is_valid(df.stream)
260
263
  if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
261
264
  order == plc.types.Order.ASCENDING and has_nulls_last
262
265
  ):