PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/GIT_COMMIT CHANGED Viewed

	@@ -1 +1 @@
1	- ~~580975be72b3516c2c18da149b62de557b28fb67~~
1	+ 9782a269e689140d2b00b5172a93056bdf19e8c2

cudf_polars/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 25.12.00
1	+ 26.02.000

cudf_polars/callback.py CHANGED Viewed

@@ -11,6 +11,7 @@ import textwrap
 import time
 import warnings
 from functools import cache, partial
+from threading import Lock
 from typing import TYPE_CHECKING, Literal, overload
 import nvtx
@@ -162,6 +163,11 @@ def set_memory_resource(
         rmm.mr.set_current_device_resource(previous)
+# libcudf doesn't support executing on multiple devices from within the same process.
+SEEN_DEVICE = None
+SEEN_DEVICE_LOCK = Lock()
 @contextlib.contextmanager
 def set_device(device: int | None) -> Generator[int, None, None]:
     """
@@ -180,13 +186,28 @@ def set_device(device: int | None) -> Generator[int, None, None]:
     -----
     At exit, the device is restored to whatever was current at entry.
     """
-    previous: int = gpu.getDevice()
-    if device is not None:
-        gpu.setDevice(device)
-    try:
-        yield previous
-    finally:
-        gpu.setDevice(previous)
+    global SEEN_DEVICE  # noqa: PLW0603
+    current: int = gpu.getDevice()
+    to_use = device if device is not None else current
+    with SEEN_DEVICE_LOCK:
+        if (
+            SEEN_DEVICE is not None and to_use != SEEN_DEVICE
+        ):  # pragma: no cover; requires multiple GPUs in CI
+            raise RuntimeError(
+                "cudf-polars does not support running queries on "
+                "multiple devices in the same process. "
+                f"A previous query used device-{SEEN_DEVICE}, "
+                f"the current query is using device-{to_use}."
+            )
+        SEEN_DEVICE = to_use
+    if to_use != current:
+        gpu.setDevice(to_use)
+        try:
+            yield to_use
+        finally:
+            gpu.setDevice(current)
+    else:
+        yield to_use
 @overload

cudf_polars/containers/column.py CHANGED Viewed

@@ -16,7 +16,6 @@ from pylibcudf.strings.convert.convert_integers import (
     is_integer,
     to_integers,
 )
-from pylibcudf.traits import is_floating_point
 from cudf_polars.containers import DataType
 from cudf_polars.containers.datatype import _dtype_from_header, _dtype_to_header
@@ -24,6 +23,8 @@ from cudf_polars.utils import conversion
 from cudf_polars.utils.dtypes import is_order_preserving_cast
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from typing_extensions import Self
     from polars import Series as pl_Series
@@ -264,7 +265,7 @@ class Column:
             return True
         return False
-    def astype(self, dtype: DataType, stream: Stream) -> Column:
+    def astype(self, dtype: DataType, stream: Stream, *, strict: bool = True) -> Column:
         """
         Cast the column to as the requested dtype.
@@ -275,6 +276,9 @@ class Column:
         stream
             CUDA stream used for device memory operations and kernel launches
             on this Column. The data in ``self.obj`` must be valid on this stream.
+        strict
+            If True, raise an error if the cast is unsupported.
+            If False, return nulls for unsupported casts.
         Returns
         -------
@@ -299,7 +303,8 @@ class Column:
             or self.obj.type().id() == plc.TypeId.STRING
         ):
             return Column(
-                self._handle_string_cast(plc_dtype, stream=stream), dtype=dtype
+                self._handle_string_cast(plc_dtype, stream=stream, strict=strict),
+                dtype=dtype,
             )
         elif plc.traits.is_integral_not_bool(
             self.obj.type()
@@ -340,33 +345,53 @@ class Column:
                 return result.sorted_like(self)
             return result
-    def _handle_string_cast(self, dtype: plc.DataType, stream: Stream) -> plc.Column:
+    def _handle_string_cast(
+        self, dtype: plc.DataType, stream: Stream, *, strict: bool
+    ) -> plc.Column:
         if dtype.id() == plc.TypeId.STRING:
-            if is_floating_point(self.obj.type()):
+            if plc.traits.is_floating_point(self.obj.type()):
                 return from_floats(self.obj, stream=stream)
-            else:
+            elif plc.traits.is_integral_not_bool(self.obj.type()):
                 return from_integers(self.obj, stream=stream)
+            else:
+                raise InvalidOperationError(
+                    f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
+                )
+        type_checker: Callable[[plc.Column, Stream], plc.Column]
+        type_caster: Callable[[plc.Column, plc.DataType, Stream], plc.Column]
+        if plc.traits.is_floating_point(dtype):
+            type_checker = is_float
+            type_caster = to_floats
+        elif plc.traits.is_integral_not_bool(dtype):
+            # is_integer has a second optional int_type: plc.DataType | None = None argument
+            # we do not use
+            # unused-ignore for if RMM is missing
+            type_checker = is_integer  # type: ignore[assignment,unused-ignore]
+            type_caster = to_integers
         else:
-            if is_floating_point(dtype):
-                floats = is_float(self.obj, stream=stream)
-                if not plc.reduce.reduce(
-                    floats,
-                    plc.aggregation.all(),
-                    plc.DataType(plc.TypeId.BOOL8),
-                    stream=stream,
-                ).to_py():
-                    raise InvalidOperationError("Conversion from `str` failed.")
-                return to_floats(self.obj, dtype)
+            raise InvalidOperationError(
+                f"Unsupported casting from {self.dtype.id()} to {dtype.id()}."
+            )
+        castable = type_checker(self.obj, stream=stream)  # type: ignore[call-arg]
+        if not plc.reduce.reduce(
+            castable,
+            plc.aggregation.all(),
+            plc.DataType(plc.TypeId.BOOL8),
+            stream=stream,
+        ).to_py(stream=stream):
+            if strict:
+                raise InvalidOperationError(
+                    f"Conversion from {self.dtype.id()} to {dtype.id()} failed."
+                )
             else:
-                integers = is_integer(self.obj, stream=stream)
-                if not plc.reduce.reduce(
-                    integers,
-                    plc.aggregation.all(),
-                    plc.DataType(plc.TypeId.BOOL8),
-                    stream=stream,
-                ).to_py():
-                    raise InvalidOperationError("Conversion from `str` failed.")
-                return to_integers(self.obj, dtype, stream=stream)
+                values = self.obj.with_mask(
+                    *plc.transform.bools_to_mask(castable, stream=stream)
+                )
+        else:
+            values = self.obj
+        return type_caster(values, dtype, stream=stream)
     def copy_metadata(self, from_: pl_Series, /) -> Self:
         """
@@ -487,7 +512,7 @@ class Column:
                 plc.aggregation.sum(),
                 plc.types.SIZE_TYPE,
                 stream=stream,
-            ).to_py()
+            ).to_py(stream=stream)
         else:
             result = 0
         return result

cudf_polars/dsl/expressions/binaryop.py CHANGED Viewed

@@ -104,7 +104,7 @@ class BinOp(Expr):
         }:
             if (
                 right.obj.size() == 1
-                and right.obj.to_scalar(stream=df.stream).to_py() == 0
+                and right.obj.to_scalar(stream=df.stream).to_py(stream=df.stream) == 0
             ):
                 return Column(
                     plc.Column.all_null_like(

cudf_polars/dsl/expressions/boolean.py CHANGED Viewed

@@ -220,7 +220,7 @@ class BooleanFunction(Expr):
                 #
                 # If the input null count was non-zero, we must
                 # post-process the result to insert the correct value.
-                h_result = scalar_result.to_py()
+                h_result = scalar_result.to_py(stream=df.stream)
                 if (is_any and not h_result) or (not is_any and h_result):
                     # Any                     All
                     # False || Null => Null   True && Null => Null

cudf_polars/dsl/expressions/selection.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Gather(Expr):
         )
         n = values.size
         lo, hi = plc.reduce.minmax(indices.obj, stream=df.stream)
-        if hi.to_py() >= n or lo.to_py() < -n:  # type: ignore[operator]
+        if hi.to_py(stream=df.stream) >= n or lo.to_py(stream=df.stream) < -n:  # type: ignore[operator]
             raise ValueError("gather indices are out of bounds")
         if indices.null_count:
             bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY

cudf_polars/dsl/expressions/string.py CHANGED Viewed

@@ -390,7 +390,7 @@ class StringFunction(Expr):
                 plc.aggregation.all(),
                 plc.DataType(plc.TypeId.BOOL8),
                 stream=df.stream,
-            ).to_py():
+            ).to_py(stream=df.stream):
                 raise InvalidOperationError(
                     "zfill only supports ascii strings with no unicode characters"
                 )
@@ -427,15 +427,12 @@ class StringFunction(Expr):
                     stream=df.stream,
                 )
-                if (
-                    POLARS_VERSION_LT_132
-                    and not plc.reduce.reduce(
-                        all_gt_0,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                        stream=df.stream,
-                    ).to_py()
-                ):  # pragma: no cover
+                if POLARS_VERSION_LT_132 and not plc.reduce.reduce(
+                    all_gt_0,
+                    plc.aggregation.all(),
+                    plc.DataType(plc.TypeId.BOOL8),
+                    stream=df.stream,
+                ).to_py(stream=df.stream):  # pragma: no cover
                     raise InvalidOperationError("fill conversion failed.")
                 return Column(
@@ -887,11 +884,11 @@ class StringFunction(Expr):
                     filtered = table.columns()[0]
                     first_valid_data = plc.copying.get_element(
                         filtered, 0, stream=df.stream
-                    ).to_py()
+                    ).to_py(stream=df.stream)
                 else:
                     first_valid_data = plc.copying.get_element(
                         plc_col, 0, stream=df.stream
-                    ).to_py()
+                    ).to_py(stream=df.stream)
                 # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
                 format = _infer_datetime_format(first_valid_data)  # type: ignore[arg-type]
@@ -909,7 +906,7 @@ class StringFunction(Expr):
                     plc.aggregation.all(),
                     plc.DataType(plc.TypeId.BOOL8),
                     stream=df.stream,
-                ).to_py():
+                ).to_py(stream=df.stream):
                     raise InvalidOperationError("conversion from `str` failed.")
             else:
                 not_timestamps = plc.unary.unary_operation(
@@ -950,18 +947,24 @@ class StringFunction(Expr):
         elif self.name is StringFunction.Name.PadStart:
             if POLARS_VERSION_LT_132:  # pragma: no cover
                 (column,) = columns
-                width, char = self.options
+                width_arg, char = self.options
+                pad_width = cast(int, width_arg)
             else:
                 (column, width_col) = columns
                 (char,) = self.options
                 # TODO: Maybe accept a string scalar in
                 # cudf::strings::pad to avoid DtoH transfer
-                # See https://github.com/rapidsai/cudf/issues/20202 for we type ignore
-                width: int = width_col.obj.to_scalar(stream=df.stream).to_py()  # type: ignore[no-redef]
+                # See https://github.com/rapidsai/cudf/issues/20202
+                width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
+                    stream=df.stream
+                )
+                assert width_py is not None
+                pad_width = int(width_py)
             return Column(
                 plc.strings.padding.pad(
                     column.obj,
-                    width,  # type: ignore[arg-type]
+                    pad_width,
                     plc.strings.SideType.LEFT,
                     char,
                     stream=df.stream,
@@ -971,17 +974,23 @@ class StringFunction(Expr):
         elif self.name is StringFunction.Name.PadEnd:
             if POLARS_VERSION_LT_132:  # pragma: no cover
                 (column,) = columns
-                width, char = self.options
+                width_arg, char = self.options
+                pad_width = cast(int, width_arg)
             else:
                 (column, width_col) = columns
                 (char,) = self.options
                 # TODO: Maybe accept a string scalar in
                 # cudf::strings::pad to avoid DtoH transfer
-                width: int = width_col.obj.to_scalar(stream=df.stream).to_py()  # type: ignore[no-redef]
+                width_py = width_col.obj.to_scalar(stream=df.stream).to_py(
+                    stream=df.stream
+                )
+                assert width_py is not None
+                pad_width = int(width_py)
             return Column(
                 plc.strings.padding.pad(
                     column.obj,
-                    width,  # type: ignore[arg-type]
+                    pad_width,
                     plc.strings.SideType.RIGHT,
                     char,
                     stream=df.stream,

cudf_polars/dsl/expressions/ternary.py CHANGED Viewed

@@ -15,6 +15,7 @@ from cudf_polars.dsl.expressions.base import (
     ExecutionContext,
     Expr,
 )
+from cudf_polars.dsl.utils.reshape import broadcast
 if TYPE_CHECKING:
     from cudf_polars.containers import DataFrame, DataType
@@ -41,15 +42,38 @@ class Ternary(Expr):
         when, then, otherwise = (
             child.evaluate(df, context=context) for child in self.children
         )
+        if when.is_scalar:
+            # For scalar predicates: lowering to copy_if_else would require
+            # materializing an all true/false mask column. Instead, just pick
+            # the correct branch.
+            when_predicate = when.obj_scalar(stream=df.stream).to_py(stream=df.stream)
+            pick, other = (then, otherwise) if when_predicate else (otherwise, then)
+            pick_col = (
+                broadcast(
+                    pick,
+                    target_length=1 if other.is_scalar else other.size,
+                    stream=df.stream,
+                )[0]
+                if pick.is_scalar
+                else pick
+            )
+            return Column(pick_col.obj, dtype=self.dtype)
         then_obj = then.obj_scalar(stream=df.stream) if then.is_scalar else then.obj
         otherwise_obj = (
             otherwise.obj_scalar(stream=df.stream)
             if otherwise.is_scalar
             else otherwise.obj
         )
         return Column(
             plc.copying.copy_if_else(
-                then_obj, otherwise_obj, when.obj, stream=df.stream
+                then_obj,
+                otherwise_obj,
+                when.obj,
+                stream=df.stream,
             ),
             dtype=self.dtype,
         )

cudf_polars/dsl/expressions/unary.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 """DSL nodes for unary operations."""
@@ -25,11 +25,12 @@ __all__ = ["Cast", "Len", "UnaryFunction"]
 class Cast(Expr):
     """Class representing a cast of an expression."""
-    __slots__ = ()
-    _non_child = ("dtype",)
+    __slots__ = ("strict",)
+    _non_child = ("dtype", "strict")
-    def __init__(self, dtype: DataType, value: Expr) -> None:
+    def __init__(self, dtype: DataType, strict: bool, value: Expr) -> None:  # noqa: FBT001
         self.dtype = dtype
+        self.strict = strict
         self.children = (value,)
         self.is_pointwise = True
         if not dtypes.can_cast(value.dtype.plc_type, self.dtype.plc_type):
@@ -43,7 +44,7 @@ class Cast(Expr):
         """Evaluate this expression given a dataframe for context."""
         (child,) = self.children
         column = child.evaluate(df, context=context)
-        return column.astype(self.dtype, stream=df.stream)
+        return column.astype(self.dtype, stream=df.stream, strict=self.strict)
 class Len(Expr):
@@ -240,7 +241,9 @@ class UnaryFunction(Expr):
             if maintain_order:
                 column = column.sorted_like(values)
             return column
-        elif self.name == "set_sorted":
+        elif self.name == "set_sorted":  # pragma: no cover
+            # TODO: LazyFrame.set_sorted is proper IR concept (ie. FunctionIR::Hint)
+            # and is is currently not implemented. We should reimplement it as a MapFunction.
             (column,) = (child.evaluate(df, context=context) for child in self.children)
             (asc,) = self.options
             order = (
@@ -253,10 +256,10 @@ class UnaryFunction(Expr):
                 # PERF: This invokes four stream synchronisations!
                 has_nulls_first = not plc.copying.get_element(
                     column.obj, 0, stream=df.stream
-                ).is_valid()
+                ).is_valid(df.stream)
                 has_nulls_last = not plc.copying.get_element(
                     column.obj, n - 1, stream=df.stream
-                ).is_valid()
+                ).is_valid(df.stream)
                 if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
                     order == plc.types.Order.ASCENDING and has_nulls_last
                 ):

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl