PyPI - cudf-polars-cu12 - Versions diffs - 24.8.0a281__py3-none-any.whl → 25.2.0__py3-none-any.whl - Mend

cudf-polars-cu12 24.8.0a281py3-none-any.whl → 25.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cudf_polars/VERSION +1 -1
cudf_polars/__init__.py +9 -3
cudf_polars/callback.py +258 -23
cudf_polars/containers/__init__.py +2 -2
cudf_polars/containers/column.py +167 -66
cudf_polars/containers/dataframe.py +157 -58
cudf_polars/dsl/expr.py +37 -1397
cudf_polars/dsl/expressions/__init__.py +8 -0
cudf_polars/dsl/expressions/aggregation.py +246 -0
cudf_polars/dsl/expressions/base.py +300 -0
cudf_polars/dsl/expressions/binaryop.py +135 -0
cudf_polars/dsl/expressions/boolean.py +312 -0
cudf_polars/dsl/expressions/datetime.py +196 -0
cudf_polars/dsl/expressions/literal.py +91 -0
cudf_polars/dsl/expressions/rolling.py +40 -0
cudf_polars/dsl/expressions/selection.py +92 -0
cudf_polars/dsl/expressions/sorting.py +97 -0
cudf_polars/dsl/expressions/string.py +362 -0
cudf_polars/dsl/expressions/ternary.py +53 -0
cudf_polars/dsl/expressions/unary.py +339 -0
cudf_polars/dsl/ir.py +1202 -427
cudf_polars/dsl/nodebase.py +150 -0
cudf_polars/dsl/to_ast.py +318 -0
cudf_polars/dsl/translate.py +398 -181
cudf_polars/dsl/traversal.py +175 -0
cudf_polars/experimental/__init__.py +8 -0
cudf_polars/experimental/base.py +43 -0
cudf_polars/experimental/dask_serialize.py +59 -0
cudf_polars/experimental/dispatch.py +84 -0
cudf_polars/experimental/io.py +325 -0
cudf_polars/experimental/parallel.py +253 -0
cudf_polars/experimental/select.py +36 -0
cudf_polars/testing/asserts.py +139 -19
cudf_polars/testing/plugin.py +242 -0
cudf_polars/typing/__init__.py +51 -10
cudf_polars/utils/dtypes.py +88 -39
cudf_polars/utils/sorting.py +2 -2
cudf_polars/utils/versions.py +22 -0
{cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +15 -12
cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
{cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu12-24.8.0a281.dist-info/RECORD +0 -23
{cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
{cudf_polars_cu12-24.8.0a281.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 24.08.~~00a281~~
1	+ 25.02.00

cudf_polars/__init__.py CHANGED Viewed

@@ -12,11 +12,17 @@ from __future__ import annotations
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
-from cudf_polars.dsl.translate import translate_ir
+from cudf_polars.dsl.translate import Translator
+# Check we have a supported polars version
+from cudf_polars.utils.versions import _ensure_polars_version
+_ensure_polars_version()
+del _ensure_polars_version
 __all__: list[str] = [
-    "execute_with_cudf",
-    "translate_ir",
+    "Translator",
     "__git_commit__",
     "__version__",
+    "execute_with_cudf",
 ]

cudf_polars/callback.py CHANGED Viewed

@@ -5,15 +5,27 @@
 from __future__ import annotations
-from functools import partial
-from typing import TYPE_CHECKING
+import contextlib
+import os
+import warnings
+from functools import cache, partial
+from typing import TYPE_CHECKING, Literal
 import nvtx
-from cudf_polars.dsl.translate import translate_ir
+from polars.exceptions import ComputeError, PerformanceWarning
+import pylibcudf
+import rmm
+from rmm._cuda import gpu
+from cudf_polars.dsl.translate import Translator
 if TYPE_CHECKING:
+    from collections.abc import Generator
     import polars as pl
+    from polars import GPUEngine
     from cudf_polars.dsl.ir import IR
     from cudf_polars.typing import NodeTraverser
@@ -21,25 +33,214 @@ if TYPE_CHECKING:
 __all__: list[str] = ["execute_with_cudf"]
+_SUPPORTED_PREFETCHES = {
+    "column_view::get_data",
+    "mutable_column_view::get_data",
+    "gather",
+    "hash_join",
+}
+def _env_get_int(name, default):
+    try:
+        return int(os.getenv(name, default))
+    except (ValueError, TypeError):  # pragma: no cover
+        return default  # pragma: no cover
+@cache
+def default_memory_resource(
+    device: int,
+    cuda_managed_memory: bool,  # noqa: FBT001
+) -> rmm.mr.DeviceMemoryResource:
+    """
+    Return the default memory resource for cudf-polars.
+    Parameters
+    ----------
+    device
+        Disambiguating device id when selecting the device. Must be
+        the active device when this function is called.
+    cuda_managed_memory
+        Whether to use managed memory or not.
+    Returns
+    -------
+    rmm.mr.DeviceMemoryResource
+        The default memory resource that cudf-polars uses. Currently
+        a managed memory resource, if `cuda_managed_memory` is `True`.
+        else, an async pool resource is returned.
+    """
+    try:
+        if (
+            cuda_managed_memory
+            and pylibcudf.utils._is_concurrent_managed_access_supported()
+        ):
+            # Allocating 80% of the available memory for the pool.
+            # Leaving a 20% headroom to avoid OOM errors.
+            free_memory, _ = rmm.mr.available_device_memory()
+            free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+            for key in _SUPPORTED_PREFETCHES:
+                pylibcudf.experimental.enable_prefetching(key)
+            mr = rmm.mr.PrefetchResourceAdaptor(
+                rmm.mr.PoolMemoryResource(
+                    rmm.mr.ManagedMemoryResource(),
+                    initial_pool_size=free_memory,
+                )
+            )
+        else:
+            mr = rmm.mr.CudaAsyncMemoryResource()
+    except RuntimeError as e:  # pragma: no cover
+        msg, *_ = e.args
+        if (
+            msg.startswith("RMM failure")
+            and msg.find("not supported with this CUDA driver/runtime version") > -1
+        ):
+            raise ComputeError(
+                "GPU engine requested, but incorrect cudf-polars package installed. "
+                "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` "
+                "and install `cudf-polars-cu11`"
+            ) from None
+        else:
+            raise
+    else:
+        return mr
+@contextlib.contextmanager
+def set_memory_resource(
+    mr: rmm.mr.DeviceMemoryResource | None,
+) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
+    """
+    Set the current memory resource for an execution block.
+    Parameters
+    ----------
+    mr
+        Memory resource to use. If `None`, calls :func:`default_memory_resource`
+        to obtain an mr on the currently active device.
+    Returns
+    -------
+    Memory resource used.
+    Notes
+    -----
+    At exit, the memory resource is restored to whatever was current
+    at entry. If a memory resource is provided, it must be valid to
+    use with the currently active device.
+    """
+    previous = rmm.mr.get_current_device_resource()
+    if mr is None:
+        device: int = gpu.getDevice()
+        mr = default_memory_resource(
+            device=device,
+            cuda_managed_memory=bool(
+                _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
+            ),
+        )
+    rmm.mr.set_current_device_resource(mr)
+    try:
+        yield mr
+    finally:
+        rmm.mr.set_current_device_resource(previous)
+@contextlib.contextmanager
+def set_device(device: int | None) -> Generator[int, None, None]:
+    """
+    Set the device the query is executed on.
+    Parameters
+    ----------
+    device
+        Device to use. If `None`, uses the current device.
+    Returns
+    -------
+    Device active for the execution of the block.
+    Notes
+    -----
+    At exit, the device is restored to whatever was current at entry.
+    """
+    previous: int = gpu.getDevice()
+    if device is not None:
+        gpu.setDevice(device)
+    try:
+        yield previous
+    finally:
+        gpu.setDevice(previous)
 def _callback(
     ir: IR,
     with_columns: list[str] | None,
     pyarrow_predicate: str | None,
     n_rows: int | None,
+    *,
+    device: int | None,
+    memory_resource: int | None,
+    executor: Literal["pylibcudf", "dask-experimental"] | None,
 ) -> pl.DataFrame:
     assert with_columns is None
     assert pyarrow_predicate is None
     assert n_rows is None
-    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
-        return ir.evaluate(cache={}).to_polars()
+    with (
+        nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
+        # Device must be set before memory resource is obtained.
+        set_device(device),
+        set_memory_resource(memory_resource),
+    ):
+        if executor is None or executor == "pylibcudf":
+            return ir.evaluate(cache={}).to_polars()
+        elif executor == "dask-experimental":
+            from cudf_polars.experimental.parallel import evaluate_dask
+            return evaluate_dask(ir).to_polars()
+        else:
+            raise ValueError(f"Unknown executor '{executor}'")
-def execute_with_cudf(
-    nt: NodeTraverser,
-    *,
-    raise_on_fail: bool = False,
-    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
-) -> None:
+def validate_config_options(config: dict) -> None:
+    """
+    Validate the configuration options for the GPU engine.
+    Parameters
+    ----------
+    config
+        Configuration options to validate.
+    Raises
+    ------
+    ValueError
+        If the configuration contains unsupported options.
+    """
+    if unsupported := (
+        config.keys()
+        - {"raise_on_fail", "parquet_options", "executor", "executor_options"}
+    ):
+        raise ValueError(
+            f"Engine configuration contains unsupported settings: {unsupported}"
+        )
+    assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
+        config.get("parquet_options", {})
+    )
+    # Validate executor_options
+    executor = config.get("executor", "pylibcudf")
+    if executor == "dask-experimental":
+        unsupported = config.get("executor_options", {}).keys() - {
+            "max_rows_per_partition",
+            "parquet_blocksize",
+        }
+    else:
+        unsupported = config.get("executor_options", {}).keys()
+    if unsupported:
+        raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}")
+def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
@@ -48,19 +249,53 @@ def execute_with_cudf(
     nt
         NodeTraverser
-    raise_on_fail
-        Should conversion raise an exception rather than continuing
-        without setting a callback.
+    config
+        GPUEngine configuration object
-    exception
-        Optional exception, or tuple of exceptions, to catch during
-        translation. Defaults to ``Exception``.
+    Raises
+    ------
+    ValueError
+        If the config contains unsupported keys.
+    NotImplementedError
+        If translation of the plan is unsupported.
+    Notes
+    -----
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
-    try:
-        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            nt.set_udf(partial(_callback, translate_ir(nt)))
-    except exception:
-        if raise_on_fail:
-            raise
+    device = config.device
+    memory_resource = config.memory_resource
+    raise_on_fail = config.config.get("raise_on_fail", False)
+    executor = config.config.get("executor", None)
+    validate_config_options(config.config)
+    with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+        translator = Translator(nt, config)
+        ir = translator.translate_ir()
+        ir_translation_errors = translator.errors
+        if len(ir_translation_errors):
+            # TODO: Display these errors in user-friendly way.
+            # tracked in https://github.com/rapidsai/cudf/issues/17051
+            unique_errors = sorted(set(ir_translation_errors), key=str)
+            formatted_errors = "\n".join(
+                f"- {e.__class__.__name__}: {e}" for e in unique_errors
+            )
+            error_message = (
+                "Query execution with GPU not possible: unsupported operations."
+                f"\nThe errors were:\n{formatted_errors}"
+            )
+            exception = NotImplementedError(error_message, unique_errors)
+            if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+                warnings.warn(error_message, PerformanceWarning, stacklevel=2)
+            if raise_on_fail:
+                raise exception
+        else:
+            nt.set_udf(
+                partial(
+                    _callback,
+                    ir,
+                    device=device,
+                    memory_resource=memory_resource,
+                    executor=executor,
+                )
+            )

cudf_polars/containers/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@
 from __future__ import annotations
-__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
+__all__: list[str] = ["Column", "DataFrame"]
-from cudf_polars.containers.column import Column, NamedColumn
+from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame

cudf_polars/containers/column.py CHANGED Viewed

@@ -8,12 +8,25 @@ from __future__ import annotations
 import functools
 from typing import TYPE_CHECKING
-import cudf._lib.pylibcudf as plc
+from polars.exceptions import InvalidOperationError
+import pylibcudf as plc
+from pylibcudf.strings.convert.convert_floats import from_floats, is_float, to_floats
+from pylibcudf.strings.convert.convert_integers import (
+    from_integers,
+    is_integer,
+    to_integers,
+)
+from pylibcudf.traits import is_floating_point
+from cudf_polars.utils.dtypes import is_order_preserving_cast
 if TYPE_CHECKING:
     from typing_extensions import Self
-__all__: list[str] = ["Column", "NamedColumn"]
+    import polars as pl
+__all__: list[str] = ["Column"]
 class Column:
@@ -24,6 +37,9 @@ class Column:
     order: plc.types.Order
     null_order: plc.types.NullOrder
     is_scalar: bool
+    # Optional name, only ever set by evaluation of NamedExpr nodes
+    # The internal evaluation should not care about the name.
+    name: str | None
     def __init__(
         self,
@@ -32,14 +48,12 @@ class Column:
         is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
         order: plc.types.Order = plc.types.Order.ASCENDING,
         null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+        name: str | None = None,
     ):
         self.obj = column
         self.is_scalar = self.obj.size() == 1
-        if self.obj.size() <= 1:
-            is_sorted = plc.types.Sorted.YES
-        self.is_sorted = is_sorted
-        self.order = order
-        self.null_order = null_order
+        self.name = name
+        self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
     @functools.cached_property
     def obj_scalar(self) -> plc.Scalar:
@@ -61,9 +75,26 @@ class Column:
             )
         return plc.copying.get_element(self.obj, 0)
+    def rename(self, name: str | None, /) -> Self:
+        """
+        Return a shallow copy with a new name.
+        Parameters
+        ----------
+        name
+            New name
+        Returns
+        -------
+        Shallow copy of self with new name set.
+        """
+        new = self.copy()
+        new.name = name
+        return new
     def sorted_like(self, like: Column, /) -> Self:
         """
-        Copy sortedness properties from a column onto self.
+        Return a shallow copy with sortedness from like.
         Parameters
         ----------
@@ -72,16 +103,122 @@ class Column:
         Returns
         -------
-        Self with metadata set.
+        Shallow copy of self with metadata set.
         See Also
         --------
-        set_sorted
+        set_sorted, copy_metadata
         """
-        return self.set_sorted(
-            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        return type(self)(
+            self.obj,
+            name=self.name,
+            is_sorted=like.is_sorted,
+            order=like.order,
+            null_order=like.null_order,
         )
+    def astype(self, dtype: plc.DataType) -> Column:
+        """
+        Cast the column to as the requested dtype.
+        Parameters
+        ----------
+        dtype
+            Datatype to cast to.
+        Returns
+        -------
+        Column of requested type.
+        Raises
+        ------
+        RuntimeError
+            If the cast is unsupported.
+        Notes
+        -----
+        This only produces a copy if the requested dtype doesn't match
+        the current one.
+        """
+        if self.obj.type() == dtype:
+            return self
+        if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING:
+            return Column(self._handle_string_cast(dtype))
+        else:
+            result = Column(plc.unary.cast(self.obj, dtype))
+            if is_order_preserving_cast(self.obj.type(), dtype):
+                return result.sorted_like(self)
+            return result
+    def _handle_string_cast(self, dtype: plc.DataType) -> plc.Column:
+        if dtype.id() == plc.TypeId.STRING:
+            if is_floating_point(self.obj.type()):
+                return from_floats(self.obj)
+            else:
+                return from_integers(self.obj)
+        else:
+            if is_floating_point(dtype):
+                floats = is_float(self.obj)
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        floats,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("Conversion from `str` failed.")
+                return to_floats(self.obj, dtype)
+            else:
+                integers = is_integer(self.obj)
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        integers,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("Conversion from `str` failed.")
+                return to_integers(self.obj, dtype)
+    def copy_metadata(self, from_: pl.Series, /) -> Self:
+        """
+        Copy metadata from a host series onto self.
+        Parameters
+        ----------
+        from_
+            Polars series to copy metadata from
+        Returns
+        -------
+        Self with metadata set.
+        See Also
+        --------
+        set_sorted, sorted_like
+        """
+        self.name = from_.name
+        if len(from_) <= 1:
+            return self
+        ascending = from_.flags["SORTED_ASC"]
+        descending = from_.flags["SORTED_DESC"]
+        if ascending or descending:
+            has_null_first = from_.item(0) is None
+            has_null_last = from_.item(-1) is None
+            order = (
+                plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if (descending and has_null_first) or (ascending and has_null_last):
+                null_order = plc.types.NullOrder.AFTER
+            return self.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        return self
     def set_sorted(
         self,
         *,
@@ -125,65 +262,29 @@ class Column:
             is_sorted=self.is_sorted,
             order=self.order,
             null_order=self.null_order,
+            name=self.name,
         )
     def mask_nans(self) -> Self:
-        """Return a copy of self with nans masked out."""
-        if self.nan_count > 0:
-            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        """Return a shallow copy of self with nans masked out."""
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count))
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
         return self.copy()
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-            return 0
-        return plc.interop.to_arrow(
-            plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
-        ).as_py()
-class NamedColumn(Column):
-    """A column with a name."""
-    name: str
-    def __init__(
-        self,
-        column: plc.Column,
-        name: str,
-        *,
-        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
-        order: plc.types.Order = plc.types.Order.ASCENDING,
-        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
-    ) -> None:
-        super().__init__(
-            column, is_sorted=is_sorted, order=order, null_order=null_order
-        )
-        self.name = name
-    def copy(self, *, new_name: str | None = None) -> Self:
-        """
-        A shallow copy of the column.
-        Parameters
-        ----------
-        new_name
-            Optional new name for the copied column.
-        Returns
-        -------
-        New column sharing data with self.
-        """
-        return type(self)(
-            self.obj,
-            self.name if new_name is None else new_name,
-            is_sorted=self.is_sorted,
-            order=self.order,
-            null_order=self.null_order,
-        )
+        if plc.traits.is_floating_point(self.obj.type()):
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    plc.types.SIZE_TYPE,
+                )
+            ).as_py()
+        return 0

cudf-polars-cu12 24.8.0a281__py3-none-any.whl → 25.2.0__py3-none-any.whl

cudf-polars-cu12 24.8.0a281py3-none-any.whl → 25.2.0py3-none-any.whl