PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/dask_registers.py ADDED Viewed

@@ -0,0 +1,196 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Dask function registrations such as serializers and dispatch implementations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, ClassVar, overload
+from dask.sizeof import sizeof as sizeof_dispatch
+from distributed.protocol import dask_deserialize, dask_serialize
+from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
+from distributed.utils import log_errors
+import pylibcudf as plc
+import rmm
+from cudf_polars.containers import Column, DataFrame
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+    from distributed import Client
+    from rmm.pylibrmm.memory_resource import DeviceMemoryResource
+    from rmm.pylibrmm.stream import Stream
+    from cudf_polars.typing import ColumnHeader, ColumnOptions, DataFrameHeader
+__all__ = ["DaskRegisterManager", "register"]
+class DaskRegisterManager:  # pragma: no cover; Only used with Distributed scheduler
+    """Manager to ensure ensure serializer is only registered once."""
+    _registered: bool = False
+    _client_run_executed: ClassVar[set[str]] = set()
+    @classmethod
+    def register_once(cls) -> None:
+        """Register Dask/cudf-polars serializers in calling process."""
+        if not cls._registered:
+            from cudf_polars.experimental.dask_registers import register
+            register()
+            cls._registered = True
+    @classmethod
+    def run_on_cluster(cls, client: Client) -> None:
+        """Run register on the workers and scheduler once."""
+        if client.id not in cls._client_run_executed:
+            client.run(cls.register_once)
+            client.run_on_scheduler(cls.register_once)
+            cls._client_run_executed.add(client.id)
+def register() -> None:
+    """Register dask serialization and dispatch functions."""
+    @overload
+    def serialize_column_or_frame(
+        x: DataFrame,
+    ) -> tuple[DataFrameHeader, list[memoryview]]: ...
+    @overload
+    def serialize_column_or_frame(
+        x: Column,
+    ) -> tuple[ColumnHeader, list[memoryview]]: ...
+    @cuda_serialize.register((Column, DataFrame))
+    def serialize_column_or_frame(
+        x: DataFrame | Column,
+    ) -> tuple[DataFrameHeader | ColumnHeader, list[memoryview]]:
+        with log_errors():
+            header, frames = x.serialize()
+            return header, list(frames)  # Dask expect a list of frames
+    @cuda_deserialize.register(DataFrame)
+    def _(
+        header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
+    ) -> DataFrame:
+        with log_errors():
+            metadata, gpudata = frames  # TODO: check if this is a length-2 list...
+            return DataFrame.deserialize(header, (metadata, plc.gpumemoryview(gpudata)))
+    @cuda_deserialize.register(Column)
+    def _(header: ColumnHeader, frames: tuple[memoryview, plc.gpumemoryview]) -> Column:
+        with log_errors():
+            metadata, gpudata = frames
+            return Column.deserialize(header, (metadata, plc.gpumemoryview(gpudata)))
+    @overload
+    def dask_serialize_column_or_frame(
+        x: DataFrame,
+    ) -> tuple[DataFrameHeader, tuple[memoryview, memoryview]]: ...
+    @overload
+    def dask_serialize_column_or_frame(
+        x: Column,
+    ) -> tuple[ColumnHeader, tuple[memoryview, memoryview]]: ...
+    @dask_serialize.register(Column)
+    def dask_serialize_column_or_frame(
+        x: DataFrame | Column,
+    ) -> tuple[DataFrameHeader | ColumnHeader, tuple[memoryview, memoryview]]:
+        with log_errors():
+            header, (metadata, gpudata) = x.serialize()
+            # For robustness, we check that the gpu data is contiguous
+            cai = gpudata.__cuda_array_interface__
+            assert len(cai["shape"]) == 1
+            assert cai["strides"] is None or cai["strides"] == (1,)
+            assert cai["typestr"] == "|u1"
+            nbytes = cai["shape"][0]
+            # Copy the gpudata to host memory
+            gpudata_on_host = memoryview(
+                rmm.DeviceBuffer(ptr=gpudata.ptr, size=nbytes).copy_to_host()
+            )
+            return header, (metadata, gpudata_on_host)
+    @dask_deserialize.register(Column)
+    def _(header: ColumnHeader, frames: tuple[memoryview, memoryview]) -> Column:
+        with log_errors():
+            assert len(frames) == 2
+            # Copy the second frame (the gpudata in host memory) back to the gpu
+            frames = frames[0], plc.gpumemoryview(rmm.DeviceBuffer.to_device(frames[1]))
+            return Column.deserialize(header, frames)
+    @dask_serialize.register(DataFrame)
+    def _(
+        x: DataFrame, context: Mapping[str, Any] | None = None
+    ) -> tuple[DataFrameHeader, tuple[memoryview, memoryview]]:
+        # Do regular serialization if no staging buffer is provided.
+        if context is None or "staging_device_buffer" not in context:
+            return dask_serialize_column_or_frame(x)
+        # If a staging buffer is provided, we use `ChunkedPack` to
+        # serialize the dataframe using the provided staging buffer.
+        with log_errors():
+            # Keyword arguments for `Column.__init__`.
+            columns_kwargs: list[ColumnOptions] = [
+                {
+                    "is_sorted": col.is_sorted,
+                    "order": col.order,
+                    "null_order": col.null_order,
+                    "name": col.name,
+                }
+                for col in x.columns
+            ]
+            header: DataFrameHeader = {
+                "columns_kwargs": columns_kwargs,
+                "frame_count": 2,
+            }
+            if "stream" not in context:
+                raise ValueError(
+                    "context: stream must be given when staging_device_buffer is"
+                )
+            if "device_mr" not in context:
+                raise ValueError(
+                    "context: device_mr must be given when staging_device_buffer is"
+                )
+            stream: Stream = context["stream"]
+            device_mr: DeviceMemoryResource = context["device_mr"]
+            buf: rmm.DeviceBuffer = context["staging_device_buffer"]
+            frame = plc.contiguous_split.ChunkedPack.create(
+                x.table, buf.nbytes, stream, device_mr
+            ).pack_to_host(buf)
+            return header, frame
+    @dask_deserialize.register(DataFrame)
+    def _(header: DataFrameHeader, frames: tuple[memoryview, memoryview]) -> DataFrame:
+        with log_errors():
+            assert len(frames) == 2
+            # Copy the second frame (the gpudata in host memory) back to the gpu
+            frames = frames[0], plc.gpumemoryview(rmm.DeviceBuffer.to_device(frames[1]))
+            return DataFrame.deserialize(header, frames)
+    @sizeof_dispatch.register(Column)
+    def _(x: Column) -> int:
+        """The total size of the device buffers used by the DataFrame or Column."""
+        return x.obj.device_buffer_size()
+    @sizeof_dispatch.register(DataFrame)
+    def _(x: DataFrame) -> int:
+        """The total size of the device buffers used by the DataFrame or Column."""
+        return sum(c.obj.device_buffer_size() for c in x.columns)
+    # Register rapidsmpf serializer if it's installed.
+    try:
+        from rapidsmpf.integrations.dask.spilling import register_dask_serialize
+        register_dask_serialize()  # pragma: no cover; rapidsmpf dependency not included yet
+    except ImportError:
+        pass

cudf_polars/experimental/distinct.py ADDED Viewed

@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition Distinct logic."""
+from __future__ import annotations
+import math
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl.expressions.base import Col, NamedExpr
+from cudf_polars.dsl.ir import Distinct
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
+from cudf_polars.experimental.utils import _fallback_inform, _lower_ir_fallback
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.utils.config import ConfigOptions
+def lower_distinct(
+    ir: Distinct,
+    child: IR,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    config_options: ConfigOptions,
+    *,
+    cardinality: float | None = None,
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Lower a Distinct IR into partition-wise stages.
+    Parameters
+    ----------
+    ir
+        The Distinct IR node to lower.
+    child
+        The reconstructed child of ``ir``. May differ
+        from ``ir.children[0]``.
+    partition_info
+        A mapping from all unique IR nodes to the
+        associated partitioning information.
+    config_options
+        GPUEngine configuration options.
+    cardinality
+        Cardinality factor to use for algorithm selection.
+    Returns
+    -------
+    new_node
+        The lowered Distinct node.
+    partition_info
+        A mapping from unique nodes in the new graph to associated
+        partitioning information.
+    """
+    from cudf_polars.experimental.repartition import Repartition
+    from cudf_polars.experimental.shuffle import Shuffle
+    # Extract child partitioning
+    child_count = partition_info[child].count
+    # Assume shuffle is not stable for now. Therefore, we
+    # require a tree reduction if row order matters.
+    require_tree_reduction = ir.stable or ir.keep in (
+        plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+        plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+    )
+    subset: frozenset = ir.subset or frozenset(ir.schema)
+    shuffle_keys = tuple(NamedExpr(name, Col(ir.schema[name], name)) for name in subset)
+    shuffled = partition_info[child].partitioned_on == shuffle_keys
+    if ir.keep == plc.stream_compaction.DuplicateKeepOption.KEEP_NONE:
+        # Need to shuffle the original data for keep == "none"
+        if require_tree_reduction:
+            # TODO: We cannot drop all duplicates without
+            # shuffling the data up front, and we assume
+            # shuffling is unstable for now. Note that the
+            # task-based shuffle should be stable, but it
+            # its performance is very poor.
+            raise NotImplementedError(
+                "Unsupported unique options for multiple partitions."
+            )
+        if not shuffled:
+            child = Shuffle(child.schema, shuffle_keys, config_options, child)
+            partition_info[child] = PartitionInfo(
+                count=child_count,
+                partitioned_on=shuffle_keys,
+            )
+            shuffled = True
+    output_count = 1
+    n_ary = 32  # Arbitrary default (for now)
+    if ir.zlice is not None:
+        # Head/tail slice operation has been pushed into Distinct
+        if ir.zlice[0] < 1 and ir.zlice[1] is not None:
+            # Use rough 1m-row heuristic to set n_ary
+            n_ary = max(int(1_000_000 / ir.zlice[1]), 2)
+        else:  # pragma: no cover
+            # TODO: General slicing is not supported for multiple
+            # partitions. For now, we raise an error to fall back
+            # to one partition.
+            raise NotImplementedError("Unsupported slice for multiple partitions.")
+    elif cardinality is not None:
+        # Use cardinality to determine partitioningcardinality
+        n_ary = min(max(int(1.0 / cardinality), 2), child_count)
+        output_count = max(int(cardinality * child_count), 1)
+    if output_count > 1 and require_tree_reduction:
+        # Need to reduce down to a single partition even
+        # if the cardinality is large.
+        output_count = 1
+        _fallback_inform(
+            "Unsupported unique options for multiple partitions.",
+            config_options,
+        )
+    # Partition-wise unique
+    count = child_count
+    new_node: IR = ir.reconstruct([child])
+    partition_info[new_node] = PartitionInfo(count=count)
+    if shuffled or output_count == 1:
+        # Tree reduction
+        while count > output_count:
+            new_node = Repartition(new_node.schema, new_node)
+            count = max(math.ceil(count / n_ary), output_count)
+            partition_info[new_node] = PartitionInfo(count=count)
+            new_node = ir.reconstruct([new_node])
+            partition_info[new_node] = PartitionInfo(count=count)
+    else:
+        # Shuffle
+        new_node = Shuffle(new_node.schema, shuffle_keys, config_options, new_node)
+        partition_info[new_node] = PartitionInfo(count=output_count)
+        new_node = ir.reconstruct([new_node])
+        partition_info[new_node] = PartitionInfo(
+            count=output_count,
+            partitioned_on=shuffle_keys,
+        )
+    return new_node, partition_info
+@lower_ir_node.register(Distinct)
+def _(
+    ir: Distinct, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Extract child partitioning
+    child, partition_info = rec(ir.children[0])
+    config_options = rec.state["config_options"]
+    assert config_options.executor.name == "streaming", (
+        "'in-memory' executor not supported in 'lower_ir_node'"
+    )
+    subset: frozenset = ir.subset or frozenset(ir.schema)
+    cardinality_factor = {
+        c: max(min(f, 1.0), 0.00001)
+        for c, f in config_options.executor.cardinality_factor.items()
+        if c in subset
+    }
+    cardinality = max(cardinality_factor.values()) if cardinality_factor else None
+    try:
+        return lower_distinct(
+            ir,
+            child,
+            partition_info,
+            config_options,
+            cardinality=cardinality,
+        )
+    except NotImplementedError as err:
+        return _lower_ir_fallback(ir, rec, msg=str(err))

cudf_polars/experimental/explain.py ADDED Viewed

@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Explain logical and physical plans."""
+from __future__ import annotations
+import functools
+from itertools import groupby
+from typing import TYPE_CHECKING
+from cudf_polars.dsl.ir import (
+    GroupBy,
+    Join,
+    Scan,
+    Sort,
+)
+from cudf_polars.dsl.translate import Translator
+from cudf_polars.experimental.parallel import lower_ir_graph
+from cudf_polars.utils.config import ConfigOptions
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    import polars as pl
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+def explain_query(
+    q: pl.LazyFrame, engine: pl.GPUEngine, *, physical: bool = True
+) -> str:
+    """
+    Return a formatted string representation of the IR plan.
+    Parameters
+    ----------
+    q : pl.LazyFrame
+        The LazyFrame to explain.
+    engine : pl.GPUEngine
+        The configured GPU engine to use.
+    physical : bool, default True
+        If True, show the physical (lowered) plan.
+        If False, show the logical (pre-lowering) plan.
+    Returns
+    -------
+    str
+        A string representation of the IR plan.
+    """
+    config = ConfigOptions.from_polars_engine(engine)
+    ir = Translator(q._ldf.visit(), engine).translate_ir()
+    if physical:
+        lowered_ir, partition_info = lower_ir_graph(ir, config)
+        return _repr_ir_tree(lowered_ir, partition_info)
+    else:
+        return _repr_ir_tree(ir)
+def _repr_ir_tree(
+    ir: IR,
+    partition_info: MutableMapping[IR, PartitionInfo] | None = None,
+    *,
+    offset: str = "",
+) -> str:
+    header = _repr_ir(ir, offset=offset)
+    count = partition_info[ir].count if partition_info else None
+    if count is not None:
+        header = header.rstrip("\n") + f" [{count}]\n"
+    children_strs = [
+        _repr_ir_tree(child, partition_info, offset=offset + "  ")
+        for child in ir.children
+    ]
+    return header + "".join(
+        f"{line}{offset}  (repeated {count} times)\n"
+        if (count := sum(1 for _ in group)) > 1
+        else line
+        for line, group in groupby(children_strs)
+    )
+def _repr_schema(schema: tuple | None) -> str:
+    if schema is None:
+        return ""  # pragma: no cover; no test yet
+    names = tuple(schema)
+    if len(names) > 6:
+        names = names[:3] + ("...",) + names[-2:]
+    return f" {names}"
+def _repr_header(offset: str, label: str, schema: tuple | dict | None) -> str:
+    return f"{offset}{label}{_repr_schema(tuple(schema) if schema is not None else None)}\n"
+@functools.singledispatch
+def _repr_ir(ir: IR, *, offset: str = "") -> str:
+    return _repr_header(offset, type(ir).__name__.upper(), ir.schema)
+@_repr_ir.register
+def _(ir: GroupBy, *, offset: str = "") -> str:
+    keys = tuple(ne.name for ne in ir.keys)
+    return _repr_header(offset, f"GROUPBY {keys}", ir.schema)
+@_repr_ir.register
+def _(ir: Join, *, offset: str = "") -> str:
+    left_on = tuple(ne.name for ne in ir.left_on)
+    right_on = tuple(ne.name for ne in ir.right_on)
+    return _repr_header(offset, f"JOIN {ir.options[0]} {left_on} {right_on}", ir.schema)
+@_repr_ir.register
+def _(ir: Sort, *, offset: str = "") -> str:
+    by = tuple(ne.name for ne in ir.by)
+    return _repr_header(offset, f"SORT {by}", ir.schema)
+@_repr_ir.register
+def _(ir: Scan, *, offset: str = "") -> str:
+    label = f"SCAN {ir.typ.upper()}"
+    return _repr_header(offset, label, ir.schema)

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl