PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/repartition.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Re-chunking logic for the RapidsMPF streaming runtime."""
@@ -7,14 +7,21 @@ from __future__ import annotations
 import math
 from typing import TYPE_CHECKING, Any
+from rapidsmpf.memory.buffer import MemoryType
 from rapidsmpf.streaming.core.message import Message
 from rapidsmpf.streaming.core.node import define_py_node
 from rapidsmpf.streaming.cudf.table_chunk import TableChunk
 from cudf_polars.containers import DataFrame
+from cudf_polars.experimental.rapidsmpf.collectives.allgather import AllGatherManager
 from cudf_polars.experimental.rapidsmpf.dispatch import generate_ir_sub_network
 from cudf_polars.experimental.rapidsmpf.nodes import shutdown_on_error
-from cudf_polars.experimental.rapidsmpf.utils import ChannelManager
+from cudf_polars.experimental.rapidsmpf.utils import (
+    ChannelManager,
+    Metadata,
+    empty_table_chunk,
+    opaque_reservation,
+)
 from cudf_polars.experimental.repartition import Repartition
 from cudf_polars.experimental.utils import _concat
@@ -34,7 +41,8 @@ async def concatenate_node(
     ch_out: ChannelPair,
     ch_in: ChannelPair,
     *,
-    max_chunks: int | None,
+    output_count: int,
+    collective_id: int,
 ) -> None:
     """
     Concatenate node for rapidsmpf.
@@ -51,66 +59,122 @@ async def concatenate_node(
         The output ChannelPair.
     ch_in
         The input ChannelPair.
-    max_chunks
-        The maximum number of chunks to concatenate at once.
-        If `None`, concatenate all input chunks.
+    output_count
+        The expected number of output chunks.
+    collective_id
+        Pre-allocated collective ID for this operation.
     """
-    # TODO: Use multiple streams
-    max_chunks = max(2, max_chunks) if max_chunks else None
-    async with shutdown_on_error(context, ch_in.data, ch_out.data):
-        seq_num = 0
-        while True:
-            chunks: list[TableChunk] = []
-            msg: TableChunk | None = None
-            # Collect chunks up to max_chunks or until end of stream
-            while len(chunks) < (max_chunks or float("inf")):
-                msg = await ch_in.data.recv(context)
-                if msg is None:
-                    break
-                chunks.append(
-                    TableChunk.from_message(msg).make_available_and_spill(
-                        context.br(), allow_overbooking=True
-                    )
+    async with shutdown_on_error(
+        context, ch_in.metadata, ch_in.data, ch_out.metadata, ch_out.data
+    ):
+        # Receive metadata.
+        input_metadata = await ch_in.recv_metadata(context)
+        metadata = Metadata(output_count)
+        # max_chunks corresponds to the number of chunks we can
+        # concatenate together. If None, we must concatenate everything.
+        # Since a single-partition operation gets "special treatment",
+        # we must make sure `output_count == 1` is always satisfied.
+        max_chunks: int | None = None
+        if output_count > 1:
+            # Make sure max_chunks is at least 2.
+            max_chunks = max(2, math.ceil(input_metadata.count / output_count))
+        # Check if we need global communication.
+        need_global_repartition = (
+            # Avoid allgather of already-duplicated data
+            context.comm().nranks > 1
+            and not input_metadata.duplicated
+            and output_count == 1
+        )
+        chunks: list[TableChunk]
+        msg: TableChunk | None
+        if need_global_repartition:
+            # Assume this means "global repartitioning" for now
+            # Send metadata.
+            metadata.duplicated = True
+            await ch_out.send_metadata(context, metadata)
+            allgather = AllGatherManager(context, collective_id)
+            stream = context.get_stream_from_pool()
+            seq_num = 0
+            while (msg := await ch_in.data.recv(context)) is not None:
+                allgather.insert(seq_num, TableChunk.from_message(msg))
+                seq_num += 1
+                del msg
+            allgather.insert_finished()
+            # Extract concatenated result
+            result_table = await allgather.extract_concatenated(stream)
+            # If no chunks were gathered, result_table has 0 columns.
+            # We need to create an empty table with the correct schema.
+            if result_table.num_columns() == 0 and len(ir.schema) > 0:
+                output_chunk = empty_table_chunk(ir, context, stream)
+            else:
+                output_chunk = TableChunk.from_pylibcudf_table(
+                    result_table, stream, exclusive_view=True
                 )
-            # Process collected chunks
-            if chunks:
-                df = (
-                    DataFrame.from_table(
-                        chunks[0].table_view(),
-                        list(ir.schema.keys()),
-                        list(ir.schema.values()),
-                        chunks[0].stream,
+            await ch_out.data.send(context, Message(0, output_chunk))
+        else:
+            # Send metadata.
+            metadata.duplicated = input_metadata.duplicated
+            await ch_out.send_metadata(context, metadata)
+            # Local repartitioning
+            seq_num = 0
+            while True:
+                chunks = []
+                done_receiving = False
+                # Collect chunks up to max_chunks or until end of stream
+                while len(chunks) < (max_chunks or float("inf")):
+                    msg = await ch_in.data.recv(context)
+                    if msg is None:
+                        done_receiving = True
+                        break
+                    chunks.append(
+                        TableChunk.from_message(msg).make_available_and_spill(
+                            context.br(), allow_overbooking=True
+                        )
                     )
-                    if len(chunks) == 1
-                    else _concat(
-                        *(
-                            DataFrame.from_table(
-                                chunk.table_view(),
-                                list(ir.schema.keys()),
-                                list(ir.schema.values()),
-                                chunk.stream,
-                            )
-                            for chunk in chunks
-                        ),
-                        context=ir_context,
-                    )
-                )
-                await ch_out.data.send(
-                    context,
-                    Message(
-                        seq_num,
-                        TableChunk.from_pylibcudf_table(
-                            df.table, df.stream, exclusive_view=True
-                        ),
-                    ),
-                )
-                seq_num += 1
+                    del msg
-            # Break if we reached end of stream
-            if msg is None:
-                break
+                if chunks:
+                    input_bytes = sum(
+                        chunk.data_alloc_size(MemoryType.DEVICE) for chunk in chunks
+                    )
+                    with opaque_reservation(context, input_bytes):
+                        df = _concat(
+                            *(
+                                DataFrame.from_table(
+                                    chunk.table_view(),
+                                    list(ir.schema.keys()),
+                                    list(ir.schema.values()),
+                                    chunk.stream,
+                                )
+                                for chunk in chunks
+                            ),
+                            context=ir_context,
+                        )
+                        await ch_out.data.send(
+                            context,
+                            Message(
+                                seq_num,
+                                TableChunk.from_pylibcudf_table(
+                                    df.table, df.stream, exclusive_view=True
+                                ),
+                            ),
+                        )
+                        seq_num += 1
+                        del df, chunks
+                # Break if we reached end of stream
+                if done_receiving:
+                    break
         await ch_out.data.drain(context)
@@ -118,18 +182,15 @@ async def concatenate_node(
 @generate_ir_sub_network.register(Repartition)
 def _(
     ir: Repartition, rec: SubNetGenerator
-) -> tuple[list[Any], dict[IR, ChannelManager]]:
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     # Repartition node.
     partition_info = rec.state["partition_info"]
-    max_chunks: int | None = None
     if partition_info[ir].count > 1:
         count_output = partition_info[ir].count
         count_input = partition_info[ir.children[0]].count
         if count_input < count_output:
             raise ValueError("Repartitioning to more chunks is not supported.")
-        # Make sure max_chunks is at least 2
-        max_chunks = max(2, math.ceil(count_input / count_output))
     # Process children
     nodes, channels = rec(ir.children[0])
@@ -137,15 +198,19 @@ def _(
     # Create output ChannelManager
     channels[ir] = ChannelManager(rec.state["context"])
+    # Look up the reserved shuffle ID for this operation
+    collective_id = rec.state["collective_id_map"][ir]
     # Add python node
-    nodes.append(
+    nodes[ir] = [
         concatenate_node(
             rec.state["context"],
             ir,
             rec.state["ir_context"],
             channels[ir].reserve_input_slot(),
             channels[ir.children[0]].reserve_output_slot(),
-            max_chunks=max_chunks,
+            output_count=partition_info[ir].count,
+            collective_id=collective_id,
         )
-    )
+    ]
     return nodes, channels

cudf_polars/experimental/rapidsmpf/union.py CHANGED Viewed

@@ -16,6 +16,7 @@ from cudf_polars.experimental.rapidsmpf.dispatch import (
 from cudf_polars.experimental.rapidsmpf.nodes import define_py_node, shutdown_on_error
 from cudf_polars.experimental.rapidsmpf.utils import (
     ChannelManager,
+    Metadata,
     process_children,
 )
@@ -51,8 +52,24 @@ async def union_node(
     chs_in
         The input ChannelPairs.
     """
-    # TODO: Use multiple streams
-    async with shutdown_on_error(context, *[ch.data for ch in chs_in], ch_out.data):
+    async with shutdown_on_error(
+        context,
+        *[ch.metadata for ch in chs_in],
+        *[ch.data for ch in chs_in],
+        ch_out.metadata,
+        ch_out.data,
+    ):
+        # Merge and forward metadata.
+        total_count = 0
+        duplicated = True
+        for ch_in in chs_in:
+            metadata = await ch_in.recv_metadata(context)
+            total_count += metadata.count
+            duplicated = duplicated and metadata.duplicated
+        await ch_out.send_metadata(
+            context, Metadata(total_count, duplicated=duplicated)
+        )
         seq_num_offset = 0
         for ch_in in chs_in:
             num_ch_chunks = 0
@@ -73,7 +90,9 @@ async def union_node(
 @generate_ir_sub_network.register(Union)
-def _(ir: Union, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: Union, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     # Union operation.
     # Pass-through all child chunks in channel order.
@@ -84,7 +103,7 @@ def _(ir: Union, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManag
     channels[ir] = ChannelManager(rec.state["context"])
     # Add simple python node
-    nodes.append(
+    nodes[ir] = [
         union_node(
             rec.state["context"],
             ir,
@@ -92,5 +111,5 @@ def _(ir: Union, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManag
             channels[ir].reserve_input_slot(),
             *[channels[c].reserve_output_slot() for c in ir.children],
         )
-    )
+    ]
     return nodes, channels

cudf_polars/experimental/rapidsmpf/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Utility functions and classes for the RapidsMPF streaming runtime."""
@@ -6,26 +6,33 @@ from __future__ import annotations
 import asyncio
 import operator
-from contextlib import asynccontextmanager
+from contextlib import asynccontextmanager, contextmanager
 from dataclasses import dataclass
 from functools import reduce
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any
+from rapidsmpf.streaming.chunks.arbitrary import ArbitraryChunk
+from rapidsmpf.streaming.core.message import Message
+from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+import pylibcudf as plc
+from cudf_polars.containers import DataFrame
 if TYPE_CHECKING:
-    from collections.abc import AsyncIterator
+    from collections.abc import AsyncIterator, Callable, Iterator
+    from rapidsmpf.memory.memory_reservation import MemoryReservation
     from rapidsmpf.streaming.core.channel import Channel
     from rapidsmpf.streaming.core.context import Context
-    from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+    from rapidsmpf.streaming.core.spillable_messages import SpillableMessages
+    from rmm.pylibrmm.stream import Stream
     from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.rapidsmpf.dispatch import SubNetGenerator
-# Type alias for metadata payloads (placeholder - not used yet)
-MetadataPayload: TypeAlias = Any
 @asynccontextmanager
 async def shutdown_on_error(
     context: Context, *channels: Channel[Any]
@@ -48,6 +55,29 @@ async def shutdown_on_error(
         raise
+class Metadata:
+    """Metadata payload for an individual ChannelPair."""
+    __slots__ = ("count", "duplicated", "partitioned_on")
+    count: int
+    """Chunk-count estimate."""
+    partitioned_on: tuple[str, ...]
+    """Partitioned-on columns."""
+    duplicated: bool
+    """Whether the data is duplicated on all workers."""
+    def __init__(
+        self,
+        count: int,
+        *,
+        partitioned_on: tuple[str, ...] = (),
+        duplicated: bool = False,
+    ):
+        self.count = count
+        self.partitioned_on = partitioned_on
+        self.duplicated = duplicated
 @dataclass
 class ChannelPair:
     """
@@ -70,7 +100,7 @@ class ChannelPair:
     in follow-up work.
     """
-    metadata: Channel[MetadataPayload]
+    metadata: Channel[ArbitraryChunk]
     data: Channel[TableChunk]
     @classmethod
@@ -81,6 +111,39 @@ class ChannelPair:
             data=context.create_channel(),
         )
+    async def send_metadata(self, ctx: Context, metadata: Metadata) -> None:
+        """
+        Send metadata and drain the metadata channel.
+        Parameters
+        ----------
+        ctx :
+            The streaming context.
+        metadata :
+            The metadata to send.
+        """
+        msg = Message(0, ArbitraryChunk(metadata))
+        await self.metadata.send(ctx, msg)
+        await self.metadata.drain(ctx)
+    async def recv_metadata(self, ctx: Context) -> Metadata:
+        """
+        Receive metadata from the metadata channel.
+        Parameters
+        ----------
+        ctx :
+            The streaming context.
+        Returns
+        -------
+        ChunkMetadata
+            The metadata, or None if channel is drained.
+        """
+        msg = await self.metadata.recv(ctx)
+        assert msg is not None, f"Expected Metadata message, got {msg}."
+        return ArbitraryChunk.from_message(msg).release()
 class ChannelManager:
     """A utility class for managing ChannelPair objects."""
@@ -131,13 +194,13 @@ class ChannelManager:
 def process_children(
     ir: IR, rec: SubNetGenerator
-) -> tuple[list[Any], dict[IR, ChannelManager]]:
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     """
     Process children IR nodes and aggregate their nodes and channels.
     This helper function recursively processes all children of an IR node,
-    collects their streaming network nodes into a flat list, and merges
-    their channel dictionaries.
+    collects their streaming network nodes into a dictionary mapping IR nodes
+    to their associated nodes, and merges their channel dictionaries.
     Parameters
     ----------
@@ -149,14 +212,163 @@ def process_children(
     Returns
     -------
     nodes
-        Flat list of all streaming network nodes from all children.
+        Dictionary mapping each IR node to its list of streaming network nodes.
     channels
         Dictionary mapping each child IR node to its ChannelManager.
     """
     if not ir.children:
-        return [], {}
+        return {}, {}
     _nodes_list, _channels_list = zip(*(rec(c) for c in ir.children), strict=True)
-    nodes: list[Any] = list(reduce(operator.add, _nodes_list, []))
+    nodes: dict[IR, list[Any]] = reduce(operator.or_, _nodes_list)
     channels: dict[IR, ChannelManager] = reduce(operator.or_, _channels_list)
     return nodes, channels
+def empty_table_chunk(ir: IR, context: Context, stream: Stream) -> TableChunk:
+    """
+    Make an empty table chunk.
+    Parameters
+    ----------
+    ir
+        The IR node to use for the schema.
+    context
+        The rapidsmpf context.
+    stream
+        The stream to use for the table chunk.
+    Returns
+    -------
+    The empty table chunk.
+    """
+    # Create an empty table with the correct schema
+    # Use dtype.plc_type to get the full DataType (preserves precision/scale for Decimals)
+    empty_columns = [
+        plc.column_factories.make_empty_column(dtype.plc_type, stream=stream)
+        for dtype in ir.schema.values()
+    ]
+    empty_table = plc.Table(empty_columns)
+    return TableChunk.from_pylibcudf_table(
+        empty_table,
+        stream,
+        exclusive_view=True,
+    )
+def chunk_to_frame(chunk: TableChunk, ir: IR) -> DataFrame:
+    """
+    Convert a TableChunk to a DataFrame.
+    Parameters
+    ----------
+    chunk
+        The TableChunk to convert.
+    ir
+        The IR node to use for the schema.
+    Returns
+    -------
+    A DataFrame.
+    """
+    return DataFrame.from_table(
+        chunk.table_view(),
+        list(ir.schema.keys()),
+        list(ir.schema.values()),
+        chunk.stream,
+    )
+def make_spill_function(
+    spillable_messages_list: list[SpillableMessages],
+    context: Context,
+) -> Callable[[int], int]:
+    """
+    Create a spill function for a list of SpillableMessages containers.
+    This utility creates a spill function that can be registered with a
+    SpillManager. The spill function uses a smart spilling strategy that
+    prioritizes:
+    1. Longest queues first (slow consumers that won't need data soon)
+    2. Newest messages first (just arrived, won't be consumed soon)
+    This strategy keeps "hot" data (about to be consumed) in fast memory
+    while spilling "cold" data (won't be needed for a while) to slower tiers.
+    Parameters
+    ----------
+    spillable_messages_list
+        List of SpillableMessages containers to create a spill function for.
+    context
+        The RapidsMPF context to use for accessing the BufferResource.
+    Returns
+    -------
+    A spill function that takes an amount (in bytes) and returns the
+    actual amount spilled (in bytes).
+    Notes
+    -----
+    The spilling strategy is particularly effective for fanout scenarios
+    where different consumers may process messages at different rates. By
+    prioritizing longest queues and newest messages, we maximize the time
+    data can remain in slower memory before it's needed.
+    """
+    def spill_func(amount: int) -> int:
+        """Spill messages from the buffers to free device/host memory."""
+        spilled = 0
+        # Collect all messages with metadata for smart spilling
+        # Format: (message_id, container_idx, queue_length, sm)
+        all_messages: list[tuple[int, int, int, SpillableMessages]] = []
+        for container_idx, sm in enumerate(spillable_messages_list):
+            content_descriptions = sm.get_content_descriptions()
+            queue_length = len(content_descriptions)
+            all_messages.extend(
+                (message_id, container_idx, queue_length, sm)
+                for message_id in content_descriptions
+            )
+        # Spill newest messages first from the longest queues
+        # Sort by: (1) queue length descending, (2) message_id descending
+        # This prioritizes:
+        # - Longest queues (slow consumers that won't need data soon)
+        # - Newest messages (just arrived, won't be consumed soon)
+        all_messages.sort(key=lambda x: (-x[2], -x[0]))
+        # Spill messages until we've freed enough memory
+        for message_id, _, _, sm in all_messages:
+            if spilled >= amount:
+                break
+            # Try to spill this message
+            spilled += sm.spill(mid=message_id, br=context.br())
+        return spilled
+    return spill_func
+@contextmanager
+def opaque_reservation(
+    context: Context,
+    estimated_bytes: int,
+) -> Iterator[MemoryReservation]:
+    """
+    Reserve memory for opaque allocations.
+    Parameters
+    ----------
+    context
+        The RapidsMPF context.
+    estimated_bytes
+        The estimated number of bytes to reserve.
+    Yields
+    ------
+    The memory reservation.
+    """
+    yield context.br().reserve_device_memory_and_spill(
+        estimated_bytes, allow_overbooking=True
+    )

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl