PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/join.py CHANGED Viewed

@@ -21,7 +21,7 @@ if TYPE_CHECKING:
     from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.ir import IR, IRExecutionContext
     from cudf_polars.experimental.parallel import LowerIRTransformer
-    from cudf_polars.utils.config import ShuffleMethod
+    from cudf_polars.utils.config import ShuffleMethod, ShufflerInsertionMethod
 def _maybe_shuffle_frame(
@@ -30,6 +30,8 @@ def _maybe_shuffle_frame(
     partition_info: MutableMapping[IR, PartitionInfo],
     shuffle_method: ShuffleMethod,
     output_count: int,
+    *,
+    shuffler_insertion_method: ShufflerInsertionMethod,
 ) -> IR:
     # Shuffle `frame` if it isn't already shuffled.
     if (
@@ -44,6 +46,7 @@ def _maybe_shuffle_frame(
             frame.schema,
             on,
             shuffle_method,
+            shuffler_insertion_method,
             frame,
         )
         partition_info[frame] = PartitionInfo(
@@ -60,6 +63,8 @@ def _make_hash_join(
     left: IR,
     right: IR,
     shuffle_method: ShuffleMethod,
+    *,
+    shuffler_insertion_method: ShufflerInsertionMethod,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     # Shuffle left and right dataframes (if necessary)
     new_left = _maybe_shuffle_frame(
@@ -68,6 +73,7 @@ def _make_hash_join(
         partition_info,
         shuffle_method,
         output_count,
+        shuffler_insertion_method=shuffler_insertion_method,
     )
     new_right = _maybe_shuffle_frame(
         right,
@@ -75,6 +81,7 @@ def _make_hash_join(
         partition_info,
         shuffle_method,
         output_count,
+        shuffler_insertion_method=shuffler_insertion_method,
     )
     if left != new_left or right != new_right:
         ir = ir.reconstruct([new_left, new_right])
@@ -144,7 +151,9 @@ def _make_bcast_join(
     left: IR,
     right: IR,
     shuffle_method: ShuffleMethod,
+    *,
     streaming_runtime: str,
+    shuffler_insertion_method: ShufflerInsertionMethod,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     if ir.options[0] != "Inner":
         left_count = partition_info[left].count
@@ -171,6 +180,7 @@ def _make_bcast_join(
                     partition_info,
                     shuffle_method,
                     right_count,
+                    shuffler_insertion_method=shuffler_insertion_method,
                 )
             else:
                 left = _maybe_shuffle_frame(
@@ -179,6 +189,7 @@ def _make_bcast_join(
                     partition_info,
                     shuffle_method,
                     left_count,
+                    shuffler_insertion_method=shuffler_insertion_method,
                 )
     new_node = ir.reconstruct([left, right])
@@ -290,7 +301,8 @@ def _(
             left,
             right,
             config_options.executor.shuffle_method,
-            config_options.executor.runtime,
+            streaming_runtime=config_options.executor.runtime,
+            shuffler_insertion_method=config_options.executor.shuffler_insertion_method,
         )
     else:
         # Create a hash join
@@ -301,6 +313,7 @@ def _(
             left,
             right,
             config_options.executor.shuffle_method,
+            shuffler_insertion_method=config_options.executor.shuffler_insertion_method,
         )

cudf_polars/experimental/parallel.py CHANGED Viewed

@@ -45,6 +45,7 @@ if TYPE_CHECKING:
     import polars as pl
+    from cudf_polars.experimental.base import StatsCollector
     from cudf_polars.experimental.dispatch import LowerIRTransformer, State
     from cudf_polars.utils.config import ConfigOptions
@@ -61,7 +62,7 @@ def _(
 def lower_ir_graph(
     ir: IR, config_options: ConfigOptions
-) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+) -> tuple[IR, MutableMapping[IR, PartitionInfo], StatsCollector]:
     """
     Rewrite an IR graph and extract partitioning information.
@@ -74,9 +75,10 @@ def lower_ir_graph(
     Returns
     -------
-    new_ir, partition_info
-        The rewritten graph, and a mapping from unique nodes
-        in the new graph to associated partitioning information.
+    new_ir, partition_info, stats
+        The rewritten graph, a mapping from unique nodes
+        in the new graph to associated partitioning information,
+        and the statistics collector.
     Notes
     -----
@@ -92,7 +94,7 @@ def lower_ir_graph(
         "stats": collect_statistics(ir, config_options),
     }
     mapper: LowerIRTransformer = CachingVisitor(lower_ir_node, state=state)
-    return mapper(ir)
+    return *mapper(ir), state["stats"]
 def task_graph(
@@ -245,7 +247,8 @@ def evaluate_rapidsmpf(
     """
     from cudf_polars.experimental.rapidsmpf.core import evaluate_logical_plan
-    return evaluate_logical_plan(ir, config_options)
+    result, _ = evaluate_logical_plan(ir, config_options, collect_metadata=False)
+    return result
 def evaluate_streaming(
@@ -277,7 +280,7 @@ def evaluate_streaming(
         return evaluate_rapidsmpf(ir, config_options)
     else:
         # Using the default task engine.
-        ir, partition_info = lower_ir_graph(ir, config_options)
+        ir, partition_info, _ = lower_ir_graph(ir, config_options)
         graph, key = task_graph(ir, partition_info, config_options)

cudf_polars/experimental/rapidsmpf/collectives/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Collective operations for the RapidsMPF streaming runtime."""
+from __future__ import annotations
+from cudf_polars.experimental.rapidsmpf.collectives.common import ReserveOpIDs
+__all__ = ["ReserveOpIDs"]

cudf_polars/experimental/rapidsmpf/collectives/allgather.py ADDED Viewed

@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""AllGather logic for the RapidsMPF streaming runtime."""
+from __future__ import annotations
+import asyncio
+from typing import TYPE_CHECKING
+from rapidsmpf.integrations.cudf.partition import unpack_and_concat
+from rapidsmpf.memory.packed_data import PackedData
+from rapidsmpf.streaming.coll.allgather import AllGather
+from pylibcudf.contiguous_split import pack
+if TYPE_CHECKING:
+    from rapidsmpf.streaming.core.context import Context
+    from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+    import pylibcudf as plc
+    from rmm.pylibrmm.stream import Stream
+class AllGatherManager:
+    """
+    AllGather manager.
+    Parameters
+    ----------
+    context: Context
+        The streaming context.
+    op_id: int
+        Pre-allocated operation ID for this operation.
+    """
+    def __init__(self, context: Context, op_id: int):
+        self.context = context
+        self.allgather = AllGather(self.context, op_id)
+    def insert(self, sequence_number: int, chunk: TableChunk) -> None:
+        """
+        Insert a chunk into the AllGatherContext.
+        Parameters
+        ----------
+        sequence_number: int
+            The sequence number of the chunk to insert.
+        chunk: TableChunk
+            The table chunk to insert.
+        """
+        self.allgather.insert(
+            sequence_number,
+            PackedData.from_cudf_packed_columns(
+                pack(
+                    chunk.table_view(),
+                    chunk.stream,
+                ),
+                chunk.stream,
+                self.context.br(),
+            ),
+        )
+        del chunk
+    def insert_finished(self) -> None:
+        """Insert finished into the AllGatherManager."""
+        self.allgather.insert_finished()
+    async def extract_concatenated(
+        self, stream: Stream, *, ordered: bool = True
+    ) -> plc.Table:
+        """
+        Extract the concatenated result.
+        Parameters
+        ----------
+        stream: Stream
+            The stream to use for chunk extraction.
+        ordered: bool
+            Whether to extract the data in ordered or unordered fashion.
+        Returns
+        -------
+        The concatenated AllGather result.
+        """
+        return await asyncio.to_thread(
+            unpack_and_concat,
+            partitions=await self.allgather.extract_all(self.context, ordered=ordered),
+            stream=stream,
+            br=self.context.br(),
+        )

cudf_polars/experimental/rapidsmpf/collectives/common.py ADDED Viewed

@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Common utilities for collective operations."""
+from __future__ import annotations
+import threading
+from typing import TYPE_CHECKING, Literal
+from rapidsmpf.shuffler import Shuffler
+from cudf_polars.dsl.traversal import traversal
+from cudf_polars.experimental.join import Join
+from cudf_polars.experimental.repartition import Repartition
+from cudf_polars.experimental.shuffle import Shuffle
+if TYPE_CHECKING:
+    from types import TracebackType
+    from cudf_polars.dsl.ir import IR
+# Set of available collective IDs
+_collective_id_vacancy: set[int] = set(range(Shuffler.max_concurrent_shuffles))
+_collective_id_vacancy_lock: threading.Lock = threading.Lock()
+def _get_new_collective_id() -> int:
+    with _collective_id_vacancy_lock:
+        if not _collective_id_vacancy:
+            raise ValueError(
+                f"Cannot shuffle more than {Shuffler.max_concurrent_shuffles} "
+                "times in a single query."
+            )
+        return _collective_id_vacancy.pop()
+def _release_collective_id(collective_id: int) -> None:
+    """Release a collective ID back to the vacancy set."""
+    with _collective_id_vacancy_lock:
+        _collective_id_vacancy.add(collective_id)
+class ReserveOpIDs:
+    """
+    Context manager to reserve collective IDs for pipeline execution.
+    Parameters
+    ----------
+    ir : IR
+        The root IR node of the pipeline.
+    Notes
+    -----
+    This context manager:
+    1. Identifies all Shuffle nodes in the IR
+    2. Reserves collective IDs from the vacancy pool
+    3. Creates a mapping from IR nodes to their reserved IDs
+    4. Releases all IDs back to the pool on __exit__
+    """
+    def __init__(self, ir: IR):
+        # Find all collective IR nodes.
+        self.collective_nodes: list[IR] = [
+            node
+            for node in traversal([ir])
+            if isinstance(node, (Shuffle, Join, Repartition))
+        ]
+        self.collective_id_map: dict[IR, int] = {}
+    def __enter__(self) -> dict[IR, int]:
+        """
+        Reserve collective IDs and return the mapping.
+        Returns
+        -------
+        collective_id_map : dict[IR, int]
+            Mapping from IR nodes to their reserved collective IDs.
+        """
+        # Reserve IDs and map nodes directly to their IDs
+        for node in self.collective_nodes:
+            self.collective_id_map[node] = _get_new_collective_id()
+        return self.collective_id_map
+    def __exit__(
+        self,
+        exc_type: type | None,
+        exc_val: Exception | None,
+        exc_tb: TracebackType | None,
+    ) -> Literal[False]:
+        """Release all reserved collective IDs back to the vacancy pool."""
+        for collective_id in self.collective_id_map.values():
+            _release_collective_id(collective_id)
+        return False

cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} RENAMED Viewed

@@ -1,20 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Shuffle logic for the RapidsMPF streaming runtime."""
 from __future__ import annotations
-import threading
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any
-from rapidsmpf.communicator.single import new_communicator
-from rapidsmpf.config import Options, get_environment_variables
 from rapidsmpf.integrations.cudf.partition import (
     partition_and_pack as py_partition_and_pack,
     unpack_and_concat as py_unpack_and_concat,
 )
-from rapidsmpf.progress_thread import ProgressThread
-from rapidsmpf.shuffler import Shuffler
+from rapidsmpf.streaming.coll.shuffler import ShufflerAsync
 from rapidsmpf.streaming.core.message import Message
 from rapidsmpf.streaming.core.node import define_py_node
 from rapidsmpf.streaming.cudf.table_chunk import TableChunk
@@ -24,12 +20,13 @@ from cudf_polars.experimental.rapidsmpf.dispatch import (
     generate_ir_sub_network,
 )
 from cudf_polars.experimental.rapidsmpf.nodes import shutdown_on_error
-from cudf_polars.experimental.rapidsmpf.utils import ChannelManager
+from cudf_polars.experimental.rapidsmpf.utils import (
+    ChannelManager,
+    Metadata,
+)
 from cudf_polars.experimental.shuffle import Shuffle
 if TYPE_CHECKING:
-    from types import TracebackType
     from rapidsmpf.streaming.core.context import Context
     import pylibcudf as plc
@@ -40,36 +37,9 @@ if TYPE_CHECKING:
     from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
-# TODO: This implementation only supports a single GPU for now.
-#       Multi-GPU support will require a distinct GlobalShuffle
-#       context manager, and updated _shuffle_id_vacancy logic.
-# Set of available shuffle IDs
-_shuffle_id_vacancy: set[int] = set(range(Shuffler.max_concurrent_shuffles))
-_shuffle_id_vacancy_lock: threading.Lock = threading.Lock()
-def _get_new_shuffle_id() -> int:
-    with _shuffle_id_vacancy_lock:
-        if not _shuffle_id_vacancy:
-            raise ValueError(
-                f"Cannot shuffle more than {Shuffler.max_concurrent_shuffles} "
-                "times in a single query."
-            )
-        return _shuffle_id_vacancy.pop()
-def _release_shuffle_id(op_id: int) -> None:
-    """Release a shuffle ID back to the vacancy set."""
-    with _shuffle_id_vacancy_lock:
-        _shuffle_id_vacancy.add(op_id)
-class LocalShuffle:
+class ShuffleManager:
     """
-    Local shuffle instance context manager.
+    ShufflerAsync manager.
     Parameters
     ----------
@@ -79,6 +49,8 @@ class LocalShuffle:
         The number of partitions to shuffle into.
     columns_to_hash: tuple[int, ...]
         The columns to hash.
+    collective_id: int
+        The collective ID.
     """
     def __init__(
@@ -86,43 +58,20 @@ class LocalShuffle:
         context: Context,
         num_partitions: int,
         columns_to_hash: tuple[int, ...],
+        collective_id: int,
     ):
         self.context = context
-        self.br = context.br()
         self.num_partitions = num_partitions
         self.columns_to_hash = columns_to_hash
-        self._insertion_finished = False
-    def __enter__(self) -> LocalShuffle:
-        """Enter the local shuffle instance context manager."""
-        self.op_id = _get_new_shuffle_id()
-        statistics = self.context.statistics()
-        comm = new_communicator(Options(get_environment_variables()))
-        progress_thread = ProgressThread(comm, statistics)
-        self.shuffler = Shuffler(
-            comm=comm,
-            progress_thread=progress_thread,
-            op_id=self.op_id,
-            total_num_partitions=self.num_partitions,
-            br=self.br,
-            statistics=statistics,
+        self.shuffler = ShufflerAsync(
+            context,
+            collective_id,
+            num_partitions,
         )
-        return self
-    def __exit__(
-        self,
-        exc_type: type | None,
-        exc_val: Exception | None,
-        exc_tb: TracebackType | None,
-    ) -> Literal[False]:
-        """Exit the local shuffle instance context manager."""
-        self.shuffler.shutdown()
-        _release_shuffle_id(self.op_id)
-        return False
     def insert_chunk(self, chunk: TableChunk) -> None:
         """
-        Insert a chunk into the local shuffle instance.
+        Insert a chunk into the ShuffleContext.
         Parameters
         ----------
@@ -135,15 +84,19 @@ class LocalShuffle:
             columns_to_hash=self.columns_to_hash,
             num_partitions=self.num_partitions,
             stream=chunk.stream,
-            br=self.br,
+            br=self.context.br(),
         )
         # Insert into shuffler
-        self.shuffler.insert_chunks(partitioned_chunks)
+        self.shuffler.insert(partitioned_chunks)
+    async def insert_finished(self) -> None:
+        """Insert finished into the ShuffleManager."""
+        await self.shuffler.insert_finished(self.context)
-    def extract_chunk(self, sequence_number: int, stream: Stream) -> plc.Table:
+    async def extract_chunk(self, sequence_number: int, stream: Stream) -> plc.Table:
         """
-        Extract a chunk from the local shuffle instance.
+        Extract a chunk from the ShuffleManager.
         Parameters
         ----------
@@ -156,21 +109,18 @@ class LocalShuffle:
         -------
         The extracted table.
         """
-        if not self._insertion_finished:
-            self.shuffler.insert_finished(list(range(self.num_partitions)))
-            self._insertion_finished = True
-        self.shuffler.wait_on(sequence_number)
-        partition_chunks = self.shuffler.extract(sequence_number)
+        partition_chunks = await self.shuffler.extract_async(
+            self.context, sequence_number
+        )
         return py_unpack_and_concat(
             partitions=partition_chunks,
             stream=stream,
-            br=self.br,
+            br=self.context.br(),
         )
 @define_py_node()
-async def local_shuffle_node(
+async def shuffle_node(
     context: Context,
     ir: Shuffle,
     ir_context: IRExecutionContext,
@@ -178,6 +128,7 @@ async def local_shuffle_node(
     ch_out: ChannelPair,
     columns_to_hash: tuple[int, ...],
     num_partitions: int,
+    collective_id: int,
 ) -> None:
     """
     Execute a local shuffle pipeline in a single node.
@@ -202,46 +153,68 @@ async def local_shuffle_node(
         Tuple of column indices to use for hashing.
     num_partitions
         Number of partitions to shuffle into.
+    collective_id
+        The collective ID.
     """
     async with shutdown_on_error(
         context, ch_in.metadata, ch_in.data, ch_out.metadata, ch_out.data
     ):
-        # Create LocalShuffle context manager to handle shuffler lifecycle
-        # TODO: Use ir_context to get the stream (not available yet)
-        with LocalShuffle(context, num_partitions, columns_to_hash) as local_shuffle:
-            # Process input chunks
-            while True:
-                msg = await ch_in.data.recv(context)
-                if msg is None:
-                    break
-                # Extract TableChunk from message
-                chunk = TableChunk.from_message(msg).make_available_and_spill(
-                    context.br(), allow_overbooking=True
-                )
+        # Receive and send updated metadata.
+        _ = await ch_in.recv_metadata(context)
+        column_names = list(ir.schema.keys())
+        partitioned_on = tuple(column_names[i] for i in columns_to_hash)
+        output_metadata = Metadata(
+            max(1, num_partitions // context.comm().nranks),
+            partitioned_on=partitioned_on,
+        )
+        await ch_out.send_metadata(context, output_metadata)
-                # Get the table view and insert into shuffler
-                local_shuffle.insert_chunk(chunk)
-            # Extract shuffled partitions and send them out
-            # LocalShuffle.extract_chunk handles insert_finished, wait, extract, and unpack
-            stream = ir_context.get_cuda_stream()
-            for partition_id in range(num_partitions):
-                # Create a new TableChunk with the result
-                output_chunk = TableChunk.from_pylibcudf_table(
-                    table=local_shuffle.extract_chunk(partition_id, stream),
-                    stream=stream,
-                    exclusive_view=True,
-                )
+        # Create ShuffleManager instance
+        shuffle = ShuffleManager(
+            context, num_partitions, columns_to_hash, collective_id
+        )
-                # Send the output chunk
-                await ch_out.data.send(context, Message(partition_id, output_chunk))
+        # Process input chunks
+        while (msg := await ch_in.data.recv(context)) is not None:
+            # Extract TableChunk from message and insert into shuffler
+            shuffle.insert_chunk(
+                TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
+            )
+            del msg
+        # Insert finished
+        await shuffle.insert_finished()
+        # Extract shuffled partitions and send them out
+        stream = ir_context.get_cuda_stream()
+        for partition_id in range(
+            # Round-robin partition assignment
+            context.comm().rank,
+            num_partitions,
+            context.comm().nranks,
+        ):
+            # Extract and send the output chunk
+            await ch_out.data.send(
+                context,
+                Message(
+                    partition_id,
+                    TableChunk.from_pylibcudf_table(
+                        table=await shuffle.extract_chunk(partition_id, stream),
+                        stream=stream,
+                        exclusive_view=True,
+                    ),
+                ),
+            )
         await ch_out.data.drain(context)
 @generate_ir_sub_network.register(Shuffle)
-def _(ir: Shuffle, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: Shuffle, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     # Local shuffle operation.
     # Process children
@@ -257,13 +230,15 @@ def _(ir: Shuffle, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelMan
     columns_to_hash = tuple(column_names.index(k.name) for k in keys)
     num_partitions = rec.state["partition_info"][ir].count
+    # Look up the reserved collective ID for this operation
+    collective_id = rec.state["collective_id_map"][ir]
     # Create output ChannelManager
     channels[ir] = ChannelManager(rec.state["context"])
-    # Complete shuffle pipeline in a single node
-    # LocalShuffle context manager handles shuffle ID lifecycle internally
-    nodes.append(
-        local_shuffle_node(
+    # Complete shuffle node
+    nodes[ir] = [
+        shuffle_node(
             context,
             ir,
             rec.state["ir_context"],
@@ -271,7 +246,8 @@ def _(ir: Shuffle, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelMan
             ch_out=channels[ir].reserve_input_slot(),
             columns_to_hash=columns_to_hash,
             num_partitions=num_partitions,
+            collective_id=collective_id,
         )
-    )
+    ]
     return nodes, channels

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl