PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/join.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Join logic for the RapidsMPF streaming runtime."""
@@ -7,11 +7,13 @@ from __future__ import annotations
 import asyncio
 from typing import TYPE_CHECKING, Any, Literal
+from rapidsmpf.memory.buffer import MemoryType
 from rapidsmpf.streaming.core.message import Message
 from rapidsmpf.streaming.cudf.table_chunk import TableChunk
 from cudf_polars.containers import DataFrame
 from cudf_polars.dsl.ir import IR, Join
+from cudf_polars.experimental.rapidsmpf.collectives.allgather import AllGatherManager
 from cudf_polars.experimental.rapidsmpf.dispatch import (
     generate_ir_sub_network,
 )
@@ -22,6 +24,10 @@ from cudf_polars.experimental.rapidsmpf.nodes import (
 )
 from cudf_polars.experimental.rapidsmpf.utils import (
     ChannelManager,
+    Metadata,
+    chunk_to_frame,
+    empty_table_chunk,
+    opaque_reservation,
     process_children,
 )
 from cudf_polars.experimental.utils import _concat
@@ -34,48 +40,6 @@ if TYPE_CHECKING:
     from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
-async def get_small_table(
-    context: Context,
-    small_child: IR,
-    ch_small: ChannelPair,
-) -> list[DataFrame]:
-    """
-    Get the small-table DataFrame partitions from the small-table ChannelPair.
-    Parameters
-    ----------
-    context
-        The rapidsmpf context.
-    small_child
-        The small-table child IR node.
-    ch_small
-        The small-table ChannelPair.
-    Returns
-    -------
-    list[DataFrame]
-        The small-table DataFrame partitions.
-    """
-    small_chunks = []
-    while (msg := await ch_small.data.recv(context)) is not None:
-        small_chunks.append(
-            TableChunk.from_message(msg).make_available_and_spill(
-                context.br(), allow_overbooking=True
-            )
-        )
-    assert small_chunks, "Empty small side"
-    return [
-        DataFrame.from_table(
-            small_chunk.table_view(),
-            list(small_child.schema.keys()),
-            list(small_child.schema.values()),
-            small_chunk.stream,
-        )
-        for small_chunk in small_chunks
-    ]
 @define_py_node()
 async def broadcast_join_node(
     context: Context,
@@ -85,6 +49,8 @@ async def broadcast_join_node(
     ch_left: ChannelPair,
     ch_right: ChannelPair,
     broadcast_side: Literal["left", "right"],
+    collective_id: int,
+    target_partition_size: int,
 ) -> None:
     """
     Join node for rapidsmpf.
@@ -105,33 +71,132 @@ async def broadcast_join_node(
         The right input ChannelPair.
     broadcast_side
         The side to broadcast.
+    collective_id
+        Pre-allocated collective ID for this operation.
+    target_partition_size
+        The target partition size in bytes.
     """
-    async with shutdown_on_error(context, ch_left.data, ch_right.data, ch_out.data):
+    async with shutdown_on_error(
+        context,
+        ch_left.metadata,
+        ch_left.data,
+        ch_right.metadata,
+        ch_right.data,
+        ch_out.metadata,
+        ch_out.data,
+    ):
+        # Receive metadata.
+        left_metadata, right_metadata = await asyncio.gather(
+            ch_left.recv_metadata(context),
+            ch_right.recv_metadata(context),
+        )
+        partitioned_on: tuple[str, ...] = ()
         if broadcast_side == "right":
             # Broadcast right, stream left
             small_ch = ch_right
             large_ch = ch_left
             small_child = ir.children[1]
             large_child = ir.children[0]
+            chunk_count = left_metadata.count
+            partitioned_on = left_metadata.partitioned_on
+            small_duplicated = right_metadata.duplicated
         else:
             # Broadcast left, stream right
             small_ch = ch_left
             large_ch = ch_right
             small_child = ir.children[0]
             large_child = ir.children[1]
+            chunk_count = right_metadata.count
+            small_duplicated = left_metadata.duplicated
+            if ir.options[0] == "Right":
+                partitioned_on = right_metadata.partitioned_on
-        # Collect small-side chunks
-        small_dfs = await get_small_table(context, small_child, small_ch)
-        if ir.options[0] != "Inner":
-            # TODO: Use local repartitioning for non-inner joins
-            small_dfs = [_concat(*small_dfs, context=ir_context)]
+        # Send metadata.
+        output_metadata = Metadata(
+            chunk_count,
+            partitioned_on=partitioned_on,
+            duplicated=left_metadata.duplicated and right_metadata.duplicated,
+        )
+        await ch_out.send_metadata(context, output_metadata)
-        # Stream through large side, joining with the small-side
-        while (msg := await large_ch.data.recv(context)) is not None:
-            large_chunk = TableChunk.from_message(msg).make_available_and_spill(
-                context.br(), allow_overbooking=True
+        # Collect small-side (may be empty if no data received)
+        small_chunks: list[TableChunk] = []
+        small_size = 0
+        while (msg := await small_ch.data.recv(context)) is not None:
+            small_chunks.append(
+                TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
             )
-            seq_num = msg.sequence_number
+            del msg
+            small_size += small_chunks[-1].data_alloc_size(MemoryType.DEVICE)
+        # Allgather is a collective - all ranks must participate even with no local data
+        need_allgather = context.comm().nranks > 1 and not small_duplicated
+        if need_allgather:
+            allgather = AllGatherManager(context, collective_id)
+            for s_id in range(len(small_chunks)):
+                allgather.insert(s_id, small_chunks.pop(0))
+            allgather.insert_finished()
+            stream = ir_context.get_cuda_stream()
+            # extract_concatenated returns a plc.Table, not a TableChunk
+            small_dfs = [
+                DataFrame.from_table(
+                    await allgather.extract_concatenated(stream),
+                    list(small_child.schema.keys()),
+                    list(small_child.schema.values()),
+                    stream,
+                )
+            ]
+        elif len(small_chunks) > 1 and (
+            ir.options[0] != "Inner" or small_size < target_partition_size
+        ):
+            # Pre-concat for non-inner joins, otherwise
+            # we need a local shuffle, and face additional
+            # memory pressure anyway.
+            small_dfs = [
+                _concat(
+                    *[chunk_to_frame(chunk, small_child) for chunk in small_chunks],
+                    context=ir_context,
+                )
+            ]
+            small_chunks.clear()  # small_dfs is not a view of small_chunks anymore
+        else:
+            small_dfs = [
+                chunk_to_frame(small_chunk, small_child) for small_chunk in small_chunks
+            ]
+        # Stream through large side, joining with the small-side
+        seq_num = 0
+        large_chunk_processed = False
+        receiving_large_chunks = True
+        while receiving_large_chunks:
+            msg = await large_ch.data.recv(context)
+            if msg is None:
+                receiving_large_chunks = False
+                if large_chunk_processed:
+                    # Normal exit - We've processed all large-table data
+                    break
+                elif small_dfs:
+                    # We received small-table data, but no large-table data.
+                    # This may never happen, but we can handle it by generating
+                    # an empty large-table chunk
+                    stream = ir_context.get_cuda_stream()
+                    large_chunk = empty_table_chunk(large_child, context, stream)
+                else:
+                    # We received no data for either the small or large table.
+                    # Drain the output channel and return
+                    await ch_out.data.drain(context)
+                    return
+            else:
+                large_chunk_processed = True
+                large_chunk = TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
+                seq_num = msg.sequence_number
+                del msg
             large_df = DataFrame.from_table(
                 large_chunk.table_view(),
                 list(large_child.schema.keys()),
@@ -139,10 +204,17 @@ async def broadcast_join_node(
                 large_chunk.stream,
             )
-            # Perform the join
-            df = _concat(
-                *[
-                    (
+            # Lazily create empty small table if small_dfs is empty
+            if not small_dfs:
+                stream = ir_context.get_cuda_stream()
+                empty_small_chunk = empty_table_chunk(small_child, context, stream)
+                small_dfs = [chunk_to_frame(empty_small_chunk, small_child)]
+            large_chunk_size = large_chunk.data_alloc_size(MemoryType.DEVICE)
+            input_bytes = large_chunk_size + small_size
+            with opaque_reservation(context, input_bytes):
+                df = _concat(
+                    *[
                         await asyncio.to_thread(
                             ir.do_evaluate,
                             *ir._non_child_args,
@@ -153,28 +225,31 @@ async def broadcast_join_node(
                             ),
                             context=ir_context,
                         )
-                    )
-                    for small_df in small_dfs
-                ],
-                context=ir_context,
-            )
+                        for small_df in small_dfs
+                    ],
+                    context=ir_context,
+                )
-            # Send output chunk
-            await ch_out.data.send(
-                context,
-                Message(
-                    seq_num,
-                    TableChunk.from_pylibcudf_table(
-                        df.table, df.stream, exclusive_view=True
+                # Send output chunk
+                await ch_out.data.send(
+                    context,
+                    Message(
+                        seq_num,
+                        TableChunk.from_pylibcudf_table(
+                            df.table, df.stream, exclusive_view=True
+                        ),
                     ),
-                ),
-            )
+                )
+                del df, large_df, large_chunk
+        del small_dfs, small_chunks
         await ch_out.data.drain(context)
 @generate_ir_sub_network.register(Join)
-def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: Join, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     # Join operation.
     left, right = ir.children
     partition_info = rec.state["partition_info"]
@@ -200,7 +275,8 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
     if pwise_join:
         # Partition-wise join (use default_node_multi)
-        nodes.append(
+        partitioning_index = 1 if ir.options[0] == "Right" else 0
+        nodes[ir] = [
             default_node_multi(
                 rec.state["context"],
                 ir,
@@ -210,8 +286,9 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
                     channels[left].reserve_output_slot(),
                     channels[right].reserve_output_slot(),
                 ),
+                partitioning_index=partitioning_index,
             )
-        )
+        ]
         return nodes, channels
     else:
@@ -223,7 +300,13 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
         else:
             broadcast_side = "left"
-        nodes.append(
+        # Get target partition size
+        config_options = rec.state["config_options"]
+        executor = config_options.executor
+        assert executor.name == "streaming", "Join node requires streaming executor"
+        target_partition_size = executor.target_partition_size
+        nodes[ir] = [
             broadcast_join_node(
                 rec.state["context"],
                 ir,
@@ -232,6 +315,8 @@ def _(ir: Join, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
                 channels[left].reserve_output_slot(),
                 channels[right].reserve_output_slot(),
                 broadcast_side=broadcast_side,
+                collective_id=rec.state["collective_id_map"][ir],
+                target_partition_size=target_partition_size,
             )
-        )
+        ]
         return nodes, channels

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl