PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/nodes.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Core node definitions for the RapidsMPF streaming runtime."""
@@ -7,17 +7,23 @@ from __future__ import annotations
 import asyncio
 from typing import TYPE_CHECKING, Any, cast
+from rapidsmpf.memory.buffer import MemoryType
 from rapidsmpf.streaming.core.message import Message
 from rapidsmpf.streaming.core.node import define_py_node
+from rapidsmpf.streaming.core.spillable_messages import SpillableMessages
 from rapidsmpf.streaming.cudf.table_chunk import TableChunk
 from cudf_polars.containers import DataFrame
-from cudf_polars.dsl.ir import IR, Empty
+from cudf_polars.dsl.ir import IR, Cache, Empty, Filter, Projection
 from cudf_polars.experimental.rapidsmpf.dispatch import (
     generate_ir_sub_network,
 )
 from cudf_polars.experimental.rapidsmpf.utils import (
     ChannelManager,
+    Metadata,
+    empty_table_chunk,
+    make_spill_function,
+    opaque_reservation,
     process_children,
     shutdown_on_error,
 )
@@ -37,6 +43,8 @@ async def default_node_single(
     ir_context: IRExecutionContext,
     ch_out: ChannelPair,
     ch_in: ChannelPair,
+    *,
+    preserve_partitioning: bool = False,
 ) -> None:
     """
     Single-channel default node for rapidsmpf.
@@ -53,32 +61,71 @@ async def default_node_single(
         The output ChannelPair.
     ch_in
         The input ChannelPair.
+    preserve_partitioning
+        Whether to preserve the partitioning metadata of the input chunks.
     Notes
     -----
     Chunks are processed in the order they are received.
     """
-    async with shutdown_on_error(context, ch_in.data, ch_out.data):
-        while (msg := await ch_in.data.recv(context)) is not None:
-            chunk = TableChunk.from_message(msg).make_available_and_spill(
-                context.br(), allow_overbooking=True
-            )
-            seq_num = msg.sequence_number
-            df = await asyncio.to_thread(
-                ir.do_evaluate,
-                *ir._non_child_args,
-                DataFrame.from_table(
-                    chunk.table_view(),
-                    list(ir.children[0].schema.keys()),
-                    list(ir.children[0].schema.values()),
-                    chunk.stream,
-                ),
-                context=ir_context,
-            )
-            chunk = TableChunk.from_pylibcudf_table(
-                df.table, chunk.stream, exclusive_view=True
-            )
-            await ch_out.data.send(context, Message(seq_num, chunk))
+    async with shutdown_on_error(
+        context, ch_in.metadata, ch_in.data, ch_out.metadata, ch_out.data
+    ):
+        # Recv/send metadata.
+        metadata_in = await ch_in.recv_metadata(context)
+        metadata_out = Metadata(
+            metadata_in.count,
+            partitioned_on=metadata_in.partitioned_on if preserve_partitioning else (),
+            duplicated=metadata_in.duplicated,
+        )
+        await ch_out.send_metadata(context, metadata_out)
+        # Recv/send data.
+        seq_num = 0
+        receiving = True
+        received_any = False
+        while receiving:
+            msg = await ch_in.data.recv(context)
+            if msg is None:
+                receiving = False
+                if received_any:
+                    break
+                else:
+                    # Make sure we have an empty chunk in case do_evaluate
+                    # always produces rows (e.g. aggregation)
+                    stream = ir_context.get_cuda_stream()
+                    chunk = empty_table_chunk(ir.children[0], context, stream)
+            else:
+                received_any = True
+                chunk = TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
+                seq_num = msg.sequence_number
+            del msg
+            input_bytes = chunk.data_alloc_size(MemoryType.DEVICE)
+            with opaque_reservation(context, input_bytes):
+                df = await asyncio.to_thread(
+                    ir.do_evaluate,
+                    *ir._non_child_args,
+                    DataFrame.from_table(
+                        chunk.table_view(),
+                        list(ir.children[0].schema.keys()),
+                        list(ir.children[0].schema.values()),
+                        chunk.stream,
+                    ),
+                    context=ir_context,
+                )
+                await ch_out.data.send(
+                    context,
+                    Message(
+                        seq_num,
+                        TableChunk.from_pylibcudf_table(
+                            df.table, chunk.stream, exclusive_view=True
+                        ),
+                    ),
+                )
+                del df, chunk
         await ch_out.data.drain(context)
@@ -90,6 +137,8 @@ async def default_node_multi(
     ir_context: IRExecutionContext,
     ch_out: ChannelPair,
     chs_in: tuple[ChannelPair, ...],
+    *,
+    partitioning_index: int | None = None,
 ) -> None:
     """
     Pointwise node for rapidsmpf.
@@ -103,17 +152,30 @@ async def default_node_multi(
     ir_context
         The execution context for the IR node.
     ch_out
-        The output ChannelPair (metadata already sent).
+        The output ChannelPair.
     chs_in
-        Tuple of input ChannelPairs (metadata already received).
-    Notes
-    -----
-    Input chunks must be aligned for evaluation. Messages from each input
-    channel are assumed to arrive in sequence number order, so we only need
-    to hold one chunk per channel at a time.
+        Tuple of input ChannelPairs.
+    partitioning_index
+        Index of the input channel to preserve partitioning information for.
+        If None, no partitioning information is preserved.
     """
-    async with shutdown_on_error(context, *[ch.data for ch in chs_in], ch_out.data):
+    async with shutdown_on_error(
+        context,
+        *[ch.metadata for ch in chs_in],
+        ch_out.metadata,
+        *[ch.data for ch in chs_in],
+        ch_out.data,
+    ):
+        # Merge and forward basic metadata.
+        metadata = Metadata(1)
+        for idx, ch_in in enumerate(chs_in):
+            md_child = await ch_in.recv_metadata(context)
+            metadata.count = max(md_child.count, metadata.count)
+            metadata.duplicated = metadata.duplicated and md_child.duplicated
+            if idx == partitioning_index:
+                metadata.partitioned_on = md_child.partitioned_on
+        await ch_out.send_metadata(context, metadata)
         seq_num = 0
         n_children = len(chs_in)
         finished_channels: set[int] = set()
@@ -122,11 +184,10 @@ async def default_node_multi(
         ready_chunks: list[TableChunk | None] = [None] * n_children
         chunk_count: list[int] = [0] * n_children
+        # Recv/send data.
         while True:
             # Receive from all non-finished channels
-            for ch_idx, (ch_in, _child) in enumerate(
-                zip(chs_in, ir.children, strict=True)
-            ):
+            for ch_idx, ch_in in enumerate(chs_in):
                 if ch_idx in finished_channels:
                     continue  # This channel already finished, reuse its data
@@ -138,19 +199,20 @@ async def default_node_multi(
                     # Store the new chunk (replacing previous if any)
                     ready_chunks[ch_idx] = TableChunk.from_message(msg)
                     chunk_count[ch_idx] += 1
-                assert ready_chunks[ch_idx] is not None, (
-                    f"Channel {ch_idx} has no data after receive loop."
-                )
+                del msg
             # If all channels finished, we're done
             if len(finished_channels) == n_children:
                 break
-            # Convert chunks to DataFrames right before evaluation
-            # All chunks are guaranteed to be non-None by the assertion above
-            assert all(chunk is not None for chunk in ready_chunks), (
-                "All chunks must be non-None"
-            )
+            # Check if any channel drained without providing data.
+            # If so, create an empty chunk for that channel.
+            for ch_idx, child in enumerate(ir.children):
+                if ready_chunks[ch_idx] is None:
+                    # Channel drained without data - create empty chunk
+                    stream = ir_context.get_cuda_stream()
+                    ready_chunks[ch_idx] = empty_table_chunk(child, context, stream)
             # Ensure all table chunks are unspilled and available.
             ready_chunks = [
                 chunk.make_available_and_spill(context.br(), allow_overbooking=True)
@@ -166,27 +228,33 @@ async def default_node_multi(
                 for chunk, child in zip(ready_chunks, ir.children, strict=True)
             ]
-            # Evaluate the IR node with current chunks
-            df = await asyncio.to_thread(
-                ir.do_evaluate,
-                *ir._non_child_args,
-                *dfs,
-                context=ir_context,
+            input_bytes = sum(
+                chunk.data_alloc_size(MemoryType.DEVICE)
+                for chunk in cast(list[TableChunk], ready_chunks)
             )
-            await ch_out.data.send(
-                context,
-                Message(
-                    seq_num,
-                    TableChunk.from_pylibcudf_table(
-                        df.table,
-                        df.stream,
-                        exclusive_view=True,
+            with opaque_reservation(context, input_bytes):
+                df = await asyncio.to_thread(
+                    ir.do_evaluate,
+                    *ir._non_child_args,
+                    *dfs,
+                    context=ir_context,
+                )
+                await ch_out.data.send(
+                    context,
+                    Message(
+                        seq_num,
+                        TableChunk.from_pylibcudf_table(
+                            df.table,
+                            df.stream,
+                            exclusive_view=True,
+                        ),
                     ),
-                ),
-            )
-            seq_num += 1
+                )
+                seq_num += 1
+                del df, dfs
         # Drain the output channel
+        del ready_chunks
         await ch_out.data.drain(context)
@@ -213,12 +281,23 @@ async def fanout_node_bounded(
     """
     # TODO: Use rapidsmpf fanout node once available.
     # See: https://github.com/rapidsai/rapidsmpf/issues/560
-    async with shutdown_on_error(context, ch_in.data, *[ch.data for ch in chs_out]):
+    async with shutdown_on_error(
+        context,
+        ch_in.metadata,
+        ch_in.data,
+        *[ch.metadata for ch in chs_out],
+        *[ch.data for ch in chs_out],
+    ):
+        # Forward metadata to all outputs.
+        metadata = await ch_in.recv_metadata(context)
+        await asyncio.gather(*(ch.send_metadata(context, metadata) for ch in chs_out))
         while (msg := await ch_in.data.recv(context)) is not None:
             table_chunk = TableChunk.from_message(msg).make_available_and_spill(
                 context.br(), allow_overbooking=True
             )
             seq_num = msg.sequence_number
+            del msg
             for ch_out in chs_out:
                 await ch_out.data.send(
                     context,
@@ -231,6 +310,7 @@ async def fanout_node_bounded(
                         ),
                     ),
                 )
+            del table_chunk
         await asyncio.gather(*(ch.data.drain(context) for ch in chs_out))
@@ -242,14 +322,15 @@ async def fanout_node_unbounded(
     *chs_out: ChannelPair,
 ) -> None:
     """
-    Unbounded fanout node for rapidsmpf.
+    Unbounded fanout node for rapidsmpf with spilling support.
     Broadcasts chunks from input to all output channels. This is called
     "unbounded" because it handles the case where one channel may consume
     all data before another channel consumes any data.
-    The implementation uses adaptive sending:
-    - Maintains a FIFO buffer for each output channel
+    The implementation uses adaptive sending with spillable buffers:
+    - Maintains a spillable FIFO buffer for each output channel
+    - Messages are buffered in host memory (spillable to disk)
     - Sends to all channels concurrently
     - Receives next chunk as soon as any channel makes progress
     - Efficient for both balanced and imbalanced consumption patterns
@@ -265,107 +346,182 @@ async def fanout_node_unbounded(
     """
     # TODO: Use rapidsmpf fanout node once available.
     # See: https://github.com/rapidsai/rapidsmpf/issues/560
-    async with shutdown_on_error(context, ch_in.data, *[ch.data for ch in chs_out]):
-        # FIFO buffer for each output channel
-        output_buffers: list[list[Message]] = [[] for _ in chs_out]
-        # Track active send/drain tasks for each output
-        active_tasks: dict[int, asyncio.Task] = {}
-        # Track which outputs need to be drained (set when no more input)
-        needs_drain: set[int] = set()
-        # Receive task
-        recv_task: asyncio.Task | None = asyncio.create_task(ch_in.data.recv(context))
-        # Flag to indicate we should start a new receive (for backpressure)
-        can_receive: bool = True
-        async def send_one_from_buffer(idx: int) -> None:
-            """Send one buffered message for output idx."""
-            if output_buffers[idx]:
-                msg = output_buffers[idx].pop(0)
-                await chs_out[idx].data.send(context, msg)
-        async def drain_output(idx: int) -> None:
-            """Drain output channel idx."""
-            await chs_out[idx].data.drain(context)
-        # Main loop: coordinate receiving, sending, and draining
-        while (
-            recv_task is not None or active_tasks or any(output_buffers) or needs_drain
-        ):
-            # Collect all currently active tasks
-            tasks_to_wait = list(active_tasks.values())
-            # Only include recv_task if we're allowed to receive
-            if recv_task is not None and can_receive:
-                tasks_to_wait.append(recv_task)
-            # Start new tasks for outputs with work to do
-            for idx in range(len(chs_out)):
-                if idx not in active_tasks:
-                    if output_buffers[idx]:
-                        # Send next buffered message
-                        task = asyncio.create_task(send_one_from_buffer(idx))
-                        active_tasks[idx] = task
-                        tasks_to_wait.append(task)
-                    elif idx in needs_drain:
-                        # Buffer empty and no more input - drain this output
-                        task = asyncio.create_task(drain_output(idx))
-                        active_tasks[idx] = task
-                        tasks_to_wait.append(task)
-                        needs_drain.discard(idx)
-            # If nothing to wait for, we're done
-            if not tasks_to_wait:
-                break
+    async with shutdown_on_error(
+        context,
+        ch_in.metadata,
+        ch_in.data,
+        *[ch.metadata for ch in chs_out],
+        *[ch.data for ch in chs_out],
+    ):
+        # Forward metadata to all outputs.
+        metadata = await ch_in.recv_metadata(context)
+        await asyncio.gather(*(ch.send_metadata(context, metadata) for ch in chs_out))
+        # Spillable FIFO buffer for each output channel
+        output_buffers: list[SpillableMessages] = [SpillableMessages() for _ in chs_out]
+        num_outputs = len(chs_out)
+        # Track message IDs in FIFO order for each output buffer
+        buffer_ids: list[list[int]] = [[] for _ in chs_out]
+        # Register a single spill function for all buffers
+        # This ensures global FIFO ordering when spilling across all outputs
+        spill_func_id = context.br().spill_manager.add_spill_function(
+            make_spill_function(output_buffers, context), priority=0
+        )
+        try:
+            # Track active send/drain tasks for each output
+            active_tasks: dict[int, asyncio.Task] = {}
+            # Track which outputs need to be drained (set when no more input)
+            needs_drain: set[int] = set()
-            # Wait for ANY task to complete
-            done, _ = await asyncio.wait(
-                tasks_to_wait, return_when=asyncio.FIRST_COMPLETED
+            # Receive task
+            recv_task: asyncio.Task | None = asyncio.create_task(
+                ch_in.data.recv(context)
             )
-            # Process completed tasks
-            for task in done:
-                if task is recv_task:
-                    # Receive completed
-                    msg = task.result()
-                    if msg is None:
-                        # End of input - mark all outputs as needing drain
-                        recv_task = None
-                        needs_drain.update(range(len(chs_out)))
-                    else:
-                        # Add message to all output buffers
-                        chunk = TableChunk.from_message(msg).make_available_and_spill(
-                            context.br(), allow_overbooking=True
-                        )
-                        seq_num = msg.sequence_number
-                        for buffer in output_buffers:
-                            message = Message(
-                                seq_num,
-                                TableChunk.from_pylibcudf_table(
-                                    chunk.table_view(),
-                                    chunk.stream,
-                                    exclusive_view=False,
-                                ),
+            # Flag to indicate we should start a new receive (for backpressure)
+            can_receive: bool = True
+            async def send_one_from_buffer(idx: int) -> None:
+                """
+                Send one buffered message for output idx.
+                The message remains in host memory (spillable) through the channel.
+                The downstream consumer will call make_available() when needed.
+                """
+                if buffer_ids[idx]:
+                    mid = buffer_ids[idx].pop(0)
+                    msg = output_buffers[idx].extract(mid=mid)
+                    await chs_out[idx].data.send(context, msg)
+            async def drain_output(idx: int) -> None:
+                """Drain output channel idx."""
+                await chs_out[idx].data.drain(context)
+            # Main loop: coordinate receiving, sending, and draining
+            while (
+                recv_task is not None or active_tasks or any(buffer_ids) or needs_drain
+            ):
+                # Collect all currently active tasks
+                tasks_to_wait = list(active_tasks.values())
+                # Only include recv_task if we're allowed to receive
+                if recv_task is not None and can_receive:
+                    tasks_to_wait.append(recv_task)
+                # Start new tasks for outputs with work to do
+                for idx in range(len(chs_out)):
+                    if idx not in active_tasks:
+                        if buffer_ids[idx]:
+                            # Send next buffered message
+                            task = asyncio.create_task(send_one_from_buffer(idx))
+                            active_tasks[idx] = task
+                            tasks_to_wait.append(task)
+                        elif idx in needs_drain:
+                            # Buffer empty and no more input - drain this output
+                            task = asyncio.create_task(drain_output(idx))
+                            active_tasks[idx] = task
+                            tasks_to_wait.append(task)
+                            needs_drain.discard(idx)
+                # If nothing to wait for, we're done
+                if not tasks_to_wait:
+                    break
+                # Wait for ANY task to complete
+                done, _ = await asyncio.wait(
+                    tasks_to_wait, return_when=asyncio.FIRST_COMPLETED
+                )
+                # Process completed tasks
+                for task in done:
+                    if task is recv_task:
+                        # Receive completed
+                        msg = task.result()
+                        if msg is None:
+                            # End of input - mark all outputs as needing drain
+                            recv_task = None
+                            needs_drain.update(range(len(chs_out)))
+                        else:
+                            # Determine where to copy based on:
+                            # 1. Current message location (avoid unnecessary transfers)
+                            # 2. Available memory (avoid OOM)
+                            content_desc = msg.get_content_description()
+                            device_size = content_desc.content_sizes.get(
+                                MemoryType.DEVICE, 0
+                            )
+                            copy_cost = msg.copy_cost()
+                            # Check if we have enough device memory for all copies
+                            # We need (num_outputs - 1) copies since last one reuses original
+                            num_copies = num_outputs - 1
+                            total_copy_cost = copy_cost * num_copies
+                            available_device_mem = context.br().memory_available(
+                                MemoryType.DEVICE
                             )
-                            buffer.append(message)
-                        # Don't receive next chunk until at least one send completes
-                        can_receive = False
-                        recv_task = asyncio.create_task(ch_in.data.recv(context))
-                else:
-                    # Must be a send or drain task - find which output and remove it
-                    for idx, at in list(active_tasks.items()):
-                        if at is task:
-                            del active_tasks[idx]
-                            # A send completed - allow receiving again
-                            can_receive = True
-                            break
+                            # Decide target memory:
+                            # Use device ONLY if message is in device AND we have sufficient headroom.
+                            # TODO: Use further information about the downstream operations to make
+                            # a more informed decision.
+                            required_headroom = total_copy_cost * 2
+                            if (
+                                device_size > 0
+                                and available_device_mem >= required_headroom
+                            ):
+                                # Use reserve_device_memory_and_spill to automatically trigger spilling
+                                # if needed to make room for the copy
+                                memory_reservation = (
+                                    context.br().reserve_device_memory_and_spill(
+                                        total_copy_cost,
+                                        allow_overbooking=True,
+                                    )
+                                )
+                            else:
+                                # Use host memory for buffering - much safer
+                                # Downstream consumers will make_available() when they need device memory
+                                memory_reservation, _ = context.br().reserve(
+                                    MemoryType.HOST,
+                                    total_copy_cost,
+                                    allow_overbooking=True,
+                                )
+                            # Copy message for each output buffer
+                            # Copies are spillable and allow downstream consumers
+                            # to control device memory allocation
+                            for idx, sm in enumerate(output_buffers):
+                                if idx < num_outputs - 1:
+                                    # Copy to target memory and insert into spillable buffer
+                                    mid = sm.insert(msg.copy(memory_reservation))
+                                else:
+                                    # Optimization: reuse the original message for last output
+                                    # (no copy needed)
+                                    mid = sm.insert(msg)
+                                buffer_ids[idx].append(mid)
+                            # Don't receive next chunk until at least one send completes
+                            can_receive = False
+                            recv_task = asyncio.create_task(ch_in.data.recv(context))
+                    else:
+                        # Must be a send or drain task - find which output and remove it
+                        for idx, at in list(active_tasks.items()):
+                            if at is task:
+                                del active_tasks[idx]
+                                # A send completed - allow receiving again
+                                can_receive = True
+                                break
+        finally:
+            # Clean up spill function registration
+            context.br().spill_manager.remove_spill_function(spill_func_id)
 @generate_ir_sub_network.register(IR)
-def _(ir: IR, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: IR, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     # Default generate_ir_sub_network logic.
     # Use simple pointwise node.
@@ -377,18 +533,27 @@ def _(ir: IR, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]
     if len(ir.children) == 1:
         # Single-channel default node
-        nodes.append(
+        preserve_partitioning = isinstance(
+            # TODO: We don't need to worry about
+            # non-pointwise Filter operations here,
+            # because the lowering stage would have
+            # collapsed to one partition anyway.
+            ir,
+            (Cache, Projection, Filter),
+        )
+        nodes[ir] = [
             default_node_single(
                 rec.state["context"],
                 ir,
                 rec.state["ir_context"],
                 channels[ir].reserve_input_slot(),
                 channels[ir.children[0]].reserve_output_slot(),
+                preserve_partitioning=preserve_partitioning,
             )
-        )
+        ]
     else:
         # Multi-channel default node
-        nodes.append(
+        nodes[ir] = [
             default_node_multi(
                 rec.state["context"],
                 ir,
@@ -396,7 +561,7 @@ def _(ir: IR, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]
                 channels[ir].reserve_input_slot(),
                 tuple(channels[c].reserve_output_slot() for c in ir.children),
             )
-        )
+        ]
     return nodes, channels
@@ -422,7 +587,10 @@ async def empty_node(
     ch_out
         The output ChannelPair.
     """
-    async with shutdown_on_error(context, ch_out.data):
+    async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
+        # Send metadata indicating a single empty chunk
+        await ch_out.send_metadata(context, Metadata(1, duplicated=True))
         # Evaluate the IR node to create an empty DataFrame
         df: DataFrame = ir.do_evaluate(*ir._non_child_args, context=ir_context)
@@ -436,20 +604,22 @@ async def empty_node(
 @generate_ir_sub_network.register(Empty)
-def _(ir: Empty, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: Empty, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     """Generate network for Empty node - produces one empty chunk."""
     context = rec.state["context"]
     ir_context = rec.state["ir_context"]
     channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
-    nodes: list[Any] = [
-        empty_node(context, ir, ir_context, channels[ir].reserve_input_slot())
-    ]
+    nodes: dict[IR, list[Any]] = {
+        ir: [empty_node(context, ir, ir_context, channels[ir].reserve_input_slot())]
+    }
     return nodes, channels
 def generate_ir_sub_network_wrapper(
     ir: IR, rec: SubNetGenerator
-) -> tuple[list[Any], dict[IR, ChannelManager]]:
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     """
     Generate a sub-network for the RapidsMPF streaming runtime.
@@ -463,7 +633,7 @@ def generate_ir_sub_network_wrapper(
     Returns
     -------
     nodes
-        List of streaming-network node(s) for the subgraph.
+        Dictionary mapping each IR node to its list of streaming-network node(s).
     channels
         Dictionary mapping between each IR node and its
         corresponding streaming-network output ChannelManager.
@@ -474,21 +644,92 @@ def generate_ir_sub_network_wrapper(
     if (fanout_info := rec.state["fanout_nodes"].get(ir)) is not None:
         count = fanout_info.num_consumers
         manager = ChannelManager(rec.state["context"], count=count)
+        fanout_node: Any
         if fanout_info.unbounded:
-            nodes.append(
-                fanout_node_unbounded(
-                    rec.state["context"],
-                    channels[ir].reserve_output_slot(),
-                    *[manager.reserve_input_slot() for _ in range(count)],
-                )
+            fanout_node = fanout_node_unbounded(
+                rec.state["context"],
+                channels[ir].reserve_output_slot(),
+                *[manager.reserve_input_slot() for _ in range(count)],
             )
         else:  # "bounded"
-            nodes.append(
-                fanout_node_bounded(
-                    rec.state["context"],
-                    channels[ir].reserve_output_slot(),
-                    *[manager.reserve_input_slot() for _ in range(count)],
-                )
+            fanout_node = fanout_node_bounded(
+                rec.state["context"],
+                channels[ir].reserve_output_slot(),
+                *[manager.reserve_input_slot() for _ in range(count)],
             )
+        nodes[ir].append(fanout_node)
         channels[ir] = manager
     return nodes, channels
+@define_py_node()
+async def metadata_feeder_node(
+    context: Context,
+    channel: ChannelPair,
+    metadata: Metadata,
+) -> None:
+    """
+    Feed metadata to a channel pair.
+    Parameters
+    ----------
+    context
+        The rapidsmpf context.
+    channel
+        The channel pair.
+    metadata
+        The metadata to feed.
+    """
+    async with shutdown_on_error(context, channel.metadata, channel.data):
+        await channel.send_metadata(context, metadata)
+@define_py_node()
+async def metadata_drain_node(
+    context: Context,
+    ir: IR,
+    ir_context: IRExecutionContext,
+    ch_in: ChannelPair,
+    ch_out: Any,
+    metadata_collector: list[Metadata] | None,
+) -> None:
+    """
+    Drain metadata and forward data to a single channel.
+    Parameters
+    ----------
+    context
+        The rapidsmpf context.
+    ir
+        The IR node.
+    ir_context
+        The execution context for the IR node.
+    ch_in
+        The input ChannelPair (with metadata and data channels).
+    ch_out
+        The output data channel.
+    metadata_collector
+        The list to collect the final metadata.
+        This list will be mutated when the network is executed.
+        If None, metadata will not be collected.
+    """
+    async with shutdown_on_error(context, ch_in.metadata, ch_in.data, ch_out):
+        # Drain metadata channel (we don't need it after this point)
+        metadata = await ch_in.recv_metadata(context)
+        send_empty = metadata.duplicated and context.comm().rank != 0
+        if metadata_collector is not None:
+            metadata_collector.append(metadata)
+        # Forward non-duplicated data messages
+        while (msg := await ch_in.data.recv(context)) is not None:
+            if not send_empty:
+                await ch_out.send(context, msg)
+        # Send empty data if needed
+        if send_empty:
+            stream = ir_context.get_cuda_stream()
+            await ch_out.send(
+                context, Message(0, empty_table_chunk(ir, context, stream))
+            )
+        await ch_out.drain(context)

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl