PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/core.py CHANGED Viewed

@@ -8,10 +8,10 @@ from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any
-from rapidsmpf.buffer.buffer import MemoryType
-from rapidsmpf.buffer.resource import BufferResource, LimitAvailableMemory
 from rapidsmpf.communicator.single import new_communicator
 from rapidsmpf.config import Options, get_environment_variables
+from rapidsmpf.memory.buffer import MemoryType
+from rapidsmpf.memory.buffer_resource import BufferResource, LimitAvailableMemory
 from rapidsmpf.rmm_resource_adaptor import RmmResourceAdaptor
 from rapidsmpf.streaming.core.context import Context
 from rapidsmpf.streaming.core.leaf_node import pull_from_channel
@@ -22,17 +22,22 @@ from rapidsmpf.streaming.cudf.table_chunk import TableChunk
 import rmm
+import cudf_polars.experimental.rapidsmpf.collectives.shuffle
 import cudf_polars.experimental.rapidsmpf.io
 import cudf_polars.experimental.rapidsmpf.join
 import cudf_polars.experimental.rapidsmpf.lower
 import cudf_polars.experimental.rapidsmpf.repartition
-import cudf_polars.experimental.rapidsmpf.shuffle
 import cudf_polars.experimental.rapidsmpf.union  # noqa: F401
 from cudf_polars.containers import DataFrame
 from cudf_polars.dsl.ir import DataFrameScan, IRExecutionContext, Join, Scan, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
+from cudf_polars.experimental.rapidsmpf.collectives import ReserveOpIDs
 from cudf_polars.experimental.rapidsmpf.dispatch import FanoutInfo, lower_ir_node
-from cudf_polars.experimental.rapidsmpf.nodes import generate_ir_sub_network_wrapper
+from cudf_polars.experimental.rapidsmpf.nodes import (
+    generate_ir_sub_network_wrapper,
+    metadata_drain_node,
+)
+from cudf_polars.experimental.rapidsmpf.utils import empty_table_chunk
 from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.experimental.utils import _concat
 from cudf_polars.utils.config import CUDAStreamPoolConfig
@@ -40,10 +45,13 @@ from cudf_polars.utils.config import CUDAStreamPoolConfig
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
+    from rapidsmpf.streaming.core.channel import Channel
     from rapidsmpf.streaming.core.leaf_node import DeferredMessages
     import polars as pl
+    from rmm.pylibrmm.cuda_stream_pool import CudaStreamPool
     from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.base import PartitionInfo, StatsCollector
     from cudf_polars.experimental.parallel import ConfigOptions
@@ -53,12 +61,15 @@ if TYPE_CHECKING:
         LowerState,
         SubNetGenerator,
     )
+    from cudf_polars.experimental.rapidsmpf.utils import Metadata
 def evaluate_logical_plan(
     ir: IR,
     config_options: ConfigOptions,
-) -> pl.DataFrame:
+    *,
+    collect_metadata: bool = False,
+) -> tuple[pl.DataFrame, list[Metadata] | None]:
     """
     Evaluate a logical plan with the RapidsMPF streaming runtime.
@@ -68,59 +79,136 @@ def evaluate_logical_plan(
         The IR node.
     config_options
         The configuration options.
+    collect_metadata
+        Whether to collect runtime metadata.
     Returns
     -------
-    The output DataFrame.
+    The output DataFrame and metadata collector.
     """
     assert config_options.executor.name == "streaming", "Executor must be streaming"
     assert config_options.executor.runtime == "rapidsmpf", "Runtime must be rapidsmpf"
-    if (
-        config_options.executor.scheduler == "distributed"
-    ):  # pragma: no cover; Requires distributed
-        # TODO: Add distributed-execution support
-        raise NotImplementedError(
-            "The rapidsmpf engine does not support distributed execution yet."
-        )
     # Lower the IR graph on the client process (for now).
     ir, partition_info, stats = lower_ir_graph(ir, config_options)
-    # Configure the context.
-    # TODO: Multi-GPU version will be different. The rest of this function
-    #       will be executed on each rank independently.
-    # TODO: Need a way to configure options specific to the rapidmspf engine.
-    options = Options(get_environment_variables())
-    comm = new_communicator(options)
-    mr = RmmResourceAdaptor(rmm.mr.get_current_device_resource())
-    rmm.mr.set_current_device_resource(mr)
-    memory_available: MutableMapping[MemoryType, LimitAvailableMemory] | None = None
-    single_spill_device = config_options.executor.client_device_threshold
-    if single_spill_device > 0.0 and single_spill_device < 1.0:
-        total_memory = rmm.mr.available_device_memory()[1]
-        memory_available = {
-            MemoryType.DEVICE: LimitAvailableMemory(
-                mr, limit=int(total_memory * single_spill_device)
+    # Reserve shuffle IDs for the entire pipeline execution
+    with ReserveOpIDs(ir) as shuffle_id_map:
+        # Build and execute the streaming pipeline.
+        # This must be done on all worker processes
+        # for cluster == "distributed".
+        if (
+            config_options.executor.cluster == "distributed"
+        ):  # pragma: no cover; block depends on executor type and Distributed cluster
+            # Distributed execution: Use client.run
+            # NOTE: Distributed execution requires Dask for now
+            from cudf_polars.experimental.rapidsmpf.dask import evaluate_pipeline_dask
+            result, metadata_collector = evaluate_pipeline_dask(
+                evaluate_pipeline,
+                ir,
+                partition_info,
+                config_options,
+                stats,
+                shuffle_id_map,
+                collect_metadata=collect_metadata,
+            )
+        else:
+            # Single-process execution: Run locally
+            result, metadata_collector = evaluate_pipeline(
+                ir,
+                partition_info,
+                config_options,
+                stats,
+                shuffle_id_map,
+                collect_metadata=collect_metadata,
             )
-        }
-    # We have a couple of cases to consider here:
-    # 1: we want to use the same stream pool for cudf-polars and rapidsmpf
-    # 2: rapidsmpf uses its own pool and cudf-polars uses the default stream
-    if isinstance(config_options.cuda_stream_policy, CUDAStreamPoolConfig):
-        stream_pool = config_options.cuda_stream_policy.build()
-    else:
-        stream_pool = None
+    return result, metadata_collector
-    br = BufferResource(mr, memory_available=memory_available, stream_pool=stream_pool)
-    rmpf_context = Context(comm, br, options)
-    executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="cpse")
+def evaluate_pipeline(
+    ir: IR,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    config_options: ConfigOptions,
+    stats: StatsCollector,
+    collective_id_map: dict[IR, int],
+    rmpf_context: Context | None = None,
+    *,
+    collect_metadata: bool = False,
+) -> tuple[pl.DataFrame, list[Metadata] | None]:
+    """
+    Build and evaluate a RapidsMPF streaming pipeline.
+    Parameters
+    ----------
+    ir
+        The IR node.
+    partition_info
+        The partition information.
+    config_options
+        The configuration options.
+    stats
+        The statistics collector.
+    collective_id_map
+        The mapping of IR nodes to collective IDs.
+    rmpf_context
+        The RapidsMPF context.
+    collect_metadata
+        Whether to collect runtime metadata.
-    # Create the IR execution context.
-    if stream_pool is not None:
-        # both cudf-polars and rapidsmpf are using the same stream pool
+    Returns
+    -------
+    The output DataFrame and metadata collector.
+    """
+    assert config_options.executor.name == "streaming", "Executor must be streaming"
+    assert config_options.executor.runtime == "rapidsmpf", "Runtime must be rapidsmpf"
+    _initial_mr: Any = None
+    stream_pool: CudaStreamPool | bool = False
+    if rmpf_context is not None:
+        # Using "distributed" mode.
+        # Always use the RapidsMPF stream pool for now.
+        br = rmpf_context.br()
+        stream_pool = True
+    else:
+        # Using "single" mode.
+        # Create a new local RapidsMPF context.
+        _original_mr = rmm.mr.get_current_device_resource()
+        mr = RmmResourceAdaptor(_original_mr)
+        rmm.mr.set_current_device_resource(mr)
+        memory_available: MutableMapping[MemoryType, LimitAvailableMemory] | None = None
+        single_spill_device = config_options.executor.client_device_threshold
+        if single_spill_device > 0.0 and single_spill_device < 1.0:
+            total_memory = rmm.mr.available_device_memory()[1]
+            memory_available = {
+                MemoryType.DEVICE: LimitAvailableMemory(
+                    mr, limit=int(total_memory * single_spill_device)
+                )
+            }
+        options = Options(
+            {
+                # By default, set the number of streaming threads to the max
+                # number of IO threads. The user may override this with an
+                # environment variable (i.e. RAPIDSMPF_NUM_STREAMING_THREADS)
+                "num_streaming_threads": str(
+                    max(config_options.executor.max_io_threads, 1)
+                )
+            }
+            | get_environment_variables()
+        )
+        if isinstance(config_options.cuda_stream_policy, CUDAStreamPoolConfig):
+            stream_pool = config_options.cuda_stream_policy.build()
+        local_comm = new_communicator(options)
+        br = BufferResource(
+            mr, memory_available=memory_available, stream_pool=stream_pool
+        )
+        rmpf_context = Context(local_comm, br, options)
+    # Create the IR execution context
+    if stream_pool:
         ir_context = IRExecutionContext(
             get_cuda_stream=rmpf_context.get_stream_from_pool
         )
@@ -128,6 +216,8 @@ def evaluate_logical_plan(
         ir_context = IRExecutionContext.from_config_options(config_options)
     # Generate network nodes
+    assert rmpf_context is not None, "RapidsMPF context must defined."
+    metadata_collector: list[Metadata] | None = [] if collect_metadata else None
     nodes, output = generate_network(
         rmpf_context,
         ir,
@@ -135,9 +225,12 @@ def evaluate_logical_plan(
         config_options,
         stats,
         ir_context=ir_context,
+        collective_id_map=collective_id_map,
+        metadata_collector=metadata_collector,
     )
     # Run the network
+    executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="cpse")
     run_streaming_pipeline(nodes=nodes, py_executor=executor)
     # Extract/return the concatenated result.
@@ -150,16 +243,29 @@ def evaluate_logical_plan(
         )
         for msg in messages
     ]
-    dfs = [
-        DataFrame.from_table(
+    dfs: list[DataFrame] = []
+    if chunks:
+        dfs = [
+            DataFrame.from_table(
+                chunk.table_view(),
+                list(ir.schema.keys()),
+                list(ir.schema.values()),
+                chunk.stream,
+            )
+            for chunk in chunks
+        ]
+        df = _concat(*dfs, context=ir_context)
+    else:
+        # No chunks received - create an empty DataFrame with correct schema
+        stream = ir_context.get_cuda_stream()
+        chunk = empty_table_chunk(ir, rmpf_context, stream)
+        df = DataFrame.from_table(
             chunk.table_view(),
             list(ir.schema.keys()),
             list(ir.schema.values()),
-            chunk.stream,
+            stream,
         )
-        for chunk in chunks
-    ]
-    df = _concat(*dfs, context=ir_context)
     # We need to materialize the polars dataframe before we drop the rapidsmpf
     # context, which keeps the CUDA streams alive.
     stream = df.stream
@@ -170,7 +276,11 @@ def evaluate_logical_plan(
     # before the Context, which ultimately contains the rmm MR, goes out of scope.
     del nodes, output, messages, chunks, dfs, df
-    return result
+    # Restore the initial RMM memory resource
+    if _initial_mr is not None:
+        rmm.mr.set_current_device_resource(_original_mr)
+    return result, metadata_collector
 def lower_ir_graph(
@@ -186,12 +296,15 @@ def lower_ir_graph(
         Root of the graph to rewrite.
     config_options
         GPUEngine configuration options.
+    stats
+        The statistics collector.
     Returns
     -------
     new_ir, partition_info, stats
-        The rewritten graph, and a mapping from unique nodes
-        in the new graph to associated partitioning information.
+        The rewritten graph, a mapping from unique nodes
+        in the new graph to associated partitioning information,
+        and the statistics collector.
     Notes
     -----
@@ -287,6 +400,8 @@ def generate_network(
     stats: StatsCollector,
     *,
     ir_context: IRExecutionContext,
+    collective_id_map: dict[IR, int],
+    metadata_collector: list[Metadata] | None,
 ) -> tuple[list[Any], DeferredMessages]:
     """
     Translate the IR graph to a RapidsMPF streaming network.
@@ -305,6 +420,12 @@ def generate_network(
         Statistics collector.
     ir_context
         The execution context for the IR node.
+    collective_id_map
+        The mapping of IR nodes to collective IDs.
+    metadata_collector
+        The list to collect the final metadata.
+        This list will be mutated when the network is executed.
+        If None, metadata will not be collected.
     Returns
     -------
@@ -322,8 +443,9 @@ def generate_network(
     # Determine which nodes need fanout
     fanout_nodes = determine_fanout_nodes(ir, partition_info, ir_dep_count)
-    # TODO: Make this configurable
-    max_io_threads_global = 2
+    # Get max_io_threads from config (default: 2)
+    assert config_options.executor.name == "streaming", "Executor must be streaming"
+    max_io_threads_global = config_options.executor.max_io_threads
     max_io_threads_local = max(1, max_io_threads_global // max(1, num_io_nodes))
     # Generate the network
@@ -335,27 +457,32 @@ def generate_network(
         "ir_context": ir_context,
         "max_io_threads": max_io_threads_local,
         "stats": stats,
+        "collective_id_map": collective_id_map,
     }
     mapper: SubNetGenerator = CachingVisitor(
         generate_ir_sub_network_wrapper, state=state
     )
-    nodes, channels = mapper(ir)
-    # Deduplicate nodes.
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/20586
-    nodes = list(set(nodes))
+    nodes_dict, channels = mapper(ir)
     ch_out = channels[ir].reserve_output_slot()
-    # TODO: We will need an additional node here to drain
-    # the metadata channel once we start plumbing metadata
-    # through the network. This node could also drop
-    # "duplicated" data on all but rank 0.
+    # Add node to drain metadata channel before pull_from_channel
+    # (since pull_from_channel doesn't accept a ChannelPair)
+    ch_final_data: Channel[TableChunk] = context.create_channel()
+    drain_node = metadata_drain_node(
+        context,
+        ir,
+        ir_context,
+        ch_out,
+        ch_final_data,
+        metadata_collector,
+    )
     # Add final node to pull from the output data channel
-    # (metadata channel is unused)
-    output_node, output = pull_from_channel(context, ch_in=ch_out.data)
-    nodes.append(output_node)
+    output_node, output = pull_from_channel(context, ch_in=ch_final_data)
+    # Flatten the nodes dictionary into a list for run_streaming_pipeline
+    nodes: list[Any] = [node for node_list in nodes_dict.values() for node in node_list]
+    nodes.extend([drain_node, output_node])
     # Return network and output hook
     return nodes, output

cudf_polars/experimental/rapidsmpf/dask.py ADDED Viewed

@@ -0,0 +1,172 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Dask-based execution with the streaming RapidsMPF runtime."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Protocol
+from distributed import get_client
+from rapidsmpf.config import Options, get_environment_variables
+from rapidsmpf.integrations.dask import get_worker_context
+from rapidsmpf.streaming.core.context import Context
+import polars as pl
+from cudf_polars.experimental.dask_registers import DaskRegisterManager
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from distributed import Client
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo, StatsCollector
+    from cudf_polars.experimental.parallel import ConfigOptions
+    from cudf_polars.experimental.rapidsmpf.utils import Metadata
+class EvaluatePipelineCallback(Protocol):
+    """Protocol for the evaluate_pipeline callback."""
+    def __call__(
+        self,
+        ir: IR,
+        partition_info: MutableMapping[IR, PartitionInfo],
+        config_options: ConfigOptions,
+        stats: StatsCollector,
+        collective_id_map: dict[IR, int],
+        rmpf_context: Context | None = None,
+        *,
+        collect_metadata: bool = False,
+    ) -> tuple[pl.DataFrame, list[Metadata] | None]:
+        """Evaluate a pipeline and return the result DataFrame and metadata."""
+        ...
+def get_dask_client() -> Client:
+    """Get a distributed Dask client."""
+    client = get_client()
+    DaskRegisterManager.register_once()
+    DaskRegisterManager.run_on_cluster(client)
+    return client
+def evaluate_pipeline_dask(
+    callback: EvaluatePipelineCallback,
+    ir: IR,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    config_options: ConfigOptions,
+    stats: StatsCollector,
+    shuffle_id_map: dict[IR, int],
+    *,
+    collect_metadata: bool = False,
+) -> tuple[pl.DataFrame, list[Metadata] | None]:
+    """
+    Evaluate a RapidsMPF streaming pipeline on a Dask cluster.
+    Parameters
+    ----------
+    callback
+        The callback function to evaluate the pipeline.
+    ir
+        The IR node.
+    partition_info
+        The partition information.
+    config_options
+        The configuration options.
+    stats
+        The statistics collector.
+    shuffle_id_map
+        Mapping from Shuffle/Repartition/Join IR nodes to reserved shuffle IDs.
+    collect_metadata
+        Whether to collect metadata.
+    Returns
+    -------
+    The output DataFrame and metadata collector.
+    """
+    client = get_dask_client()
+    result = client.run(
+        _evaluate_pipeline_dask,
+        callback,
+        ir,
+        partition_info,
+        config_options,
+        stats,
+        shuffle_id_map,
+        collect_metadata=collect_metadata,
+    )
+    dfs: list[pl.DataFrame] = []
+    metadata_collector: list[Metadata] = []
+    for df, md in result.values():
+        dfs.append(df)
+        if md is not None:
+            metadata_collector.extend(md)
+    return pl.concat(dfs), metadata_collector or None
+def _evaluate_pipeline_dask(
+    callback: EvaluatePipelineCallback,
+    ir: IR,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    config_options: ConfigOptions,
+    stats: StatsCollector,
+    shuffle_id_map: dict[IR, int],
+    dask_worker: Any = None,
+    *,
+    collect_metadata: bool = False,
+) -> tuple[pl.DataFrame, list[Metadata] | None]:
+    """
+    Build and evaluate a RapidsMPF streaming pipeline.
+    Parameters
+    ----------
+    callback
+        The callback function to evaluate the pipeline.
+    ir
+        The IR node.
+    partition_info
+        The partition information.
+    config_options
+        The configuration options.
+    stats
+        The statistics collector.
+    shuffle_id_map
+        Mapping from Shuffle/Repartition/Join IR nodes to reserved shuffle IDs.
+    dask_worker
+        Dask worker reference.
+        This kwarg is automatically populated by Dask
+        when evaluate_pipeline is called with `client.run`.
+    collect_metadata
+        Whether to collect metadata.
+    Returns
+    -------
+    The output DataFrame and metadata collector.
+    """
+    assert dask_worker is not None, "Dask worker must be provided"
+    assert config_options.executor.name == "streaming", "Executor must be streaming"
+    # NOTE: The Dask-CUDA cluster must be bootstrapped
+    # ahead of time using bootstrap_dask_cluster
+    # (rapidsmpf.integrations.dask.bootstrap_dask_cluster).
+    # TODO: Automatically bootstrap the cluster if necessary.
+    options = Options(
+        {"num_streaming_threads": str(max(config_options.executor.max_io_threads, 1))}
+        | get_environment_variables()
+    )
+    dask_context = get_worker_context(dask_worker)
+    rmpf_context = Context(dask_context.comm, dask_context.br, options)
+    # IDs are already reserved by the caller, just pass them through
+    return callback(
+        ir,
+        partition_info,
+        config_options,
+        stats,
+        shuffle_id_map,
+        rmpf_context,
+        collect_metadata=collect_metadata,
+    )

cudf_polars/experimental/rapidsmpf/dispatch.py CHANGED Viewed

@@ -75,6 +75,8 @@ class GenState(TypedDict):
         a single IO node.
     stats
         Statistics collector.
+    collective_id_map
+        The mapping of IR nodes to collective IDs.
     """
     context: Context
@@ -84,10 +86,11 @@ class GenState(TypedDict):
     ir_context: IRExecutionContext
     max_io_threads: int
     stats: StatsCollector
+    collective_id_map: dict[IR, int]
 SubNetGenerator: TypeAlias = GenericTransformer[
-    "IR", "tuple[list[Any], dict[IR, ChannelManager]]", GenState
+    "IR", "tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]", GenState
 ]
 """Protocol for Generating a streaming sub-network."""
@@ -128,7 +131,7 @@ def lower_ir_node(
 @singledispatch
 def generate_ir_sub_network(
     ir: IR, rec: SubNetGenerator
-) -> tuple[list[Any], dict[IR, ChannelManager]]:
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     """
     Generate a sub-network for the RapidsMPF streaming runtime.
@@ -142,7 +145,7 @@ def generate_ir_sub_network(
     Returns
     -------
     nodes
-        List of streaming-network node(s).
+        Dictionary mapping each IR node to its list of streaming-network node(s).
     channels
         Dictionary mapping between each IR node and its
         corresponding output ChannelManager object.

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl