PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/join.py ADDED Viewed

@@ -0,0 +1,322 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Join logic for the RapidsMPF streaming runtime."""
+from __future__ import annotations
+import asyncio
+from typing import TYPE_CHECKING, Any, Literal
+from rapidsmpf.memory.buffer import MemoryType
+from rapidsmpf.streaming.core.message import Message
+from rapidsmpf.streaming.cudf.table_chunk import TableChunk
+from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.ir import IR, Join
+from cudf_polars.experimental.rapidsmpf.collectives.allgather import AllGatherManager
+from cudf_polars.experimental.rapidsmpf.dispatch import (
+    generate_ir_sub_network,
+)
+from cudf_polars.experimental.rapidsmpf.nodes import (
+    default_node_multi,
+    define_py_node,
+    shutdown_on_error,
+)
+from cudf_polars.experimental.rapidsmpf.utils import (
+    ChannelManager,
+    Metadata,
+    chunk_to_frame,
+    empty_table_chunk,
+    opaque_reservation,
+    process_children,
+)
+from cudf_polars.experimental.utils import _concat
+if TYPE_CHECKING:
+    from rapidsmpf.streaming.core.context import Context
+    from cudf_polars.dsl.ir import IR, IRExecutionContext
+    from cudf_polars.experimental.rapidsmpf.core import SubNetGenerator
+    from cudf_polars.experimental.rapidsmpf.utils import ChannelPair
+@define_py_node()
+async def broadcast_join_node(
+    context: Context,
+    ir: Join,
+    ir_context: IRExecutionContext,
+    ch_out: ChannelPair,
+    ch_left: ChannelPair,
+    ch_right: ChannelPair,
+    broadcast_side: Literal["left", "right"],
+    collective_id: int,
+    target_partition_size: int,
+) -> None:
+    """
+    Join node for rapidsmpf.
+    Parameters
+    ----------
+    context
+        The rapidsmpf context.
+    ir
+        The Join IR node.
+    ir_context
+        The execution context for the IR node.
+    ch_out
+        The output ChannelPair.
+    ch_left
+        The left input ChannelPair.
+    ch_right
+        The right input ChannelPair.
+    broadcast_side
+        The side to broadcast.
+    collective_id
+        Pre-allocated collective ID for this operation.
+    target_partition_size
+        The target partition size in bytes.
+    """
+    async with shutdown_on_error(
+        context,
+        ch_left.metadata,
+        ch_left.data,
+        ch_right.metadata,
+        ch_right.data,
+        ch_out.metadata,
+        ch_out.data,
+    ):
+        # Receive metadata.
+        left_metadata, right_metadata = await asyncio.gather(
+            ch_left.recv_metadata(context),
+            ch_right.recv_metadata(context),
+        )
+        partitioned_on: tuple[str, ...] = ()
+        if broadcast_side == "right":
+            # Broadcast right, stream left
+            small_ch = ch_right
+            large_ch = ch_left
+            small_child = ir.children[1]
+            large_child = ir.children[0]
+            chunk_count = left_metadata.count
+            partitioned_on = left_metadata.partitioned_on
+            small_duplicated = right_metadata.duplicated
+        else:
+            # Broadcast left, stream right
+            small_ch = ch_left
+            large_ch = ch_right
+            small_child = ir.children[0]
+            large_child = ir.children[1]
+            chunk_count = right_metadata.count
+            small_duplicated = left_metadata.duplicated
+            if ir.options[0] == "Right":
+                partitioned_on = right_metadata.partitioned_on
+        # Send metadata.
+        output_metadata = Metadata(
+            chunk_count,
+            partitioned_on=partitioned_on,
+            duplicated=left_metadata.duplicated and right_metadata.duplicated,
+        )
+        await ch_out.send_metadata(context, output_metadata)
+        # Collect small-side (may be empty if no data received)
+        small_chunks: list[TableChunk] = []
+        small_size = 0
+        while (msg := await small_ch.data.recv(context)) is not None:
+            small_chunks.append(
+                TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
+            )
+            del msg
+            small_size += small_chunks[-1].data_alloc_size(MemoryType.DEVICE)
+        # Allgather is a collective - all ranks must participate even with no local data
+        need_allgather = context.comm().nranks > 1 and not small_duplicated
+        if need_allgather:
+            allgather = AllGatherManager(context, collective_id)
+            for s_id in range(len(small_chunks)):
+                allgather.insert(s_id, small_chunks.pop(0))
+            allgather.insert_finished()
+            stream = ir_context.get_cuda_stream()
+            # extract_concatenated returns a plc.Table, not a TableChunk
+            small_dfs = [
+                DataFrame.from_table(
+                    await allgather.extract_concatenated(stream),
+                    list(small_child.schema.keys()),
+                    list(small_child.schema.values()),
+                    stream,
+                )
+            ]
+        elif len(small_chunks) > 1 and (
+            ir.options[0] != "Inner" or small_size < target_partition_size
+        ):
+            # Pre-concat for non-inner joins, otherwise
+            # we need a local shuffle, and face additional
+            # memory pressure anyway.
+            small_dfs = [
+                _concat(
+                    *[chunk_to_frame(chunk, small_child) for chunk in small_chunks],
+                    context=ir_context,
+                )
+            ]
+            small_chunks.clear()  # small_dfs is not a view of small_chunks anymore
+        else:
+            small_dfs = [
+                chunk_to_frame(small_chunk, small_child) for small_chunk in small_chunks
+            ]
+        # Stream through large side, joining with the small-side
+        seq_num = 0
+        large_chunk_processed = False
+        receiving_large_chunks = True
+        while receiving_large_chunks:
+            msg = await large_ch.data.recv(context)
+            if msg is None:
+                receiving_large_chunks = False
+                if large_chunk_processed:
+                    # Normal exit - We've processed all large-table data
+                    break
+                elif small_dfs:
+                    # We received small-table data, but no large-table data.
+                    # This may never happen, but we can handle it by generating
+                    # an empty large-table chunk
+                    stream = ir_context.get_cuda_stream()
+                    large_chunk = empty_table_chunk(large_child, context, stream)
+                else:
+                    # We received no data for either the small or large table.
+                    # Drain the output channel and return
+                    await ch_out.data.drain(context)
+                    return
+            else:
+                large_chunk_processed = True
+                large_chunk = TableChunk.from_message(msg).make_available_and_spill(
+                    context.br(), allow_overbooking=True
+                )
+                seq_num = msg.sequence_number
+                del msg
+            large_df = DataFrame.from_table(
+                large_chunk.table_view(),
+                list(large_child.schema.keys()),
+                list(large_child.schema.values()),
+                large_chunk.stream,
+            )
+            # Lazily create empty small table if small_dfs is empty
+            if not small_dfs:
+                stream = ir_context.get_cuda_stream()
+                empty_small_chunk = empty_table_chunk(small_child, context, stream)
+                small_dfs = [chunk_to_frame(empty_small_chunk, small_child)]
+            large_chunk_size = large_chunk.data_alloc_size(MemoryType.DEVICE)
+            input_bytes = large_chunk_size + small_size
+            with opaque_reservation(context, input_bytes):
+                df = _concat(
+                    *[
+                        await asyncio.to_thread(
+                            ir.do_evaluate,
+                            *ir._non_child_args,
+                            *(
+                                [large_df, small_df]
+                                if broadcast_side == "right"
+                                else [small_df, large_df]
+                            ),
+                            context=ir_context,
+                        )
+                        for small_df in small_dfs
+                    ],
+                    context=ir_context,
+                )
+                # Send output chunk
+                await ch_out.data.send(
+                    context,
+                    Message(
+                        seq_num,
+                        TableChunk.from_pylibcudf_table(
+                            df.table, df.stream, exclusive_view=True
+                        ),
+                    ),
+                )
+                del df, large_df, large_chunk
+        del small_dfs, small_chunks
+        await ch_out.data.drain(context)
+@generate_ir_sub_network.register(Join)
+def _(
+    ir: Join, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
+    # Join operation.
+    left, right = ir.children
+    partition_info = rec.state["partition_info"]
+    output_count = partition_info[ir].count
+    left_count = partition_info[left].count
+    right_count = partition_info[right].count
+    left_partitioned = (
+        partition_info[left].partitioned_on == ir.left_on and left_count == output_count
+    )
+    right_partitioned = (
+        partition_info[right].partitioned_on == ir.right_on
+        and right_count == output_count
+    )
+    pwise_join = output_count == 1 or (left_partitioned and right_partitioned)
+    # Process children
+    nodes, channels = process_children(ir, rec)
+    # Create output ChannelManager
+    channels[ir] = ChannelManager(rec.state["context"])
+    if pwise_join:
+        # Partition-wise join (use default_node_multi)
+        partitioning_index = 1 if ir.options[0] == "Right" else 0
+        nodes[ir] = [
+            default_node_multi(
+                rec.state["context"],
+                ir,
+                rec.state["ir_context"],
+                channels[ir].reserve_input_slot(),
+                (
+                    channels[left].reserve_output_slot(),
+                    channels[right].reserve_output_slot(),
+                ),
+                partitioning_index=partitioning_index,
+            )
+        ]
+        return nodes, channels
+    else:
+        # Broadcast join (use broadcast_join_node)
+        broadcast_side: Literal["left", "right"]
+        if left_count >= right_count:
+            # Broadcast right, stream left
+            broadcast_side = "right"
+        else:
+            broadcast_side = "left"
+        # Get target partition size
+        config_options = rec.state["config_options"]
+        executor = config_options.executor
+        assert executor.name == "streaming", "Join node requires streaming executor"
+        target_partition_size = executor.target_partition_size
+        nodes[ir] = [
+            broadcast_join_node(
+                rec.state["context"],
+                ir,
+                rec.state["ir_context"],
+                channels[ir].reserve_input_slot(),
+                channels[left].reserve_output_slot(),
+                channels[right].reserve_output_slot(),
+                broadcast_side=broadcast_side,
+                collective_id=rec.state["collective_id_map"][ir],
+                target_partition_size=target_partition_size,
+            )
+        ]
+        return nodes, channels

cudf_polars/experimental/rapidsmpf/lower.py ADDED Viewed

@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Core lowering logic for the RapidsMPF streaming runtime."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import cudf_polars.experimental.rapidsmpf.io  # noqa: F401
+from cudf_polars.dsl.ir import IR, Sort
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.io import StreamingSink
+from cudf_polars.experimental.parallel import _lower_ir_pwise
+from cudf_polars.experimental.rapidsmpf.dispatch import (
+    lower_ir_node,
+)
+from cudf_polars.experimental.repartition import Repartition
+from cudf_polars.experimental.sort import ShuffleSorted
+from cudf_polars.experimental.utils import _lower_ir_fallback
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from cudf_polars.experimental.rapidsmpf.dispatch import LowerIRTransformer
+@lower_ir_node.register(IR)
+def _lower_ir_node_task_engine(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Use task-engine lowering logic
+    from cudf_polars.experimental.dispatch import lower_ir_node as base_lower_ir_node
+    return base_lower_ir_node(ir, rec)
+@lower_ir_node.register(ShuffleSorted)
+@lower_ir_node.register(StreamingSink)
+def _unsupported(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Unsupported operations - Fall back to a single partition/chunk.
+    return _lower_ir_fallback(
+        ir, rec, msg=f"Class {type(ir)} does not support multiple partitions."
+    )
+@lower_ir_node.register(Sort)
+def _(
+    ir: Sort, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    if ir.zlice is not None:
+        # Top- or bottom-k support
+        has_offset = ir.zlice[0] > 0 or (
+            ir.zlice[0] < 0
+            and ir.zlice[1] is not None
+            and ir.zlice[0] + ir.zlice[1] < 0
+        )
+        if not has_offset:
+            # Sort input partitions
+            new_node, partition_info = _lower_ir_pwise(ir, rec)
+            if partition_info[new_node].count > 1:
+                # Collapse down to single partition
+                inter = Repartition(new_node.schema, new_node)
+                partition_info[inter] = PartitionInfo(count=1)
+                # Sort reduced partition
+                new_node = ir.reconstruct([inter])
+                partition_info[new_node] = PartitionInfo(count=1)
+            return new_node, partition_info
+    # TODO: Add general multi-partition Sort support
+    return _lower_ir_fallback(
+        ir, rec, msg=f"Class {type(ir)} does not support multiple partitions."
+    )

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl