PyPI - cudf-polars-cu13 - Versions diffs - 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +28 -7
cudf_polars/containers/column.py +51 -26
cudf_polars/dsl/expressions/binaryop.py +1 -1
cudf_polars/dsl/expressions/boolean.py +1 -1
cudf_polars/dsl/expressions/selection.py +1 -1
cudf_polars/dsl/expressions/string.py +29 -20
cudf_polars/dsl/expressions/ternary.py +25 -1
cudf_polars/dsl/expressions/unary.py +11 -8
cudf_polars/dsl/ir.py +351 -281
cudf_polars/dsl/translate.py +18 -15
cudf_polars/dsl/utils/aggregations.py +10 -5
cudf_polars/experimental/base.py +10 -0
cudf_polars/experimental/benchmarks/pdsh.py +1 -1
cudf_polars/experimental/benchmarks/utils.py +83 -2
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +1 -1
cudf_polars/experimental/expressions.py +8 -5
cudf_polars/experimental/groupby.py +2 -0
cudf_polars/experimental/io.py +64 -42
cudf_polars/experimental/join.py +15 -2
cudf_polars/experimental/parallel.py +10 -7
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/{shuffle.py → collectives/shuffle.py} +90 -114
cudf_polars/experimental/rapidsmpf/core.py +194 -67
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +6 -3
cudf_polars/experimental/rapidsmpf/io.py +162 -70
cudf_polars/experimental/rapidsmpf/join.py +162 -77
cudf_polars/experimental/rapidsmpf/nodes.py +421 -180
cudf_polars/experimental/rapidsmpf/repartition.py +130 -65
cudf_polars/experimental/rapidsmpf/union.py +24 -5
cudf_polars/experimental/rapidsmpf/utils.py +228 -16
cudf_polars/experimental/shuffle.py +18 -4
cudf_polars/experimental/sort.py +13 -6
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/testing/plugin.py +6 -3
cudf_polars/utils/config.py +67 -0
cudf_polars/utils/versions.py +3 -3
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/METADATA +9 -10
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/RECORD +47 -43
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.12.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/rapidsmpf/io.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """IO logic for the RapidsMPF streaming runtime."""
@@ -34,9 +34,14 @@ from cudf_polars.experimental.rapidsmpf.dispatch import (
 )
 from cudf_polars.experimental.rapidsmpf.nodes import (
     define_py_node,
+    metadata_feeder_node,
     shutdown_on_error,
 )
-from cudf_polars.experimental.rapidsmpf.utils import ChannelManager
+from cudf_polars.experimental.rapidsmpf.utils import (
+    ChannelManager,
+    Metadata,
+    opaque_reservation,
+)
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -103,7 +108,7 @@ class Lineariser:
         # Forward any remaining buffered messages
         for seq in sorted(buffer.keys()):
-            await self.ch_out.send(self.context, buffer[seq])
+            await self.ch_out.send(self.context, buffer.pop(seq))
         await self.ch_out.drain(self.context)
@@ -138,6 +143,7 @@ async def dataframescan_node(
     *,
     num_producers: int,
     rows_per_partition: int,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     DataFrameScan node for rapidsmpf.
@@ -156,19 +162,26 @@ async def dataframescan_node(
         The number of producers to use for the DataFrameScan node.
     rows_per_partition
         The number of rows per partition.
+    estimated_chunk_bytes
+        Estimated size of each chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
-    nrows = max(ir.df.shape()[0], 1)
-    global_count = math.ceil(nrows / rows_per_partition)
+    async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
+        # Find local partition count.
+        nrows = ir.df.shape()[0]
+        global_count = math.ceil(nrows / rows_per_partition) if nrows > 0 else 0
+        # For single rank, simplify the logic
+        if context.comm().nranks == 1:
+            local_count = global_count
+            local_offset = 0
+        else:
+            local_count = math.ceil(global_count / context.comm().nranks)
+            local_offset = local_count * context.comm().rank
-    # For single rank, simplify the logic
-    if context.comm().nranks == 1:
-        local_count = global_count
-        local_offset = 0
-    else:
-        local_count = math.ceil(global_count / context.comm().nranks)
-        local_offset = local_count * context.comm().rank
+        # Send basic metadata
+        await ch_out.send_metadata(context, Metadata(max(1, local_count)))
-    async with shutdown_on_error(context, ch_out.data):
         # Build list of IR slices to read
         ir_slices = []
         for seq_num in range(local_count):
@@ -183,6 +196,26 @@ async def dataframescan_node(
                 )
             )
+        # If there are no slices, drain the channel and return
+        if len(ir_slices) == 0:
+            await ch_out.data.drain(context)
+            return
+        # If there is only one ir_slices or one producer, we can
+        # skip the lineariser and read the chunks directly
+        if len(ir_slices) == 1 or num_producers == 1:
+            for seq_num, ir_slice in enumerate(ir_slices):
+                await read_chunk(
+                    context,
+                    ir_slice,
+                    seq_num,
+                    ch_out.data,
+                    ir_context,
+                    estimated_chunk_bytes,
+                )
+            await ch_out.data.drain(context)
+            return
         # Use Lineariser to ensure ordered delivery
         num_producers = min(num_producers, len(ir_slices))
         lineariser = Lineariser(context, ch_out.data, num_producers)
@@ -203,6 +236,7 @@ async def dataframescan_node(
                     task_idx,
                     ch_out,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.drain(context)
@@ -216,27 +250,32 @@ async def dataframescan_node(
 @generate_ir_sub_network.register(DataFrameScan)
 def _(
     ir: DataFrameScan, rec: SubNetGenerator
-) -> tuple[list[Any], dict[IR, ChannelManager]]:
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     config_options = rec.state["config_options"]
     assert config_options.executor.name == "streaming", (
         "'in-memory' executor not supported in 'generate_ir_sub_network'"
     )
     rows_per_partition = config_options.executor.max_rows_per_partition
     num_producers = rec.state["max_io_threads"]
+    # Use target_partition_size as the estimated chunk size
+    estimated_chunk_bytes = config_options.executor.target_partition_size
     context = rec.state["context"]
     ir_context = rec.state["ir_context"]
     channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
-    nodes: list[Any] = [
-        dataframescan_node(
-            context,
-            ir,
-            ir_context,
-            channels[ir].reserve_input_slot(),
-            num_producers=num_producers,
-            rows_per_partition=rows_per_partition,
-        )
-    ]
+    nodes: dict[IR, list[Any]] = {
+        ir: [
+            dataframescan_node(
+                context,
+                ir,
+                ir_context,
+                channels[ir].reserve_input_slot(),
+                num_producers=num_producers,
+                rows_per_partition=rows_per_partition,
+                estimated_chunk_bytes=estimated_chunk_bytes,
+            )
+        ]
+    }
     return nodes, channels
@@ -278,6 +317,7 @@ async def read_chunk(
     seq_num: int,
     ch_out: Channel[TableChunk],
     ir_context: IRExecutionContext,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     Read a chunk from disk and send it to the output channel.
@@ -294,24 +334,27 @@ async def read_chunk(
         The output channel.
     ir_context
         The execution context for the IR node.
+    estimated_chunk_bytes
+        Estimated size of the chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
-    # Evaluate and send the Scan-node result
-    df = await asyncio.to_thread(
-        scan.do_evaluate,
-        *scan._non_child_args,
-        context=ir_context,
-    )
-    await ch_out.send(
-        context,
-        Message(
-            seq_num,
-            TableChunk.from_pylibcudf_table(
-                df.table,
-                df.stream,
-                exclusive_view=True,
+    with opaque_reservation(context, estimated_chunk_bytes):
+        df = await asyncio.to_thread(
+            scan.do_evaluate,
+            *scan._non_child_args,
+            context=ir_context,
+        )
+        await ch_out.send(
+            context,
+            Message(
+                seq_num,
+                TableChunk.from_pylibcudf_table(
+                    df.table,
+                    df.stream,
+                    exclusive_view=True,
+                ),
             ),
-        ),
-    )
+        )
 @define_py_node()
@@ -324,6 +367,7 @@ async def scan_node(
     num_producers: int,
     plan: IOPartitionPlan,
     parquet_options: ParquetOptions,
+    estimated_chunk_bytes: int,
 ) -> None:
     """
     Scan node for rapidsmpf.
@@ -344,8 +388,11 @@ async def scan_node(
         The partitioning plan.
     parquet_options
         The Parquet options.
+    estimated_chunk_bytes
+        Estimated size of each chunk in bytes. Used for memory reservation
+        with block spilling to avoid thrashing.
     """
-    async with shutdown_on_error(context, ch_out.data):
+    async with shutdown_on_error(context, ch_out.metadata, ch_out.data):
         # Build a list of local Scan operations
         scans: list[Scan | SplitScan] = []
         if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
@@ -353,9 +400,11 @@ async def scan_node(
             local_count = math.ceil(count / context.comm().nranks)
             local_offset = local_count * context.comm().rank
             path_offset = local_offset // plan.factor
-            path_count = math.ceil(local_count / plan.factor)
+            path_end = math.ceil((local_offset + local_count) / plan.factor)
+            path_count = path_end - path_offset
             local_paths = ir.paths[path_offset : path_offset + path_count]
             sindex = local_offset % plan.factor
+            splits_created = 0
             for path in local_paths:
                 base_scan = Scan(
                     ir.schema,
@@ -371,7 +420,7 @@ async def scan_node(
                     ir.predicate,
                     parquet_options,
                 )
-                while sindex < plan.factor:
+                while sindex < plan.factor and splits_created < local_count:
                     scans.append(
                         SplitScan(
                             ir.schema,
@@ -382,6 +431,7 @@ async def scan_node(
                         )
                     )
                     sindex += 1
+                    splits_created += 1
                 sindex = 0
         else:
@@ -392,22 +442,46 @@ async def scan_node(
             paths_offset_end = paths_offset_start + plan.factor * local_count
             for offset in range(paths_offset_start, paths_offset_end, plan.factor):
                 local_paths = ir.paths[offset : offset + plan.factor]
-                scans.append(
-                    Scan(
-                        ir.schema,
-                        ir.typ,
-                        ir.reader_options,
-                        ir.cloud_options,
-                        local_paths,
-                        ir.with_columns,
-                        ir.skip_rows,
-                        ir.n_rows,
-                        ir.row_index,
-                        ir.include_file_paths,
-                        ir.predicate,
-                        parquet_options,
+                if len(local_paths) > 0:  # Only add scan if there are paths
+                    scans.append(
+                        Scan(
+                            ir.schema,
+                            ir.typ,
+                            ir.reader_options,
+                            ir.cloud_options,
+                            local_paths,
+                            ir.with_columns,
+                            ir.skip_rows,
+                            ir.n_rows,
+                            ir.row_index,
+                            ir.include_file_paths,
+                            ir.predicate,
+                            parquet_options,
+                        )
                     )
+        # Send basic metadata
+        await ch_out.send_metadata(context, Metadata(max(1, len(scans))))
+        # If there is nothing to scan, drain the channel and return
+        if len(scans) == 0:
+            await ch_out.data.drain(context)
+            return
+        # If there is only one scan or one producer, we can
+        # skip the lineariser and read the chunks directly
+        if len(scans) == 1 or num_producers == 1:
+            for seq_num, scan in enumerate(scans):
+                await read_chunk(
+                    context,
+                    scan,
+                    seq_num,
+                    ch_out.data,
+                    ir_context,
+                    estimated_chunk_bytes,
                 )
+            await ch_out.data.drain(context)
+            return
         # Use Lineariser to ensure ordered delivery
         num_producers = min(num_producers, len(scans))
@@ -429,6 +503,7 @@ async def scan_node(
                     task_idx,
                     ch_out,
                     ir_context,
+                    estimated_chunk_bytes,
                 )
             await ch_out.drain(context)
@@ -548,9 +623,12 @@ def make_rapidsmpf_read_parquet_node(
 @generate_ir_sub_network.register(Scan)
-def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManager]]:
+def _(
+    ir: Scan, rec: SubNetGenerator
+) -> tuple[dict[IR, list[Any]], dict[IR, ChannelManager]]:
     config_options = rec.state["config_options"]
-    assert config_options.executor.name == "streaming", (
+    executor = rec.state["config_options"].executor
+    assert executor.name == "streaming", (
         "'in-memory' executor not supported in 'generate_ir_sub_network'"
     )
     parquet_options = config_options.parquet_options
@@ -558,17 +636,28 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
     num_producers = rec.state["max_io_threads"]
     channels: dict[IR, ChannelManager] = {ir: ChannelManager(rec.state["context"])}
-    # Use rapidsmpf native read_parquet for multi-partition Parquet scans.
+    assert partition_info.io_plan is not None, "Scan node must have a partition plan"
+    plan: IOPartitionPlan = partition_info.io_plan
+    # Native node cannot split large files in distributed mode yet
+    distributed_split_files = (
+        plan.flavor == IOPartitionFlavor.SPLIT_FILES
+        and rec.state["context"].comm().nranks > 1
+    )
+    # Use rapidsmpf native read_parquet node if possible
     ch_pair = channels[ir].reserve_input_slot()
-    nodes: list[Any]
+    nodes: dict[IR, list[Any]] = {}
     native_node: Any = None
     if (
-        partition_info.count > 1
+        parquet_options.use_rapidsmpf_native
+        and partition_info.count > 1
         and ir.typ == "parquet"
         and ir.row_index is None
         and ir.include_file_paths is None
         and ir.n_rows == -1
         and ir.skip_rows == 0
+        and not distributed_split_files
     ):
         native_node = make_rapidsmpf_read_parquet_node(
             rec.state["context"],
@@ -580,17 +669,19 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
         )
     if native_node is not None:
-        nodes = [native_node]
+        # Need metadata node, because the native read_parquet
+        # node does not send metadata.
+        metadata_node = metadata_feeder_node(
+            rec.state["context"],
+            ch_pair,
+            Metadata(partition_info.count),
+        )
+        nodes[ir] = [metadata_node, native_node]
     else:
         # Fall back to scan_node (predicate not convertible, or other constraint)
-        assert partition_info.io_plan is not None, (
-            "Scan node must have a partition plan"
-        )
-        plan: IOPartitionPlan = partition_info.io_plan
-        if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
-            parquet_options = dataclasses.replace(parquet_options, chunked=False)
+        parquet_options = dataclasses.replace(parquet_options, chunked=False)
-        nodes = [
+        nodes[ir] = [
             scan_node(
                 rec.state["context"],
                 ir,
@@ -599,6 +690,7 @@ def _(ir: Scan, rec: SubNetGenerator) -> tuple[list[Any], dict[IR, ChannelManage
                 num_producers=num_producers,
                 plan=plan,
                 parquet_options=parquet_options,
+                estimated_chunk_bytes=executor.target_partition_size,
             )
         ]
     return nodes, channels

cudf-polars-cu13 25.12.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.12.0py3-none-any.whl → 26.2.0py3-none-any.whl