PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/repartition.py CHANGED Viewed

@@ -5,6 +5,7 @@
 from __future__ import annotations
 import itertools
+from functools import partial
 from typing import TYPE_CHECKING, Any
 from cudf_polars.dsl.ir import IR
@@ -15,6 +16,7 @@ from cudf_polars.experimental.utils import _concat
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
+    from cudf_polars.dsl.ir import IRExecutionContext
     from cudf_polars.experimental.parallel import PartitionInfo
     from cudf_polars.typing import Schema
@@ -44,7 +46,9 @@ class Repartition(IR):
 @generate_ir_tasks.register(Repartition)
 def _(
-    ir: Repartition, partition_info: MutableMapping[IR, PartitionInfo]
+    ir: Repartition,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    context: IRExecutionContext,
 ) -> MutableMapping[Any, Any]:
     # Repartition an IR node.
     # Only supports rapartitioning to fewer (for now).
@@ -64,6 +68,9 @@ def _(
     offsets = [0, *itertools.accumulate(n + (i < remainder) for i in range(count_out))]
     child_keys = tuple(partition_info[child].keys(child))
     return {
-        (key_name, i): (_concat, *child_keys[offsets[i] : offsets[i + 1]])
+        (key_name, i): (
+            partial(_concat, context=context),
+            *child_keys[offsets[i] : offsets[i + 1]],
+        )
         for i in range(count_out)
     }

cudf_polars/experimental/select.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from __future__ import annotations
+from collections import defaultdict
 from typing import TYPE_CHECKING
 import polars as pl
@@ -12,20 +13,23 @@ from cudf_polars.dsl import expr
 from cudf_polars.dsl.expr import Col, Len
 from cudf_polars.dsl.ir import Empty, HConcat, Scan, Select, Union
 from cudf_polars.dsl.traversal import traversal
+from cudf_polars.dsl.utils.naming import unique_names
 from cudf_polars.experimental.base import ColumnStat, PartitionInfo
 from cudf_polars.experimental.dispatch import lower_ir_node
 from cudf_polars.experimental.expressions import decompose_expr_graph
+from cudf_polars.experimental.repartition import Repartition
 from cudf_polars.experimental.utils import (
     _contains_unsupported_fill_strategy,
     _lower_ir_fallback,
 )
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
+    from collections.abc import MutableMapping, Sequence
     from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.parallel import LowerIRTransformer
     from cudf_polars.experimental.statistics import StatsCollector
+    from cudf_polars.typing import Schema
     from cudf_polars.utils.config import ConfigOptions
@@ -74,7 +78,10 @@ def decompose_select(
     decompose_expr_graph
     """
     # Collect partial selections
-    selections = []
+    selections: list[Select] = []
+    name_generator = unique_names(
+        (*(ne.name for ne in select_ir.exprs), *input_ir.schema.keys())
+    )
     for ne in select_ir.exprs:
         # Decompose this partial expression
         new_ne, partial_input_ir, _partition_info = decompose_expr_graph(
@@ -84,6 +91,7 @@ def decompose_select(
             config_options,
             stats.row_count.get(select_ir.children[0], ColumnStat[int](None)),
             stats.column_stats.get(select_ir.children[0], {}),
+            name_generator,
         )
         pi = _partition_info[partial_input_ir]
         partial_input_ir = Select(
@@ -97,7 +105,11 @@ def decompose_select(
         selections.append(partial_input_ir)
     # Concatenate partial selections
-    new_ir: HConcat | Select
+    new_ir: Select | HConcat
+    selections, partition_info = _fuse_simple_reductions(
+        selections,
+        partition_info,
+    )
     if len(selections) > 1:
         new_ir = HConcat(
             select_ir.schema,
@@ -113,6 +125,151 @@ def decompose_select(
     return new_ir, partition_info
+def _fuse_simple_reductions(
+    decomposed_select_irs: Sequence[Select],
+    pi: MutableMapping[IR, PartitionInfo],
+) -> tuple[list[Select], MutableMapping[IR, PartitionInfo]]:
+    """
+    Fuse simple reductions that are part of the same Select node.
+    Parameters
+    ----------
+    decomposed_select_irs
+        The decomposed Select nodes.
+    pi
+        Partition information.
+    Returns
+    -------
+    fused_select_irs, pi
+        The new Select nodes, and the updated partition information.
+    """
+    # After a Select node is decomposed, it will be broken into
+    # one or more Select nodes that each target a different
+    # named expression. In some cases, one or more of these
+    # decomposed select nodes will be simple reductions that
+    # *should* be performed at the same time. Each "simple"
+    # reduction will have the following pattern:
+    #
+    #   # Partition-wise column selection (select_c)
+    #   Select(
+    #     # Outer Agg selection (select_b)
+    #     Select(
+    #       # Repartition to 1 (repartition)
+    #       Repartition(
+    #         # Inner Agg selection (select_a)
+    #         Select(
+    #           ...
+    #         )
+    #       )
+    #     )
+    #   )
+    #
+    # We need to fuse these simple reductions together to
+    # avoid unnecessary memory pressure.
+    # If there is only one decomposed_select_ir, return it
+    if len(decomposed_select_irs) == 1:
+        return list(decomposed_select_irs), pi
+    fused_select_c_exprs = []
+    fused_select_c_schema: Schema = {}
+    # Find reduction groups
+    reduction_groups: defaultdict[IR, list[Select]] = defaultdict(list)
+    for select_c in decomposed_select_irs:
+        # Final expressions and schema must be included in
+        # the fused select_c node even if this specific
+        # selection is not a simple reduction.
+        fused_select_c_exprs.extend(list(select_c.exprs))
+        fused_select_c_schema |= select_c.schema
+        if (
+            isinstance((select_b := select_c.children[0]), Select)
+            and pi[select_b].count == 1
+            and isinstance(repartition := select_b.children[0], Repartition)
+            and pi[repartition].count == 1
+            and isinstance(select_a := repartition.children[0], Select)
+        ):
+            # We have a simple reduction that may be
+            # fused with other simple reductions
+            # sharing the same root.
+            reduction_root = select_a.children[0]
+            reduction_groups[reduction_root].append(select_c)
+        else:
+            # Not a simple reduction.
+            # This selection becomes it own "group".
+            reduction_groups[select_c].append(select_c)
+    new_decomposed_select_irs: list[IR] = []
+    for root_ir, group in reduction_groups.items():
+        if len(group) > 1:
+            # Fuse simple-aggregation group
+            fused_select_b_exprs = []
+            fused_select_a_exprs = []
+            fused_select_b_schema: Schema = {}
+            fused_select_a_schema: Schema = {}
+            for select_c in group:
+                select_b = select_c.children[0]
+                assert isinstance(select_b, Select), (
+                    f"Expected Select, got {type(select_b)}"
+                )
+                fused_select_b_exprs.extend(list(select_b.exprs))
+                fused_select_b_schema |= select_b.schema
+                select_a = select_b.children[0].children[0]
+                assert isinstance(select_a, Select), (
+                    f"Expected Select, got {type(select_a)}"
+                )
+                fused_select_a_exprs.extend(list(select_a.exprs))
+                fused_select_a_schema |= select_a.schema
+            fused_select_a = Select(
+                fused_select_a_schema,
+                fused_select_a_exprs,
+                True,  # noqa: FBT003
+                root_ir,
+            )
+            pi[fused_select_a] = PartitionInfo(count=pi[root_ir].count)
+            fused_repartition = Repartition(fused_select_a_schema, fused_select_a)
+            pi[fused_repartition] = PartitionInfo(count=1)
+            fused_select_b = Select(
+                fused_select_b_schema,
+                fused_select_b_exprs,
+                True,  # noqa: FBT003
+                fused_repartition,
+            )
+            pi[fused_select_b] = PartitionInfo(count=1)
+            new_decomposed_select_irs.append(fused_select_b)
+        else:
+            # Nothing to fuse for this group
+            new_decomposed_select_irs.append(group[0])
+    # If any aggregations were fused, we must concatenate
+    # the results and apply the final (fused) "c" selection,
+    # otherwise we may mess up the ordering of the columns.
+    if len(new_decomposed_select_irs) < len(decomposed_select_irs):
+        # Compute schema from actual children (intermediate columns)
+        hconcat_schema: Schema = {}
+        for ir in new_decomposed_select_irs:
+            hconcat_schema |= ir.schema
+        new_hconcat = HConcat(
+            hconcat_schema,
+            True,  # noqa: FBT003
+            *new_decomposed_select_irs,
+        )
+        count = max(pi[c].count for c in new_decomposed_select_irs)
+        pi[new_hconcat] = PartitionInfo(count=count)
+        fused_select_c = Select(
+            fused_select_c_schema,
+            fused_select_c_exprs,
+            True,  # noqa: FBT003
+            new_hconcat,
+        )
+        pi[fused_select_c] = PartitionInfo(count=count)
+        return [fused_select_c], pi
+    return list(decomposed_select_irs), pi
 @lower_ir_node.register(Select)
 def _(
     ir: Select, rec: LowerIRTransformer
@@ -130,21 +287,27 @@ def _(
                 "for multiple partitions; falling back to in-memory evaluation."
             ),
         )
-    if (
-        pi.count == 1
-        and Select._is_len_expr(ir.exprs)
-        and isinstance(child, Union)
-        and len(child.children) == 1
-        and isinstance(child.children[0], Scan)
-        and child.children[0].predicate is None
-    ):
+    scan_child: Scan | None = None
+    if pi.count == 1 and Select._is_len_expr(ir.exprs):
+        if (
+            isinstance(child, Union)
+            and len(child.children) == 1
+            and isinstance(child.children[0], Scan)
+        ):
+            # Task engine case
+            scan_child = child.children[0]
+        elif isinstance(child, Scan):  # pragma: no cover; Requires rapidsmpf runtime
+            # RapidsMPF case
+            scan_child = child
+    if scan_child and scan_child.predicate is None:
         # Special Case: Fast count.
-        scan = child.children[0]
-        count = scan.fast_count()
+        count = scan_child.fast_count()
         dtype = ir.exprs[0].value.dtype
         lit_expr = expr.LiteralColumn(
-            dtype, pl.Series(values=[count], dtype=dtype.polars)
+            dtype, pl.Series(values=[count], dtype=dtype.polars_type)
         )
         named_expr = expr.NamedExpr(ir.exprs[0].name or "len", lit_expr)

cudf_polars/experimental/shuffle.py CHANGED Viewed

@@ -5,6 +5,7 @@
 from __future__ import annotations
 import operator
+from functools import partial
 from typing import TYPE_CHECKING, Any, Concatenate, Literal, TypeVar, TypedDict
 import pylibcudf as plc
@@ -13,16 +14,19 @@ from rmm.pylibrmm.stream import DEFAULT_STREAM
 from cudf_polars.containers import DataFrame
 from cudf_polars.dsl.expr import Col
 from cudf_polars.dsl.ir import IR
-from cudf_polars.dsl.tracing import nvtx_annotate_cudf_polars
+from cudf_polars.dsl.tracing import log_do_evaluate, nvtx_annotate_cudf_polars
 from cudf_polars.experimental.base import get_key_name
 from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
 from cudf_polars.experimental.utils import _concat
+from cudf_polars.utils.config import ShufflerInsertionMethod
+from cudf_polars.utils.cuda_stream import get_dask_cuda_stream
 if TYPE_CHECKING:
     from collections.abc import Callable, MutableMapping, Sequence
     from cudf_polars.containers import DataType
     from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.dsl.ir import IRExecutionContext
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.experimental.parallel import PartitionInfo
     from cudf_polars.typing import Schema
@@ -40,6 +44,7 @@ class ShuffleOptions(TypedDict):
     column_names: Sequence[str]
     dtypes: Sequence[DataType]
     cluster_kind: Literal["dask", "single"]
+    shuffler_insertion_method: ShufflerInsertionMethod
 # Experimental rapidsmpf shuffler integration
@@ -77,7 +82,14 @@ class RMPFIntegration:  # pragma: no cover
             br=context.br,
             stream=DEFAULT_STREAM,
         )
-        shuffler.insert_chunks(packed_inputs)
+        if (
+            options["shuffler_insertion_method"]
+            == ShufflerInsertionMethod.CONCAT_INSERT
+        ):
+            shuffler.concat_insert(packed_inputs)
+        else:
+            shuffler.insert_chunks(packed_inputs)
     @staticmethod
     @nvtx_annotate_cudf_polars(message="RMPFIntegration.extract_partition")
@@ -116,6 +128,7 @@ class RMPFIntegration:  # pragma: no cover
             ),
             column_names,
             dtypes,
+            get_dask_cuda_stream(),
         )
@@ -129,33 +142,44 @@ class Shuffle(IR):
     `ShuffleSorted` for sorting-based shuffling.
     """
-    __slots__ = ("keys", "shuffle_method")
-    _non_child = ("schema", "keys", "shuffle_method")
+    __slots__ = ("keys", "shuffle_method", "shuffler_insertion_method")
+    _non_child = ("schema", "keys", "shuffle_method", "shuffler_insertion_method")
     keys: tuple[NamedExpr, ...]
     """Keys to shuffle on."""
     shuffle_method: ShuffleMethod
     """Shuffle method to use."""
+    shuffler_insertion_method: ShufflerInsertionMethod
+    """Insertion method for rapidsmpf shuffler."""
     def __init__(
         self,
         schema: Schema,
         keys: tuple[NamedExpr, ...],
         shuffle_method: ShuffleMethod,
+        shuffler_insertion_method: ShufflerInsertionMethod,
         df: IR,
     ):
         self.schema = schema
         self.keys = keys
         self.shuffle_method = shuffle_method
-        self._non_child_args = (schema, keys, shuffle_method)
+        self.shuffler_insertion_method = shuffler_insertion_method
+        self._non_child_args = (schema, keys, shuffle_method, shuffler_insertion_method)
         self.children = (df,)
-    @classmethod
+    # the type-ignore is for
+    # Argument 1 to "log_do_evaluate" has incompatible type "Callable[[type[Shuffle], <snip>]"
+    #    expected Callable[[type[IR], <snip>]
+    # But Shuffle is a subclass of IR, so this is fine.
+    @classmethod  # type: ignore[arg-type]
+    @log_do_evaluate
     def do_evaluate(
         cls,
         schema: Schema,
         keys: tuple[NamedExpr, ...],
         shuffle_method: ShuffleMethod,
         df: DataFrame,
+        *,
+        context: IRExecutionContext,
     ) -> DataFrame:  # pragma: no cover
         """Evaluate and return a dataframe."""
         # Single-partition Shuffle evaluation is a no-op
@@ -201,11 +225,15 @@ def _hash_partition_dataframe(
     # partition for each row
     partition_map = plc.binaryop.binary_operation(
         plc.hashing.murmurhash3_x86_32(
-            DataFrame([expr.evaluate(df) for expr in on]).table
+            DataFrame([expr.evaluate(df) for expr in on], stream=df.stream).table,
+            stream=df.stream,
+        ),
+        plc.Scalar.from_py(
+            partition_count, plc.DataType(plc.TypeId.UINT32), stream=df.stream
         ),
-        plc.Scalar.from_py(partition_count, plc.DataType(plc.TypeId.UINT32)),
         plc.binaryop.BinaryOperator.PYMOD,
         plc.types.DataType(plc.types.TypeId.UINT32),
+        stream=df.stream,
     )
     # Apply partitioning
@@ -213,6 +241,7 @@ def _hash_partition_dataframe(
         df.table,
         partition_map,
         partition_count,
+        stream=df.stream,
     )
     splits = offsets[1:-1]
@@ -222,8 +251,9 @@ def _hash_partition_dataframe(
             split,
             df.column_names,
             df.dtypes,
+            df.stream,
         )
-        for i, split in enumerate(plc.copying.split(t, splits))
+        for i, split in enumerate(plc.copying.split(t, splits, stream=df.stream))
     }
@@ -242,6 +272,7 @@ def _simple_shuffle_graph(
     ],
     options: OPT_T,
     *other: Any,
+    context: IRExecutionContext,
 ) -> MutableMapping[Any, Any]:
     """Make a simple all-to-all shuffle graph."""
     split_name = f"split-{name_out}"
@@ -265,7 +296,7 @@ def _simple_shuffle_graph(
                 (split_name, part_in),
                 part_out,
             )
-        graph[(name_out, part_out)] = (_concat, *_concat_list)
+        graph[(name_out, part_out)] = (partial(_concat, context=context), *_concat_list)
     return graph
@@ -296,7 +327,9 @@ def _(
 @generate_ir_tasks.register(Shuffle)
 def _(
-    ir: Shuffle, partition_info: MutableMapping[IR, PartitionInfo]
+    ir: Shuffle,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    context: IRExecutionContext,
 ) -> MutableMapping[Any, Any]:
     # Extract "shuffle_method" configuration
     shuffle_method = ir.shuffle_method
@@ -331,6 +364,7 @@ def _(
                     "column_names": list(ir.schema.keys()),
                     "dtypes": list(ir.schema.values()),
                     "cluster_kind": cluster_kind,
+                    "shuffler_insertion_method": ir.shuffler_insertion_method,
                 },
             )
         except ValueError as err:
@@ -343,7 +377,7 @@ def _(
                 ) from err
     # Simple task-based fall-back
-    return _simple_shuffle_graph(
+    return partial(_simple_shuffle_graph, context=context)(
         get_key_name(ir.children[0]),
         get_key_name(ir),
         partition_info[ir.children[0]].count,

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl