PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/groupby.py ADDED Viewed

@@ -0,0 +1,288 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Parallel GroupBy Logic."""
+from __future__ import annotations
+import itertools
+import math
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl.expr import Agg, BinOp, Col, Len, NamedExpr
+from cudf_polars.dsl.ir import GroupBy, Select
+from cudf_polars.dsl.traversal import traversal
+from cudf_polars.dsl.utils.naming import unique_names
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
+from cudf_polars.experimental.repartition import Repartition
+from cudf_polars.experimental.shuffle import Shuffle
+from cudf_polars.experimental.utils import _lower_ir_fallback
+if TYPE_CHECKING:
+    from collections.abc import Generator, MutableMapping
+    from cudf_polars.dsl.expr import Expr
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.parallel import LowerIRTransformer
+# Supported multi-partition aggregations
+_GB_AGG_SUPPORTED = ("sum", "count", "mean", "min", "max", "n_unique")
+def combine(
+    *decompositions: tuple[NamedExpr, list[NamedExpr], list[NamedExpr]],
+) -> tuple[list[NamedExpr], list[NamedExpr], list[NamedExpr]]:
+    """
+    Combine multiple groupby-aggregation decompositions.
+    Parameters
+    ----------
+    decompositions
+        Packed sequence of `decompose` results.
+    Returns
+    -------
+    Unified groupby-aggregation decomposition.
+    """
+    if len(decompositions) == 0:
+        return [], [], []
+    selections, aggregations, reductions = zip(*decompositions, strict=True)
+    assert all(isinstance(ne, NamedExpr) for ne in selections)
+    return (
+        list(selections),
+        list(itertools.chain.from_iterable(aggregations)),
+        list(itertools.chain.from_iterable(reductions)),
+    )
+def decompose(
+    name: str, expr: Expr, *, names: Generator[str, None, None]
+) -> tuple[NamedExpr, list[NamedExpr], list[NamedExpr]]:
+    """
+    Decompose a groupby-aggregation expression.
+    Parameters
+    ----------
+    name
+        Output schema name.
+    expr
+        The aggregation expression for a single column.
+    names
+        Generator of unique names for temporaries.
+    Returns
+    -------
+    NamedExpr
+        The expression selecting the *output* column or columns.
+    list[NamedExpr]
+        The initial aggregation expressions.
+    list[NamedExpr]
+        The reduction expressions.
+    """
+    dtype = expr.dtype
+    if isinstance(expr, Len):
+        selection = NamedExpr(name, Col(dtype, name))
+        aggregation = [NamedExpr(name, expr)]
+        reduction = [NamedExpr(name, Agg(dtype, "sum", None, Col(dtype, name)))]
+        return selection, aggregation, reduction
+    if isinstance(expr, Agg):
+        if expr.name in ("sum", "count", "min", "max", "n_unique"):
+            if expr.name in ("sum", "count", "n_unique"):
+                aggfunc = "sum"
+            else:
+                aggfunc = expr.name
+            selection = NamedExpr(name, Col(dtype, name))
+            aggregation = [NamedExpr(name, expr)]
+            reduction = [NamedExpr(name, Agg(dtype, aggfunc, None, Col(dtype, name)))]
+            return selection, aggregation, reduction
+        elif expr.name == "mean":
+            (child,) = expr.children
+            (sum, count), aggregations, reductions = combine(
+                decompose(
+                    f"{next(names)}__mean_sum",
+                    Agg(dtype, "sum", None, child),
+                    names=names,
+                ),
+                decompose(f"{next(names)}__mean_count", Len(dtype), names=names),
+            )
+            selection = NamedExpr(
+                name,
+                BinOp(dtype, plc.binaryop.BinaryOperator.DIV, sum.value, count.value),
+            )
+            return selection, aggregations, reductions
+        else:
+            raise NotImplementedError(
+                "group_by does not support multiple partitions "
+                f"for this aggregation type:\n{type(expr)}\n"
+                f"Only {_GB_AGG_SUPPORTED} are supported."
+            )
+    else:  # pragma: no cover
+        # Unsupported expression
+        raise NotImplementedError(
+            f"GroupBy does not support multiple partitions for this expression:\n{expr}"
+        )
+@lower_ir_node.register(GroupBy)
+def _(
+    ir: GroupBy, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Extract child partitioning
+    child, partition_info = rec(ir.children[0])
+    # Handle single-partition case
+    if partition_info[child].count == 1:
+        single_part_node = ir.reconstruct([child])
+        partition_info[single_part_node] = partition_info[child]
+        return single_part_node, partition_info
+    # Check group-by keys
+    if not all(
+        expr.is_pointwise for expr in traversal([e.value for e in ir.keys])
+    ):  # pragma: no cover
+        return _lower_ir_fallback(
+            ir,
+            rec,
+            msg="group_by does not support multiple partitions for non-pointwise keys.",
+        )
+    # Check if we are dealing with any high-cardinality columns
+    post_aggregation_count = 1  # Default tree reduction
+    groupby_key_columns = [ne.name for ne in ir.keys]
+    shuffled = partition_info[child].partitioned_on == ir.keys
+    assert ir.config_options.executor.name == "streaming", (
+        "'in-memory' executor not supported in 'generate_ir_tasks'"
+    )
+    cardinality_factor = {
+        c: min(f, 1.0)
+        for c, f in ir.config_options.executor.cardinality_factor.items()
+        if c in groupby_key_columns
+    }
+    if cardinality_factor:
+        # The `cardinality_factor` dictionary can be used
+        # to specify a mapping between column names and
+        # cardinality "factors". Each factor estimates the
+        # fractional number of unique values in the column.
+        # Each value should be in the range (0, 1].
+        child_count = partition_info[child].count
+        post_aggregation_count = max(
+            int(max(cardinality_factor.values()) * child_count),
+            1,
+        )
+    new_node: IR
+    name_generator = unique_names(ir.schema.keys())
+    # Decompose the aggregation requests into three distinct phases
+    try:
+        selection_exprs, piecewise_exprs, reduction_exprs = combine(
+            *(
+                decompose(agg.name, agg.value, names=name_generator)
+                for agg in ir.agg_requests
+            )
+        )
+    except NotImplementedError:
+        if shuffled:  # pragma: no cover
+            # Don't fallback if we are already shuffled.
+            # We can just preserve the child's partitioning
+            new_node = ir.reconstruct([child])
+            partition_info[new_node] = partition_info[child]
+            return new_node, partition_info
+        return _lower_ir_fallback(
+            ir, rec, msg="Failed to decompose groupby aggs for multiple partitions."
+        )
+    # Partition-wise groupby operation
+    pwise_schema = {k.name: k.value.dtype for k in ir.keys} | {
+        k.name: k.value.dtype for k in piecewise_exprs
+    }
+    gb_pwise = GroupBy(
+        pwise_schema,
+        ir.keys,
+        piecewise_exprs,
+        ir.maintain_order,
+        None,
+        ir.config_options,
+        child,
+    )
+    child_count = partition_info[child].count
+    partition_info[gb_pwise] = PartitionInfo(count=child_count)
+    # Reduction
+    gb_inter: GroupBy | Repartition | Shuffle
+    reduction_schema = {k.name: k.value.dtype for k in ir.keys} | {
+        k.name: k.value.dtype for k in reduction_exprs
+    }
+    if not shuffled and post_aggregation_count > 1:
+        # Shuffle reduction
+        if ir.maintain_order:  # pragma: no cover
+            return _lower_ir_fallback(
+                ir,
+                rec,
+                msg="maintain_order not supported for multiple output partitions.",
+            )
+        gb_inter = Shuffle(
+            gb_pwise.schema,
+            ir.keys,
+            ir.config_options,
+            gb_pwise,
+        )
+        partition_info[gb_inter] = PartitionInfo(count=post_aggregation_count)
+    else:
+        # N-ary tree reduction
+        assert ir.config_options.executor.name == "streaming", (
+            "'in-memory' executor not supported in 'generate_ir_tasks'"
+        )
+        n_ary = ir.config_options.executor.groupby_n_ary
+        count = child_count
+        gb_inter = gb_pwise
+        while count > post_aggregation_count:
+            gb_inter = Repartition(gb_inter.schema, gb_inter)
+            count = max(math.ceil(count / n_ary), post_aggregation_count)
+            partition_info[gb_inter] = PartitionInfo(count=count)
+            if count > post_aggregation_count:
+                gb_inter = GroupBy(
+                    reduction_schema,
+                    ir.keys,
+                    reduction_exprs,
+                    ir.maintain_order,
+                    None,
+                    ir.config_options,
+                    gb_inter,
+                )
+                partition_info[gb_inter] = PartitionInfo(count=count)
+    # Final aggregation
+    gb_reduce = GroupBy(
+        reduction_schema,
+        ir.keys,
+        reduction_exprs,
+        ir.maintain_order,
+        ir.zlice,
+        ir.config_options,
+        gb_inter,
+    )
+    partition_info[gb_reduce] = PartitionInfo(count=post_aggregation_count)
+    # Final Select phase
+    new_node = Select(
+        ir.schema,
+        [
+            *(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in ir.keys),
+            *selection_exprs,
+        ],
+        False,  # noqa: FBT003
+        gb_reduce,
+    )
+    partition_info[new_node] = PartitionInfo(
+        count=post_aggregation_count,
+        partitioned_on=ir.keys,
+    )
+    return new_node, partition_info

cudf_polars/experimental/io.py CHANGED Viewed

@@ -4,11 +4,12 @@
 from __future__ import annotations
+import dataclasses
 import enum
 import math
 import random
 from enum import IntEnum
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypeVar
 import pylibcudf as plc
@@ -19,19 +20,28 @@ from cudf_polars.experimental.dispatch import lower_ir_node
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
+    import numpy as np
+    import numpy.typing as npt
+    from cudf_polars.containers import DataFrame
     from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.typing import Schema
+    from cudf_polars.utils.config import ConfigOptions
+    T = TypeVar("T", bound=npt.NBitBase)
 @lower_ir_node.register(DataFrameScan)
 def _(
     ir: DataFrameScan, rec: LowerIRTransformer
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    rows_per_partition = ir.config_options.get("executor_options", {}).get(
-        "max_rows_per_partition", 1_000_000
+    assert ir.config_options.executor.name == "streaming", (
+        "'in-memory' executor not supported in 'generate_ir_tasks'"
     )
+    rows_per_partition = ir.config_options.executor.max_rows_per_partition
     nrows = max(ir.df.shape()[0], 1)
     count = math.ceil(nrows / rows_per_partition)
@@ -91,10 +101,18 @@ class ScanPartitionPlan:
         """Extract the partitioning plan of a Scan operation."""
         if ir.typ == "parquet":
             # TODO: Use system info to set default blocksize
-            parallel_options = ir.config_options.get("executor_options", {})
-            blocksize: int = parallel_options.get("parquet_blocksize", 1024**3)
-            stats = _sample_pq_statistics(ir)
-            file_size = sum(float(stats[column]) for column in ir.schema)
+            assert ir.config_options.executor.name == "streaming", (
+                "'in-memory' executor not supported in 'generate_ir_tasks'"
+            )
+            blocksize: int = ir.config_options.executor.target_partition_size
+            # _sample_pq_statistics is generic over the bit-width of the array
+            # We don't care about that here, so we ignore it.
+            stats = _sample_pq_statistics(ir)  # type: ignore[var-annotated]
+            # Some columns (e.g., "include_file_paths") may be present in the schema
+            # but not in the Parquet statistics dict. We use stats.get(column, 0)
+            # to safely fall back to 0 in those cases.
+            file_size = sum(float(stats.get(column, 0)) for column in ir.schema)
             if file_size > 0:
                 if file_size > blocksize:
                     # Split large files
@@ -168,14 +186,15 @@ class SplitScan(IR):
         schema: Schema,
         typ: str,
         reader_options: dict[str, Any],
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
         n_rows: int,
         row_index: tuple[str, int] | None,
+        include_file_paths: str | None,
         predicate: NamedExpr | None,
-    ):
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if typ not in ("parquet",):  # pragma: no cover
             raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}")
@@ -233,30 +252,37 @@ class SplitScan(IR):
             skip_rows,
             n_rows,
             row_index,
+            include_file_paths,
             predicate,
         )
-def _sample_pq_statistics(ir: Scan) -> dict[str, float]:
+def _sample_pq_statistics(ir: Scan) -> dict[str, np.floating[T]]:
+    import itertools
     import numpy as np
-    import pyarrow.dataset as pa_ds
     # Use average total_uncompressed_size of three files
-    # TODO: Use plc.io.parquet_metadata.read_parquet_metadata
-    n_sample = 3
+    n_sample = min(3, len(ir.paths))
+    metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(random.sample(ir.paths, n_sample))
+    )
     column_sizes = {}
-    ds = pa_ds.dataset(random.sample(ir.paths, n_sample), format="parquet")
-    for i, frag in enumerate(ds.get_fragments()):
-        md = frag.metadata
-        for rg in range(md.num_row_groups):
-            row_group = md.row_group(rg)
-            for col in range(row_group.num_columns):
-                column = row_group.column(col)
-                name = column.path_in_schema
-                if name not in column_sizes:
-                    column_sizes[name] = np.zeros(n_sample, dtype="int64")
-                column_sizes[name][i] += column.total_uncompressed_size
+    rowgroup_offsets_per_file = np.insert(
+        np.cumsum(metadata.num_rowgroups_per_file()), 0, 0
+    )
+    # For each column, calculate the `total_uncompressed_size` for each file
+    for name, uncompressed_sizes in metadata.columnchunk_metadata().items():
+        column_sizes[name] = np.array(
+            [
+                np.sum(uncompressed_sizes[start:end])
+                for (start, end) in itertools.pairwise(rowgroup_offsets_per_file)
+            ],
+            dtype="int64",
+        )
+    # Return the mean per-file `total_uncompressed_size` for each column
     return {name: np.mean(sizes) for name, sizes in column_sizes.items()}
@@ -270,11 +296,12 @@ def _(
         paths = list(ir.paths)
         if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
             # Disable chunked reader when splitting files
-            config_options = ir.config_options.copy()
-            config_options["parquet_options"] = config_options.get(
-                "parquet_options", {}
-            ).copy()
-            config_options["parquet_options"]["chunked"] = False
+            config_options = dataclasses.replace(
+                ir.config_options,
+                parquet_options=dataclasses.replace(
+                    ir.config_options.parquet_options, chunked=False
+                ),
+            )
             slices: list[SplitScan] = []
             for path in paths:
@@ -289,6 +316,7 @@ def _(
                     ir.skip_rows,
                     ir.n_rows,
                     ir.row_index,
+                    ir.include_file_paths,
                     ir.predicate,
                 )
                 slices.extend(
@@ -312,6 +340,7 @@ def _(
                     ir.skip_rows,
                     ir.n_rows,
                     ir.row_index,
+                    ir.include_file_paths,
                     ir.predicate,
                 )
                 for i in range(0, len(paths), plan.factor)

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl