PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/utils/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""DSL utilities."""
+from __future__ import annotations
+__all__: list[str] = []

cudf_polars/dsl/utils/aggregations.py ADDED Viewed

@@ -0,0 +1,292 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for rewriting aggregations."""
+from __future__ import annotations
+import itertools
+from functools import partial
+from typing import TYPE_CHECKING, Any
+import pylibcudf as plc
+from cudf_polars.dsl import expr, ir
+if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Iterable, Sequence
+    from cudf_polars.typing import Schema
+__all__ = ["apply_pre_evaluation", "decompose_aggs", "decompose_single_agg"]
+def replace_nulls(col: expr.Expr, value: Any, *, is_top: bool) -> expr.Expr:
+    """
+    Replace nulls with the given scalar if at top level.
+    Parameters
+    ----------
+    col
+        Expression to replace nulls in.
+    value
+        Scalar replacement
+    is_top
+        Is this top-level (should replacement be performed).
+    Returns
+    -------
+    Massaged expression.
+    """
+    if not is_top:
+        return col
+    return expr.UnaryFunction(
+        col.dtype, "fill_null", (), col, expr.Literal(col.dtype, value)
+    )
+def decompose_single_agg(
+    named_expr: expr.NamedExpr,
+    name_generator: Generator[str, None, None],
+    *,
+    is_top: bool,
+) -> tuple[list[tuple[expr.NamedExpr, bool]], expr.NamedExpr]:
+    """
+    Decompose a single named aggregation.
+    Parameters
+    ----------
+    named_expr
+        The named aggregation to decompose
+    name_generator
+        Generator of unique names for temporaries introduced during decomposition.
+    is_top
+        Is this the top of an aggregation expression?
+    Returns
+    -------
+    aggregations
+        Pairs of expressions to apply as grouped aggregations (whose children
+        may be evaluated pointwise) and flags indicating if the
+        expression contained nested aggregations.
+    post_aggregate
+        Single expression to apply to post-process the grouped
+        aggregations.
+    Raises
+    ------
+    NotImplementedError
+        If the expression contains nested aggregations or unsupported
+        operations in a grouped aggregation context.
+    """
+    agg = named_expr.value
+    name = named_expr.name
+    if isinstance(agg, expr.Col):
+        # TODO: collect_list produces null for empty group in libcudf, empty list in polars.
+        # But we need the nested value type, so need to track proper dtypes in our DSL.
+        return [(named_expr, False)], named_expr.reconstruct(expr.Col(agg.dtype, name))
+    if is_top and isinstance(agg, expr.Cast) and isinstance(agg.children[0], expr.Len):
+        # Special case to fill nulls with zeros for empty group length calculations
+        (child,) = agg.children
+        child_agg, post = decompose_single_agg(
+            expr.NamedExpr(next(name_generator), child), name_generator, is_top=True
+        )
+        return child_agg, named_expr.reconstruct(
+            replace_nulls(
+                agg.reconstruct([post.value]),
+                0,
+                is_top=True,
+            )
+        )
+    if isinstance(agg, expr.Len):
+        return [(named_expr, True)], named_expr.reconstruct(expr.Col(agg.dtype, name))
+    if isinstance(agg, (expr.Literal, expr.LiteralColumn)):
+        return [], named_expr
+    if isinstance(agg, expr.Agg):
+        if agg.name == "quantile":
+            # Second child the requested quantile (which is asserted
+            # to be a literal on construction)
+            child = agg.children[0]
+        else:
+            (child,) = agg.children
+        needs_masking = agg.name in {"min", "max"} and plc.traits.is_floating_point(
+            child.dtype
+        )
+        if needs_masking and agg.options:
+            # pl.col("a").nan_max or nan_min
+            raise NotImplementedError("Nan propagation in groupby for min/max")
+        aggs, _ = decompose_single_agg(
+            expr.NamedExpr(next(name_generator), child), name_generator, is_top=False
+        )
+        if any(has_agg for _, has_agg in aggs):
+            raise NotImplementedError("Nested aggs in groupby not supported")
+        if needs_masking:
+            child = expr.UnaryFunction(child.dtype, "mask_nans", (), child)
+            # The aggregation is just reconstructed with the new
+            # (potentially masked) child. This is safe because we recursed
+            # to ensure there are no nested aggregations.
+            return (
+                [(named_expr.reconstruct(agg.reconstruct([child])), True)],
+                named_expr.reconstruct(expr.Col(agg.dtype, name)),
+            )
+        elif agg.name == "sum":
+            col = (
+                expr.Cast(agg.dtype, expr.Col(plc.DataType(plc.TypeId.INT64), name))
+                if (
+                    plc.traits.is_integral(agg.dtype)
+                    and agg.dtype.id() != plc.TypeId.INT64
+                )
+                else expr.Col(agg.dtype, name)
+            )
+            return [(named_expr, True)], expr.NamedExpr(
+                name,
+                # In polars sum(empty_group) => 0, but in libcudf
+                # sum(empty_group) => null So must post-process by
+                # replacing nulls, but only if we're a "top-level"
+                # agg.
+                replace_nulls(col, 0, is_top=is_top),
+            )
+        else:
+            return [(named_expr, True)], named_expr.reconstruct(
+                expr.Col(agg.dtype, name)
+            )
+    if isinstance(agg, expr.Ternary):
+        raise NotImplementedError("Ternary inside groupby")
+    if agg.is_pointwise:
+        aggs, posts = _decompose_aggs(
+            (expr.NamedExpr(next(name_generator), child) for child in agg.children),
+            name_generator,
+            is_top=False,
+        )
+        if any(has_agg for _, has_agg in aggs):
+            if not all(
+                has_agg or isinstance(agg.value, expr.Literal) for agg, has_agg in aggs
+            ):
+                raise NotImplementedError(
+                    "Broadcasting aggregated expressions in groupby/rolling"
+                )
+            # Any pointwise expression can be handled either by
+            # post-evaluation (if outside an aggregation).
+            return (
+                aggs,
+                named_expr.reconstruct(agg.reconstruct([p.value for p in posts])),
+            )
+        else:
+            # Or pre-evaluation if inside an aggregation.
+            return (
+                [(named_expr, False)],
+                named_expr.reconstruct(expr.Col(agg.dtype, name)),
+            )
+    raise NotImplementedError(f"No support for {type(agg)} in groupby/rolling")
+def _decompose_aggs(
+    aggs: Iterable[expr.NamedExpr],
+    name_generator: Generator[str, None, None],
+    *,
+    is_top: bool,
+) -> tuple[list[tuple[expr.NamedExpr, bool]], Sequence[expr.NamedExpr]]:
+    new_aggs, post = zip(
+        *(decompose_single_agg(agg, name_generator, is_top=is_top) for agg in aggs),
+        strict=True,
+    )
+    return list(itertools.chain.from_iterable(new_aggs)), post
+def decompose_aggs(
+    aggs: Iterable[expr.NamedExpr], name_generator: Generator[str, None, None]
+) -> tuple[list[expr.NamedExpr], Sequence[expr.NamedExpr]]:
+    """
+    Process arbitrary aggregations into a form we can handle in grouped aggregations.
+    Parameters
+    ----------
+    aggs
+        List of aggregation expressions
+    name_generator
+        Generator of unique names for temporaries introduced during decomposition.
+    Returns
+    -------
+    aggregations
+        Aggregations to apply in the groupby node.
+    post_aggregations
+        Expressions to apply after aggregating (as a ``Select``).
+    Notes
+    -----
+    The aggregation expressions are guaranteed to either be
+    expressions that can be pointwise evaluated before the groupby
+    operation, or aggregations of such expressions.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported aggregation combinations.
+    """
+    new_aggs, post = _decompose_aggs(aggs, name_generator, is_top=True)
+    return [agg for agg, _ in new_aggs], post
+def apply_pre_evaluation(
+    output_schema: Schema,
+    keys: Sequence[expr.NamedExpr],
+    original_aggs: Sequence[expr.NamedExpr],
+    name_generator: Generator[str, None, None],
+    *extra_columns: expr.NamedExpr,
+) -> tuple[Sequence[expr.NamedExpr], Schema, Callable[[ir.IR], ir.IR]]:
+    """
+    Apply pre-evaluation to aggregations in a grouped or rolling context.
+    Parameters
+    ----------
+    output_schema
+        Schema of the plan node we're rewriting.
+    keys
+        Grouping keys (may be empty).
+    original_aggs
+        Aggregation expressions to rewrite.
+    name_generator
+        Generator of unique names for temporaries introduced during decomposition.
+    extra_columns
+        Any additional columns to be included in the output (only
+        relevant for rolling aggregations). Columns will appear in the
+        order `keys, extra_columns, original_aggs`.
+    Returns
+    -------
+    aggregations
+        The required aggregations.
+    schema
+        The new schema of the aggregation node
+    post_process
+        Function to apply to the aggregation node to apply any
+        post-processing.
+    Raises
+    ------
+    NotImplementedError
+        If the aggregations are somehow unsupported.
+    """
+    aggs, post = decompose_aggs(original_aggs, name_generator)
+    assert len(post) == len(original_aggs), (
+        f"Unexpected number of post-aggs {len(post)=} {len(original_aggs)=}"
+    )
+    # Order-preserving unique
+    aggs = list(dict.fromkeys(aggs).keys())
+    if any(not isinstance(e.value, expr.Col) for e in post):
+        selection = [
+            *(key.reconstruct(expr.Col(key.value.dtype, key.name)) for key in keys),
+            *extra_columns,
+            *post,
+        ]
+        inter_schema = {
+            e.name: e.value.dtype for e in itertools.chain(keys, extra_columns, aggs)
+        }
+        return (
+            aggs,
+            inter_schema,
+            partial(ir.Select, output_schema, selection, True),  # noqa: FBT003
+        )
+    else:
+        return aggs, output_schema, lambda inp: inp

cudf_polars/dsl/utils/groupby.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for grouped aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl import ir
+from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation
+from cudf_polars.dsl.utils.naming import unique_names
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from typing import Any
+    from cudf_polars.dsl import expr
+    from cudf_polars.utils import config
+__all__ = ["rewrite_groupby"]
+def rewrite_groupby(
+    node: Any,
+    schema: dict[str, plc.DataType],
+    keys: Sequence[expr.NamedExpr],
+    aggs: Sequence[expr.NamedExpr],
+    config_options: config.ConfigOptions,
+    inp: ir.IR,
+) -> ir.IR:
+    """
+    Rewrite a groupby plan node into something we can handle.
+    Parameters
+    ----------
+    node
+        The polars groupby plan node.
+    schema
+        Schema of the groupby plan node.
+    keys
+        Grouping keys.
+    aggs
+        Originally requested aggregations.
+    config_options
+        Configuration options.
+    inp
+        Input plan node to the groupby.
+    Returns
+    -------
+    New plan node representing the grouped aggregations.
+    Raises
+    ------
+    NotImplementedError
+        If any of the requested aggregations are unsupported.
+    Notes
+    -----
+    Since libcudf can only perform grouped aggregations on columns
+    (not arbitrary expressions), the approach is to split each
+    aggregation into a pre-selection phase (evaluating expressions
+    that live within an aggregation), the aggregation phase (now
+    acting on columns only), and a post-selection phase (evaluating
+    expressions of aggregated results).
+    This does scheme does not permit nested aggregations, so those are
+    unsupported.
+    """
+    if len(aggs) == 0:
+        return ir.Distinct(
+            schema,
+            plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
+            None,
+            node.options.slice,
+            node.maintain_order,
+            ir.Select(schema, keys, True, inp),  # noqa: FBT003
+        )
+    aggs, group_schema, apply_post_evaluation = apply_pre_evaluation(
+        schema, keys, aggs, unique_names(schema.keys())
+    )
+    # TODO: use Distinct when the partitioned executor supports it if
+    # the requested aggregations are empty
+    inp = ir.GroupBy(
+        group_schema,
+        keys,
+        aggs,
+        node.maintain_order,
+        node.options.slice,
+        config_options,
+        inp,
+    )
+    return apply_post_evaluation(inp)

cudf_polars/dsl/utils/naming.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Name generation utilities."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Generator, Iterable
+__all__ = ["unique_names"]
+def unique_names(names: Iterable[str]) -> Generator[str, None, None]:
+    """
+    Generate unique names relative to some known names.
+    Parameters
+    ----------
+    names
+        Names we should be unique with respect to.
+    Yields
+    ------
+    Unique names (just using sequence numbers)
+    """
+    prefix = "_" * max(map(len, names))
+    i = 0
+    while True:
+        yield f"{prefix}{i}"
+        i += 1

cudf_polars/dsl/utils/replace.py ADDED Viewed

@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for replacing nodes in a DAG."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+    from cudf_polars.typing import GenericTransformer, NodeT
+__all__ = ["replace"]
+def _replace(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
+    try:
+        return fn.state["replacements"][node]
+    except KeyError:
+        return reuse_if_unchanged(node, fn)
+def replace(nodes: Sequence[NodeT], replacements: Mapping[NodeT, NodeT]) -> list[NodeT]:
+    """
+    Replace nodes in expressions.
+    Parameters
+    ----------
+    nodes
+        Sequence of nodes to perform replacements in.
+    replacements
+        Mapping from nodes to be replaced to their replacements.
+    Returns
+    -------
+    list
+        Of nodes with replacements performed.
+    """
+    mapper: GenericTransformer[NodeT, NodeT] = CachingVisitor(
+        _replace, state={"replacements": replacements}
+    )
+    return [mapper(node) for node in nodes]

cudf_polars/dsl/utils/rolling.py ADDED Viewed

@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for rolling window aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation
+from cudf_polars.dsl.utils.naming import unique_names
+from cudf_polars.dsl.utils.windows import offsets_to_windows
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from typing import Any
+    from cudf_polars.typing import Schema
+    from cudf_polars.utils import config
+__all__ = ["rewrite_rolling"]
+def rewrite_rolling(
+    options: Any,
+    schema: Schema,
+    keys: Sequence[expr.NamedExpr],
+    aggs: Sequence[expr.NamedExpr],
+    config_options: config.ConfigOptions,
+    inp: ir.IR,
+) -> ir.IR:
+    """
+    Rewrite a rolling plan node into something we can handle.
+    Parameters
+    ----------
+    options
+        Rolling-specific group options.
+    schema
+        Schema of the rolling plan node.
+    keys
+        Grouping keys for the rolling node (may be empty).
+    aggs
+        Originally requested rolling aggregations.
+    config_options
+        Configuration options (currently unused).
+    inp
+        Input plan node to the rolling aggregation.
+    Returns
+    -------
+    New plan node representing the rolling aggregations
+    Raises
+    ------
+    NotImplementedError
+        If any of the requested aggregations are unsupported.
+    Notes
+    -----
+    Since libcudf can only perform rolling aggregations on columns
+    (not arbitrary expressions), the approach is to split each
+    aggregation into a pre-selection phase (evaluating expressions
+    that live within an aggregation), the aggregation phase (now
+    acting on columns only), and a post-selection phase (evaluating
+    expressions of aggregated results).
+    This scheme does not permit nested aggregations, so those are
+    unsupported.
+    """
+    index_name = options.rolling.index_column
+    index_dtype = schema[index_name]
+    index_col = expr.Col(index_dtype, index_name)
+    if plc.traits.is_integral(index_dtype) and index_dtype.id() != plc.TypeId.INT64:
+        index_dtype = plc.DataType(plc.TypeId.INT64)
+    index = expr.NamedExpr(index_name, index_col)
+    temp_prefix = "_" * max(map(len, schema))
+    if len(aggs) > 0:
+        aggs, rolling_schema, apply_post_evaluation = apply_pre_evaluation(
+            schema, keys, aggs, unique_names(temp_prefix), index
+        )
+    else:
+        rolling_schema = schema
+        apply_post_evaluation = lambda inp: inp  # noqa: E731
+    preceding, following = offsets_to_windows(
+        index_dtype, options.rolling.offset, options.rolling.period
+    )
+    if (n := len(keys)) > 0:
+        # Grouped rolling in polars sorts the output by the groups.
+        inp = ir.Sort(
+            inp.schema,
+            keys,
+            [plc.types.Order.ASCENDING] * n,
+            [plc.types.NullOrder.BEFORE] * n,
+            True,  # noqa: FBT003
+            None,
+            inp,
+        )
+    return apply_post_evaluation(
+        ir.Rolling(
+            rolling_schema,
+            index,
+            preceding,
+            following,
+            options.rolling.closed_window,
+            keys,
+            aggs,
+            options.slice,
+            inp,
+        )
+    )

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl