PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/utils/windows.py ADDED Viewed

@@ -0,0 +1,186 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for rolling window aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import polars as pl
+import pylibcudf as plc
+if TYPE_CHECKING:
+    from cudf_polars.typing import ClosedInterval, Duration
+__all__ = [
+    "duration_to_int",
+    "duration_to_scalar",
+    "offsets_to_windows",
+    "range_window_bounds",
+]
+def duration_to_int(
+    dtype: plc.DataType,
+    months: int,
+    weeks: int,
+    days: int,
+    nanoseconds: int,
+    parsed_int: bool,  # noqa: FBT001
+    negative: bool,  # noqa: FBT001
+) -> int:
+    """
+    Convert a polars duration value to an integer.
+    Parameters
+    ----------
+    dtype
+        The type of the column being added to.
+    months
+        Number of months
+    weeks
+        Number of weeks
+    days
+        Number of days
+    nanoseconds
+        Number of nanoseconds
+    parsed_int
+        Is this actually a representation of an integer, not a duration?
+    negative
+        Is this a negative duration?
+    Returns
+    -------
+    int
+        The total number of nanoseconds represented by this duration,
+        or just an integer if `parsed_int` was true.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported durations or datatypes.
+    """
+    if months != 0:
+        raise NotImplementedError("Month durations in rolling windows")
+    if parsed_int and (weeks != 0 or days != 0 or dtype.id() != plc.TypeId.INT64):
+        raise NotImplementedError(
+            "Invalid duration for parsed_int"
+        )  # pragma: no cover; polars raises first
+    elif not parsed_int and dtype.id() == plc.TypeId.INT64:
+        raise pl.exceptions.InvalidOperationError("Duration must be a parsed integer")
+    value = nanoseconds + 24 * 60 * 60 * 10**9 * (days + 7 * weeks)
+    return -value if negative else value
+def duration_to_scalar(dtype: plc.DataType, value: int) -> plc.Scalar:
+    """
+    Convert a raw polars duration value to a pylibcudf scalar.
+    Parameters
+    ----------
+    dtype
+        The type of the column being added to.
+    value
+        The raw value as in integer. If `dtype` represents a timestamp
+        type, this should be in nanoseconds.
+    Returns
+    -------
+    pylibcudf.Scalar
+        With datatype matching the provided dtype.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported durations or datatypes.
+    """
+    tid = dtype.id()
+    if tid == plc.TypeId.INT64:
+        return plc.Scalar.from_py(value, dtype)
+    elif tid == plc.TypeId.TIMESTAMP_NANOSECONDS:
+        return plc.Scalar.from_py(value, plc.DataType(plc.TypeId.DURATION_NANOSECONDS))
+    elif tid == plc.TypeId.TIMESTAMP_MICROSECONDS:
+        return plc.Scalar.from_py(
+            value // 1000, plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+        )
+    elif tid == plc.TypeId.TIMESTAMP_MILLISECONDS:
+        return plc.Scalar.from_py(
+            value // 1_000_000, plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
+        )
+    else:
+        raise NotImplementedError("Unsupported data type in rolling window offset")
+def offsets_to_windows(
+    dtype: plc.DataType,
+    offset: Duration,
+    period: Duration,
+) -> tuple[plc.Scalar, plc.Scalar]:
+    """
+    Convert polars offset/period pair to preceding/following windows.
+    Parameters
+    ----------
+    dtype
+        Datatype of column defining windows
+    offset
+        Offset duration
+    period
+        Period of window
+    Returns
+    -------
+    tuple of preceding and following windows as pyarrow scalars.
+    """
+    offset_i = duration_to_int(dtype, *offset)
+    period_i = duration_to_int(dtype, *period)
+    # Polars uses current_row + offset, ..., current_row + offset + period
+    # Libcudf uses current_row - preceding, ..., current_row + following
+    return duration_to_scalar(dtype, -offset_i), duration_to_scalar(
+        dtype, offset_i + period_i
+    )
+def range_window_bounds(
+    preceding: plc.Scalar, following: plc.Scalar, closed_window: ClosedInterval
+) -> tuple[plc.rolling.RangeWindowType, plc.rolling.RangeWindowType]:
+    """
+    Convert preceding and following scalars to range window specs.
+    Parameters
+    ----------
+    preceding
+        The preceding window scalar.
+    following
+        The following window scalar.
+    closed_window
+        How the window interval endpoints are treated.
+    Returns
+    -------
+    tuple
+        Of preceding and following range window types.
+    """
+    if closed_window == "both":
+        return (
+            plc.rolling.BoundedClosed(preceding),
+            plc.rolling.BoundedClosed(following),
+        )
+    elif closed_window == "left":
+        return (
+            plc.rolling.BoundedClosed(preceding),
+            plc.rolling.BoundedOpen(following),
+        )
+    elif closed_window == "right":
+        return (
+            plc.rolling.BoundedOpen(preceding),
+            plc.rolling.BoundedClosed(following),
+        )
+    else:
+        return (
+            plc.rolling.BoundedOpen(preceding),
+            plc.rolling.BoundedOpen(following),
+        )

cudf_polars/experimental/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition base classes."""
@@ -6,26 +6,29 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
-from cudf_polars.dsl.ir import Union
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Sequence
+    from collections.abc import Iterator
-    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.nodebase import Node
 class PartitionInfo:
-    """
-    Partitioning information.
-    This class only tracks the partition count (for now).
-    """
-    __slots__ = ("count",)
-    def __init__(self, count: int):
+    """Partitioning information."""
+    __slots__ = ("count", "partitioned_on")
+    count: int
+    """Partition count."""
+    partitioned_on: tuple[NamedExpr, ...]
+    """Columns the data is hash-partitioned on."""
+    def __init__(
+        self,
+        count: int,
+        partitioned_on: tuple[NamedExpr, ...] = (),
+    ):
         self.count = count
+        self.partitioned_on = partitioned_on
     def keys(self, node: Node) -> Iterator[tuple[str, int]]:
         """Return the partitioned keys for a given node."""
@@ -36,8 +39,3 @@ class PartitionInfo:
 def get_key_name(node: Node) -> str:
     """Generate the key name for a Node."""
     return f"{type(node).__name__.lower()}-{hash(node)}"
-def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
-    # Concatenate a sequence of DataFrames vertically
-    return Union.do_evaluate(None, *dfs)

cudf_polars/experimental/benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Experimental benchmarks."""

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl