PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

cudf_polars/GIT_COMMIT +1 -0
cudf_polars/VERSION +1 -0
cudf_polars/__init__.py +28 -0
cudf_polars/_version.py +21 -0
cudf_polars/callback.py +318 -0
cudf_polars/containers/__init__.py +13 -0
cudf_polars/containers/column.py +495 -0
cudf_polars/containers/dataframe.py +361 -0
cudf_polars/containers/datatype.py +137 -0
cudf_polars/dsl/__init__.py +8 -0
cudf_polars/dsl/expr.py +66 -0
cudf_polars/dsl/expressions/__init__.py +8 -0
cudf_polars/dsl/expressions/aggregation.py +226 -0
cudf_polars/dsl/expressions/base.py +272 -0
cudf_polars/dsl/expressions/binaryop.py +120 -0
cudf_polars/dsl/expressions/boolean.py +326 -0
cudf_polars/dsl/expressions/datetime.py +271 -0
cudf_polars/dsl/expressions/literal.py +97 -0
cudf_polars/dsl/expressions/rolling.py +643 -0
cudf_polars/dsl/expressions/selection.py +74 -0
cudf_polars/dsl/expressions/slicing.py +46 -0
cudf_polars/dsl/expressions/sorting.py +85 -0
cudf_polars/dsl/expressions/string.py +1002 -0
cudf_polars/dsl/expressions/struct.py +137 -0
cudf_polars/dsl/expressions/ternary.py +49 -0
cudf_polars/dsl/expressions/unary.py +517 -0
cudf_polars/dsl/ir.py +2607 -0
cudf_polars/dsl/nodebase.py +164 -0
cudf_polars/dsl/to_ast.py +359 -0
cudf_polars/dsl/tracing.py +16 -0
cudf_polars/dsl/translate.py +939 -0
cudf_polars/dsl/traversal.py +224 -0
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +481 -0
cudf_polars/dsl/utils/groupby.py +98 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +61 -0
cudf_polars/dsl/utils/reshape.py +74 -0
cudf_polars/dsl/utils/rolling.py +121 -0
cudf_polars/dsl/utils/windows.py +192 -0
cudf_polars/experimental/__init__.py +8 -0
cudf_polars/experimental/base.py +386 -0
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsds.py +220 -0
cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
cudf_polars/experimental/benchmarks/pdsh.py +814 -0
cudf_polars/experimental/benchmarks/utils.py +832 -0
cudf_polars/experimental/dask_registers.py +200 -0
cudf_polars/experimental/dispatch.py +156 -0
cudf_polars/experimental/distinct.py +197 -0
cudf_polars/experimental/explain.py +157 -0
cudf_polars/experimental/expressions.py +590 -0
cudf_polars/experimental/groupby.py +327 -0
cudf_polars/experimental/io.py +943 -0
cudf_polars/experimental/join.py +391 -0
cudf_polars/experimental/parallel.py +423 -0
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +188 -0
cudf_polars/experimental/shuffle.py +354 -0
cudf_polars/experimental/sort.py +609 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/statistics.py +795 -0
cudf_polars/experimental/utils.py +169 -0
cudf_polars/py.typed +0 -0
cudf_polars/testing/__init__.py +8 -0
cudf_polars/testing/asserts.py +448 -0
cudf_polars/testing/io.py +122 -0
cudf_polars/testing/plugin.py +236 -0
cudf_polars/typing/__init__.py +219 -0
cudf_polars/utils/__init__.py +8 -0
cudf_polars/utils/config.py +741 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +118 -0
cudf_polars/utils/sorting.py +53 -0
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +27 -0
cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0

cudf_polars/dsl/utils/groupby.py ADDED Viewed

@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for grouped aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl import ir
+from cudf_polars.dsl.expressions.base import ExecutionContext
+from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation
+from cudf_polars.dsl.utils.naming import unique_names
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from typing import Any
+    from cudf_polars.dsl import expr
+    from cudf_polars.typing import Schema
+__all__ = ["rewrite_groupby"]
+def rewrite_groupby(
+    node: Any,
+    schema: Schema,
+    keys: Sequence[expr.NamedExpr],
+    aggs: Sequence[expr.NamedExpr],
+    inp: ir.IR,
+) -> ir.IR:
+    """
+    Rewrite a groupby plan node into something we can handle.
+    Parameters
+    ----------
+    node
+        The polars groupby plan node.
+    schema
+        Schema of the groupby plan node.
+    keys
+        Grouping keys.
+    aggs
+        Originally requested aggregations.
+    inp
+        Input plan node to the groupby.
+    Returns
+    -------
+    New plan node representing the grouped aggregations.
+    Raises
+    ------
+    NotImplementedError
+        If any of the requested aggregations are unsupported.
+    Notes
+    -----
+    Since libcudf can only perform grouped aggregations on columns
+    (not arbitrary expressions), the approach is to split each
+    aggregation into a pre-selection phase (evaluating expressions
+    that live within an aggregation), the aggregation phase (now
+    acting on columns only), and a post-selection phase (evaluating
+    expressions of aggregated results).
+    This does scheme does not permit nested aggregations, so those are
+    unsupported.
+    """
+    if len(aggs) == 0:
+        return ir.Distinct(
+            schema,
+            plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
+            None,
+            node.options.slice,
+            node.maintain_order,
+            ir.Select(schema, keys, True, inp),  # noqa: FBT003
+        )
+    aggs, group_schema, apply_post_evaluation = apply_pre_evaluation(
+        schema,
+        keys,
+        aggs,
+        unique_names(schema.keys()),
+        ExecutionContext.GROUPBY,
+    )
+    # TODO: use Distinct when the partitioned executor supports it if
+    # the requested aggregations are empty
+    inp = ir.GroupBy(
+        group_schema,
+        keys,
+        aggs,
+        node.maintain_order,
+        node.options.slice,
+        inp,
+    )
+    return apply_post_evaluation(inp)

cudf_polars/dsl/utils/naming.py ADDED Viewed

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Name generation utilities."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Generator, Iterable
+__all__ = ["unique_names"]
+def unique_names(names: Iterable[str]) -> Generator[str, None, None]:
+    """
+    Generate unique names relative to some known names.
+    Parameters
+    ----------
+    names
+        Names we should be unique with respect to.
+    Yields
+    ------
+    Unique names (just using sequence numbers)
+    """
+    prefix = "_" * max(map(len, names))
+    i = 0
+    while True:
+        yield f"{prefix}{i}"
+        i += 1

cudf_polars/dsl/utils/replace.py ADDED Viewed

@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for replacing nodes in a DAG."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Generic
+from cudf_polars.dsl.traversal import CachingVisitor, reuse_if_unchanged
+from cudf_polars.typing import NodeT, TypedDict
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+    from cudf_polars.typing import GenericTransformer
+__all__ = ["replace"]
+class State(Generic[NodeT], TypedDict):
+    """
+    State used when replacing nodes in expressions.
+    Parameters
+    ----------
+    replacements
+        Mapping from nodes to be replaced to their replacements.
+        This state is generic over the type of these nodes.
+    """
+    replacements: Mapping[NodeT, NodeT]
+def _replace(node: NodeT, fn: GenericTransformer[NodeT, NodeT, State]) -> NodeT:
+    try:
+        return fn.state["replacements"][node]
+    except KeyError:
+        return reuse_if_unchanged(node, fn)
+def replace(nodes: Sequence[NodeT], replacements: Mapping[NodeT, NodeT]) -> list[NodeT]:
+    """
+    Replace nodes in expressions.
+    Parameters
+    ----------
+    nodes
+        Sequence of nodes to perform replacements in.
+    replacements
+        Mapping from nodes to be replaced to their replacements.
+    Returns
+    -------
+    list
+        Of nodes with replacements performed.
+    """
+    mapper: GenericTransformer[NodeT, NodeT, State] = CachingVisitor(
+        _replace, state={"replacements": replacements}
+    )
+    return [mapper(node) for node in nodes]

cudf_polars/dsl/utils/reshape.py ADDED Viewed

@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for reshaping Columns."""
+from __future__ import annotations
+import pylibcudf as plc
+from cudf_polars.containers import Column
+def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]:
+    """
+    Broadcast a sequence of columns to a common length.
+    Parameters
+    ----------
+    columns
+        Columns to broadcast.
+    target_length
+        Optional length to broadcast to. If not provided, uses the
+        non-unit length of existing columns.
+    Returns
+    -------
+    List of broadcasted columns all of the same length.
+    Raises
+    ------
+    RuntimeError
+        If broadcasting is not possible.
+    Notes
+    -----
+    In evaluation of a set of expressions, polars type-puns length-1
+    columns with scalars. When we insert these into a DataFrame
+    object, we need to ensure they are of equal length. This function
+    takes some columns, some of which may be length-1 and ensures that
+    all length-1 columns are broadcast to the length of the others.
+    Broadcasting is only possible if the set of lengths of the input
+    columns is a subset of ``{1, n}`` for some (fixed) ``n``. If
+    ``target_length`` is provided and not all columns are length-1
+    (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
+    """
+    if len(columns) == 0:
+        return []
+    lengths: set[int] = {column.size for column in columns}
+    if lengths == {1}:
+        if target_length is None:
+            return list(columns)
+        nrows = target_length
+    else:
+        try:
+            (nrows,) = lengths.difference([1])
+        except ValueError as e:
+            raise RuntimeError("Mismatching column lengths") from e
+        if target_length is not None and nrows != target_length:
+            raise RuntimeError(
+                f"Cannot broadcast columns of length {nrows=} to {target_length=}"
+            )
+    return [
+        column
+        if column.size != 1
+        else Column(
+            plc.Column.from_scalar(column.obj_scalar, nrows),
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
+            name=column.name,
+            dtype=column.dtype,
+        )
+        for column in columns
+    ]

cudf_polars/dsl/utils/rolling.py ADDED Viewed

@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for rolling window aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import pylibcudf as plc
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.expressions.base import ExecutionContext
+from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation
+from cudf_polars.dsl.utils.naming import unique_names
+from cudf_polars.dsl.utils.windows import offsets_to_windows
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from typing import Any
+    from cudf_polars.typing import Schema
+    from cudf_polars.utils import config
+__all__ = ["rewrite_rolling"]
+def rewrite_rolling(
+    options: Any,
+    schema: Schema,
+    keys: Sequence[expr.NamedExpr],
+    aggs: Sequence[expr.NamedExpr],
+    config_options: config.ConfigOptions,
+    inp: ir.IR,
+) -> ir.IR:
+    """
+    Rewrite a rolling plan node into something we can handle.
+    Parameters
+    ----------
+    options
+        Rolling-specific group options.
+    schema
+        Schema of the rolling plan node.
+    keys
+        Grouping keys for the rolling node (may be empty).
+    aggs
+        Originally requested rolling aggregations.
+    config_options
+        Configuration options (currently unused).
+    inp
+        Input plan node to the rolling aggregation.
+    Returns
+    -------
+    New plan node representing the rolling aggregations
+    Raises
+    ------
+    NotImplementedError
+        If any of the requested aggregations are unsupported.
+    Notes
+    -----
+    Since libcudf can only perform rolling aggregations on columns
+    (not arbitrary expressions), the approach is to split each
+    aggregation into a pre-selection phase (evaluating expressions
+    that live within an aggregation), the aggregation phase (now
+    acting on columns only), and a post-selection phase (evaluating
+    expressions of aggregated results).
+    This scheme does not permit nested aggregations, so those are
+    unsupported.
+    """
+    index_name = options.rolling.index_column
+    index_dtype = schema[index_name]
+    index_col = expr.Col(index_dtype, index_name)
+    if plc.traits.is_integral(index_dtype.plc) and index_dtype.id() != plc.TypeId.INT64:
+        plc_index_dtype = plc.DataType(plc.TypeId.INT64)
+    else:
+        plc_index_dtype = index_dtype.plc
+    index = expr.NamedExpr(index_name, index_col)
+    temp_prefix = "_" * max(map(len, schema))
+    if len(aggs) > 0:
+        aggs, rolling_schema, apply_post_evaluation = apply_pre_evaluation(
+            schema,
+            keys,
+            aggs,
+            unique_names(temp_prefix),
+            ExecutionContext.ROLLING,
+            index,
+        )
+    else:
+        rolling_schema = schema
+        apply_post_evaluation = lambda inp: inp  # noqa: E731
+    preceding, following = offsets_to_windows(
+        plc_index_dtype, options.rolling.offset, options.rolling.period
+    )
+    if (n := len(keys)) > 0:
+        # Grouped rolling in polars sorts the output by the groups.
+        inp = ir.Sort(
+            inp.schema,
+            keys,
+            [plc.types.Order.ASCENDING] * n,
+            [plc.types.NullOrder.BEFORE] * n,
+            True,  # noqa: FBT003
+            None,
+            inp,
+        )
+    return apply_post_evaluation(
+        ir.Rolling(
+            rolling_schema,
+            index,
+            preceding,
+            following,
+            options.rolling.closed_window,
+            keys,
+            aggs,
+            options.slice,
+            inp,
+        )
+    )

cudf_polars/dsl/utils/windows.py ADDED Viewed

@@ -0,0 +1,192 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities for rolling window aggregations."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import polars as pl
+import pylibcudf as plc
+if TYPE_CHECKING:
+    from cudf_polars.typing import ClosedInterval, Duration
+__all__ = [
+    "duration_to_int",
+    "duration_to_scalar",
+    "offsets_to_windows",
+    "range_window_bounds",
+]
+def duration_to_int(
+    dtype: plc.DataType,
+    months: int,
+    weeks: int,
+    days: int,
+    nanoseconds: int,
+    parsed_int: bool,  # noqa: FBT001
+    negative: bool,  # noqa: FBT001
+) -> int:
+    """
+    Convert a polars duration value to an integer.
+    Parameters
+    ----------
+    dtype
+        The type of the column being added to.
+    months
+        Number of months
+    weeks
+        Number of weeks
+    days
+        Number of days
+    nanoseconds
+        Number of nanoseconds
+    parsed_int
+        Is this actually a representation of an integer, not a duration?
+    negative
+        Is this a negative duration?
+    Returns
+    -------
+    int
+        The total number of nanoseconds represented by this duration,
+        or just an integer if `parsed_int` was true.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported durations or datatypes.
+    """
+    if months != 0:
+        raise NotImplementedError("Month durations in rolling windows")
+    if parsed_int and (weeks != 0 or days != 0 or dtype.id() != plc.TypeId.INT64):
+        raise NotImplementedError(
+            "Invalid duration for parsed_int"
+        )  # pragma: no cover; polars raises first
+    elif not parsed_int and dtype.id() == plc.TypeId.INT64:
+        raise pl.exceptions.InvalidOperationError("Duration must be a parsed integer")
+    value = nanoseconds + 24 * 60 * 60 * 10**9 * (days + 7 * weeks)
+    return -value if negative else value
+def duration_to_scalar(dtype: plc.DataType, value: int) -> plc.Scalar:
+    """
+    Convert a raw polars duration value to a pylibcudf scalar.
+    Parameters
+    ----------
+    dtype
+        The type of the column being added to.
+    value
+        The raw value as in integer. If `dtype` represents a timestamp
+        type, this should be in nanoseconds.
+    Returns
+    -------
+    pylibcudf.Scalar
+        With datatype matching the provided dtype.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported durations or datatypes.
+    """
+    tid = dtype.id()
+    if tid == plc.TypeId.INT64:
+        return plc.Scalar.from_py(value, dtype)
+    elif tid == plc.TypeId.TIMESTAMP_NANOSECONDS:
+        return plc.Scalar.from_py(value, plc.DataType(plc.TypeId.DURATION_NANOSECONDS))
+    elif tid == plc.TypeId.TIMESTAMP_MICROSECONDS:
+        return plc.Scalar.from_py(
+            value // 1000, plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+        )
+    elif tid == plc.TypeId.TIMESTAMP_MILLISECONDS:
+        return plc.Scalar.from_py(
+            value // 1_000_000, plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
+        )
+    elif tid == plc.TypeId.TIMESTAMP_DAYS:
+        return plc.Scalar.from_py(
+            value // 86_400_000_000_000, plc.DataType(plc.TypeId.DURATION_DAYS)
+        )
+    else:
+        raise NotImplementedError(
+            "Unsupported data type in rolling window offset"
+        )  # pragma: no cover; polars raises first
+def offsets_to_windows(
+    dtype: plc.DataType,
+    offset: Duration,
+    period: Duration,
+) -> tuple[plc.Scalar, plc.Scalar]:
+    """
+    Convert polars offset/period pair to preceding/following windows.
+    Parameters
+    ----------
+    dtype
+        Datatype of column defining windows
+    offset
+        Offset duration
+    period
+        Period of window
+    Returns
+    -------
+    tuple of preceding and following windows as pylibcudf scalars.
+    """
+    offset_i = duration_to_int(dtype, *offset)
+    period_i = duration_to_int(dtype, *period)
+    # Polars uses current_row + offset, ..., current_row + offset + period
+    # Libcudf uses current_row - preceding, ..., current_row + following
+    return duration_to_scalar(dtype, -offset_i), duration_to_scalar(
+        dtype, offset_i + period_i
+    )
+def range_window_bounds(
+    preceding: plc.Scalar, following: plc.Scalar, closed_window: ClosedInterval
+) -> tuple[plc.rolling.RangeWindowType, plc.rolling.RangeWindowType]:
+    """
+    Convert preceding and following scalars to range window specs.
+    Parameters
+    ----------
+    preceding
+        The preceding window scalar.
+    following
+        The following window scalar.
+    closed_window
+        How the window interval endpoints are treated.
+    Returns
+    -------
+    tuple
+        Of preceding and following range window types.
+    """
+    if closed_window == "both":
+        return (
+            plc.rolling.BoundedClosed(preceding),
+            plc.rolling.BoundedClosed(following),
+        )
+    elif closed_window == "left":
+        return (
+            plc.rolling.BoundedClosed(preceding),
+            plc.rolling.BoundedOpen(following),
+        )
+    elif closed_window == "right":
+        return (
+            plc.rolling.BoundedOpen(preceding),
+            plc.rolling.BoundedClosed(following),
+        )
+    else:
+        return (
+            plc.rolling.BoundedOpen(preceding),
+            plc.rolling.BoundedOpen(following),
+        )

cudf_polars/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Experimental features, which can change without any deprecation period."""
+from __future__ import annotations
+__all__: list[str] = []