PyPI - cudf-polars-cu12 - Versions diffs - 24.8.0a281__py3-none-any.whl - Mend

cudf-polars-cu12 24.8.0a281__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

cudf_polars/VERSION +1 -0
cudf_polars/__init__.py +22 -0
cudf_polars/_version.py +21 -0
cudf_polars/callback.py +66 -0
cudf_polars/containers/__init__.py +11 -0
cudf_polars/containers/column.py +189 -0
cudf_polars/containers/dataframe.py +226 -0
cudf_polars/dsl/__init__.py +8 -0
cudf_polars/dsl/expr.py +1422 -0
cudf_polars/dsl/ir.py +1053 -0
cudf_polars/dsl/translate.py +535 -0
cudf_polars/py.typed +0 -0
cudf_polars/testing/__init__.py +8 -0
cudf_polars/testing/asserts.py +118 -0
cudf_polars/typing/__init__.py +106 -0
cudf_polars/utils/__init__.py +8 -0
cudf_polars/utils/dtypes.py +159 -0
cudf_polars/utils/sorting.py +53 -0
cudf_polars_cu12-24.8.0a281.dist-info/LICENSE +201 -0
cudf_polars_cu12-24.8.0a281.dist-info/METADATA +126 -0
cudf_polars_cu12-24.8.0a281.dist-info/RECORD +23 -0
cudf_polars_cu12-24.8.0a281.dist-info/WHEEL +5 -0
cudf_polars_cu12-24.8.0a281.dist-info/top_level.txt +1 -0

cudf_polars/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 24.08.00a281

cudf_polars/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""
+An executor for polars logical plans.
+This package implements an executor for polars logical plans using
+pylibcudf to execute the plans on device.
+"""
+from __future__ import annotations
+from cudf_polars._version import __git_commit__, __version__
+from cudf_polars.callback import execute_with_cudf
+from cudf_polars.dsl.translate import translate_ir
+__all__: list[str] = [
+    "execute_with_cudf",
+    "translate_ir",
+    "__git_commit__",
+    "__version__",
+]

cudf_polars/_version.py ADDED Viewed

@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import importlib.resources
+__version__ = (
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+__all__ = ["__git_commit__", "__version__"]

cudf_polars/callback.py ADDED Viewed

@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Callback for the polars collect function to execute on device."""
+from __future__ import annotations
+from functools import partial
+from typing import TYPE_CHECKING
+import nvtx
+from cudf_polars.dsl.translate import translate_ir
+if TYPE_CHECKING:
+    import polars as pl
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.typing import NodeTraverser
+__all__: list[str] = ["execute_with_cudf"]
+def _callback(
+    ir: IR,
+    with_columns: list[str] | None,
+    pyarrow_predicate: str | None,
+    n_rows: int | None,
+) -> pl.DataFrame:
+    assert with_columns is None
+    assert pyarrow_predicate is None
+    assert n_rows is None
+    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+        return ir.evaluate(cache={}).to_polars()
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
+    """
+    A post optimization callback that attempts to execute the plan with cudf.
+    Parameters
+    ----------
+    nt
+        NodeTraverser
+    raise_on_fail
+        Should conversion raise an exception rather than continuing
+        without setting a callback.
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+    The NodeTraverser is mutated if the libcudf executor can handle the plan.
+    """
+    try:
+        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+            nt.set_udf(partial(_callback, translate_ir(nt)))
+    except exception:
+        if raise_on_fail:
+            raise

cudf_polars/containers/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Containers of concrete data."""
+from __future__ import annotations
+__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
+from cudf_polars.containers.column import Column, NamedColumn
+from cudf_polars.containers.dataframe import DataFrame

cudf_polars/containers/column.py ADDED Viewed

@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""A column, with some properties."""
+from __future__ import annotations
+import functools
+from typing import TYPE_CHECKING
+import cudf._lib.pylibcudf as plc
+if TYPE_CHECKING:
+    from typing_extensions import Self
+__all__: list[str] = ["Column", "NamedColumn"]
+class Column:
+    """An immutable column with sortedness metadata."""
+    obj: plc.Column
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+    is_scalar: bool
+    def __init__(
+        self,
+        column: plc.Column,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ):
+        self.obj = column
+        self.is_scalar = self.obj.size() == 1
+        if self.obj.size() <= 1:
+            is_sorted = plc.types.Sorted.YES
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
+    @functools.cached_property
+    def obj_scalar(self) -> plc.Scalar:
+        """
+        A copy of the column object as a pylibcudf Scalar.
+        Returns
+        -------
+        pylibcudf Scalar object.
+        Raises
+        ------
+        ValueError
+            If the column is not length-1.
+        """
+        if not self.is_scalar:
+            raise ValueError(
+                f"Cannot convert a column of length {self.obj.size()} to scalar"
+            )
+        return plc.copying.get_element(self.obj, 0)
+    def sorted_like(self, like: Column, /) -> Self:
+        """
+        Copy sortedness properties from a column onto self.
+        Parameters
+        ----------
+        like
+            The column to copy sortedness metadata from.
+        Returns
+        -------
+        Self with metadata set.
+        See Also
+        --------
+        set_sorted
+        """
+        return self.set_sorted(
+            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        )
+    def set_sorted(
+        self,
+        *,
+        is_sorted: plc.types.Sorted,
+        order: plc.types.Order,
+        null_order: plc.types.NullOrder,
+    ) -> Self:
+        """
+        Modify sortedness metadata in place.
+        Parameters
+        ----------
+        is_sorted
+            Is the column sorted
+        order
+            The order if sorted
+        null_order
+            Where nulls sort, if sorted
+        Returns
+        -------
+        Self with metadata set.
+        """
+        if self.obj.size() <= 1:
+            is_sorted = plc.types.Sorted.YES
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
+        return self
+    def copy(self) -> Self:
+        """
+        A shallow copy of the column.
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )
+    def mask_nans(self) -> Self:
+        """Return a copy of self with nans masked out."""
+        if self.nan_count > 0:
+            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        return self.copy()
+    @functools.cached_property
+    def nan_count(self) -> int:
+        """Return the number of NaN values in the column."""
+        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+            return 0
+        return plc.interop.to_arrow(
+            plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                # TODO: pylibcudf needs to have a SizeType DataType singleton
+                plc.DataType(plc.TypeId.INT32),
+            )
+        ).as_py()
+class NamedColumn(Column):
+    """A column with a name."""
+    name: str
+    def __init__(
+        self,
+        column: plc.Column,
+        name: str,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ) -> None:
+        super().__init__(
+            column, is_sorted=is_sorted, order=order, null_order=null_order
+        )
+        self.name = name
+    def copy(self, *, new_name: str | None = None) -> Self:
+        """
+        A shallow copy of the column.
+        Parameters
+        ----------
+        new_name
+            Optional new name for the copied column.
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj,
+            self.name if new_name is None else new_name,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )

cudf_polars/containers/dataframe.py ADDED Viewed

@@ -0,0 +1,226 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""A dataframe, with some properties."""
+from __future__ import annotations
+import itertools
+from functools import cached_property
+from typing import TYPE_CHECKING, cast
+import polars as pl
+import cudf._lib.pylibcudf as plc
+from cudf_polars.containers.column import NamedColumn
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence, Set
+    import pyarrow as pa
+    from typing_extensions import Self
+    import cudf
+    from cudf_polars.containers import Column
+__all__: list[str] = ["DataFrame"]
+class DataFrame:
+    """A representation of a dataframe."""
+    columns: list[NamedColumn]
+    table: plc.Table
+    def __init__(self, columns: Sequence[NamedColumn]) -> None:
+        self.columns = list(columns)
+        self._column_map = {c.name: c for c in self.columns}
+        self.table = plc.Table([c.obj for c in columns])
+    def copy(self) -> Self:
+        """Return a shallow copy of self."""
+        return type(self)([c.copy() for c in self.columns])
+    def to_polars(self) -> pl.DataFrame:
+        """Convert to a polars DataFrame."""
+        table: pa.Table = plc.interop.to_arrow(
+            self.table,
+            [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+        )
+        return cast(pl.DataFrame, pl.from_arrow(table))
+    @cached_property
+    def column_names_set(self) -> frozenset[str]:
+        """Return the column names as a set."""
+        return frozenset(c.name for c in self.columns)
+    @cached_property
+    def column_names(self) -> list[str]:
+        """Return a list of the column names."""
+        return [c.name for c in self.columns]
+    @cached_property
+    def num_columns(self) -> int:
+        """Number of columns."""
+        return len(self.columns)
+    @cached_property
+    def num_rows(self) -> int:
+        """Number of rows."""
+        return 0 if len(self.columns) == 0 else self.table.num_rows()
+    @classmethod
+    def from_cudf(cls, df: cudf.DataFrame) -> Self:
+        """Create from a cudf dataframe."""
+        return cls(
+            [
+                NamedColumn(c.to_pylibcudf(mode="read"), name)
+                for name, c in df._data.items()
+            ]
+        )
+    @classmethod
+    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
+        """
+        Create from a pylibcudf table.
+        Parameters
+        ----------
+        table
+            Pylibcudf table to obtain columns from
+        names
+            Names for the columns
+        Returns
+        -------
+        New dataframe sharing data with the input table.
+        Raises
+        ------
+        ValueError
+            If the number of provided names does not match the
+            number of columns in the table.
+        """
+        if table.num_columns() != len(names):
+            raise ValueError("Mismatching name and table length.")
+        return cls(
+            # TODO: strict=True when we drop py39
+            [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
+        )
+    def sorted_like(
+        self, like: DataFrame, /, *, subset: Set[str] | None = None
+    ) -> Self:
+        """
+        Copy sortedness from a dataframe onto self.
+        Parameters
+        ----------
+        like
+            The dataframe to copy from
+        subset
+            Optional subset of columns from which to copy data.
+        Returns
+        -------
+        Self with metadata set.
+        Raises
+        ------
+        ValueError
+            If there is a name mismatch between self and like.
+        """
+        if like.column_names != self.column_names:
+            raise ValueError("Can only copy from identically named frame")
+        subset = self.column_names_set if subset is None else subset
+        self.columns = [
+            c.sorted_like(other) if c.name in subset else c
+            # TODO: strict=True when we drop py39
+            for c, other in zip(self.columns, like.columns)
+        ]
+        return self
+    def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
+        """
+        Return a new dataframe with extra columns.
+        Parameters
+        ----------
+        columns
+            Columns to add
+        Returns
+        -------
+        New dataframe
+        Notes
+        -----
+        If column names overlap, newer names replace older ones.
+        """
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
+    def discard_columns(self, names: Set[str]) -> Self:
+        """Drop columns by name."""
+        return type(self)([c for c in self.columns if c.name not in names])
+    def select(self, names: Sequence[str]) -> Self:
+        """Select columns by name returning DataFrame."""
+        want = set(names)
+        if not want.issubset(self.column_names_set):
+            raise ValueError("Can't select missing names")
+        return type(self)([self._column_map[name] for name in names])
+    def replace_columns(self, *columns: NamedColumn) -> Self:
+        """Return a new dataframe with columns replaced by name."""
+        new = {c.name: c for c in columns}
+        if not set(new).issubset(self.column_names_set):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)([new.get(c.name, c) for c in self.columns])
+    def rename_columns(self, mapping: Mapping[str, str]) -> Self:
+        """Rename some columns."""
+        return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
+    def select_columns(self, names: Set[str]) -> list[NamedColumn]:
+        """Select columns by name."""
+        return [c for c in self.columns if c.name in names]
+    def filter(self, mask: Column) -> Self:
+        """Return a filtered table given a mask."""
+        table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
+        return type(self).from_table(table, self.column_names).sorted_like(self)
+    def slice(self, zlice: tuple[int, int] | None) -> Self:
+        """
+        Slice a dataframe.
+        Parameters
+        ----------
+        zlice
+            optional, tuple of start and length, negative values of start
+            treated as for python indexing. If not provided, returns self.
+        Returns
+        -------
+        New dataframe (if zlice is not None) otherwise self (if it is)
+        """
+        if zlice is None:
+            return self
+        start, length = zlice
+        if start < 0:
+            start += self.num_rows
+        # Polars implementation wraps negative start by num_rows, then
+        # adds length to start to get the end, then clamps both to
+        # [0, num_rows)
+        end = start + length
+        start = max(min(start, self.num_rows), 0)
+        end = max(min(end, self.num_rows), 0)
+        (table,) = plc.copying.slice(self.table, [start, end])
+        return type(self).from_table(table, self.column_names).sorted_like(self)

cudf_polars/dsl/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""The domain-specific language (DSL) for the polars executor."""
+from __future__ import annotations
+__all__: list[str] = []