PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/containers/dataframe.py CHANGED Viewed

@@ -20,8 +20,9 @@ if TYPE_CHECKING:
     from typing_extensions import Any, CapsuleType, Self
-    from cudf_polars.typing import ColumnOptions, DataFrameHeader, PolarsDataType, Slice
+    from rmm.pylibrmm.stream import Stream
+    from cudf_polars.typing import ColumnOptions, DataFrameHeader, PolarsDataType, Slice
 __all__: list[str] = ["DataFrame"]
@@ -55,15 +56,21 @@ def _create_polars_column_metadata(
 # This is also defined in pylibcudf.interop
 class _ObjectWithArrowMetadata:
     def __init__(
-        self, obj: plc.Table | plc.Column, metadata: list[plc.interop.ColumnMetadata]
+        self,
+        obj: plc.Table | plc.Column,
+        metadata: list[plc.interop.ColumnMetadata],
+        stream: Stream,
     ) -> None:
         self.obj = obj
         self.metadata = metadata
+        self.stream = stream
     def __arrow_c_array__(
         self, requested_schema: None = None
     ) -> tuple[CapsuleType, CapsuleType]:
-        return self.obj._to_schema(self.metadata), self.obj._to_host_array()
+        return self.obj._to_schema(self.metadata), self.obj._to_host_array(
+            stream=self.stream
+        )
 # Pacify the type checker. DataFrame init asserts that all the columns
@@ -78,8 +85,9 @@ class DataFrame:
     column_map: dict[str, Column]
     table: plc.Table
     columns: list[NamedColumn]
+    stream: Stream
-    def __init__(self, columns: Iterable[Column]) -> None:
+    def __init__(self, columns: Iterable[Column], stream: Stream) -> None:
         columns = list(columns)
         if any(c.name is None for c in columns):
             raise ValueError("All columns must have a name")
@@ -87,10 +95,11 @@ class DataFrame:
         self.dtypes = [c.dtype for c in self.columns]
         self.column_map = {c.name: c for c in self.columns}
         self.table = plc.Table([c.obj for c in self.columns])
+        self.stream = stream
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)(c.copy() for c in self.columns)
+        return type(self)((c.copy() for c in self.columns), stream=self.stream)
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
@@ -102,10 +111,12 @@ class DataFrame:
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
         metadata = [
-            _create_polars_column_metadata(name, dtype.polars)
+            _create_polars_column_metadata(name, dtype.polars_type)
             for name, dtype in zip(name_map, self.dtypes, strict=True)
         ]
-        table_with_metadata = _ObjectWithArrowMetadata(self.table, metadata)
+        table_with_metadata = _ObjectWithArrowMetadata(
+            self.table, metadata, self.stream
+        )
         df = pl.DataFrame(table_with_metadata)
         return df.rename(name_map).with_columns(
             pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
@@ -135,7 +146,7 @@ class DataFrame:
         return self.table.num_rows() if self.column_map else 0
     @classmethod
-    def from_polars(cls, df: pl.DataFrame) -> Self:
+    def from_polars(cls, df: pl.DataFrame, stream: Stream) -> Self:
         """
         Create from a polars dataframe.
@@ -143,22 +154,34 @@ class DataFrame:
         ----------
         df
             Polars dataframe to convert
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe.
         Returns
         -------
         New dataframe representing the input.
         """
-        plc_table = plc.Table.from_arrow(df)
+        plc_table = plc.Table.from_arrow(df, stream=stream)
         return cls(
-            Column(d_col, name=name, dtype=DataType(h_col.dtype)).copy_metadata(h_col)
-            for d_col, h_col, name in zip(
-                plc_table.columns(), df.iter_columns(), df.columns, strict=True
-            )
+            (
+                Column(d_col, name=name, dtype=DataType(h_col.dtype)).copy_metadata(
+                    h_col
+                )
+                for d_col, h_col, name in zip(
+                    plc_table.columns(), df.iter_columns(), df.columns, strict=True
+                )
+            ),
+            stream=stream,
         )
     @classmethod
     def from_table(
-        cls, table: plc.Table, names: Sequence[str], dtypes: Sequence[DataType]
+        cls,
+        table: plc.Table,
+        names: Sequence[str],
+        dtypes: Sequence[DataType],
+        stream: Stream,
     ) -> Self:
         """
         Create from a pylibcudf table.
@@ -171,6 +194,10 @@ class DataFrame:
             Names for the columns
         dtypes
             Dtypes for the columns
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe. The caller is responsible for ensuring that
+            the data in ``table`` is valid on ``stream``.
         Returns
         -------
@@ -185,13 +212,19 @@ class DataFrame:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            Column(c, name=name, dtype=dtype)
-            for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
+            (
+                Column(c, name=name, dtype=dtype)
+                for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
+            ),
+            stream=stream,
         )
     @classmethod
     def deserialize(
-        cls, header: DataFrameHeader, frames: tuple[memoryview, plc.gpumemoryview]
+        cls,
+        header: DataFrameHeader,
+        frames: tuple[memoryview[bytes], plc.gpumemoryview],
+        stream: Stream,
     ) -> Self:
         """
         Create a DataFrame from a serialized representation returned by `.serialize()`.
@@ -202,6 +235,10 @@ class DataFrame:
             The (unpickled) metadata required to reconstruct the object.
         frames
             Two-tuple of frames (a memoryview and a gpumemoryview).
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe. The caller is responsible for ensuring that
+            the data in ``frames`` is valid on ``stream``.
         Returns
         -------
@@ -210,16 +247,22 @@ class DataFrame:
         """
         packed_metadata, packed_gpu_data = frames
         table = plc.contiguous_split.unpack_from_memoryviews(
-            packed_metadata, packed_gpu_data
+            packed_metadata,
+            packed_gpu_data,
+            stream,
         )
         return cls(
-            Column(c, **Column.deserialize_ctor_kwargs(kw))
-            for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
+            (
+                Column(c, **Column.deserialize_ctor_kwargs(kw))
+                for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
+            ),
+            stream=stream,
         )
     def serialize(
         self,
-    ) -> tuple[DataFrameHeader, tuple[memoryview, plc.gpumemoryview]]:
+        stream: Stream | None = None,
+    ) -> tuple[DataFrameHeader, tuple[memoryview[bytes], plc.gpumemoryview]]:
         """
         Serialize the table into header and frames.
@@ -231,6 +274,12 @@ class DataFrame:
             >>> from cudf_polars.experimental.dask_serialize import register
             >>> register()
+        Parameters
+        ----------
+        stream
+            CUDA stream used for device memory operations and kernel launches
+            on this dataframe.
         Returns
         -------
         header
@@ -238,7 +287,7 @@ class DataFrame:
         frames
             Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
         """
-        packed = plc.contiguous_split.pack(self.table)
+        packed = plc.contiguous_split.pack(self.table, stream=stream)
         # Keyword arguments for `Column.__init__`.
         columns_kwargs: list[ColumnOptions] = [
@@ -276,12 +325,19 @@ class DataFrame:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
         return type(self)(
-            c.sorted_like(other) if c.name in subset else c
-            for c, other in zip(self.columns, like.columns, strict=True)
+            (
+                c.sorted_like(other) if c.name in subset else c
+                for c, other in zip(self.columns, like.columns, strict=True)
+            ),
+            stream=self.stream,
         )
     def with_columns(
-        self, columns: Iterable[Column], *, replace_only: bool = False
+        self,
+        columns: Iterable[Column],
+        *,
+        replace_only: bool = False,
+        stream: Stream,
     ) -> Self:
         """
         Return a new dataframe with extra columns.
@@ -292,6 +348,13 @@ class DataFrame:
             Columns to add
         replace_only
             If true, then only replacements are allowed (matching by name).
+        stream
+            CUDA stream used for device memory operations and kernel launches.
+            The caller is responsible for ensuring that
+            1. The data in ``columns`` is valid on ``stream``.
+            2. No additional operations occur on ``self.stream`` with the
+               original data in ``self``.
         Returns
         -------
@@ -305,33 +368,57 @@ class DataFrame:
         new = {c.name: c for c in columns}
         if replace_only and not self.column_names_set.issuperset(new.keys()):
             raise ValueError("Cannot replace with non-existing names")
-        return type(self)((self.column_map | new).values())
+        return type(self)((self.column_map | new).values(), stream=stream)
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)(column for column in self.columns if column.name not in names)
+        return type(self)(
+            (column for column in self.columns if column.name not in names),
+            stream=self.stream,
+        )
     def select(self, names: Sequence[str] | Mapping[str, Any]) -> Self:
         """Select columns by name returning DataFrame."""
         try:
-            return type(self)(self.column_map[name] for name in names)
+            return type(self)(
+                (self.column_map[name] for name in names), stream=self.stream
+            )
         except KeyError as e:
             raise ValueError("Can't select missing names") from e
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
-        return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns)
+        return type(self)(
+            (c.rename(mapping.get(c.name, c.name)) for c in self.columns),
+            stream=self.stream,
+        )
     def select_columns(self, names: Set[str]) -> list[Column]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]
     def filter(self, mask: Column) -> Self:
-        """Return a filtered table given a mask."""
-        table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
+        """
+        Return a filtered table given a mask.
+        Parameters
+        ----------
+        mask
+            Boolean mask to apply to the dataframe. It is the caller's
+            responsibility to ensure that ``mask`` is valid on ``self.stream``.
+            A mask that is derived from ``self`` via a computation on ``self.stream``
+            automatically satisfies this requirement.
+        Returns
+        -------
+        Filtered dataframe
+        """
+        table = plc.stream_compaction.apply_boolean_mask(
+            self.table, mask.obj, stream=self.stream
+        )
         return (
             type(self)
-            .from_table(table, self.column_names, self.dtypes)
+            .from_table(table, self.column_names, self.dtypes, self.stream)
             .sorted_like(self)
         )
@@ -352,10 +439,12 @@ class DataFrame:
         if zlice is None:
             return self
         (table,) = plc.copying.slice(
-            self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
+            self.table,
+            conversion.from_polars_slice(zlice, num_rows=self.num_rows),
+            stream=self.stream,
         )
         return (
             type(self)
-            .from_table(table, self.column_names, self.dtypes)
+            .from_table(table, self.column_names, self.dtypes, self.stream)
             .sorted_like(self)
         )

cudf_polars/containers/datatype.py CHANGED Viewed

@@ -6,6 +6,7 @@
 from __future__ import annotations
 from functools import cache
+from typing import TYPE_CHECKING, Literal, cast
 from typing_extensions import assert_never
@@ -13,8 +14,103 @@ import polars as pl
 import pylibcudf as plc
+if TYPE_CHECKING:
+    from cudf_polars.typing import (
+        DataTypeHeader,
+        PolarsDataType,
+    )
 __all__ = ["DataType"]
+SCALAR_NAME_TO_POLARS_TYPE_MAP: dict[str, pl.DataType] = {
+    "Boolean": pl.Boolean(),
+    "Int8": pl.Int8(),
+    "Int16": pl.Int16(),
+    "Int32": pl.Int32(),
+    "Int64": pl.Int64(),
+    "Object": pl.Object(),
+    "UInt8": pl.UInt8(),
+    "UInt16": pl.UInt16(),
+    "UInt32": pl.UInt32(),
+    "UInt64": pl.UInt64(),
+    "Float32": pl.Float32(),
+    "Float64": pl.Float64(),
+    "String": pl.String(),
+    "Null": pl.Null(),
+    "Date": pl.Date(),
+    "Time": pl.Time(),
+}
+def _dtype_to_header(dtype: pl.DataType) -> DataTypeHeader:
+    name = type(dtype).__name__
+    if name in SCALAR_NAME_TO_POLARS_TYPE_MAP:
+        return {"kind": "scalar", "name": name}
+    if isinstance(dtype, pl.Decimal):
+        # TODO: Add version guard once we support polars 1.34
+        # Also keep in mind the typing change in polars:
+        # https://github.com/pola-rs/polars/pull/25227
+        precision = dtype.precision if dtype.precision is not None else 38
+        return {
+            "kind": "decimal",
+            "precision": precision,
+            "scale": dtype.scale,
+        }
+    if isinstance(dtype, pl.Datetime):
+        return {
+            "kind": "datetime",
+            "time_unit": dtype.time_unit,
+            "time_zone": dtype.time_zone,
+        }
+    if isinstance(dtype, pl.Duration):
+        return {"kind": "duration", "time_unit": dtype.time_unit}
+    if isinstance(dtype, pl.List):
+        # isinstance narrows dtype to pl.List, but .inner returns DataTypeClass | DataType
+        return {
+            "kind": "list",
+            "inner": _dtype_to_header(cast(pl.DataType, dtype.inner)),
+        }
+    if isinstance(dtype, pl.Struct):
+        # isinstance narrows dtype to pl.Struct, but field.dtype returns DataTypeClass | DataType
+        return {
+            "kind": "struct",
+            "fields": [
+                {"name": f.name, "dtype": _dtype_to_header(cast(pl.DataType, f.dtype))}
+                for f in dtype.fields
+            ],
+        }
+    raise NotImplementedError(f"Unsupported dtype {dtype!r}")
+def _dtype_from_header(header: DataTypeHeader) -> pl.DataType:
+    if header["kind"] == "scalar":
+        name = header["name"]
+        try:
+            return SCALAR_NAME_TO_POLARS_TYPE_MAP[name]
+        except KeyError as err:
+            raise NotImplementedError(f"Unknown scalar dtype name: {name}") from err
+    if header["kind"] == "decimal":
+        return pl.Decimal(header["precision"], header["scale"])
+    if header["kind"] == "datetime":
+        return pl.Datetime(
+            time_unit=cast(Literal["ns", "us", "ms"], header["time_unit"]),
+            time_zone=header["time_zone"],
+        )
+    if header["kind"] == "duration":
+        return pl.Duration(
+            time_unit=cast(Literal["ns", "us", "ms"], header["time_unit"])
+        )
+    if header["kind"] == "list":
+        return pl.List(_dtype_from_header(header["inner"]))
+    if header["kind"] == "struct":
+        return pl.Struct(
+            [
+                pl.Field(f["name"], _dtype_from_header(f["dtype"]))
+                for f in header["fields"]
+            ]
+        )
+    raise NotImplementedError(f"Unsupported kind {header['kind']!r}")
 @cache
 def _from_polars(dtype: pl.DataType) -> plc.DataType:
@@ -102,36 +198,61 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType:
 class DataType:
     """A datatype, preserving polars metadata."""
-    polars: pl.datatypes.DataType
-    plc: plc.DataType
+    polars_type: pl.datatypes.DataType
+    plc_type: plc.DataType
-    def __init__(self, polars_dtype: pl.DataType) -> None:
-        self.polars = polars_dtype
-        self.plc = _from_polars(polars_dtype)
+    def __init__(self, polars_dtype: PolarsDataType) -> None:
+        # Convert DataTypeClass to DataType instance if needed
+        # polars allows both pl.Int64 (class) and pl.Int64() (instance)
+        if isinstance(polars_dtype, type):
+            polars_dtype = polars_dtype()
+        # After conversion, it's guaranteed to be a DataType instance
+        self.polars_type = cast(pl.DataType, polars_dtype)
+        self.plc_type = _from_polars(self.polars_type)
     def id(self) -> plc.TypeId:
         """The pylibcudf.TypeId of this DataType."""
-        return self.plc.id()
+        return self.plc_type.id()
     @property
     def children(self) -> list[DataType]:
         """The children types of this DataType."""
-        if self.plc.id() == plc.TypeId.STRUCT:
-            return [DataType(field.dtype) for field in self.polars.fields]
-        elif self.plc.id() == plc.TypeId.LIST:
-            return [DataType(self.polars.inner)]
+        # Type checker doesn't narrow polars_type through plc_type.id() checks
+        if self.plc_type.id() == plc.TypeId.STRUCT:
+            # field.dtype returns DataTypeClass | DataType, need to cast to DataType
+            return [
+                DataType(cast(pl.DataType, field.dtype))
+                for field in cast(pl.Struct, self.polars_type).fields
+            ]
+        elif self.plc_type.id() == plc.TypeId.LIST:
+            # .inner returns DataTypeClass | DataType, need to cast to DataType
+            return [DataType(cast(pl.DataType, cast(pl.List, self.polars_type).inner))]
         return []
+    def scale(self) -> int:
+        """The scale of this DataType."""
+        return self.plc_type.scale()
+    @staticmethod
+    def common_decimal_dtype(left: DataType, right: DataType) -> DataType:
+        """Return a common decimal DataType for the two inputs."""
+        if not (
+            plc.traits.is_fixed_point(left.plc_type)
+            and plc.traits.is_fixed_point(right.plc_type)
+        ):
+            raise ValueError("Both inputs required to be decimal types.")
+        return DataType(pl.Decimal(38, abs(min(left.scale(), right.scale()))))
     def __eq__(self, other: object) -> bool:
         """Equality of DataTypes."""
         if not isinstance(other, DataType):
             return False
-        return self.polars == other.polars
+        return self.polars_type == other.polars_type
     def __hash__(self) -> int:
         """Hash of the DataType."""
-        return hash(self.polars)
+        return hash(self.polars_type)
     def __repr__(self) -> str:
         """Representation of the DataType."""
-        return f"<DataType(polars={self.polars}, plc={self.id()!r})>"
+        return f"<DataType(polars={self.polars_type}, plc={self.id()!r})>"

cudf_polars/dsl/expr.py CHANGED Viewed

@@ -1,7 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
-# TODO: remove need for this
-# ruff: noqa: D101
 """
 DSL nodes for the polars expression language.

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl