PyPI - cudf-polars-cu12 - Versions diffs - 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.6.0py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +21 -12
cudf_polars/containers/__init__.py +4 -2
cudf_polars/containers/column.py +87 -42
cudf_polars/containers/dataframe.py +62 -22
cudf_polars/containers/datatype.py +135 -0
cudf_polars/dsl/expr.py +2 -0
cudf_polars/dsl/expressions/aggregation.py +31 -15
cudf_polars/dsl/expressions/base.py +5 -5
cudf_polars/dsl/expressions/binaryop.py +26 -5
cudf_polars/dsl/expressions/boolean.py +58 -37
cudf_polars/dsl/expressions/datetime.py +29 -35
cudf_polars/dsl/expressions/literal.py +23 -11
cudf_polars/dsl/expressions/rolling.py +37 -15
cudf_polars/dsl/expressions/selection.py +7 -7
cudf_polars/dsl/expressions/slicing.py +4 -5
cudf_polars/dsl/expressions/sorting.py +5 -4
cudf_polars/dsl/expressions/string.py +449 -60
cudf_polars/dsl/expressions/struct.py +138 -0
cudf_polars/dsl/expressions/ternary.py +6 -3
cudf_polars/dsl/expressions/unary.py +127 -25
cudf_polars/dsl/ir.py +284 -225
cudf_polars/dsl/nodebase.py +10 -3
cudf_polars/dsl/to_ast.py +60 -21
cudf_polars/dsl/tracing.py +16 -0
cudf_polars/dsl/translate.py +53 -61
cudf_polars/dsl/traversal.py +64 -15
cudf_polars/dsl/utils/aggregations.py +12 -3
cudf_polars/dsl/utils/groupby.py +2 -6
cudf_polars/dsl/utils/replace.py +19 -4
cudf_polars/dsl/utils/reshape.py +74 -0
cudf_polars/dsl/utils/rolling.py +5 -3
cudf_polars/dsl/utils/windows.py +1 -1
cudf_polars/experimental/base.py +114 -2
cudf_polars/experimental/benchmarks/pdsds.py +216 -0
cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
cudf_polars/experimental/benchmarks/pdsh.py +11 -478
cudf_polars/experimental/benchmarks/utils.py +725 -0
cudf_polars/experimental/dask_registers.py +13 -9
cudf_polars/experimental/dispatch.py +22 -7
cudf_polars/experimental/distinct.py +39 -19
cudf_polars/experimental/expressions.py +49 -23
cudf_polars/experimental/groupby.py +79 -43
cudf_polars/experimental/io.py +617 -69
cudf_polars/experimental/join.py +51 -15
cudf_polars/experimental/parallel.py +76 -12
cudf_polars/experimental/select.py +41 -1
cudf_polars/experimental/shuffle.py +33 -25
cudf_polars/experimental/utils.py +13 -1
cudf_polars/testing/asserts.py +85 -26
cudf_polars/testing/plugin.py +64 -67
cudf_polars/typing/__init__.py +41 -22
cudf_polars/utils/config.py +335 -83
cudf_polars/utils/dtypes.py +3 -123
cudf_polars/utils/versions.py +6 -4
{cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/METADATA +12 -19
cudf_polars_cu12-25.8.0.dist-info/RECORD +81 -0
cudf_polars_cu12-25.6.0.dist-info/RECORD +0 -73
{cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/WHEEL +0 -0
{cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu12-25.6.0.dist-info → cudf_polars_cu12-25.8.0.dist-info}/top_level.txt +0 -0

cudf_polars/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 25.06.00
1	+ 25.08.00

cudf_polars/callback.py CHANGED Viewed

@@ -7,6 +7,7 @@ from __future__ import annotations
 import contextlib
 import os
+import textwrap
 import time
 import warnings
 from functools import cache, partial
@@ -21,7 +22,9 @@ import pylibcudf
 import rmm
 from rmm._cuda import gpu
+from cudf_polars.dsl.tracing import CUDF_POLARS_NVTX_DOMAIN
 from cudf_polars.dsl.translate import Translator
+from cudf_polars.utils.config import _env_get_int, get_total_device_memory
 from cudf_polars.utils.timer import Timer
 if TYPE_CHECKING:
@@ -45,13 +48,6 @@ _SUPPORTED_PREFETCHES = {
 }
-def _env_get_int(name: str, default: int) -> int:
-    try:
-        return int(os.getenv(name, default))
-    except (ValueError, TypeError):  # pragma: no cover
-        return default  # pragma: no cover
 @cache
 def default_memory_resource(
     device: int,
@@ -102,8 +98,7 @@ def default_memory_resource(
         ):
             raise ComputeError(
                 "GPU engine requested, but incorrect cudf-polars package installed. "
-                "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` "
-                "and install `cudf-polars-cu11`"
+                "cudf-polars requires CUDA 12.0+ to installed."
             ) from None
         else:
             raise
@@ -140,7 +135,11 @@ def set_memory_resource(
         mr = default_memory_resource(
             device=device,
             cuda_managed_memory=bool(
-                _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) != 0
+                _env_get_int(
+                    "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY",
+                    default=1 if get_total_device_memory() is not None else 0,
+                )
+                != 0
             ),
         )
     rmm.mr.set_current_device_resource(mr)
@@ -222,7 +221,7 @@ def _callback(
     if timer is not None:
         assert should_time
     with (
-        nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
+        nvtx.annotate(message="ExecuteIR", domain=CUDF_POLARS_NVTX_DOMAIN),
         # Device must be set before memory resource is obtained.
         set_device(config_options.device),
         set_memory_resource(memory_resource),
@@ -236,6 +235,16 @@ def _callback(
         elif config_options.executor.name == "streaming":
             from cudf_polars.experimental.parallel import evaluate_streaming
+            if timer is not None:
+                msg = textwrap.dedent("""\
+                    LazyFrame.profile() is not supported with the streaming executor.
+                    To profile execution with the streaming executor, use:
+                    - NVIDIA NSight Systems with the 'streaming' scheduler.
+                    - Dask's built-in profiling tools with the 'distributed' scheduler.
+                    """)
+                raise NotImplementedError(msg)
             return evaluate_streaming(ir, config_options).to_polars()
         assert_never(f"Unknown executor '{config_options.executor}'")
@@ -277,7 +286,7 @@ def execute_with_cudf(
     memory_resource = config.memory_resource
-    with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+    with nvtx.annotate(message="ConvertIR", domain=CUDF_POLARS_NVTX_DOMAIN):
         translator = Translator(nt, config)
         ir = translator.translate_ir()
         ir_translation_errors = translator.errors

cudf_polars/containers/__init__.py CHANGED Viewed

@@ -1,11 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Containers of concrete data."""
 from __future__ import annotations
-__all__: list[str] = ["Column", "DataFrame"]
+__all__: list[str] = ["Column", "DataFrame", "DataType"]
+# dataframe.py & column.py imports DataType, so import in this order to avoid circular import
+from cudf_polars.containers.datatype import DataType  # noqa: I001
 from cudf_polars.containers.column import Column
 from cudf_polars.containers.dataframe import DataFrame

cudf_polars/containers/column.py CHANGED Viewed

@@ -8,6 +8,8 @@ from __future__ import annotations
 import functools
 from typing import TYPE_CHECKING
+import polars as pl
+import polars.datatypes.convert
 from polars.exceptions import InvalidOperationError
 import pylibcudf as plc
@@ -19,19 +21,39 @@ from pylibcudf.strings.convert.convert_integers import (
 )
 from pylibcudf.traits import is_floating_point
+from cudf_polars.containers import DataType
 from cudf_polars.utils import conversion
 from cudf_polars.utils.dtypes import is_order_preserving_cast
 if TYPE_CHECKING:
     from typing_extensions import Self
-    import polars as pl
-    from cudf_polars.typing import ColumnHeader, ColumnOptions, Slice
+    from cudf_polars.typing import (
+        ColumnHeader,
+        ColumnOptions,
+        DeserializedColumnOptions,
+        Slice,
+    )
 __all__: list[str] = ["Column"]
+def _dtype_short_repr_to_dtype(dtype_str: str) -> pl.DataType:
+    """Convert a Polars dtype short repr to a Polars dtype."""
+    # limitations of dtype_short_repr_to_dtype described in
+    # py-polars/polars/datatypes/convert.py#L299
+    if dtype_str.startswith("list["):
+        stripped = dtype_str.removeprefix("list[").removesuffix("]")
+        return pl.List(_dtype_short_repr_to_dtype(stripped))
+    pl_type = polars.datatypes.convert.dtype_short_repr_to_dtype(dtype_str)
+    if pl_type is None:
+        raise ValueError(f"{dtype_str} was not able to be parsed by Polars.")
+    if isinstance(pl_type, polars.datatypes.DataTypeClass):
+        return pl_type()
+    else:
+        return pl_type
 class Column:
     """An immutable column with sortedness metadata."""
@@ -43,10 +65,12 @@ class Column:
     # Optional name, only ever set by evaluation of NamedExpr nodes
     # The internal evaluation should not care about the name.
     name: str | None
+    dtype: DataType
     def __init__(
         self,
         column: plc.Column,
+        dtype: DataType,
         *,
         is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
         order: plc.types.Order = plc.types.Order.ASCENDING,
@@ -56,6 +80,7 @@ class Column:
         self.obj = column
         self.is_scalar = self.size == 1
         self.name = name
+        self.dtype = dtype
         self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)
     @classmethod
@@ -81,7 +106,23 @@ class Column:
         (plc_column,) = plc.contiguous_split.unpack_from_memoryviews(
             packed_metadata, packed_gpu_data
         ).columns()
-        return cls(plc_column, **header["column_kwargs"])
+        return cls(plc_column, **cls.deserialize_ctor_kwargs(header["column_kwargs"]))
+    @staticmethod
+    def deserialize_ctor_kwargs(
+        column_kwargs: ColumnOptions,
+    ) -> DeserializedColumnOptions:
+        """Deserialize the constructor kwargs for a Column."""
+        dtype = DataType(  # pragma: no cover
+            _dtype_short_repr_to_dtype(column_kwargs["dtype"])
+        )
+        return {
+            "is_sorted": column_kwargs["is_sorted"],
+            "order": column_kwargs["order"],
+            "null_order": column_kwargs["null_order"],
+            "name": column_kwargs["name"],
+            "dtype": dtype,
+        }
     def serialize(
         self,
@@ -105,17 +146,21 @@ class Column:
             Two-tuple of frames suitable for passing to `plc.contiguous_split.unpack_from_memoryviews`
         """
         packed = plc.contiguous_split.pack(plc.Table([self.obj]))
-        column_kwargs: ColumnOptions = {
+        header: ColumnHeader = {
+            "column_kwargs": self.serialize_ctor_kwargs(),
+            "frame_count": 2,
+        }
+        return header, packed.release()
+    def serialize_ctor_kwargs(self) -> ColumnOptions:
+        """Serialize the constructor kwargs for self."""
+        return {
             "is_sorted": self.is_sorted,
             "order": self.order,
             "null_order": self.null_order,
             "name": self.name,
+            "dtype": pl.polars.dtype_str_repr(self.dtype.polars),
         }
-        header: ColumnHeader = {
-            "column_kwargs": column_kwargs,
-            "frame_count": 2,
-        }
-        return header, packed.release()
     @functools.cached_property
     def obj_scalar(self) -> plc.Scalar:
@@ -172,6 +217,7 @@ class Column:
         return type(self)(
             self.obj,
             name=self.name,
+            dtype=self.dtype,
             is_sorted=like.is_sorted,
             order=like.order,
             null_order=like.null_order,
@@ -202,11 +248,11 @@ class Column:
         If the sortedness flag is not set, this launches a kernel to
         check sortedness.
         """
-        if self.obj.size() <= 1 or self.obj.size() == self.obj.null_count():
+        if self.size <= 1 or self.size == self.null_count:
             return True
         if self.is_sorted == plc.types.Sorted.YES:
             return self.order == order and (
-                self.obj.null_count() == 0 or self.null_order == null_order
+                self.null_count == 0 or self.null_order == null_order
             )
         if plc.sorting.is_sorted(plc.Table([self.obj]), [order], [null_order]):
             self.sorted = plc.types.Sorted.YES
@@ -215,7 +261,7 @@ class Column:
             return True
         return False
-    def astype(self, dtype: plc.DataType) -> Column:
+    def astype(self, dtype: DataType) -> Column:
         """
         Cast the column to as the requested dtype.
@@ -238,14 +284,18 @@ class Column:
         This only produces a copy if the requested dtype doesn't match
         the current one.
         """
-        if self.obj.type() == dtype:
+        plc_dtype = dtype.plc
+        if self.obj.type() == plc_dtype:
             return self
-        if dtype.id() == plc.TypeId.STRING or self.obj.type().id() == plc.TypeId.STRING:
-            return Column(self._handle_string_cast(dtype))
+        if (
+            plc_dtype.id() == plc.TypeId.STRING
+            or self.obj.type().id() == plc.TypeId.STRING
+        ):
+            return Column(self._handle_string_cast(plc_dtype), dtype=dtype)
         else:
-            result = Column(plc.unary.cast(self.obj, dtype))
-            if is_order_preserving_cast(self.obj.type(), dtype):
+            result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype)
+            if is_order_preserving_cast(self.obj.type(), plc_dtype):
                 return result.sorted_like(self)
             return result
@@ -258,24 +308,20 @@ class Column:
         else:
             if is_floating_point(dtype):
                 floats = is_float(self.obj)
-                if not plc.interop.to_arrow(
-                    plc.reduce.reduce(
-                        floats,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                    )
-                ).as_py():
+                if not plc.reduce.reduce(
+                    floats,
+                    plc.aggregation.all(),
+                    plc.DataType(plc.TypeId.BOOL8),
+                ).to_py():
                     raise InvalidOperationError("Conversion from `str` failed.")
                 return to_floats(self.obj, dtype)
             else:
                 integers = is_integer(self.obj)
-                if not plc.interop.to_arrow(
-                    plc.reduce.reduce(
-                        integers,
-                        plc.aggregation.all(),
-                        plc.DataType(plc.TypeId.BOOL8),
-                    )
-                ).as_py():
+                if not plc.reduce.reduce(
+                    integers,
+                    plc.aggregation.all(),
+                    plc.DataType(plc.TypeId.BOOL8),
+                ).to_py():
                     raise InvalidOperationError("Conversion from `str` failed.")
                 return to_integers(self.obj, dtype)
@@ -361,6 +407,7 @@ class Column:
             order=self.order,
             null_order=self.null_order,
             name=self.name,
+            dtype=self.dtype,
         )
     def mask_nans(self) -> Self:
@@ -368,7 +415,7 @@ class Column:
         if plc.traits.is_floating_point(self.obj.type()):
             old_count = self.null_count
             mask, new_count = plc.transform.nans_to_nulls(self.obj)
-            result = type(self)(self.obj.with_mask(mask, new_count))
+            result = type(self)(self.obj.with_mask(mask, new_count), self.dtype)
             if old_count == new_count:
                 return result.sorted_like(self)
             return result
@@ -377,14 +424,12 @@ class Column:
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if plc.traits.is_floating_point(self.obj.type()):
-            return plc.interop.to_arrow(
-                plc.reduce.reduce(
-                    plc.unary.is_nan(self.obj),
-                    plc.aggregation.sum(),
-                    plc.types.SIZE_TYPE,
-                )
-            ).as_py()
+        if self.size > 0 and plc.traits.is_floating_point(self.obj.type()):
+            return plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                plc.types.SIZE_TYPE,
+            ).to_py()
         return 0
     @property
@@ -418,4 +463,4 @@ class Column:
             conversion.from_polars_slice(zlice, num_rows=self.size),
         )
         (column,) = table.columns()
-        return type(self)(column, name=self.name).sorted_like(self)
+        return type(self)(column, name=self.name, dtype=self.dtype).sorted_like(self)

cudf_polars/containers/dataframe.py CHANGED Viewed

@@ -12,20 +12,51 @@ import polars as pl
 import pylibcudf as plc
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, DataType
 from cudf_polars.utils import conversion
 if TYPE_CHECKING:
     from collections.abc import Iterable, Mapping, Sequence, Set
-    from typing_extensions import Any, Self
+    from typing_extensions import Any, CapsuleType, Self
-    from cudf_polars.typing import ColumnOptions, DataFrameHeader, Slice
+    from cudf_polars.typing import ColumnOptions, DataFrameHeader, PolarsDataType, Slice
 __all__: list[str] = ["DataFrame"]
+def _create_polars_column_metadata(
+    name: str, dtype: PolarsDataType
+) -> plc.interop.ColumnMetadata:
+    """Create ColumnMetadata preserving pl.Struct field names."""
+    if isinstance(dtype, pl.Struct):
+        children_meta = [
+            _create_polars_column_metadata(field.name, field.dtype)
+            for field in dtype.fields
+        ]
+    else:
+        children_meta = []
+    timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None
+    return plc.interop.ColumnMetadata(
+        name=name, timezone=timezone or "", children_meta=children_meta
+    )
+# This is also defined in pylibcudf.interop
+class _ObjectWithArrowMetadata:
+    def __init__(
+        self, obj: plc.Table, metadata: list[plc.interop.ColumnMetadata]
+    ) -> None:
+        self.obj = obj
+        self.metadata = metadata
+    def __arrow_c_array__(
+        self, requested_schema: None = None
+    ) -> tuple[CapsuleType, CapsuleType]:
+        return self.obj._to_schema(self.metadata), self.obj._to_host_array()
 # Pacify the type checker. DataFrame init asserts that all the columns
 # have a string name, so let's narrow the type.
 class NamedColumn(Column):
@@ -44,6 +75,7 @@ class DataFrame:
         if any(c.name is None for c in columns):
             raise ValueError("All columns must have a name")
         self.columns = [cast(NamedColumn, c) for c in columns]
+        self.dtypes = [c.dtype for c in self.columns]
         self.column_map = {c.name: c for c in self.columns}
         self.table = plc.Table([c.obj for c in self.columns])
@@ -60,11 +92,12 @@ class DataFrame:
         # To guarantee we produce correct names, we therefore
         # serialise with names we control and rename with that map.
         name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)}
-        table = plc.interop.to_arrow(
-            self.table,
-            [plc.interop.ColumnMetadata(name=name) for name in name_map],
-        )
-        df: pl.DataFrame = pl.from_arrow(table)
+        metadata = [
+            _create_polars_column_metadata(name, dtype.polars)
+            for name, dtype in zip(name_map, self.dtypes, strict=True)
+        ]
+        table_with_metadata = _ObjectWithArrowMetadata(self.table, metadata)
+        df = pl.DataFrame(table_with_metadata)
         return df.rename(name_map).with_columns(
             pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING)
             if c.is_sorted
@@ -106,16 +139,18 @@ class DataFrame:
         -------
         New dataframe representing the input.
         """
-        plc_table = plc.Table(df)
+        plc_table = plc.Table.from_arrow(df)
         return cls(
-            Column(d_col, name=name).copy_metadata(h_col)
+            Column(d_col, name=name, dtype=DataType(h_col.dtype)).copy_metadata(h_col)
             for d_col, h_col, name in zip(
                 plc_table.columns(), df.iter_columns(), df.columns, strict=True
             )
         )
     @classmethod
-    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
+    def from_table(
+        cls, table: plc.Table, names: Sequence[str], dtypes: Sequence[DataType]
+    ) -> Self:
         """
         Create from a pylibcudf table.
@@ -125,6 +160,8 @@ class DataFrame:
             Pylibcudf table to obtain columns from
         names
             Names for the columns
+        dtypes
+            Dtypes for the columns
         Returns
         -------
@@ -139,7 +176,8 @@ class DataFrame:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            Column(c, name=name) for c, name in zip(table.columns(), names, strict=True)
+            Column(c, name=name, dtype=dtype)
+            for c, name, dtype in zip(table.columns(), names, dtypes, strict=True)
         )
     @classmethod
@@ -166,7 +204,7 @@ class DataFrame:
             packed_metadata, packed_gpu_data
         )
         return cls(
-            Column(c, **kw)
+            Column(c, **Column.deserialize_ctor_kwargs(kw))
             for c, kw in zip(table.columns(), header["columns_kwargs"], strict=True)
         )
@@ -195,13 +233,7 @@ class DataFrame:
         # Keyword arguments for `Column.__init__`.
         columns_kwargs: list[ColumnOptions] = [
-            {
-                "is_sorted": col.is_sorted,
-                "order": col.order,
-                "null_order": col.null_order,
-                "name": col.name,
-            }
-            for col in self.columns
+            col.serialize_ctor_kwargs() for col in self.columns
         ]
         header: DataFrameHeader = {
             "columns_kwargs": columns_kwargs,
@@ -288,7 +320,11 @@ class DataFrame:
     def filter(self, mask: Column) -> Self:
         """Return a filtered table given a mask."""
         table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
-        return type(self).from_table(table, self.column_names).sorted_like(self)
+        return (
+            type(self)
+            .from_table(table, self.column_names, self.dtypes)
+            .sorted_like(self)
+        )
     def slice(self, zlice: Slice | None) -> Self:
         """
@@ -309,4 +345,8 @@ class DataFrame:
         (table,) = plc.copying.slice(
             self.table, conversion.from_polars_slice(zlice, num_rows=self.num_rows)
         )
-        return type(self).from_table(table, self.column_names).sorted_like(self)
+        return (
+            type(self)
+            .from_table(table, self.column_names, self.dtypes)
+            .sorted_like(self)
+        )

cudf_polars/containers/datatype.py ADDED Viewed

@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""A datatype, preserving polars metadata."""
+from __future__ import annotations
+from functools import cache
+from typing_extensions import assert_never
+import polars as pl
+import pylibcudf as plc
+__all__ = ["DataType"]
+@cache
+def _from_polars(dtype: pl.DataType) -> plc.DataType:
+    """
+    Convert a polars datatype to a pylibcudf one.
+    Parameters
+    ----------
+    dtype
+        Polars dtype to convert
+    Returns
+    -------
+    Matching pylibcudf DataType object.
+    Raises
+    ------
+    NotImplementedError
+        For unsupported conversions.
+    """
+    if isinstance(dtype, pl.Boolean):
+        return plc.DataType(plc.TypeId.BOOL8)
+    elif isinstance(dtype, pl.Int8):
+        return plc.DataType(plc.TypeId.INT8)
+    elif isinstance(dtype, pl.Int16):
+        return plc.DataType(plc.TypeId.INT16)
+    elif isinstance(dtype, pl.Int32):
+        return plc.DataType(plc.TypeId.INT32)
+    elif isinstance(dtype, pl.Int64):
+        return plc.DataType(plc.TypeId.INT64)
+    if isinstance(dtype, pl.UInt8):
+        return plc.DataType(plc.TypeId.UINT8)
+    elif isinstance(dtype, pl.UInt16):
+        return plc.DataType(plc.TypeId.UINT16)
+    elif isinstance(dtype, pl.UInt32):
+        return plc.DataType(plc.TypeId.UINT32)
+    elif isinstance(dtype, pl.UInt64):
+        return plc.DataType(plc.TypeId.UINT64)
+    elif isinstance(dtype, pl.Float32):
+        return plc.DataType(plc.TypeId.FLOAT32)
+    elif isinstance(dtype, pl.Float64):
+        return plc.DataType(plc.TypeId.FLOAT64)
+    elif isinstance(dtype, pl.Date):
+        return plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    elif isinstance(dtype, pl.Time):
+        raise NotImplementedError("Time of day dtype not implemented")
+    elif isinstance(dtype, pl.Datetime):
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
+        assert dtype.time_unit is not None  # pragma: no cover
+        assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.Duration):
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
+        assert dtype.time_unit is not None  # pragma: no cover
+        assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.String):
+        return plc.DataType(plc.TypeId.STRING)
+    elif isinstance(dtype, pl.Null):
+        # TODO: Hopefully
+        return plc.DataType(plc.TypeId.EMPTY)
+    elif isinstance(dtype, pl.List):
+        # Recurse to catch unsupported inner types
+        _ = _from_polars(dtype.inner)
+        return plc.DataType(plc.TypeId.LIST)
+    elif isinstance(dtype, pl.Struct):
+        # Recurse to catch unsupported field types
+        for field in dtype.fields:
+            _ = _from_polars(field.dtype)
+        return plc.DataType(plc.TypeId.STRUCT)
+    else:
+        raise NotImplementedError(f"{dtype=} conversion not supported")
+class DataType:
+    """A datatype, preserving polars metadata."""
+    polars: pl.datatypes.DataType
+    plc: plc.DataType
+    def __init__(self, polars_dtype: pl.DataType) -> None:
+        self.polars = polars_dtype
+        self.plc = _from_polars(polars_dtype)
+    def id(self) -> plc.TypeId:
+        """The pylibcudf.TypeId of this DataType."""
+        return self.plc.id()
+    @property
+    def children(self) -> list[DataType]:
+        """The children types of this DataType."""
+        if self.plc.id() == plc.TypeId.STRUCT:
+            return [DataType(field.dtype) for field in self.polars.fields]
+        elif self.plc.id() == plc.TypeId.LIST:
+            return [DataType(self.polars.inner)]
+        return []
+    def __eq__(self, other: object) -> bool:
+        """Equality of DataTypes."""
+        if not isinstance(other, DataType):
+            return False
+        return self.polars == other.polars
+    def __hash__(self) -> int:
+        """Hash of the DataType."""
+        return hash(self.polars)
+    def __repr__(self) -> str:
+        """Representation of the DataType."""
+        return f"<DataType(polars={self.polars}, plc={self.id()!r})>"

cudf_polars/dsl/expr.py CHANGED Viewed

@@ -33,6 +33,7 @@ from cudf_polars.dsl.expressions.selection import Filter, Gather
 from cudf_polars.dsl.expressions.slicing import Slice
 from cudf_polars.dsl.expressions.sorting import Sort, SortBy
 from cudf_polars.dsl.expressions.string import StringFunction
+from cudf_polars.dsl.expressions.struct import StructFunction
 from cudf_polars.dsl.expressions.ternary import Ternary
 from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
@@ -58,6 +59,7 @@ __all__ = [
     "Sort",
     "SortBy",
     "StringFunction",
+    "StructFunction",
     "TemporalFunction",
     "Ternary",
     "UnaryFunction",

cudf-polars-cu12 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

cudf-polars-cu12 25.6.0py3-none-any.whl → 25.8.0py3-none-any.whl