PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/expressions/string.py CHANGED Viewed

@@ -21,8 +21,6 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn
 if TYPE_CHECKING:
-    from collections.abc import Mapping
     from typing_extensions import Self
     from polars.polars import _expr_nodes as pl_expr
@@ -107,10 +105,10 @@ class StringFunction(Expr):
         self.options = options
         self.name = name
         self.children = children
-        self.is_pointwise = True
+        self.is_pointwise = self.name != StringFunction.Name.ConcatVertical
         self._validate_input()
-    def _validate_input(self):
+    def _validate_input(self) -> None:
         if self.name not in (
             StringFunction.Name.ConcatVertical,
             StringFunction.Name.Contains,
@@ -138,7 +136,7 @@ class StringFunction(Expr):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
-                pattern = self.children[1].value.as_py()
+                pattern = self.children[1].value
                 try:
                     self._regex_program = plc.strings.regex_program.RegexProgram.create(
                         pattern,
@@ -155,7 +153,9 @@ class StringFunction(Expr):
             if not all(isinstance(expr, Literal) for expr in self.children[1:]):
                 raise NotImplementedError("replace only supports scalar target")
             target = self.children[1]
-            if target.value == pa.scalar("", type=pa.string()):
+            # Above, we raise NotImplementedError if the target is not a Literal,
+            # so we can safely access .value here.
+            if target.value == "":  # type: ignore[attr-defined]
                 raise NotImplementedError(
                     "libcudf replace does not support empty strings"
                 )
@@ -170,7 +170,14 @@ class StringFunction(Expr):
             ):
                 raise NotImplementedError("replace_many only supports literal inputs")
             target = self.children[1]
-            if pc.any(pc.equal(target.value, "")).as_py():
+            # Above, we raise NotImplementedError if the target is not a Literal,
+            # so we can safely access .value here.
+            if (isinstance(target, Literal) and target.value == "") or (
+                isinstance(target, LiteralColumn)
+                and pc.any(
+                    pc.equal(target.value.cast(pa.string()), "")  # type: ignore[attr-defined]
+                ).as_py()
+            ):
                 raise NotImplementedError(
                     "libcudf replace_many is implemented differently from polars "
                     "for empty strings"
@@ -199,36 +206,32 @@ class StringFunction(Expr):
                 )
     def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if self.name is StringFunction.Name.ConcatVertical:
             (child,) = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
+            column = child.evaluate(df, context=context)
             delimiter, ignore_nulls = self.options
-            if column.obj.null_count() > 0 and not ignore_nulls:
+            if column.null_count > 0 and not ignore_nulls:
                 return Column(plc.Column.all_null_like(column.obj, 1))
             return Column(
                 plc.strings.combine.join_strings(
                     column.obj,
-                    plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
-                    plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
+                    plc.Scalar.from_py(delimiter, plc.DataType(plc.TypeId.STRING)),
+                    plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)),
                 )
             )
         elif self.name is StringFunction.Name.Contains:
             child, arg = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
+            column = child.evaluate(df, context=context)
             literal, _ = self.options
             if literal:
-                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pat = arg.evaluate(df, context=context)
                 pattern = (
                     pat.obj_scalar
-                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    if pat.is_scalar and pat.size != column.size
                     else pat.obj
                 )
                 return Column(plc.strings.find.contains(column.obj, pattern))
@@ -241,15 +244,15 @@ class StringFunction(Expr):
             assert isinstance(expr_offset, Literal)
             assert isinstance(expr_length, Literal)
-            column = child.evaluate(df, context=context, mapping=mapping)
+            column = child.evaluate(df, context=context)
             # libcudf slices via [start,stop).
             # polars slices with offset + length where start == offset
             # stop = start + length. Negative values for start look backward
             # from the last element of the string. If the end index would be
             # below zero, an empty string is returned.
             # Do this maths on the host
-            start = expr_offset.value.as_py()
-            length = expr_length.value.as_py()
+            start = expr_offset.value
+            length = expr_length.value
             if length == 0:
                 stop = start
@@ -262,8 +265,8 @@ class StringFunction(Expr):
             return Column(
                 plc.strings.slice.slice_strings(
                     column.obj,
-                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
-                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                    plc.Scalar.from_py(start, plc.DataType(plc.TypeId.INT32)),
+                    plc.Scalar.from_py(stop, plc.DataType(plc.TypeId.INT32)),
                 )
             )
         elif self.name in {
@@ -271,9 +274,7 @@ class StringFunction(Expr):
             StringFunction.Name.StripCharsStart,
             StringFunction.Name.StripCharsEnd,
         }:
-            column, chars = (
-                c.evaluate(df, context=context, mapping=mapping) for c in self.children
-            )
+            column, chars = (c.evaluate(df, context=context) for c in self.children)
             if self.name is StringFunction.Name.StripCharsStart:
                 side = plc.strings.SideType.LEFT
             elif self.name is StringFunction.Name.StripCharsEnd:
@@ -282,10 +283,7 @@ class StringFunction(Expr):
                 side = plc.strings.SideType.BOTH
             return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
-        columns = [
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
-        ]
+        columns = [child.evaluate(df, context=context) for child in self.children]
         if self.name is StringFunction.Name.Lowercase:
             (column,) = columns
             return Column(plc.strings.case.to_lower(column.obj))
@@ -298,7 +296,7 @@ class StringFunction(Expr):
                 plc.strings.find.ends_with(
                     column.obj,
                     suffix.obj_scalar
-                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
+                    if column.size != suffix.size and suffix.is_scalar
                     else suffix.obj,
                 )
             )
@@ -308,14 +306,14 @@ class StringFunction(Expr):
                 plc.strings.find.starts_with(
                     column.obj,
                     prefix.obj_scalar
-                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
+                    if column.size != prefix.size and prefix.is_scalar
                     else prefix.obj,
                 )
             )
         elif self.name is StringFunction.Name.Strptime:
             # TODO: ignores ambiguous
             format, strict, exact, cache = self.options
-            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+            col = self.children[0].evaluate(df, context=context)
             is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
                 col.obj, format
@@ -334,8 +332,7 @@ class StringFunction(Expr):
                 not_timestamps = plc.unary.unary_operation(
                     is_timestamps, plc.unary.UnaryOperator.NOT
                 )
-                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                null = plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING))
                 res = plc.copying.boolean_mask_scatter(
                     [null], plc.Table([col.obj]), not_timestamps
                 )

cudf_polars/dsl/expressions/ternary.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -17,8 +17,6 @@ from cudf_polars.dsl.expressions.base import (
 )
 if TYPE_CHECKING:
-    from collections.abc import Mapping
     from cudf_polars.containers import DataFrame
@@ -37,16 +35,11 @@ class Ternary(Expr):
         self.is_pointwise = True
     def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         when, then, otherwise = (
-            child.evaluate(df, context=context, mapping=mapping)
-            for child in self.children
+            child.evaluate(df, context=context) for child in self.children
         )
         then_obj = then.obj_scalar if then.is_scalar else then.obj
         otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj

cudf_polars/dsl/expressions/unary.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 """DSL nodes for unary operations."""
@@ -7,18 +7,15 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any, ClassVar
-import pyarrow as pa
 import pylibcudf as plc
 from cudf_polars.containers import Column
-from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr
+from cudf_polars.dsl.expressions.base import ExecutionContext, Expr
 from cudf_polars.dsl.expressions.literal import Literal
 from cudf_polars.utils import dtypes
+from cudf_polars.utils.versions import POLARS_VERSION_LT_128
 if TYPE_CHECKING:
-    from collections.abc import Mapping
     from cudf_polars.containers import DataFrame
 __all__ = ["Cast", "Len", "UnaryFunction"]
@@ -40,23 +37,13 @@ class Cast(Expr):
             )
     def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
+        column = child.evaluate(df, context=context)
         return column.astype(self.dtype)
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented filter
-        (child,) = self.children
-        return child.collect_agg(depth=depth)
 class Len(Expr):
     """Class representing the length of an expression."""
@@ -67,28 +54,19 @@ class Len(Expr):
         self.is_pointwise = False
     def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         return Column(
             plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
-                ),
+                plc.Scalar.from_py(df.num_rows, self.dtype),
                 1,
             )
         )
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: polars returns a uint, not an int for count
-        return AggInfo(
-            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
-        )
+    @property
+    def agg_request(self) -> plc.aggregation.Aggregation:  # noqa: D102
+        return plc.aggregation.count(plc.types.NullPolicy.INCLUDE)
 class UnaryFunction(Expr):
@@ -119,6 +97,7 @@ class UnaryFunction(Expr):
         "abs": plc.unary.UnaryOperator.ABS,
         "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
         "not": plc.unary.UnaryOperator.NOT,
+        "negate": plc.unary.UnaryOperator.NEGATE,
     }
     _supported_misc_fns = frozenset(
         {
@@ -168,22 +147,15 @@ class UnaryFunction(Expr):
                 )
     def do_evaluate(
-        self,
-        df: DataFrame,
-        *,
-        context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: Mapping[Expr, Column] | None = None,
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if self.name == "mask_nans":
             (child,) = self.children
-            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
+            return child.evaluate(df, context=context).mask_nans()
         if self.name == "round":
             (decimal_places,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
+            (values,) = (child.evaluate(df, context=context) for child in self.children)
             return Column(
                 plc.round.round(
                     values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
@@ -191,10 +163,7 @@ class UnaryFunction(Expr):
             ).sorted_like(values)
         elif self.name == "unique":
             (maintain_order,) = self.options
-            (values,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
+            (values,) = (child.evaluate(df, context=context) for child in self.children)
             # Only one column, so keep_any is the same as keep_first
             # for stable distinct
             keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
@@ -224,10 +193,7 @@ class UnaryFunction(Expr):
                 return Column(column).sorted_like(values)
             return Column(column)
         elif self.name == "set_sorted":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
+            (column,) = (child.evaluate(df, context=context) for child in self.children)
             (asc,) = self.options
             order = (
                 plc.types.Order.ASCENDING
@@ -235,7 +201,7 @@ class UnaryFunction(Expr):
                 else plc.types.Order.DESCENDING
             )
             null_order = plc.types.NullOrder.BEFORE
-            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+            if column.null_count > 0 and (n := column.size) > 1:
                 # PERF: This invokes four stream synchronisations!
                 has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
                 has_nulls_last = not plc.copying.get_element(
@@ -251,34 +217,41 @@ class UnaryFunction(Expr):
                 null_order=null_order,
             )
         elif self.name == "drop_nulls":
-            (column,) = (
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            )
+            (column,) = (child.evaluate(df, context=context) for child in self.children)
+            if column.null_count == 0:
+                return column
             return Column(
                 plc.stream_compaction.drop_nulls(
                     plc.Table([column.obj]), [0], 1
                 ).columns()[0]
             )
         elif self.name == "fill_null":
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            column = self.children[0].evaluate(df, context=context)
+            if column.null_count == 0:
+                return column
             if isinstance(self.children[1], Literal):
-                arg = plc.interop.from_arrow(self.children[1].value)
+                arg = plc.Scalar.from_py(self.children[1].value, self.children[1].dtype)
             else:
-                evaluated = self.children[1].evaluate(
-                    df, context=context, mapping=mapping
-                )
+                evaluated = self.children[1].evaluate(df, context=context)
                 arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            if (
+                not POLARS_VERSION_LT_128
+                and isinstance(arg, plc.Scalar)
+                and dtypes.can_cast(column.obj.type(), arg.type())
+            ):  # pragma: no cover
+                arg = plc.unary.cast(
+                    plc.Column.from_scalar(arg, 1), column.obj.type()
+                ).to_scalar()
             return Column(plc.replace.replace_nulls(column.obj, arg))
         elif self.name in self._OP_MAPPING:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            column = self.children[0].evaluate(df, context=context)
             if column.obj.type().id() != self.dtype.id():
                 arg = plc.unary.cast(column.obj, self.dtype)
             else:
                 arg = column.obj
             return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
         elif self.name in UnaryFunction._supported_cum_aggs:
-            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            column = self.children[0].evaluate(df, context=context)
             plc_col = column.obj
             col_type = column.obj.type()
             # cum_sum casts
@@ -324,16 +297,3 @@ class UnaryFunction(Expr):
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
-            raise NotImplementedError(f"{self.name} in groupby")
-        if depth == 1:
-            # inside aggregation, need to pre-evaluate, groupby
-            # construction has checked that we don't have nested aggs,
-            # so stop the recursion and return ourselves for pre-eval
-            return AggInfo([(self, plc.aggregation.collect_list(), self)])
-        else:
-            (child,) = self.children
-            return child.collect_agg(depth=depth)

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl