PyPI - cudf-polars-cu12 - Versions diffs - 24.12.0__py3-none-any.whl → 25.2.0__py3-none-any.whl - Mend

cudf-polars-cu12 24.12.0py3-none-any.whl → 25.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cudf_polars/VERSION +1 -1
cudf_polars/__init__.py +1 -1
cudf_polars/callback.py +28 -3
cudf_polars/containers/__init__.py +1 -1
cudf_polars/dsl/expr.py +16 -16
cudf_polars/dsl/expressions/aggregation.py +21 -4
cudf_polars/dsl/expressions/base.py +7 -2
cudf_polars/dsl/expressions/binaryop.py +1 -0
cudf_polars/dsl/expressions/boolean.py +65 -22
cudf_polars/dsl/expressions/datetime.py +82 -20
cudf_polars/dsl/expressions/literal.py +2 -0
cudf_polars/dsl/expressions/rolling.py +3 -1
cudf_polars/dsl/expressions/selection.py +3 -1
cudf_polars/dsl/expressions/sorting.py +2 -0
cudf_polars/dsl/expressions/string.py +118 -39
cudf_polars/dsl/expressions/ternary.py +1 -0
cudf_polars/dsl/expressions/unary.py +11 -1
cudf_polars/dsl/ir.py +173 -122
cudf_polars/dsl/to_ast.py +4 -6
cudf_polars/dsl/translate.py +53 -21
cudf_polars/dsl/traversal.py +10 -10
cudf_polars/experimental/base.py +43 -0
cudf_polars/experimental/dispatch.py +84 -0
cudf_polars/experimental/io.py +325 -0
cudf_polars/experimental/parallel.py +253 -0
cudf_polars/experimental/select.py +36 -0
cudf_polars/testing/asserts.py +14 -5
cudf_polars/testing/plugin.py +60 -4
cudf_polars/typing/__init__.py +5 -5
cudf_polars/utils/dtypes.py +9 -7
cudf_polars/utils/versions.py +4 -7
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +6 -6
cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu12-24.12.0.dist-info/RECORD +0 -43
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/ir.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """
 DSL nodes for the LogicalPlan of polars.
@@ -31,35 +31,36 @@ from cudf_polars.containers import Column, DataFrame
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
 from cudf_polars.utils import dtypes
-from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 if TYPE_CHECKING:
-    from collections.abc import Callable, Hashable, MutableMapping, Sequence
+    from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
     from typing import Literal
+    from polars.polars import _expr_nodes as pl_expr
     from cudf_polars.typing import Schema
 __all__ = [
     "IR",
-    "ErrorNode",
-    "PythonScan",
-    "Scan",
     "Cache",
-    "DataFrameScan",
-    "Select",
-    "GroupBy",
-    "Join",
     "ConditionalJoin",
-    "HStack",
+    "DataFrameScan",
     "Distinct",
-    "Sort",
-    "Slice",
+    "ErrorNode",
     "Filter",
-    "Projection",
+    "GroupBy",
+    "HConcat",
+    "HStack",
+    "Join",
     "MapFunction",
+    "Projection",
+    "PythonScan",
+    "Scan",
+    "Select",
+    "Slice",
+    "Sort",
     "Union",
-    "HConcat",
 ]
@@ -130,7 +131,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
 class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
-    __slots__ = ("schema", "_non_child_args")
+    __slots__ = ("_non_child_args", "schema")
     # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     # Concrete classes should set this up with the arguments that will
@@ -253,16 +254,16 @@ class Scan(IR):
     """Input from files."""
     __slots__ = (
-        "typ",
-        "reader_options",
         "cloud_options",
         "config_options",
-        "paths",
-        "with_columns",
-        "skip_rows",
         "n_rows",
-        "row_index",
+        "paths",
         "predicate",
+        "reader_options",
+        "row_index",
+        "skip_rows",
+        "typ",
+        "with_columns",
     )
     _non_child = (
         "schema",
@@ -476,23 +477,28 @@ class Scan(IR):
                 with path.open() as f:
                     while f.readline() == "\n":
                         skiprows += 1
-                tbl_w_meta = plc.io.csv.read_csv(
-                    plc.io.SourceInfo([path]),
-                    delimiter=sep,
-                    quotechar=quote,
-                    lineterminator=eol,
-                    col_names=column_names,
-                    header=header,
-                    usecols=usecols,
-                    na_filter=True,
-                    na_values=null_values,
-                    keep_default_na=False,
-                    skiprows=skiprows,
-                    comment=comment,
-                    decimal=decimal,
-                    dtypes=schema,
-                    nrows=n_rows,
+                options = (
+                    plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path]))
+                    .nrows(n_rows)
+                    .skiprows(skiprows)
+                    .lineterminator(str(eol))
+                    .quotechar(str(quote))
+                    .decimal(decimal)
+                    .keep_default_na(keep_default_na=False)
+                    .na_filter(na_filter=True)
+                    .build()
                 )
+                options.set_delimiter(str(sep))
+                if column_names is not None:
+                    options.set_names([str(name) for name in column_names])
+                options.set_header(header)
+                options.set_dtypes(schema)
+                if usecols is not None:
+                    options.set_use_cols_names([str(name) for name in usecols])
+                options.set_na_values(null_values)
+                if comment is not None:
+                    options.set_comment(comment)
+                tbl_w_meta = plc.io.csv.read_csv(options)
                 pieces.append(tbl_w_meta)
                 if read_partial:
                     n_rows -= tbl_w_meta.tbl.num_rows()
@@ -512,17 +518,22 @@ class Scan(IR):
         elif typ == "parquet":
             parquet_options = config_options.get("parquet_options", {})
             if parquet_options.get("chunked", True):
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                # We handle skip_rows != 0 by reading from the
+                # up to n_rows + skip_rows and slicing off the
+                # first skip_rows entries.
+                # TODO: Remove this workaround once
+                # https://github.com/rapidsai/cudf/issues/16186
+                # is fixed
+                nrows = n_rows + skip_rows
+                if nrows > -1:
+                    options.set_num_rows(nrows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    # We handle skip_rows != 0 by reading from the
-                    # up to n_rows + skip_rows and slicing off the
-                    # first skip_rows entries.
-                    # TODO: Remove this workaround once
-                    # https://github.com/rapidsai/cudf/issues/16186
-                    # is fixed
-                    nrows=n_rows + skip_rows,
-                    skip_rows=0,
+                    options,
                     chunk_read_limit=parquet_options.get(
                         "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
                     ),
@@ -568,13 +579,18 @@ class Scan(IR):
                 if predicate is not None and row_index is None:
                     # Can't apply filters during read if we have a row index.
                     filters = to_parquet_filter(predicate.value)
-                tbl_w_meta = plc.io.parquet.read_parquet(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    filters=filters,
-                    nrows=n_rows,
-                    skip_rows=skip_rows,
-                )
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                if n_rows != -1:
+                    options.set_num_rows(n_rows)
+                if skip_rows != 0:
+                    options.set_skip_rows(skip_rows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
+                if filters is not None:
+                    options.set_filter(filters)
+                tbl_w_meta = plc.io.parquet.read_parquet(options)
                 df = DataFrame.from_table(
                     tbl_w_meta.tbl,
                     # TODO: consider nested column names?
@@ -589,10 +605,12 @@ class Scan(IR):
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(
-                plc.io.SourceInfo(paths),
-                lines=True,
-                dtypes=json_schema,
-                prune_columns=True,
+                plc.io.json._setup_json_reader_options(
+                    plc.io.SourceInfo(paths),
+                    lines=True,
+                    dtypes=json_schema,
+                    prune_columns=True,
+                )
             )
             # TODO: I don't think cudf-polars supports nested types in general right now
             # (but when it does, we should pass child column names from nested columns in)
@@ -609,12 +627,7 @@ class Scan(IR):
             )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
-            if POLARS_VERSION_GT_112:
-                # If we sliced away some data from the start, that
-                # shifts the row index.
-                # But prior to 1.13, polars had this wrong, so we match behaviour
-                # https://github.com/pola-rs/polars/issues/19607
-                offset += skip_rows
+            offset += skip_rows
             dtype = schema[name]
             step = plc.interop.from_arrow(
                 pa.scalar(1, type=plc.interop.to_arrow(dtype))
@@ -683,27 +696,27 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
-    __slots__ = ("df", "projection", "predicate")
-    _non_child = ("schema", "df", "projection", "predicate")
+    __slots__ = ("config_options", "df", "projection")
+    _non_child = ("schema", "df", "projection", "config_options")
     df: Any
     """Polars LazyFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
-    predicate: expr.NamedExpr | None
-    """Mask to apply."""
+    config_options: dict[str, Any]
+    """GPU-specific configuration options"""
     def __init__(
         self,
         schema: Schema,
         df: Any,
         projection: Sequence[str] | None,
-        predicate: expr.NamedExpr | None,
+        config_options: dict[str, Any],
     ):
         self.schema = schema
         self.df = df
         self.projection = tuple(projection) if projection is not None else None
-        self.predicate = predicate
-        self._non_child_args = (schema, df, self.projection, predicate)
+        self.config_options = config_options
+        self._non_child_args = (schema, df, self.projection)
         self.children = ()
     def get_hashable(self) -> Hashable:
@@ -714,7 +727,13 @@ class DataFrameScan(IR):
         not stable across runs, or repeat instances of the same equal dataframes.
         """
         schema_hash = tuple(self.schema.items())
-        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+        return (
+            type(self),
+            schema_hash,
+            id(self.df),
+            self.projection,
+            json.dumps(self.config_options),
+        )
     @classmethod
     def do_evaluate(
@@ -722,7 +741,6 @@ class DataFrameScan(IR):
         schema: Schema,
         df: Any,
         projection: tuple[str, ...] | None,
-        predicate: expr.NamedExpr | None,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(df)
@@ -733,11 +751,7 @@ class DataFrameScan(IR):
             c.obj.type() == dtype
             for c, dtype in zip(df.columns, schema.values(), strict=True)
         )
-        if predicate is not None:
-            (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows)
-            return df.filter(mask)
-        else:
-            return df
+        return df
 class Select(IR):
@@ -814,11 +828,11 @@ class GroupBy(IR):
     """Perform a groupby."""
     __slots__ = (
+        "agg_infos",
         "agg_requests",
         "keys",
         "maintain_order",
         "options",
-        "agg_infos",
     )
     _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
     keys: tuple[expr.NamedExpr, ...]
@@ -988,10 +1002,30 @@ class GroupBy(IR):
 class ConditionalJoin(IR):
     """A conditional inner join of two dataframes on a predicate."""
-    __slots__ = ("predicate", "options", "ast_predicate")
+    __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
-    options: tuple
+    """Expression predicate to join on"""
+    options: tuple[
+        tuple[
+            str,
+            pl_expr.Operator | Iterable[pl_expr.Operator],
+        ],
+        bool,
+        tuple[int, int] | None,
+        str,
+        bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
+    ]
+    """
+    tuple of options:
+    - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions
+    - join_nulls: do nulls compare equal?
+    - slice: optional slice to perform after joining.
+    - suffix: string suffix for right columns if names match
+    - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
+    """
     def __init__(
         self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR
@@ -1001,15 +1035,16 @@ class ConditionalJoin(IR):
         self.options = options
         self.children = (left, right)
         self.ast_predicate = to_ast(predicate)
-        _, join_nulls, zlice, suffix, coalesce = self.options
+        _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options
         # Preconditions from polars
         assert not join_nulls
         assert not coalesce
+        assert maintain_order == "none"
         if self.ast_predicate is None:
             raise NotImplementedError(
                 f"Conditional join with predicate {predicate}"
             )  # pragma: no cover; polars never delivers expressions we can't handle
-        self._non_child_args = (self.ast_predicate, zlice, suffix)
+        self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order)
     @classmethod
     def do_evaluate(
@@ -1017,6 +1052,7 @@ class ConditionalJoin(IR):
         predicate: plc.expressions.Expression,
         zlice: tuple[int, int] | None,
         suffix: str,
+        maintain_order: Literal["none", "left", "right", "left_right", "right_left"],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
@@ -1048,18 +1084,19 @@ class ConditionalJoin(IR):
 class Join(IR):
     """A join of two dataframes."""
-    __slots__ = ("left_on", "right_on", "options")
+    __slots__ = ("left_on", "options", "right_on")
     _non_child = ("schema", "left_on", "right_on", "options")
     left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
     right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+        Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
         bool,
         tuple[int, int] | None,
         str,
         bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
     ]
     """
     tuple of options:
@@ -1068,6 +1105,7 @@ class Join(IR):
     - slice: optional slice to perform after joining.
     - suffix: string suffix for right columns if names match
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
     """
     def __init__(
@@ -1085,50 +1123,48 @@ class Join(IR):
         self.options = options
         self.children = (left, right)
         self._non_child_args = (self.left_on, self.right_on, self.options)
-        if any(
-            isinstance(e.value, expr.Literal)
-            for e in itertools.chain(self.left_on, self.right_on)
-        ):
-            raise NotImplementedError("Join with literal as join key.")
+        # TODO: Implement maintain_order
+        if options[5] != "none":
+            raise NotImplementedError("maintain_order not implemented yet")
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "semi", "anti"],
+        how: Literal["Inner", "Left", "Right", "Full", "Semi", "Anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
-        if how == "inner":
+        if how == "Inner":
             return (
                 plc.join.inner_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
-        elif how == "left" or how == "right":
+        elif how == "Left" or how == "Right":
             return (
                 plc.join.left_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "full":
+        elif how == "Full":
             return (
                 plc.join.full_join,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "semi":
+        elif how == "Semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "anti":
+        elif how == "Anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        assert_never(how)
+        assert_never(how)  # pragma: no cover
     @staticmethod
     def _reorder_maps(
@@ -1189,18 +1225,19 @@ class Join(IR):
         left_on_exprs: Sequence[expr.NamedExpr],
         right_on_exprs: Sequence[expr.NamedExpr],
         options: tuple[
-            Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+            Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
             bool,
             tuple[int, int] | None,
             str,
             bool,
+            Literal["none", "left", "right", "left_right", "right_left"],
         ],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        how, join_nulls, zlice, suffix, coalesce = options
-        if how == "cross":
+        how, join_nulls, zlice, suffix, coalesce, _ = options
+        if how == "Cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
             columns = plc.join.cross_join(left.table, right.table).columns()
@@ -1237,25 +1274,32 @@ class Join(IR):
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
-            if how == "right":
+            if how == "Right":
                 # Right join is a left join with the tables swapped
                 left, right = right, left
                 left_on, right_on = right_on, left_on
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            if how == "left" or how == "right":
+            if how == "Left" or how == "Right":
                 # Order of left table is preserved
                 lg, rg = cls._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
                 )
-            if coalesce and how == "inner":
-                right = right.discard_columns(right_on.column_names_set)
+            if coalesce:
+                if how == "Full":
+                    # In this case, keys must be column references,
+                    # possibly with dtype casting. We should use them in
+                    # preference to the columns from the original tables.
+                    left = left.with_columns(left_on.columns, replace_only=True)
+                    right = right.with_columns(right_on.columns, replace_only=True)
+                else:
+                    right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
             )
             right = DataFrame.from_table(
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
-            if coalesce and how != "inner":
+            if coalesce and how == "Full":
                 left = left.with_columns(
                     (
                         Column(
@@ -1271,7 +1315,7 @@ class Join(IR):
                     replace_only=True,
                 )
                 right = right.discard_columns(right_on.column_names_set)
-            if how == "right":
+            if how == "Right":
                 # Undo the swap for right join before gluing together.
                 left, right = right, left
             right = right.rename_columns(
@@ -1316,7 +1360,9 @@ class HStack(IR):
         """Evaluate and return a dataframe."""
         columns = [c.evaluate(df) for c in exprs]
         if should_broadcast:
-            columns = broadcast(*columns, target_length=df.num_rows)
+            columns = broadcast(
+                *columns, target_length=df.num_rows if df.num_columns != 0 else None
+            )
         else:
             # Polars ensures this is true, but let's make sure nothing
             # went wrong. In this case, the parent node is a
@@ -1332,7 +1378,7 @@ class HStack(IR):
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
-    __slots__ = ("keep", "subset", "zlice", "stable")
+    __slots__ = ("keep", "stable", "subset", "zlice")
     _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
     """Which distinct value to keep."""
@@ -1419,7 +1465,7 @@ class Distinct(IR):
 class Sort(IR):
     """Sort a dataframe."""
-    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    __slots__ = ("by", "null_order", "order", "stable", "zlice")
     _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
     by: tuple[expr.NamedExpr, ...]
     """Sort keys."""
@@ -1500,7 +1546,7 @@ class Sort(IR):
 class Slice(IR):
     """Slice a dataframe."""
-    __slots__ = ("offset", "length")
+    __slots__ = ("length", "offset")
     _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
@@ -1599,13 +1645,15 @@ class MapFunction(IR):
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
+            self.options = (tuple(to_explode),)
         elif self.name == "rename":
-            old, new, _ = self.options
+            old, new, strict = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
                 set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
+            self.options = (tuple(old), tuple(new), strict)
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             value_name = "value" if value_name is None else value_name
@@ -1623,13 +1671,15 @@ class MapFunction(IR):
             self.options = (
                 tuple(indices),
                 tuple(pivotees),
-                (variable_name, schema[variable_name]),
-                (value_name, schema[value_name]),
+                variable_name,
+                value_name,
             )
-        self._non_child_args = (name, self.options)
+        self._non_child_args = (schema, name, self.options)
     @classmethod
-    def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, schema: Schema, name: str, options: Any, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if name == "rechunk":
             # No-op in our data model
@@ -1651,8 +1701,8 @@ class MapFunction(IR):
             (
                 indices,
                 pivotees,
-                (variable_name, variable_dtype),
-                (value_name, value_dtype),
+                variable_name,
+                value_name,
             ) = options
             npiv = len(pivotees)
             index_columns = [
@@ -1669,7 +1719,7 @@ class MapFunction(IR):
                         plc.interop.from_arrow(
                             pa.array(
                                 pivotees,
-                                type=plc.interop.to_arrow(variable_dtype),
+                                type=plc.interop.to_arrow(schema[variable_name]),
                             ),
                         )
                     ]
@@ -1677,7 +1727,10 @@ class MapFunction(IR):
                 df.num_rows,
             ).columns()
             value_column = plc.concatenate.concatenate(
-                [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees]
+                [
+                    df.column_map[pivotee].astype(schema[value_name]).obj
+                    for pivotee in pivotees
+                ]
             )
             return DataFrame(
                 [
@@ -1704,8 +1757,6 @@ class Union(IR):
         self._non_child_args = (zlice,)
         self.children = children
         schema = self.children[0].schema
-        if not all(s.schema == schema for s in self.children[1:]):
-            raise NotImplementedError("Schema mismatch")
     @classmethod
     def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:

cudf_polars/dsl/to_ast.py CHANGED Viewed

@@ -8,8 +8,6 @@ from __future__ import annotations
 from functools import partial, reduce, singledispatch
 from typing import TYPE_CHECKING, TypeAlias
-from polars.polars import _expr_nodes as pl_expr
 import pylibcudf as plc
 from pylibcudf import expressions as plc_expr
@@ -185,7 +183,7 @@ def _(node: expr.BinOp, self: Transformer) -> plc_expr.Expression:
 @_to_ast.register
 def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
-    if node.name == pl_expr.BooleanFunction.IsIn:
+    if node.name is expr.BooleanFunction.Name.IsIn:
         needles, haystack = node.children
         if isinstance(haystack, expr.LiteralColumn) and len(haystack.value) < 16:
             # 16 is an arbitrary limit
@@ -204,14 +202,14 @@ def _(node: expr.BooleanFunction, self: Transformer) -> plc_expr.Expression:
         raise NotImplementedError(
             f"Parquet filters don't support {node.name} on columns"
         )
-    if node.name == pl_expr.BooleanFunction.IsNull:
+    if node.name is expr.BooleanFunction.Name.IsNull:
         return plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0]))
-    elif node.name == pl_expr.BooleanFunction.IsNotNull:
+    elif node.name is expr.BooleanFunction.Name.IsNotNull:
         return plc_expr.Operation(
             plc_expr.ASTOperator.NOT,
             plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, self(node.children[0])),
         )
-    elif node.name == pl_expr.BooleanFunction.Not:
+    elif node.name is expr.BooleanFunction.Name.Not:
         return plc_expr.Operation(plc_expr.ASTOperator.NOT, self(node.children[0]))
     raise NotImplementedError(f"AST conversion does not support {node.name}")

cudf-polars-cu12 24.12.0__py3-none-any.whl → 25.2.0__py3-none-any.whl

cudf-polars-cu12 24.12.0py3-none-any.whl → 25.2.0py3-none-any.whl