PyPI - cudf-polars-cu12 - Versions diffs - 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl - Mend

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

cudf_polars/VERSION +1 -1
cudf_polars/callback.py +82 -65
cudf_polars/containers/column.py +138 -7
cudf_polars/containers/dataframe.py +26 -39
cudf_polars/dsl/expr.py +3 -1
cudf_polars/dsl/expressions/aggregation.py +27 -63
cudf_polars/dsl/expressions/base.py +40 -72
cudf_polars/dsl/expressions/binaryop.py +5 -41
cudf_polars/dsl/expressions/boolean.py +25 -53
cudf_polars/dsl/expressions/datetime.py +97 -17
cudf_polars/dsl/expressions/literal.py +27 -33
cudf_polars/dsl/expressions/rolling.py +110 -9
cudf_polars/dsl/expressions/selection.py +8 -26
cudf_polars/dsl/expressions/slicing.py +47 -0
cudf_polars/dsl/expressions/sorting.py +5 -18
cudf_polars/dsl/expressions/string.py +33 -36
cudf_polars/dsl/expressions/ternary.py +3 -10
cudf_polars/dsl/expressions/unary.py +35 -75
cudf_polars/dsl/ir.py +749 -212
cudf_polars/dsl/nodebase.py +8 -1
cudf_polars/dsl/to_ast.py +5 -3
cudf_polars/dsl/translate.py +319 -171
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +292 -0
cudf_polars/dsl/utils/groupby.py +97 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +46 -0
cudf_polars/dsl/utils/rolling.py +113 -0
cudf_polars/dsl/utils/windows.py +186 -0
cudf_polars/experimental/base.py +17 -19
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsh.py +1279 -0
cudf_polars/experimental/dask_registers.py +196 -0
cudf_polars/experimental/distinct.py +174 -0
cudf_polars/experimental/explain.py +127 -0
cudf_polars/experimental/expressions.py +521 -0
cudf_polars/experimental/groupby.py +288 -0
cudf_polars/experimental/io.py +58 -29
cudf_polars/experimental/join.py +353 -0
cudf_polars/experimental/parallel.py +166 -93
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +92 -7
cudf_polars/experimental/shuffle.py +294 -0
cudf_polars/experimental/sort.py +45 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/utils.py +100 -0
cudf_polars/testing/asserts.py +146 -6
cudf_polars/testing/io.py +72 -0
cudf_polars/testing/plugin.py +78 -76
cudf_polars/typing/__init__.py +59 -6
cudf_polars/utils/config.py +353 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +22 -5
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +5 -4
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/METADATA +10 -7
cudf_polars_cu12-25.6.0.dist-info/RECORD +73 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/WHEEL +1 -1
cudf_polars/experimental/dask_serialize.py +0 -59
cudf_polars_cu12-25.2.2.dist-info/RECORD +0 -48
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info/licenses}/LICENSE +0 -0
{cudf_polars_cu12-25.2.2.dist-info → cudf_polars_cu12-25.6.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/ir.py CHANGED Viewed

@@ -15,6 +15,8 @@ from __future__ import annotations
 import itertools
 import json
+import random
+import time
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -28,17 +30,25 @@ import pylibcudf as plc
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
+from cudf_polars.dsl.expressions import rolling
+from cudf_polars.dsl.expressions.base import ExecutionContext
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
+from cudf_polars.dsl.utils.windows import range_window_bounds
 from cudf_polars.utils import dtypes
+from cudf_polars.utils.versions import POLARS_VERSION_LT_128
 if TYPE_CHECKING:
-    from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
+    from collections.abc import Callable, Hashable, Iterable, Sequence
     from typing import Literal
+    from typing_extensions import Self
     from polars.polars import _expr_nodes as pl_expr
-    from cudf_polars.typing import Schema
+    from cudf_polars.typing import CSECache, ClosedInterval, Schema, Slice as Zlice
+    from cudf_polars.utils.config import ConfigOptions
+    from cudf_polars.utils.timer import Timer
 __all__ = [
@@ -47,6 +57,7 @@ __all__ = [
     "ConditionalJoin",
     "DataFrameScan",
     "Distinct",
+    "Empty",
     "ErrorNode",
     "Filter",
     "GroupBy",
@@ -54,10 +65,14 @@ __all__ = [
     "HStack",
     "Join",
     "MapFunction",
+    "MergeSorted",
     "Projection",
     "PythonScan",
+    "Reduce",
+    "Rolling",
     "Scan",
     "Select",
+    "Sink",
     "Slice",
     "Sort",
     "Union",
@@ -100,7 +115,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
     """
     if len(columns) == 0:
         return []
-    lengths: set[int] = {column.obj.size() for column in columns}
+    lengths: set[int] = {column.size for column in columns}
     if lengths == {1}:
         if target_length is None:
             return list(columns)
@@ -116,7 +131,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
             )
     return [
         column
-        if column.obj.size() != 1
+        if column.size != 1
         else Column(
             plc.Column.from_scalar(column.obj_scalar, nrows),
             is_sorted=plc.types.Sorted.YES,
@@ -181,7 +196,7 @@ class IR(Node["IR"]):
         translation phase should fail earlier.
     """
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: CSECache, timer: Timer | None) -> DataFrame:
         """
         Evaluate the node (recursively) and return a dataframe.
@@ -190,6 +205,9 @@ class IR(Node["IR"]):
         cache
             Mapping from cached node ids to constructed DataFrames.
             Used to implement evaluation of the `Cache` node.
+        timer
+            If not None, a Timer object to record timings for the
+            evaluation of the node.
         Notes
         -----
@@ -208,10 +226,16 @@ class IR(Node["IR"]):
             If evaluation fails. Ideally this should not occur, since the
             translation phase should fail earlier.
         """
-        return self.do_evaluate(
-            *self._non_child_args,
-            *(child.evaluate(cache=cache) for child in self.children),
-        )
+        children = [child.evaluate(cache=cache, timer=timer) for child in self.children]
+        if timer is not None:
+            start = time.monotonic_ns()
+            result = self.do_evaluate(*self._non_child_args, *children)
+            end = time.monotonic_ns()
+            # TODO: Set better names on each class object.
+            timer.store(start, end, type(self).__name__)
+            return result
+        else:
+            return self.do_evaluate(*self._non_child_args, *children)
 class ErrorNode(IR):
@@ -256,6 +280,7 @@ class Scan(IR):
     __slots__ = (
         "cloud_options",
         "config_options",
+        "include_file_paths",
         "n_rows",
         "paths",
         "predicate",
@@ -276,6 +301,7 @@ class Scan(IR):
         "skip_rows",
         "n_rows",
         "row_index",
+        "include_file_paths",
         "predicate",
     )
     typ: str
@@ -284,7 +310,7 @@ class Scan(IR):
     """Reader-specific options, as dictionary."""
     cloud_options: dict[str, Any] | None
     """Cloud-related authentication options, currently ignored."""
-    config_options: dict[str, Any]
+    config_options: ConfigOptions
     """GPU-specific configuration options"""
     paths: list[str]
     """List of paths to read from."""
@@ -296,6 +322,8 @@ class Scan(IR):
     """Number of rows to read after skipping."""
     row_index: tuple[str, int] | None
     """If not None add an integer index column of the given name."""
+    include_file_paths: str | None
+    """Include the path of the source file(s) as a column with this name."""
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
@@ -308,12 +336,13 @@ class Scan(IR):
         typ: str,
         reader_options: dict[str, Any],
         cloud_options: dict[str, Any] | None,
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
         n_rows: int,
         row_index: tuple[str, int] | None,
+        include_file_paths: str | None,
         predicate: expr.NamedExpr | None,
     ):
         self.schema = schema
@@ -326,6 +355,7 @@ class Scan(IR):
         self.skip_rows = skip_rows
         self.n_rows = n_rows
         self.row_index = row_index
+        self.include_file_paths = include_file_paths
         self.predicate = predicate
         self._non_child_args = (
             schema,
@@ -337,6 +367,7 @@ class Scan(IR):
             skip_rows,
             n_rows,
             row_index,
+            include_file_paths,
             predicate,
         )
         self.children = ()
@@ -350,7 +381,9 @@ class Scan(IR):
             # TODO: polars has this implemented for parquet,
             # maybe we can do this too?
             raise NotImplementedError("slice pushdown for negative slices")
-        if self.typ in {"csv"} and self.skip_rows != 0:  # pragma: no cover
+        if (
+            POLARS_VERSION_LT_128 and self.typ in {"csv"} and self.skip_rows != 0
+        ):  # pragma: no cover
             # This comes from slice pushdown, but that
             # optimization doesn't happen right now
             raise NotImplementedError("skipping rows in CSV reader")
@@ -360,7 +393,7 @@ class Scan(IR):
             raise NotImplementedError(
                 "Read from cloud storage"
             )  # pragma: no cover; no test yet
-        if any(p.startswith("https://") for p in self.paths):
+        if any(str(p).startswith("https:/") for p in self.paths):
             raise NotImplementedError("Read from https")
         if self.typ == "csv":
             if self.reader_options["skip_rows_after_header"] != 0:
@@ -379,9 +412,18 @@ class Scan(IR):
                     "Multi-character comment prefix not supported for CSV reader"
                 )
             if not self.reader_options["has_header"]:
-                # Need to do some file introspection to get the number
-                # of columns so that column projection works right.
-                raise NotImplementedError("Reading CSV without header")
+                # TODO: To support reading headerless CSV files without requiring new
+                # column names, we would need to do file introspection to infer the number
+                # of columns so column projection works right.
+                reader_schema = self.reader_options.get("schema")
+                if not (
+                    reader_schema
+                    and isinstance(schema, dict)
+                    and "fields" in reader_schema
+                ):
+                    raise NotImplementedError(
+                        "Reading CSV without header requires user-provided column names via new_columns"
+                    )
         elif self.typ == "ndjson":
             # TODO: consider handling the low memory option here
             # (maybe use chunked JSON reader)
@@ -389,6 +431,9 @@ class Scan(IR):
                 raise NotImplementedError(
                     "ignore_errors is not supported in the JSON reader"
                 )
+            if include_file_paths is not None:
+                # TODO: Need to populate num_rows_per_source in read_json in libcudf
+                raise NotImplementedError("Including file paths in a json scan.")
         elif (
             self.typ == "parquet"
             and self.row_index is not None
@@ -413,31 +458,60 @@ class Scan(IR):
             self.typ,
             json.dumps(self.reader_options),
             json.dumps(self.cloud_options),
-            json.dumps(self.config_options),
+            self.config_options,
             tuple(self.paths),
             tuple(self.with_columns) if self.with_columns is not None else None,
             self.skip_rows,
             self.n_rows,
             self.row_index,
+            self.include_file_paths,
             self.predicate,
         )
+    @staticmethod
+    def add_file_paths(
+        name: str, paths: list[str], rows_per_path: list[int], df: DataFrame
+    ) -> DataFrame:
+        """
+        Add a Column of file paths to the DataFrame.
+        Each path is repeated according to the number of rows read from it.
+        """
+        (filepaths,) = plc.filling.repeat(
+            # TODO: Remove call from_arrow when we support python list to Column
+            plc.Table([plc.interop.from_arrow(pa.array(map(str, paths)))]),
+            plc.interop.from_arrow(pa.array(rows_per_path, type=pa.int32())),
+        ).columns()
+        return df.with_columns([Column(filepaths, name=name)])
     @classmethod
     def do_evaluate(
         cls,
         schema: Schema,
         typ: str,
         reader_options: dict[str, Any],
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
         paths: list[str],
         with_columns: list[str] | None,
         skip_rows: int,
         n_rows: int,
         row_index: tuple[str, int] | None,
+        include_file_paths: str | None,
         predicate: expr.NamedExpr | None,
-    ):
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if typ == "csv":
+            def read_csv_header(
+                path: Path | str, sep: str
+            ) -> list[str]:  # pragma: no cover
+                with Path(path).open() as f:
+                    for line in f:
+                        stripped = line.strip()
+                        if stripped:
+                            return stripped.split(sep)
+                return []
             parse_options = reader_options["parse_options"]
             sep = chr(parse_options["separator"])
             quote = chr(parse_options["quote_char"])
@@ -449,8 +523,8 @@ class Scan(IR):
                 # file provides column names
                 column_names = None
             usecols = with_columns
-            # TODO: support has_header=False
-            header = 0
+            has_header = reader_options["has_header"]
+            header = 0 if has_header else -1
             # polars defaults to no null recognition
             null_values = [""]
@@ -470,6 +544,7 @@ class Scan(IR):
             # polars skips blank lines at the beginning of the file
             pieces = []
+            seen_paths = []
             read_partial = n_rows != -1
             for p in paths:
                 skiprows = reader_options["skip_rows"]
@@ -480,7 +555,9 @@ class Scan(IR):
                 options = (
                     plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path]))
                     .nrows(n_rows)
-                    .skiprows(skiprows)
+                    .skiprows(
+                        skiprows if POLARS_VERSION_LT_128 else skiprows + skip_rows
+                    )  # pragma: no cover
                     .lineterminator(str(eol))
                     .quotechar(str(quote))
                     .decimal(decimal)
@@ -491,6 +568,13 @@ class Scan(IR):
                 options.set_delimiter(str(sep))
                 if column_names is not None:
                     options.set_names([str(name) for name in column_names])
+                else:
+                    if (
+                        not POLARS_VERSION_LT_128 and header > -1 and skip_rows > header
+                    ):  # pragma: no cover
+                        # We need to read the header otherwise we would skip it
+                        column_names = read_csv_header(path, str(sep))
+                        options.set_names(column_names)
                 options.set_header(header)
                 options.set_dtypes(schema)
                 if usecols is not None:
@@ -500,6 +584,8 @@ class Scan(IR):
                     options.set_comment(comment)
                 tbl_w_meta = plc.io.csv.read_csv(options)
                 pieces.append(tbl_w_meta)
+                if include_file_paths is not None:
+                    seen_paths.append(p)
                 if read_partial:
                     n_rows -= tbl_w_meta.tbl.num_rows()
                     if n_rows <= 0:
@@ -515,12 +601,26 @@ class Scan(IR):
                 plc.concatenate.concatenate(list(tables)),
                 colnames[0],
             )
+            if include_file_paths is not None:
+                df = Scan.add_file_paths(
+                    include_file_paths,
+                    seen_paths,
+                    [t.num_rows() for t in tables],
+                    df,
+                )
         elif typ == "parquet":
-            parquet_options = config_options.get("parquet_options", {})
-            if parquet_options.get("chunked", True):
-                options = plc.io.parquet.ParquetReaderOptions.builder(
-                    plc.io.SourceInfo(paths)
-                ).build()
+            filters = None
+            if predicate is not None and row_index is None:
+                # Can't apply filters during read if we have a row index.
+                filters = to_parquet_filter(predicate.value)
+            options = plc.io.parquet.ParquetReaderOptions.builder(
+                plc.io.SourceInfo(paths)
+            ).build()
+            if with_columns is not None:
+                options.set_columns(with_columns)
+            if filters is not None:
+                options.set_filter(filters)
+            if config_options.parquet_options.chunked:
                 # We handle skip_rows != 0 by reading from the
                 # up to n_rows + skip_rows and slicing off the
                 # first skip_rows entries.
@@ -530,21 +630,15 @@ class Scan(IR):
                 nrows = n_rows + skip_rows
                 if nrows > -1:
                     options.set_num_rows(nrows)
-                if with_columns is not None:
-                    options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
                     options,
-                    chunk_read_limit=parquet_options.get(
-                        "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
-                    ),
-                    pass_read_limit=parquet_options.get(
-                        "pass_read_limit", cls.PARQUET_DEFAULT_PASS_LIMIT
-                    ),
+                    chunk_read_limit=config_options.parquet_options.chunk_read_limit,
+                    pass_read_limit=config_options.parquet_options.pass_read_limit,
                 )
-                chk = reader.read_chunk()
+                chunk = reader.read_chunk()
                 rows_left_to_skip = skip_rows
-                def slice_skip(tbl: plc.Table):
+                def slice_skip(tbl: plc.Table) -> plc.Table:
                     nonlocal rows_left_to_skip
                     if rows_left_to_skip > 0:
                         table_rows = tbl.num_rows()
@@ -556,12 +650,13 @@ class Scan(IR):
                         rows_left_to_skip -= chunk_skip
                     return tbl
-                tbl = slice_skip(chk.tbl)
+                tbl = slice_skip(chunk.tbl)
                 # TODO: Nested column names
-                names = chk.column_names(include_children=False)
+                names = chunk.column_names(include_children=False)
                 concatenated_columns = tbl.columns()
                 while reader.has_next():
-                    tbl = slice_skip(reader.read_chunk().tbl)
+                    chunk = reader.read_chunk()
+                    tbl = slice_skip(chunk.tbl)
                     for i in range(tbl.num_columns()):
                         concatenated_columns[i] = plc.concatenate.concatenate(
@@ -574,31 +669,28 @@ class Scan(IR):
                     plc.Table(concatenated_columns),
                     names=names,
                 )
+                if include_file_paths is not None:
+                    df = Scan.add_file_paths(
+                        include_file_paths, paths, chunk.num_rows_per_source, df
+                    )
             else:
-                filters = None
-                if predicate is not None and row_index is None:
-                    # Can't apply filters during read if we have a row index.
-                    filters = to_parquet_filter(predicate.value)
-                options = plc.io.parquet.ParquetReaderOptions.builder(
-                    plc.io.SourceInfo(paths)
-                ).build()
                 if n_rows != -1:
                     options.set_num_rows(n_rows)
                 if skip_rows != 0:
                     options.set_skip_rows(skip_rows)
-                if with_columns is not None:
-                    options.set_columns(with_columns)
-                if filters is not None:
-                    options.set_filter(filters)
                 tbl_w_meta = plc.io.parquet.read_parquet(options)
                 df = DataFrame.from_table(
                     tbl_w_meta.tbl,
                     # TODO: consider nested column names?
                     tbl_w_meta.column_names(include_children=False),
                 )
-                if filters is not None:
-                    # Mask must have been applied.
-                    return df
+                if include_file_paths is not None:
+                    df = Scan.add_file_paths(
+                        include_file_paths, paths, tbl_w_meta.num_rows_per_source, df
+                    )
+            if filters is not None:
+                # Mask must have been applied.
+                return df
         elif typ == "ndjson":
             json_schema: list[plc.io.json.NameAndType] = [
@@ -629,20 +721,18 @@ class Scan(IR):
             name, offset = row_index
             offset += skip_rows
             dtype = schema[name]
-            step = plc.interop.from_arrow(
-                pa.scalar(1, type=plc.interop.to_arrow(dtype))
-            )
-            init = plc.interop.from_arrow(
-                pa.scalar(offset, type=plc.interop.to_arrow(dtype))
-            )
-            index = Column(
+            step = plc.Scalar.from_py(1, dtype)
+            init = plc.Scalar.from_py(offset, dtype)
+            index_col = Column(
                 plc.filling.sequence(df.num_rows, init, step),
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
                 null_order=plc.types.NullOrder.AFTER,
                 name=name,
             )
-            df = DataFrame([index, *df.columns])
+            df = DataFrame([index_col, *df.columns])
+            if next(iter(schema)) != name:
+                df = df.select(schema)
         assert all(c.obj.type() == schema[name] for name, c in df.column_map.items())
         if predicate is None:
             return df
@@ -651,6 +741,193 @@ class Scan(IR):
             return df.filter(mask)
+class Sink(IR):
+    """Sink a dataframe to a file."""
+    __slots__ = ("cloud_options", "kind", "options", "path")
+    _non_child = ("schema", "kind", "path", "options", "cloud_options")
+    kind: str
+    path: str
+    options: dict[str, Any]
+    def __init__(
+        self,
+        schema: Schema,
+        kind: str,
+        path: str,
+        options: dict[str, Any],
+        cloud_options: dict[str, Any],
+        df: IR,
+    ):
+        self.schema = schema
+        self.kind = kind
+        self.path = path
+        self.options = options
+        self.cloud_options = cloud_options
+        self.children = (df,)
+        self._non_child_args = (schema, kind, path, options)
+        if self.cloud_options is not None and any(
+            self.cloud_options.get(k) is not None
+            for k in ("config", "credential_provider")
+        ):
+            raise NotImplementedError(
+                "Write to cloud storage"
+            )  # pragma: no cover; no test yet
+        sync_on_close = options.get("sync_on_close")
+        if sync_on_close not in {"None", None}:
+            raise NotImplementedError(
+                f"sync_on_close='{sync_on_close}' is not supported."
+            )  # pragma: no cover; no test yet
+        child_schema = df.schema.values()
+        if kind == "Csv":
+            if not all(
+                plc.io.csv.is_supported_write_csv(dtype) for dtype in child_schema
+            ):
+                # Nested types are unsupported in polars and libcudf
+                raise NotImplementedError(
+                    "Contains unsupported types for CSV writing"
+                )  # pragma: no cover
+            serialize = options["serialize_options"]
+            if options["include_bom"]:
+                raise NotImplementedError("include_bom is not supported.")
+            for key in (
+                "date_format",
+                "time_format",
+                "datetime_format",
+                "float_scientific",
+                "float_precision",
+            ):
+                if serialize[key] is not None:
+                    raise NotImplementedError(f"{key} is not supported.")
+            if serialize["quote_style"] != "Necessary":
+                raise NotImplementedError("Only quote_style='Necessary' is supported.")
+            if chr(serialize["quote_char"]) != '"':
+                raise NotImplementedError("Only quote_char='\"' is supported.")
+        elif kind == "Parquet":
+            compression = options["compression"]
+            if isinstance(compression, dict):
+                if len(compression) != 1:
+                    raise NotImplementedError(
+                        "Compression dict with more than one entry."
+                    )  # pragma: no cover
+                compression, compression_level = next(iter(compression.items()))
+                options["compression"] = compression
+                if compression_level is not None:
+                    raise NotImplementedError(
+                        "Setting compression_level is not supported."
+                    )
+            if compression == "Lz4Raw":
+                compression = "Lz4"
+                options["compression"] = compression
+            if (
+                compression != "Uncompressed"
+                and not plc.io.parquet.is_supported_write_parquet(
+                    getattr(plc.io.types.CompressionType, compression.upper())
+                )
+            ):
+                raise NotImplementedError(
+                    f"Compression type '{compression}' is not supported."
+                )
+        elif (
+            kind == "Json"
+        ):  # pragma: no cover; options are validated on the polars side
+            if not all(
+                plc.io.json.is_supported_write_json(dtype) for dtype in child_schema
+            ):
+                # Nested types are unsupported in polars and libcudf
+                raise NotImplementedError(
+                    "Contains unsupported types for JSON writing"
+                )  # pragma: no cover
+            shared_writer_options = {"sync_on_close", "maintain_order", "mkdir"}
+            if set(options) - shared_writer_options:
+                raise NotImplementedError("Unsupported options passed JSON writer.")
+        else:
+            raise NotImplementedError(
+                f"Unhandled sink kind: {kind}"
+            )  # pragma: no cover
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+        The option dictionary is serialised for hashing purposes.
+        """
+        schema_hash = tuple(self.schema.items())  # pragma: no cover
+        return (
+            type(self),
+            schema_hash,
+            self.kind,
+            self.path,
+            json.dumps(self.options),
+            json.dumps(self.cloud_options),
+        )  # pragma: no cover
+    @classmethod
+    def do_evaluate(
+        cls,
+        schema: Schema,
+        kind: str,
+        path: str,
+        options: dict[str, Any],
+        df: DataFrame,
+    ) -> DataFrame:
+        """Write the dataframe to a file."""
+        target = plc.io.SinkInfo([path])
+        if options.get("mkdir", False):
+            Path(path).parent.mkdir(parents=True, exist_ok=True)
+        if kind == "Csv":
+            serialize = options["serialize_options"]
+            options = (
+                plc.io.csv.CsvWriterOptions.builder(target, df.table)
+                .include_header(options["include_header"])
+                .names(df.column_names if options["include_header"] else [])
+                .na_rep(serialize["null"])
+                .line_terminator(serialize["line_terminator"])
+                .inter_column_delimiter(chr(serialize["separator"]))
+                .build()
+            )
+            plc.io.csv.write_csv(options)
+        elif kind == "Parquet":
+            metadata = plc.io.types.TableInputMetadata(df.table)
+            for i, name in enumerate(df.column_names):
+                metadata.column_metadata[i].set_name(name)
+            builder = plc.io.parquet.ParquetWriterOptions.builder(target, df.table)
+            compression = options["compression"]
+            if compression != "Uncompressed":
+                builder.compression(
+                    getattr(plc.io.types.CompressionType, compression.upper())
+                )
+            writer_options = builder.metadata(metadata).build()
+            if options["data_page_size"] is not None:
+                writer_options.set_max_page_size_bytes(options["data_page_size"])
+            if options["row_group_size"] is not None:
+                writer_options.set_row_group_size_rows(options["row_group_size"])
+            plc.io.parquet.write_parquet(writer_options)
+        elif kind == "Json":
+            metadata = plc.io.TableWithMetadata(
+                df.table, [(col, []) for col in df.column_names]
+            )
+            options = (
+                plc.io.json.JsonWriterOptions.builder(target, df.table)
+                .lines(val=True)
+                .na_rep("null")
+                .include_nulls(val=True)
+                .metadata(metadata)
+                .utf8_escaped(val=False)
+                .build()
+            )
+            plc.io.json.write_json(options)
+        return DataFrame([])
 class Cache(IR):
     """
     Return a cached plan node.
@@ -658,35 +935,59 @@ class Cache(IR):
     Used for CSE at the plan level.
     """
-    __slots__ = ("key",)
-    _non_child = ("schema", "key")
+    __slots__ = ("key", "refcount")
+    _non_child = ("schema", "key", "refcount")
     key: int
     """The cache key."""
+    refcount: int
+    """The number of cache hits."""
-    def __init__(self, schema: Schema, key: int, value: IR):
+    def __init__(self, schema: Schema, key: int, refcount: int, value: IR):
         self.schema = schema
         self.key = key
+        self.refcount = refcount
         self.children = (value,)
-        self._non_child_args = (key,)
+        self._non_child_args = (key, refcount)
+    def get_hashable(self) -> Hashable:  # noqa: D102
+        # Polars arranges that the keys are unique across all cache
+        # nodes that reference the same child, so we don't need to
+        # hash the child.
+        return (type(self), self.key, self.refcount)
+    def is_equal(self, other: Self) -> bool:  # noqa: D102
+        if self.key == other.key and self.refcount == other.refcount:
+            self.children = other.children
+            return True
+        return False
     @classmethod
     def do_evaluate(
-        cls, key: int, df: DataFrame
+        cls, key: int, refcount: int, df: DataFrame
     ) -> DataFrame:  # pragma: no cover; basic evaluation never calls this
         """Evaluate and return a dataframe."""
         # Our value has already been computed for us, so let's just
         # return it.
         return df
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: CSECache, timer: Timer | None) -> DataFrame:
         """Evaluate and return a dataframe."""
         # We must override the recursion scheme because we don't want
         # to recurse if we're in the cache.
         try:
-            return cache[self.key]
+            (result, hits) = cache[self.key]
         except KeyError:
             (value,) = self.children
-            return cache.setdefault(self.key, value.evaluate(cache=cache))
+            result = value.evaluate(cache=cache, timer=timer)
+            cache[self.key] = (result, 0)
+            return result
+        else:
+            hits += 1
+            if hits == self.refcount:
+                del cache[self.key]
+            else:
+                cache[self.key] = (result, hits)
+            return result
 class DataFrameScan(IR):
@@ -696,13 +997,13 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
-    __slots__ = ("config_options", "df", "projection")
+    __slots__ = ("_id_for_hash", "config_options", "df", "projection")
     _non_child = ("schema", "df", "projection", "config_options")
     df: Any
-    """Polars LazyFrame object."""
+    """Polars internal PyDataFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
-    config_options: dict[str, Any]
+    config_options: ConfigOptions
     """GPU-specific configuration options"""
     def __init__(
@@ -710,29 +1011,35 @@ class DataFrameScan(IR):
         schema: Schema,
         df: Any,
         projection: Sequence[str] | None,
-        config_options: dict[str, Any],
+        config_options: ConfigOptions,
     ):
         self.schema = schema
         self.df = df
         self.projection = tuple(projection) if projection is not None else None
         self.config_options = config_options
-        self._non_child_args = (schema, df, self.projection)
+        self._non_child_args = (
+            schema,
+            pl.DataFrame._from_pydf(df),
+            self.projection,
+        )
         self.children = ()
+        self._id_for_hash = random.randint(0, 2**64 - 1)
     def get_hashable(self) -> Hashable:
         """
         Hashable representation of the node.
-        The (heavy) dataframe object is hashed as its id, so this is
-        not stable across runs, or repeat instances of the same equal dataframes.
+        The (heavy) dataframe object is not hashed. No two instances of
+        ``DataFrameScan`` will have the same hash, even if they have the
+        same schema, projection, and config options, and data.
         """
         schema_hash = tuple(self.schema.items())
         return (
             type(self),
             schema_hash,
-            id(self.df),
+            self._id_for_hash,
             self.projection,
-            json.dumps(self.config_options),
+            self.config_options,
         )
     @classmethod
@@ -743,10 +1050,9 @@ class DataFrameScan(IR):
         projection: tuple[str, ...] | None,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        pdf = pl.DataFrame._from_pydf(df)
         if projection is not None:
-            pdf = pdf.select(projection)
-        df = DataFrame.from_polars(pdf)
+            df = df.select(projection)
+        df = DataFrame.from_polars(df)
         assert all(
             c.obj.type() == dtype
             for c, dtype in zip(df.columns, schema.values(), strict=True)
@@ -820,29 +1126,191 @@ class Reduce(IR):
     ) -> DataFrame:  # pragma: no cover; not exposed by polars yet
         """Evaluate and return a dataframe."""
         columns = broadcast(*(e.evaluate(df) for e in exprs))
-        assert all(column.obj.size() == 1 for column in columns)
+        assert all(column.size == 1 for column in columns)
         return DataFrame(columns)
+class Rolling(IR):
+    """Perform a (possibly grouped) rolling aggregation."""
+    __slots__ = (
+        "agg_requests",
+        "closed_window",
+        "following",
+        "index",
+        "keys",
+        "preceding",
+        "zlice",
+    )
+    _non_child = (
+        "schema",
+        "index",
+        "preceding",
+        "following",
+        "closed_window",
+        "keys",
+        "agg_requests",
+        "zlice",
+    )
+    index: expr.NamedExpr
+    """Column being rolled over."""
+    preceding: plc.Scalar
+    """Preceding window extent defining start of window."""
+    following: plc.Scalar
+    """Following window extent defining end of window."""
+    closed_window: ClosedInterval
+    """Treatment of window endpoints."""
+    keys: tuple[expr.NamedExpr, ...]
+    """Grouping keys."""
+    agg_requests: tuple[expr.NamedExpr, ...]
+    """Aggregation expressions."""
+    zlice: Zlice | None
+    """Optional slice"""
+    def __init__(
+        self,
+        schema: Schema,
+        index: expr.NamedExpr,
+        preceding: plc.Scalar,
+        following: plc.Scalar,
+        closed_window: ClosedInterval,
+        keys: Sequence[expr.NamedExpr],
+        agg_requests: Sequence[expr.NamedExpr],
+        zlice: Zlice | None,
+        df: IR,
+    ):
+        self.schema = schema
+        self.index = index
+        self.preceding = preceding
+        self.following = following
+        self.closed_window = closed_window
+        self.keys = tuple(keys)
+        self.agg_requests = tuple(agg_requests)
+        if not all(
+            plc.rolling.is_valid_rolling_aggregation(
+                agg.value.dtype, agg.value.agg_request
+            )
+            for agg in self.agg_requests
+        ):
+            raise NotImplementedError("Unsupported rolling aggregation")
+        if any(
+            agg.value.agg_request.kind() == plc.aggregation.Kind.COLLECT_LIST
+            for agg in self.agg_requests
+        ):
+            raise NotImplementedError(
+                "Incorrect handling of empty groups for list collection"
+            )
+        self.zlice = zlice
+        self.children = (df,)
+        self._non_child_args = (
+            index,
+            preceding,
+            following,
+            closed_window,
+            keys,
+            agg_requests,
+            zlice,
+        )
+    @classmethod
+    def do_evaluate(
+        cls,
+        index: expr.NamedExpr,
+        preceding: plc.Scalar,
+        following: plc.Scalar,
+        closed_window: ClosedInterval,
+        keys_in: Sequence[expr.NamedExpr],
+        aggs: Sequence[expr.NamedExpr],
+        zlice: Zlice | None,
+        df: DataFrame,
+    ) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        keys = broadcast(*(k.evaluate(df) for k in keys_in), target_length=df.num_rows)
+        orderby = index.evaluate(df)
+        # Polars casts integral orderby to int64, but only for calculating window bounds
+        if (
+            plc.traits.is_integral(orderby.obj.type())
+            and orderby.obj.type().id() != plc.TypeId.INT64
+        ):
+            orderby_obj = plc.unary.cast(orderby.obj, plc.DataType(plc.TypeId.INT64))
+        else:
+            orderby_obj = orderby.obj
+        preceding_window, following_window = range_window_bounds(
+            preceding, following, closed_window
+        )
+        if orderby.obj.null_count() != 0:
+            raise RuntimeError(
+                f"Index column '{index.name}' in rolling may not contain nulls"
+            )
+        if len(keys_in) > 0:
+            # Must always check sortedness
+            table = plc.Table([*(k.obj for k in keys), orderby_obj])
+            n = table.num_columns()
+            if not plc.sorting.is_sorted(
+                table, [plc.types.Order.ASCENDING] * n, [plc.types.NullOrder.BEFORE] * n
+            ):
+                raise RuntimeError("Input for grouped rolling is not sorted")
+        else:
+            if not orderby.check_sorted(
+                order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE
+            ):
+                raise RuntimeError(
+                    f"Index column '{index.name}' in rolling is not sorted, please sort first"
+                )
+        values = plc.rolling.grouped_range_rolling_window(
+            plc.Table([k.obj for k in keys]),
+            orderby_obj,
+            plc.types.Order.ASCENDING,  # Polars requires ascending orderby.
+            plc.types.NullOrder.BEFORE,  # Doesn't matter, polars doesn't allow nulls in orderby
+            preceding_window,
+            following_window,
+            [rolling.to_request(request.value, orderby, df) for request in aggs],
+        )
+        return DataFrame(
+            itertools.chain(
+                keys,
+                [orderby],
+                (
+                    Column(col, name=name)
+                    for col, name in zip(
+                        values.columns(),
+                        (request.name for request in aggs),
+                        strict=True,
+                    )
+                ),
+            )
+        ).slice(zlice)
 class GroupBy(IR):
     """Perform a groupby."""
     __slots__ = (
-        "agg_infos",
         "agg_requests",
+        "config_options",
         "keys",
         "maintain_order",
-        "options",
+        "zlice",
+    )
+    _non_child = (
+        "schema",
+        "keys",
+        "agg_requests",
+        "maintain_order",
+        "zlice",
+        "config_options",
     )
-    _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
     keys: tuple[expr.NamedExpr, ...]
     """Grouping keys."""
     agg_requests: tuple[expr.NamedExpr, ...]
     """Aggregation expressions."""
     maintain_order: bool
     """Preserve order in groupby."""
-    options: Any
-    """Arbitrary options."""
+    zlice: Zlice | None
+    """Optional slice to apply after grouping."""
+    config_options: ConfigOptions
+    """GPU-specific configuration options"""
     def __init__(
         self,
@@ -850,70 +1318,33 @@ class GroupBy(IR):
         keys: Sequence[expr.NamedExpr],
         agg_requests: Sequence[expr.NamedExpr],
         maintain_order: bool,  # noqa: FBT001
-        options: Any,
+        zlice: Zlice | None,
+        config_options: ConfigOptions,
         df: IR,
     ):
         self.schema = schema
         self.keys = tuple(keys)
         self.agg_requests = tuple(agg_requests)
         self.maintain_order = maintain_order
-        self.options = options
+        self.zlice = zlice
+        self.config_options = config_options
         self.children = (df,)
-        if self.options.rolling:
-            raise NotImplementedError(
-                "rolling window/groupby"
-            )  # pragma: no cover; rollingwindow constructor has already raised
-        if self.options.dynamic:
-            raise NotImplementedError("dynamic group by")
-        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
-            raise NotImplementedError("Nested aggregations in groupby")
-        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
         self._non_child_args = (
             self.keys,
             self.agg_requests,
             maintain_order,
-            options,
-            self.agg_infos,
+            self.zlice,
         )
-    @staticmethod
-    def check_agg(agg: expr.Expr) -> int:
-        """
-        Determine if we can handle an aggregation expression.
-        Parameters
-        ----------
-        agg
-            Expression to check
-        Returns
-        -------
-        depth of nesting
-        Raises
-        ------
-        NotImplementedError
-            For unsupported expression nodes.
-        """
-        if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)):
-            return max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, expr.Agg):
-            return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
-            return 0
-        else:
-            raise NotImplementedError(f"No handler for {agg=}")
     @classmethod
     def do_evaluate(
         cls,
         keys_in: Sequence[expr.NamedExpr],
         agg_requests: Sequence[expr.NamedExpr],
         maintain_order: bool,  # noqa: FBT001
-        options: Any,
-        agg_infos: Sequence[expr.AggInfo],
+        zlice: Zlice | None,
         df: DataFrame,
-    ):
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         keys = broadcast(*(k.evaluate(df) for k in keys_in), target_length=df.num_rows)
         sorted = (
@@ -928,32 +1359,38 @@ class GroupBy(IR):
             column_order=[k.order for k in keys],
             null_precedence=[k.null_order for k in keys],
         )
-        # TODO: uniquify
         requests = []
-        replacements: list[expr.Expr] = []
-        for info in agg_infos:
-            for pre_eval, req, rep in info.requests:
-                if pre_eval is None:
-                    # A count aggregation, doesn't touch the column,
-                    # but we need to have one. Rather than evaluating
-                    # one, just use one of the key columns.
-                    col = keys[0].obj
+        names = []
+        for request in agg_requests:
+            name = request.name
+            value = request.value
+            if isinstance(value, expr.Len):
+                # A count aggregation, we need a column so use a key column
+                col = keys[0].obj
+            elif isinstance(value, expr.Agg):
+                if value.name == "quantile":
+                    child = value.children[0]
                 else:
-                    col = pre_eval.evaluate(df).obj
-                requests.append(plc.groupby.GroupByRequest(col, [req]))
-                replacements.append(rep)
+                    (child,) = value.children
+                col = child.evaluate(df, context=ExecutionContext.GROUPBY).obj
+            else:
+                # Anything else, we pre-evaluate
+                col = value.evaluate(df, context=ExecutionContext.GROUPBY).obj
+            requests.append(plc.groupby.GroupByRequest(col, [value.agg_request]))
+            names.append(name)
         group_keys, raw_tables = grouper.aggregate(requests)
-        raw_columns: list[Column] = []
-        for i, table in enumerate(raw_tables):
-            (column,) = table.columns()
-            raw_columns.append(Column(column, name=f"tmp{i}"))
-        mapping = dict(zip(replacements, raw_columns, strict=True))
+        results = [
+            Column(column, name=name)
+            for name, column in zip(
+                names,
+                itertools.chain.from_iterable(t.columns() for t in raw_tables),
+                strict=True,
+            )
+        ]
         result_keys = [
             Column(grouped_key, name=key.name)
             for key, grouped_key in zip(keys, group_keys.columns(), strict=True)
         ]
-        result_subs = DataFrame(raw_columns)
-        results = [req.evaluate(result_subs, mapping=mapping) for req in agg_requests]
         broadcasted = broadcast(*result_keys, *results)
         # Handle order preservation of groups
         if maintain_order and not sorted:
@@ -996,12 +1433,26 @@ class GroupBy(IR):
                     ordered_table.columns(), broadcasted, strict=True
                 )
             ]
-        return DataFrame(broadcasted).slice(options.slice)
+        return DataFrame(broadcasted).slice(zlice)
 class ConditionalJoin(IR):
     """A conditional inner join of two dataframes on a predicate."""
+    class Predicate:
+        """Serializable wrapper for a predicate expression."""
+        predicate: expr.Expr
+        ast: plc.expressions.Expression
+        def __init__(self, predicate: expr.Expr):
+            self.predicate = predicate
+            self.ast = to_ast(predicate)
+        def __reduce__(self) -> tuple[Any, ...]:
+            """Pickle a Predicate object."""
+            return (type(self), (self.predicate,))
     __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
@@ -1012,7 +1463,7 @@ class ConditionalJoin(IR):
             pl_expr.Operator | Iterable[pl_expr.Operator],
         ],
         bool,
-        tuple[int, int] | None,
+        Zlice | None,
         str,
         bool,
         Literal["none", "left", "right", "left_right", "right_left"],
@@ -1020,7 +1471,7 @@ class ConditionalJoin(IR):
     """
     tuple of options:
     - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions
-    - join_nulls: do nulls compare equal?
+    - nulls_equal: do nulls compare equal?
     - slice: optional slice to perform after joining.
     - suffix: string suffix for right columns if names match
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
@@ -1034,30 +1485,34 @@ class ConditionalJoin(IR):
         self.predicate = predicate
         self.options = options
         self.children = (left, right)
-        self.ast_predicate = to_ast(predicate)
-        _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options
+        predicate_wrapper = self.Predicate(predicate)
+        _, nulls_equal, zlice, suffix, coalesce, maintain_order = self.options
         # Preconditions from polars
-        assert not join_nulls
+        assert not nulls_equal
         assert not coalesce
         assert maintain_order == "none"
-        if self.ast_predicate is None:
+        if predicate_wrapper.ast is None:
             raise NotImplementedError(
                 f"Conditional join with predicate {predicate}"
             )  # pragma: no cover; polars never delivers expressions we can't handle
-        self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order)
+        self._non_child_args = (predicate_wrapper, zlice, suffix, maintain_order)
     @classmethod
     def do_evaluate(
         cls,
-        predicate: plc.expressions.Expression,
-        zlice: tuple[int, int] | None,
+        predicate_wrapper: Predicate,
+        zlice: Zlice | None,
         suffix: str,
         maintain_order: Literal["none", "left", "right", "left_right", "right_left"],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        lg, rg = plc.join.conditional_inner_join(left.table, right.table, predicate)
+        lg, rg = plc.join.conditional_inner_join(
+            left.table,
+            right.table,
+            predicate_wrapper.ast,
+        )
         left = DataFrame.from_table(
             plc.copying.gather(
                 left.table, lg, plc.copying.OutOfBoundsPolicy.DONT_CHECK
@@ -1084,8 +1539,8 @@ class ConditionalJoin(IR):
 class Join(IR):
     """A join of two dataframes."""
-    __slots__ = ("left_on", "options", "right_on")
-    _non_child = ("schema", "left_on", "right_on", "options")
+    __slots__ = ("config_options", "left_on", "options", "right_on")
+    _non_child = ("schema", "left_on", "right_on", "options", "config_options")
     left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
     right_on: tuple[expr.NamedExpr, ...]
@@ -1093,7 +1548,7 @@ class Join(IR):
     options: tuple[
         Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
         bool,
-        tuple[int, int] | None,
+        Zlice | None,
         str,
         bool,
         Literal["none", "left", "right", "left_right", "right_left"],
@@ -1101,12 +1556,14 @@ class Join(IR):
     """
     tuple of options:
     - how: join type
-    - join_nulls: do nulls compare equal?
+    - nulls_equal: do nulls compare equal?
     - slice: optional slice to perform after joining.
     - suffix: string suffix for right columns if names match
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     - maintain_order: which DataFrame row order to preserve, if any
     """
+    config_options: ConfigOptions
+    """GPU-specific configuration options"""
     def __init__(
         self,
@@ -1114,6 +1571,7 @@ class Join(IR):
         left_on: Sequence[expr.NamedExpr],
         right_on: Sequence[expr.NamedExpr],
         options: Any,
+        config_options: ConfigOptions,
         left: IR,
         right: IR,
     ):
@@ -1121,6 +1579,7 @@ class Join(IR):
         self.left_on = tuple(left_on)
         self.right_on = tuple(right_on)
         self.options = options
+        self.config_options = config_options
         self.children = (left, right)
         self._non_child_args = (self.left_on, self.right_on, self.options)
         # TODO: Implement maintain_order
@@ -1203,9 +1662,8 @@ class Join(IR):
         left keys, and is stable wrt the right keys. For all other
         joins, there is no order obligation.
         """
-        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
-        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
-        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        init = plc.Scalar.from_py(0, plc.types.SIZE_TYPE)
+        step = plc.Scalar.from_py(1, plc.types.SIZE_TYPE)
         left_order = plc.copying.gather(
             plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
         )
@@ -1227,7 +1685,7 @@ class Join(IR):
         options: tuple[
             Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
             bool,
-            tuple[int, int] | None,
+            Zlice | None,
             str,
             bool,
             Literal["none", "left", "right", "left_right", "right_left"],
@@ -1236,7 +1694,7 @@ class Join(IR):
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        how, join_nulls, zlice, suffix, coalesce, _ = options
+        how, nulls_equal, zlice, suffix, coalesce, _ = options
         if how == "Cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
@@ -1264,7 +1722,7 @@ class Join(IR):
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in right_on_exprs)))
         null_equality = (
             plc.types.NullEquality.EQUAL
-            if join_nulls
+            if nulls_equal
             else plc.types.NullEquality.UNEQUAL
         )
         join_fn, left_policy, right_policy = cls._joiners(how)
@@ -1385,7 +1843,7 @@ class Distinct(IR):
     subset: frozenset[str] | None
     """Which columns should be used to define distinctness. If None,
     then all columns are used."""
-    zlice: tuple[int, int] | None
+    zlice: Zlice | None
     """Optional slice to apply to the result."""
     stable: bool
     """Should the result maintain ordering."""
@@ -1395,7 +1853,7 @@ class Distinct(IR):
         schema: Schema,
         keep: plc.stream_compaction.DuplicateKeepOption,
         subset: frozenset[str] | None,
-        zlice: tuple[int, int] | None,
+        zlice: Zlice | None,
         stable: bool,  # noqa: FBT001
         df: IR,
     ):
@@ -1419,10 +1877,10 @@ class Distinct(IR):
         cls,
         keep: plc.stream_compaction.DuplicateKeepOption,
         subset: frozenset[str] | None,
-        zlice: tuple[int, int] | None,
+        zlice: Zlice | None,
         stable: bool,  # noqa: FBT001
         df: DataFrame,
-    ):
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if subset is None:
             indices = list(range(df.num_columns))
@@ -1475,7 +1933,7 @@ class Sort(IR):
     """Null sorting location for each sort key."""
     stable: bool
     """Should the sort be stable?"""
-    zlice: tuple[int, int] | None
+    zlice: Zlice | None
     """Optional slice to apply to the result."""
     def __init__(
@@ -1485,7 +1943,7 @@ class Sort(IR):
         order: Sequence[plc.types.Order],
         null_order: Sequence[plc.types.NullOrder],
         stable: bool,  # noqa: FBT001
-        zlice: tuple[int, int] | None,
+        zlice: Zlice | None,
         df: IR,
     ):
         self.schema = schema
@@ -1510,17 +1968,11 @@ class Sort(IR):
         order: Sequence[plc.types.Order],
         null_order: Sequence[plc.types.NullOrder],
         stable: bool,  # noqa: FBT001
-        zlice: tuple[int, int] | None,
+        zlice: Zlice | None,
         df: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
         sort_keys = broadcast(*(k.evaluate(df) for k in by), target_length=df.num_rows)
-        # TODO: More robust identification here.
-        keys_in_result = {
-            k.name: i
-            for i, k in enumerate(sort_keys)
-            if k.name in df.column_map and k.obj is df.column_map[k.name].obj
-        }
         do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
         table = do_sort(
             df.table,
@@ -1528,19 +1980,17 @@ class Sort(IR):
             list(order),
             list(null_order),
         )
-        columns: list[Column] = []
-        for name, c in zip(df.column_map, table.columns(), strict=True):
-            column = Column(c, name=name)
-            # If a sort key is in the result table, set the sortedness property
-            if name in keys_in_result:
-                i = keys_in_result[name]
-                column = column.set_sorted(
-                    is_sorted=plc.types.Sorted.YES,
-                    order=order[i],
-                    null_order=null_order[i],
-                )
-            columns.append(column)
-        return DataFrame(columns).slice(zlice)
+        result = DataFrame.from_table(table, df.column_names)
+        first_key = sort_keys[0]
+        name = by[0].name
+        first_key_in_result = (
+            name in df.column_map and first_key.obj is df.column_map[name].obj
+        )
+        if first_key_in_result:
+            result.column_map[name].set_sorted(
+                is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
+            )
+        return result.slice(zlice)
 class Slice(IR):
@@ -1608,6 +2058,42 @@ class Projection(IR):
         return DataFrame(columns)
+class MergeSorted(IR):
+    """Merge sorted operation."""
+    __slots__ = ("key",)
+    _non_child = ("schema", "key")
+    key: str
+    """Key that is sorted."""
+    def __init__(self, schema: Schema, key: str, left: IR, right: IR):
+        assert isinstance(left, Sort)
+        assert isinstance(right, Sort)
+        assert left.order == right.order
+        assert len(left.schema.keys()) <= len(right.schema.keys())
+        self.schema = schema
+        self.key = key
+        self.children = (left, right)
+        self._non_child_args = (key,)
+    @classmethod
+    def do_evaluate(cls, key: str, *dfs: DataFrame) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        left, right = dfs
+        right = right.discard_columns(right.column_names_set - left.column_names_set)
+        on_col_left = left.select_columns({key})[0]
+        on_col_right = right.select_columns({key})[0]
+        return DataFrame.from_table(
+            plc.merge.merge(
+                [right.table, left.table],
+                [left.column_names.index(key), right.column_names.index(key)],
+                [on_col_left.order, on_col_right.order],
+                [on_col_left.null_order, on_col_right.null_order],
+            ),
+            left.column_names,
+        )
 class MapFunction(IR):
     """Apply some function to a dataframe."""
@@ -1621,13 +2107,10 @@ class MapFunction(IR):
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
             "rechunk",
-            # libcudf merge is not stable wrt order of inputs, since
-            # it uses a priority queue to manage the tables it produces.
-            # See: https://github.com/rapidsai/cudf/issues/16010
-            # "merge_sorted",
             "rename",
             "explode",
             "unpivot",
+            "row_index",
         ]
     )
@@ -1636,8 +2119,12 @@ class MapFunction(IR):
         self.name = name
         self.options = options
         self.children = (df,)
-        if self.name not in MapFunction._NAMES:
-            raise NotImplementedError(f"Unhandled map function {self.name}")
+        if (
+            self.name not in MapFunction._NAMES
+        ):  # pragma: no cover; need more polars rust functions
+            raise NotImplementedError(
+                f"Unhandled map function {self.name}"
+            )  # pragma: no cover
         if self.name == "explode":
             (to_explode,) = self.options
             if len(to_explode) > 1:
@@ -1674,6 +2161,9 @@ class MapFunction(IR):
                 variable_name,
                 value_name,
             )
+        elif self.name == "row_index":
+            col_name, offset = options
+            self.options = (col_name, offset)
         self._non_child_args = (schema, name, self.options)
     @classmethod
@@ -1739,6 +2229,19 @@ class MapFunction(IR):
                     Column(value_column, name=value_name),
                 ]
             )
+        elif name == "row_index":
+            col_name, offset = options
+            dtype = schema[col_name]
+            step = plc.Scalar.from_py(1, dtype)
+            init = plc.Scalar.from_py(offset, dtype)
+            index_col = Column(
+                plc.filling.sequence(df.num_rows, init, step),
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.AFTER,
+                name=col_name,
+            )
+            return DataFrame([index_col, *df.columns])
         else:
             raise AssertionError("Should never be reached")  # pragma: no cover
@@ -1748,10 +2251,10 @@ class Union(IR):
     __slots__ = ("zlice",)
     _non_child = ("schema", "zlice")
-    zlice: tuple[int, int] | None
+    zlice: Zlice | None
     """Optional slice to apply to the result."""
-    def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR):
+    def __init__(self, schema: Schema, zlice: Zlice | None, *children: IR):
         self.schema = schema
         self.zlice = zlice
         self._non_child_args = (zlice,)
@@ -1759,7 +2262,7 @@ class Union(IR):
         schema = self.children[0].schema
     @classmethod
-    def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
+    def do_evaluate(cls, zlice: Zlice | None, *dfs: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice?
         return DataFrame.from_table(
@@ -1771,12 +2274,18 @@ class Union(IR):
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
-    __slots__ = ()
-    _non_child = ("schema",)
+    __slots__ = ("should_broadcast",)
+    _non_child = ("schema", "should_broadcast")
-    def __init__(self, schema: Schema, *children: IR):
+    def __init__(
+        self,
+        schema: Schema,
+        should_broadcast: bool,  # noqa: FBT001
+        *children: IR,
+    ):
         self.schema = schema
-        self._non_child_args = ()
+        self.should_broadcast = should_broadcast
+        self._non_child_args = (should_broadcast,)
         self.children = children
     @staticmethod
@@ -1808,8 +2317,19 @@ class HConcat(IR):
         )
     @classmethod
-    def do_evaluate(cls, *dfs: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls,
+        should_broadcast: bool,  # noqa: FBT001
+        *dfs: DataFrame,
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
+        # Special should_broadcast case.
+        # Used to recombine decomposed expressions
+        if should_broadcast:
+            return DataFrame(
+                broadcast(*itertools.chain.from_iterable(df.columns for df in dfs))
+            )
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls
         return DataFrame(
@@ -1826,3 +2346,20 @@ class HConcat(IR):
                 )
             )
         )
+class Empty(IR):
+    """Represents an empty DataFrame."""
+    __slots__ = ()
+    _non_child = ()
+    def __init__(self) -> None:
+        self.schema = {}
+        self._non_child_args = ()
+        self.children = ()
+    @classmethod
+    def do_evaluate(cls) -> DataFrame:  # pragma: no cover
+        """Evaluate and return a dataframe."""
+        return DataFrame([])

cudf-polars-cu12 25.2.2__py3-none-any.whl → 25.6.0__py3-none-any.whl

cudf-polars-cu12 25.2.2py3-none-any.whl → 25.6.0py3-none-any.whl