PyPI - cudf-polars-cu12 - Versions diffs - 24.12.0__py3-none-any.whl → 25.2.0__py3-none-any.whl - Mend

cudf-polars-cu12 24.12.0py3-none-any.whl → 25.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

cudf_polars/VERSION +1 -1
cudf_polars/__init__.py +1 -1
cudf_polars/callback.py +28 -3
cudf_polars/containers/__init__.py +1 -1
cudf_polars/dsl/expr.py +16 -16
cudf_polars/dsl/expressions/aggregation.py +21 -4
cudf_polars/dsl/expressions/base.py +7 -2
cudf_polars/dsl/expressions/binaryop.py +1 -0
cudf_polars/dsl/expressions/boolean.py +65 -22
cudf_polars/dsl/expressions/datetime.py +82 -20
cudf_polars/dsl/expressions/literal.py +2 -0
cudf_polars/dsl/expressions/rolling.py +3 -1
cudf_polars/dsl/expressions/selection.py +3 -1
cudf_polars/dsl/expressions/sorting.py +2 -0
cudf_polars/dsl/expressions/string.py +118 -39
cudf_polars/dsl/expressions/ternary.py +1 -0
cudf_polars/dsl/expressions/unary.py +11 -1
cudf_polars/dsl/ir.py +173 -122
cudf_polars/dsl/to_ast.py +4 -6
cudf_polars/dsl/translate.py +53 -21
cudf_polars/dsl/traversal.py +10 -10
cudf_polars/experimental/base.py +43 -0
cudf_polars/experimental/dispatch.py +84 -0
cudf_polars/experimental/io.py +325 -0
cudf_polars/experimental/parallel.py +253 -0
cudf_polars/experimental/select.py +36 -0
cudf_polars/testing/asserts.py +14 -5
cudf_polars/testing/plugin.py +60 -4
cudf_polars/typing/__init__.py +5 -5
cudf_polars/utils/dtypes.py +9 -7
cudf_polars/utils/versions.py +4 -7
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/METADATA +6 -6
cudf_polars_cu12-25.2.0.dist-info/RECORD +48 -0
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu12-24.12.0.dist-info/RECORD +0 -43
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/LICENSE +0 -0
{cudf_polars_cu12-24.12.0.dist-info → cudf_polars_cu12-25.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/dsl/translate.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Translate polars IR representation to ours."""
@@ -84,7 +84,7 @@ class Translator:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (4, 0):
+        if (version := self.visitor.version()) >= (5, 1):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
@@ -260,9 +260,7 @@ def _(
         schema,
         node.df,
         node.projection,
-        translate_named_expr(translator, n=node.selection)
-        if node.selection is not None
-        else None,
+        translator.config.config.copy(),
     )
@@ -301,25 +299,52 @@ def _(
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
+    def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
+        if literal.dtype.id() == plc.types.TypeId.INT32:
+            plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
+            return expr.Literal(
+                plc_int64,
+                pa.scalar(literal.value.as_py(), type=plc.interop.to_arrow(plc_int64)),
+            )
+        return literal
+    def maybe_adjust_binop(e) -> expr.Expr:
+        if isinstance(e.value, expr.BinOp):
+            left, right = e.value.children
+            if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
+                e.value.children = (left, adjust_literal_dtype(right))
+            elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
+                e.value.children = (adjust_literal_dtype(left), right)
+        return e
+    def translate_expr_and_maybe_fix_binop_args(translator, exprs):
+        return [
+            maybe_adjust_binop(translate_named_expr(translator, n=e)) for e in exprs
+        ]
     with set_node(translator.visitor, node.input_left):
         inp_left = translator.translate_ir(n=None)
-        left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
+        # TODO: There's bug in the polars type coercion phase. Use
+        # translate_named_expr directly once it is resolved.
+        # Tracking issue: https://github.com/pola-rs/polars/issues/20935
+        left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
-        right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
+        right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on)
     if (how := node.options[0]) in {
-        "inner",
-        "left",
-        "right",
-        "full",
-        "cross",
-        "semi",
-        "anti",
+        "Inner",
+        "Left",
+        "Right",
+        "Full",
+        "Cross",
+        "Semi",
+        "Anti",
     }:
         return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
     else:
-        how, op1, op2 = how
-        if how != "ie_join":
+        how, op1, op2 = node.options[0]
+        if how != "IEJoin":
             raise NotImplementedError(
                 f"Unsupported join type {how}"
             )  # pragma: no cover; asof joins not yet exposed
@@ -531,10 +556,16 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
                         column.dtype,
                         pa.scalar("", type=plc.interop.to_arrow(column.dtype)),
                     )
-            return expr.StringFunction(dtype, name, options, column, chars)
+            return expr.StringFunction(
+                dtype,
+                expr.StringFunction.Name.from_polars(name),
+                options,
+                column,
+                chars,
+            )
         return expr.StringFunction(
             dtype,
-            name,
+            expr.StringFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
@@ -551,7 +582,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
             )
         return expr.BooleanFunction(
             dtype,
-            name,
+            expr.BooleanFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
@@ -571,7 +602,7 @@ def _(node: pl_expr.Function, translator: Translator, dtype: plc.DataType) -> ex
         }
         result_expr = expr.TemporalFunction(
             dtype,
-            name,
+            expr.TemporalFunction.Name.from_polars(name),
             options,
             *(translator.translate_expr(n=n) for n in node.input),
         )
@@ -633,9 +664,10 @@ def _(node: pl_expr.Sort, translator: Translator, dtype: plc.DataType) -> expr.E
 @_translate_expr.register
 def _(node: pl_expr.SortBy, translator: Translator, dtype: plc.DataType) -> expr.Expr:
+    options = node.sort_options
     return expr.SortBy(
         dtype,
-        node.sort_options,
+        (options[0], tuple(options[1]), tuple(options[2])),
         translator.translate_expr(n=node.expr),
         *(translator.translate_expr(n=n) for n in node.by),
     )

cudf_polars/dsl/traversal.py CHANGED Viewed

@@ -10,35 +10,35 @@ from typing import TYPE_CHECKING, Any, Generic
 from cudf_polars.typing import U_contra, V_co
 if TYPE_CHECKING:
-    from collections.abc import Callable, Generator, Mapping, MutableMapping
+    from collections.abc import Callable, Generator, Mapping, MutableMapping, Sequence
     from cudf_polars.typing import GenericTransformer, NodeT
 __all__: list[str] = [
-    "traversal",
-    "reuse_if_unchanged",
-    "make_recursive",
     "CachingVisitor",
+    "make_recursive",
+    "reuse_if_unchanged",
+    "traversal",
 ]
-def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+def traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]:
     """
     Pre-order traversal of nodes in an expression.
     Parameters
     ----------
-    node
-        Root of expression to traverse.
+    nodes
+        Roots of expressions to traverse.
     Yields
     ------
-    Unique nodes in the expression, parent before child, children
+    Unique nodes in the expressions, parent before child, children
     in-order from left to right.
     """
-    seen = {node}
-    lifo = [node]
+    seen = set(nodes)
+    lifo = list(nodes)
     while lifo:
         node = lifo.pop()

cudf_polars/experimental/base.py ADDED Viewed

@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition base classes."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from cudf_polars.dsl.ir import Union
+if TYPE_CHECKING:
+    from collections.abc import Iterator, Sequence
+    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.nodebase import Node
+class PartitionInfo:
+    """
+    Partitioning information.
+    This class only tracks the partition count (for now).
+    """
+    __slots__ = ("count",)
+    def __init__(self, count: int):
+        self.count = count
+    def keys(self, node: Node) -> Iterator[tuple[str, int]]:
+        """Return the partitioned keys for a given node."""
+        name = get_key_name(node)
+        yield from ((name, i) for i in range(self.count))
+def get_key_name(node: Node) -> str:
+    """Generate the key name for a Node."""
+    return f"{type(node).__name__.lower()}-{hash(node)}"
+def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
+    # Concatenate a sequence of DataFrames vertically
+    return Union.do_evaluate(None, *dfs)

cudf_polars/experimental/dispatch.py ADDED Viewed

@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition dispatch functions."""
+from __future__ import annotations
+from functools import singledispatch
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import TypeAlias
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.typing import GenericTransformer
+LowerIRTransformer: TypeAlias = (
+    "GenericTransformer[IR, tuple[IR, MutableMapping[IR, PartitionInfo]]]"
+)
+"""Protocol for Lowering IR nodes."""
+@singledispatch
+def lower_ir_node(
+    ir: IR, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    """
+    Rewrite an IR node and extract partitioning information.
+    Parameters
+    ----------
+    ir
+        IR node to rewrite.
+    rec
+        Recursive LowerIRTransformer callable.
+    Returns
+    -------
+    new_ir, partition_info
+        The rewritten node, and a mapping from unique nodes in
+        the full IR graph to associated partitioning information.
+    Notes
+    -----
+    This function is used by `lower_ir_graph`.
+    See Also
+    --------
+    lower_ir_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
+@singledispatch
+def generate_ir_tasks(
+    ir: IR, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    """
+    Generate a task graph for evaluation of an IR node.
+    Parameters
+    ----------
+    ir
+        IR node to generate tasks for.
+    partition_info
+        Partitioning information, obtained from :func:`lower_ir_graph`.
+    Returns
+    -------
+    mapping
+        A (partial) dask task graph for the evaluation of an ir node.
+    Notes
+    -----
+    Task generation should only produce the tasks for the current node,
+    referring to child tasks by name.
+    See Also
+    --------
+    task_graph
+    """
+    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover

cudf_polars/experimental/io.py ADDED Viewed

@@ -0,0 +1,325 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Multi-partition IO Logic."""
+from __future__ import annotations
+import enum
+import math
+import random
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any
+import pylibcudf as plc
+from cudf_polars.dsl.ir import IR, DataFrameScan, Scan, Union
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.typing import Schema
+@lower_ir_node.register(DataFrameScan)
+def _(
+    ir: DataFrameScan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    rows_per_partition = ir.config_options.get("executor_options", {}).get(
+        "max_rows_per_partition", 1_000_000
+    )
+    nrows = max(ir.df.shape()[0], 1)
+    count = math.ceil(nrows / rows_per_partition)
+    if count > 1:
+        length = math.ceil(nrows / count)
+        slices = [
+            DataFrameScan(
+                ir.schema,
+                ir.df.slice(offset, length),
+                ir.projection,
+                ir.config_options,
+            )
+            for offset in range(0, nrows, length)
+        ]
+        new_node = Union(ir.schema, None, *slices)
+        return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
+            new_node: PartitionInfo(count=count)
+        }
+    return ir, {ir: PartitionInfo(count=1)}
+class ScanPartitionFlavor(IntEnum):
+    """Flavor of Scan partitioning."""
+    SINGLE_FILE = enum.auto()  # 1:1 mapping between files and partitions
+    SPLIT_FILES = enum.auto()  # Split each file into >1 partition
+    FUSED_FILES = enum.auto()  # Fuse multiple files into each partition
+class ScanPartitionPlan:
+    """
+    Scan partitioning plan.
+    Notes
+    -----
+    The meaning of `factor` depends on the value of `flavor`:
+      - SINGLE_FILE: `factor` must be `1`.
+      - SPLIT_FILES: `factor` is the number of partitions per file.
+      - FUSED_FILES: `factor` is the number of files per partition.
+    """
+    __slots__ = ("factor", "flavor")
+    factor: int
+    flavor: ScanPartitionFlavor
+    def __init__(self, factor: int, flavor: ScanPartitionFlavor) -> None:
+        if (
+            flavor == ScanPartitionFlavor.SINGLE_FILE and factor != 1
+        ):  # pragma: no cover
+            raise ValueError(f"Expected factor == 1 for {flavor}, got: {factor}")
+        self.factor = factor
+        self.flavor = flavor
+    @staticmethod
+    def from_scan(ir: Scan) -> ScanPartitionPlan:
+        """Extract the partitioning plan of a Scan operation."""
+        if ir.typ == "parquet":
+            # TODO: Use system info to set default blocksize
+            parallel_options = ir.config_options.get("executor_options", {})
+            blocksize: int = parallel_options.get("parquet_blocksize", 1024**3)
+            stats = _sample_pq_statistics(ir)
+            file_size = sum(float(stats[column]) for column in ir.schema)
+            if file_size > 0:
+                if file_size > blocksize:
+                    # Split large files
+                    return ScanPartitionPlan(
+                        math.ceil(file_size / blocksize),
+                        ScanPartitionFlavor.SPLIT_FILES,
+                    )
+                else:
+                    # Fuse small files
+                    return ScanPartitionPlan(
+                        max(blocksize // int(file_size), 1),
+                        ScanPartitionFlavor.FUSED_FILES,
+                    )
+        # TODO: Use file sizes for csv and json
+        return ScanPartitionPlan(1, ScanPartitionFlavor.SINGLE_FILE)
+class SplitScan(IR):
+    """
+    Input from a split file.
+    This class wraps a single-file `Scan` object. At
+    IO/evaluation time, this class will only perform
+    a partial read of the underlying file. The range
+    (skip_rows and n_rows) is calculated at IO time.
+    """
+    __slots__ = (
+        "base_scan",
+        "schema",
+        "split_index",
+        "total_splits",
+    )
+    _non_child = (
+        "schema",
+        "base_scan",
+        "split_index",
+        "total_splits",
+    )
+    base_scan: Scan
+    """Scan operation this node is based on."""
+    split_index: int
+    """Index of the current split."""
+    total_splits: int
+    """Total number of splits."""
+    def __init__(
+        self, schema: Schema, base_scan: Scan, split_index: int, total_splits: int
+    ):
+        self.schema = schema
+        self.base_scan = base_scan
+        self.split_index = split_index
+        self.total_splits = total_splits
+        self._non_child_args = (
+            split_index,
+            total_splits,
+            *base_scan._non_child_args,
+        )
+        self.children = ()
+        if base_scan.typ not in ("parquet",):  # pragma: no cover
+            raise NotImplementedError(
+                f"Unhandled Scan type for file splitting: {base_scan.typ}"
+            )
+    @classmethod
+    def do_evaluate(
+        cls,
+        split_index: int,
+        total_splits: int,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        config_options: dict[str, Any],
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: NamedExpr | None,
+    ):
+        """Evaluate and return a dataframe."""
+        if typ not in ("parquet",):  # pragma: no cover
+            raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}")
+        if len(paths) > 1:  # pragma: no cover
+            raise ValueError(f"Expected a single path, got: {paths}")
+        # Parquet logic:
+        # - We are one of "total_splits" SplitScan nodes
+        #   assigned to the same file.
+        # - We know our index within this file ("split_index")
+        # - We can also use parquet metadata to query the
+        #   total number of rows in each row-group of the file.
+        # - We can use all this information to calculate the
+        #   "skip_rows" and "n_rows" options to use locally.
+        rowgroup_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+            plc.io.SourceInfo(paths)
+        ).rowgroup_metadata()
+        total_row_groups = len(rowgroup_metadata)
+        if total_splits <= total_row_groups:
+            # We have enough row-groups in the file to align
+            # all "total_splits" of our reads with row-group
+            # boundaries. Calculate which row-groups to include
+            # in the current read, and use metadata to translate
+            # the row-group indices to "skip_rows" and "n_rows".
+            rg_stride = total_row_groups // total_splits
+            skip_rgs = rg_stride * split_index
+            skip_rows = sum(rg["num_rows"] for rg in rowgroup_metadata[:skip_rgs])
+            n_rows = sum(
+                rg["num_rows"]
+                for rg in rowgroup_metadata[skip_rgs : skip_rgs + rg_stride]
+            )
+        else:
+            # There are not enough row-groups to align
+            # all "total_splits" of our reads with row-group
+            # boundaries. Use metadata to directly calculate
+            # "skip_rows" and "n_rows" for the current read.
+            total_rows = sum(rg["num_rows"] for rg in rowgroup_metadata)
+            n_rows = total_rows // total_splits
+            skip_rows = n_rows * split_index
+        # Last split should always read to end of file
+        if split_index == (total_splits - 1):
+            n_rows = -1
+        # Perform the partial read
+        return Scan.do_evaluate(
+            schema,
+            typ,
+            reader_options,
+            config_options,
+            paths,
+            with_columns,
+            skip_rows,
+            n_rows,
+            row_index,
+            predicate,
+        )
+def _sample_pq_statistics(ir: Scan) -> dict[str, float]:
+    import numpy as np
+    import pyarrow.dataset as pa_ds
+    # Use average total_uncompressed_size of three files
+    # TODO: Use plc.io.parquet_metadata.read_parquet_metadata
+    n_sample = 3
+    column_sizes = {}
+    ds = pa_ds.dataset(random.sample(ir.paths, n_sample), format="parquet")
+    for i, frag in enumerate(ds.get_fragments()):
+        md = frag.metadata
+        for rg in range(md.num_row_groups):
+            row_group = md.row_group(rg)
+            for col in range(row_group.num_columns):
+                column = row_group.column(col)
+                name = column.path_in_schema
+                if name not in column_sizes:
+                    column_sizes[name] = np.zeros(n_sample, dtype="int64")
+                column_sizes[name][i] += column.total_uncompressed_size
+    return {name: np.mean(sizes) for name, sizes in column_sizes.items()}
+@lower_ir_node.register(Scan)
+def _(
+    ir: Scan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    partition_info: MutableMapping[IR, PartitionInfo]
+    if ir.typ in ("csv", "parquet", "ndjson") and ir.n_rows == -1 and ir.skip_rows == 0:
+        plan = ScanPartitionPlan.from_scan(ir)
+        paths = list(ir.paths)
+        if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
+            # Disable chunked reader when splitting files
+            config_options = ir.config_options.copy()
+            config_options["parquet_options"] = config_options.get(
+                "parquet_options", {}
+            ).copy()
+            config_options["parquet_options"]["chunked"] = False
+            slices: list[SplitScan] = []
+            for path in paths:
+                base_scan = Scan(
+                    ir.schema,
+                    ir.typ,
+                    ir.reader_options,
+                    ir.cloud_options,
+                    config_options,
+                    [path],
+                    ir.with_columns,
+                    ir.skip_rows,
+                    ir.n_rows,
+                    ir.row_index,
+                    ir.predicate,
+                )
+                slices.extend(
+                    SplitScan(ir.schema, base_scan, sindex, plan.factor)
+                    for sindex in range(plan.factor)
+                )
+            new_node = Union(ir.schema, None, *slices)
+            partition_info = {slice: PartitionInfo(count=1) for slice in slices} | {
+                new_node: PartitionInfo(count=len(slices))
+            }
+        else:
+            groups: list[Scan] = [
+                Scan(
+                    ir.schema,
+                    ir.typ,
+                    ir.reader_options,
+                    ir.cloud_options,
+                    ir.config_options,
+                    paths[i : i + plan.factor],
+                    ir.with_columns,
+                    ir.skip_rows,
+                    ir.n_rows,
+                    ir.row_index,
+                    ir.predicate,
+                )
+                for i in range(0, len(paths), plan.factor)
+            ]
+            new_node = Union(ir.schema, None, *groups)
+            partition_info = {group: PartitionInfo(count=1) for group in groups} | {
+                new_node: PartitionInfo(count=len(groups))
+            }
+        return new_node, partition_info
+    return ir, {ir: PartitionInfo(count=1)}  # pragma: no cover

cudf-polars-cu12 24.12.0__py3-none-any.whl → 25.2.0__py3-none-any.whl

cudf-polars-cu12 24.12.0py3-none-any.whl → 25.2.0py3-none-any.whl