PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

cudf_polars/GIT_COMMIT +1 -0
cudf_polars/VERSION +1 -0
cudf_polars/__init__.py +28 -0
cudf_polars/_version.py +21 -0
cudf_polars/callback.py +318 -0
cudf_polars/containers/__init__.py +13 -0
cudf_polars/containers/column.py +495 -0
cudf_polars/containers/dataframe.py +361 -0
cudf_polars/containers/datatype.py +137 -0
cudf_polars/dsl/__init__.py +8 -0
cudf_polars/dsl/expr.py +66 -0
cudf_polars/dsl/expressions/__init__.py +8 -0
cudf_polars/dsl/expressions/aggregation.py +226 -0
cudf_polars/dsl/expressions/base.py +272 -0
cudf_polars/dsl/expressions/binaryop.py +120 -0
cudf_polars/dsl/expressions/boolean.py +326 -0
cudf_polars/dsl/expressions/datetime.py +271 -0
cudf_polars/dsl/expressions/literal.py +97 -0
cudf_polars/dsl/expressions/rolling.py +643 -0
cudf_polars/dsl/expressions/selection.py +74 -0
cudf_polars/dsl/expressions/slicing.py +46 -0
cudf_polars/dsl/expressions/sorting.py +85 -0
cudf_polars/dsl/expressions/string.py +1002 -0
cudf_polars/dsl/expressions/struct.py +137 -0
cudf_polars/dsl/expressions/ternary.py +49 -0
cudf_polars/dsl/expressions/unary.py +517 -0
cudf_polars/dsl/ir.py +2607 -0
cudf_polars/dsl/nodebase.py +164 -0
cudf_polars/dsl/to_ast.py +359 -0
cudf_polars/dsl/tracing.py +16 -0
cudf_polars/dsl/translate.py +939 -0
cudf_polars/dsl/traversal.py +224 -0
cudf_polars/dsl/utils/__init__.py +8 -0
cudf_polars/dsl/utils/aggregations.py +481 -0
cudf_polars/dsl/utils/groupby.py +98 -0
cudf_polars/dsl/utils/naming.py +34 -0
cudf_polars/dsl/utils/replace.py +61 -0
cudf_polars/dsl/utils/reshape.py +74 -0
cudf_polars/dsl/utils/rolling.py +121 -0
cudf_polars/dsl/utils/windows.py +192 -0
cudf_polars/experimental/__init__.py +8 -0
cudf_polars/experimental/base.py +386 -0
cudf_polars/experimental/benchmarks/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsds.py +220 -0
cudf_polars/experimental/benchmarks/pdsds_queries/__init__.py +4 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q1.py +88 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q10.py +225 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q2.py +244 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q3.py +65 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q4.py +359 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q5.py +462 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q6.py +92 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q7.py +79 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q8.py +524 -0
cudf_polars/experimental/benchmarks/pdsds_queries/q9.py +137 -0
cudf_polars/experimental/benchmarks/pdsh.py +814 -0
cudf_polars/experimental/benchmarks/utils.py +832 -0
cudf_polars/experimental/dask_registers.py +200 -0
cudf_polars/experimental/dispatch.py +156 -0
cudf_polars/experimental/distinct.py +197 -0
cudf_polars/experimental/explain.py +157 -0
cudf_polars/experimental/expressions.py +590 -0
cudf_polars/experimental/groupby.py +327 -0
cudf_polars/experimental/io.py +943 -0
cudf_polars/experimental/join.py +391 -0
cudf_polars/experimental/parallel.py +423 -0
cudf_polars/experimental/repartition.py +69 -0
cudf_polars/experimental/scheduler.py +155 -0
cudf_polars/experimental/select.py +188 -0
cudf_polars/experimental/shuffle.py +354 -0
cudf_polars/experimental/sort.py +609 -0
cudf_polars/experimental/spilling.py +151 -0
cudf_polars/experimental/statistics.py +795 -0
cudf_polars/experimental/utils.py +169 -0
cudf_polars/py.typed +0 -0
cudf_polars/testing/__init__.py +8 -0
cudf_polars/testing/asserts.py +448 -0
cudf_polars/testing/io.py +122 -0
cudf_polars/testing/plugin.py +236 -0
cudf_polars/typing/__init__.py +219 -0
cudf_polars/utils/__init__.py +8 -0
cudf_polars/utils/config.py +741 -0
cudf_polars/utils/conversion.py +40 -0
cudf_polars/utils/dtypes.py +118 -0
cudf_polars/utils/sorting.py +53 -0
cudf_polars/utils/timer.py +39 -0
cudf_polars/utils/versions.py +27 -0
cudf_polars_cu13-25.10.0.dist-info/METADATA +136 -0
cudf_polars_cu13-25.10.0.dist-info/RECORD +92 -0
cudf_polars_cu13-25.10.0.dist-info/WHEEL +5 -0
cudf_polars_cu13-25.10.0.dist-info/licenses/LICENSE +201 -0
cudf_polars_cu13-25.10.0.dist-info/top_level.txt +1 -0

cudf_polars/testing/io.py ADDED Viewed

@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""IO testing utilities."""
+from __future__ import annotations
+from pathlib import Path
+from typing import TYPE_CHECKING
+import polars as pl
+if TYPE_CHECKING:
+    from typing import Literal
+__all__: list[str] = ["make_partitioned_source"]
+def make_partitioned_source(
+    df: pl.DataFrame,
+    path: str | Path,
+    fmt: Literal["csv", "ndjson", "parquet", "chunked_parquet"],
+    *,
+    n_files: int = 1,
+    row_group_size: int | None = None,
+    write_kwargs: dict | None = None,
+) -> None:
+    """
+    Write the Polars DataFrame to one or more files of the desired format.
+    Parameters
+    ----------
+    df : polars.DataFrame
+        The input DataFrame to write.
+    path : str | pathlib.Path
+        The base path to write the file(s) to.
+    fmt : Literal["csv", "ndjson", "parquet", "chunked_parquet"]
+        The format to write in.
+    n_files : int, default 1
+        If greater than 1, splits the data into multiple files.
+    row_group_size : optional, int
+        Only used for Parquet. Specifies the row group size per file.
+    write_kwargs : dict, optional
+        Additional keyword arguments to pass to the write_* functions.
+    """
+    path = Path(path)
+    write_kwargs = write_kwargs or {}
+    def write(part: pl.DataFrame, file_path: Path) -> None:
+        match fmt:
+            case "csv":
+                part.write_csv(file_path, **write_kwargs)
+            case "ndjson":
+                part.write_ndjson(file_path, **write_kwargs)
+            case "parquet" | "chunked_parquet":
+                part.write_parquet(
+                    file_path,
+                    row_group_size=row_group_size or (len(part) // 2),
+                    **write_kwargs,
+                )
+            case _:
+                raise ValueError(f"Unsupported format: {fmt}")
+    if n_files == 1:
+        if path.is_dir():
+            path = path / f"part.0.{fmt}"
+        write(df, path)
+    else:
+        stride = len(df) // n_files
+        for i, part in enumerate(df.iter_slices(stride)):
+            file_path = path / f"part.{i}.{fmt}"
+            write(part, file_path)
+def make_lazy_frame(
+    df: pl.DataFrame,
+    fmt: Literal["csv", "parquet", "frame"],
+    *,
+    path: str | Path | None = None,
+    n_files: int = 1,
+    n_rows: int | None = None,
+) -> pl.LazyFrame:
+    """
+    Returns a pl.LazyFrame from a pl.DataFrame.
+    Parameters
+    ----------
+    df : polars.DataFrame
+        The input DataFrame to convert to a LazyFrame.
+    path : str | pathlib.Path
+        The base path to write the file(s) to.
+        This option is ignored if fmt is "frame".
+    fmt : Literal["parquet", "csv", "frame"]
+        The format to use for IO.
+    n_files : int, default 1
+        If greater than 1, splits the data into multiple files.
+        This option is ignored if fmt is "frame".
+    n_rows : optional, int
+        Slice to apply to the final LazyFrame before returning.
+    """
+    from cudf_polars.experimental.io import _clear_source_info_cache
+    _clear_source_info_cache()
+    if fmt == "frame":
+        if n_rows is not None:
+            return df.slice(0, n_rows).lazy()
+        return df.lazy()
+    else:
+        assert path is not None, f"path is required for fmt={fmt}."
+        row_group_size: int | None = None
+        if fmt == "parquet":
+            read = pl.scan_parquet
+            row_group_size = 10
+        elif fmt == "csv":
+            read = pl.scan_csv
+        else:  # pragma: no cover
+            raise ValueError(f"Unsupported format: {fmt}")
+        make_partitioned_source(
+            df, path, fmt=fmt, n_files=n_files, row_group_size=row_group_size
+        )
+        return read(path, n_rows=n_rows) if n_rows is not None else read(path)

cudf_polars/testing/plugin.py ADDED Viewed

@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Plugin for running polars test suite setting GPU engine as default."""
+from __future__ import annotations
+from functools import partialmethod
+from typing import TYPE_CHECKING
+import pytest
+import polars
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Add plugin-specific options."""
+    group = parser.getgroup(
+        "cudf-polars", "Plugin to set GPU as default engine for polars tests"
+    )
+    group.addoption(
+        "--cudf-polars-no-fallback",
+        action="store_true",
+        help="Turn off fallback to CPU when running tests (default use fallback)",
+    )
+def pytest_configure(config: pytest.Config) -> None:
+    """Enable use of this module as a pytest plugin to enable GPU collection."""
+    no_fallback = config.getoption("--cudf-polars-no-fallback")
+    if no_fallback:
+        collect = polars.LazyFrame.collect
+        engine = polars.GPUEngine(raise_on_fail=no_fallback)
+        # https://github.com/python/mypy/issues/2427
+        polars.LazyFrame.collect = partialmethod(collect, engine=engine)  # type: ignore[method-assign,assignment]
+    else:
+        polars.Config.set_engine_affinity("gpu")
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*GPU engine does not support streaming or background collection",
+    )
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*Query execution with GPU not possible",
+    )
+EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
+    "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
+    "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_scan_delta_schema_evolution_nested_struct_field_19915": "Need to expose hive partitioning",
+    "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
+    "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp_nested": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949",
+    "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
+    "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df2-sorts2-io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df3-b-io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df4-sorts4-io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df0-a-io_type3]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type0]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type1]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_partition.py::test_partition_to_memory_sort_by[df1-a-io_type2]": "partition sinks not yet supported in standard engine.",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
+    "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_field_overwrites_metadata": "cannot serialize in-memory sink target.",
+    "tests/unit/io/test_parquet_field_overwrites.py::test_required_flat": "cannot serialize in-memory sink target.",
+    "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.",
+    "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.",
+    "tests/unit/io/test_parquet_field_overwrites.py::test_required_struct": "cannot serialize in-memory sink target.",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
+    "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
+    "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
+    "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "polars returns decimal column with precision=None",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
+    "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[False]": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[True]": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
+    "tests/unit/operations/test_join.py::test_join_filter_pushdown_iejoin": "Row order differs due to multiple matches per left row index; join results are correct but unsorted",
+    "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
+    "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
+    "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
+    "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
+    "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
+    "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU",
+    "tests/unit/io/test_scan_row_deletion.py::test_scan_row_deletion_skips_file_with_all_rows_deleted": "The test intentionally corrupts the parquet file, so we cannot read the row count from the header.",
+    "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match",
+    "tests/unit/functions/range/test_linear_space.py::test_linear_space_date": "Needs https://github.com/pola-rs/polars/issues/23020",
+    "tests/unit/sql/test_temporal.py::test_implicit_temporal_strings[dt IN ('1960-01-07','2077-01-01','2222-02-22')-expected15]": "Needs https://github.com/pola-rs/polars/issues/23020",
+    "tests/unit/sql/test_operators.py::test_in_not_in[dt NOT IN ('1950-12-24', '1997-07-05')]": "Needs https://github.com/pola-rs/polars/issues/23020",
+    "tests/unit/sql/test_operators.py::test_in_not_in[dt IN ('2020-10-10', '2077-03-18')]": "Needs https://github.com/pola-rs/polars/issues/23020",
+    "tests/unit/datatypes/test_struct.py::test_struct_agg_all": "Needs nested list[struct] support",
+    "tests/unit/constructors/test_structs.py::test_constructor_non_strict_schema_17956": "Needs nested list[struct] support",
+    "tests/unit/io/test_delta.py::test_read_delta_arrow_map_type": "Needs nested list[struct] support",
+    "tests/unit/datatypes/test_struct.py::test_struct_null_cast": "pylibcudf.Scalar does not support struct scalars",
+    "tests/unit/datatypes/test_struct.py::test_struct_outer_nullability_zip_18119": "pylibcudf.Scalar does not support struct scalars",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-none]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR",
+    "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899",
+    "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899",
+}
+TESTS_TO_SKIP: Mapping[str, str] = {
+    "tests/unit/operations/test_profile.py::test_profile_with_cse": "Shape assertion won't match",
+    # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
+    # for obsolete timezone names. However, the chrono_tz package that
+    # polars uses doesn't read /usr/share/zoneinfo, instead packaging
+    # the current zoneinfo database from IANA. Consequently, when this
+    # hypothesis-generated test runs and generates timezones from the
+    # available zoneinfo-reported timezones, we can get an error from
+    # polars that the requested timezone is unknown.
+    # Since this is random, just skip it, rather than xfailing.
+    "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+    # Tests performance difference of CPU engine
+    "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine",
+    "tests/unit/operations/namespaces/list/test_list.py::test_list_struct_field_perf": "Tests CPU Engine perf",
+    "tests/benchmark/test_with_columns.py::test_with_columns_quadratic_19503": "Tests performance bug in CPU engine",
+    # The test may segfault with the legacy streaming engine. We should
+    # remove this skip when all polars tests use the new streaming engine.
+    "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
+    # Fails in CI, but passes locally
+    "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
+    # Remove when polars supports Pydantic V3
+    "tests/unit/constructors/test_constructors.py::test_init_structured_objects": "pydantic deprecation warning",
+    "tests/unit/constructors/test_constructors.py::test_init_pydantic_2x": "pydantic deprecation warning",
+    "tests/unit/constructors/test_constructors.py::test_init_structured_objects_nested[_TestFooPD-_TestBarPD-_TestBazPD]": "pydantic deprecation warning",
+    "tests/unit/series/test_series.py::test_init_structured_objects": "pydantic deprecation warning",
+    "tests/unit/series/test_describe.py::test_series_describe_float": "https://github.com/rapidsai/cudf/issues/19324",
+    "tests/unit/series/test_describe.py::test_series_describe_int": "https://github.com/rapidsai/cudf/issues/19324",
+    "tests/unit/streaming/test_streaming.py::test_streaming_apply": "https://github.com/pola-rs/polars/issues/22558",
+    # New iceberg release causes this test to fail. We can remove in the next polars version bump: https://github.com/rapidsai/cudf/pull/19912
+    "tests/unit/io/test_iceberg.py::test_fill_missing_fields_with_identity_partition_values[False]": "https://github.com/pola-rs/polars/pull/24456",
+}
+def pytest_collection_modifyitems(
+    session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
+) -> None:
+    """Mark known failing tests."""
+    if config.getoption("--cudf-polars-no-fallback"):
+        # Don't xfail tests if running without fallback
+        return
+    for item in items:
+        if (reason := TESTS_TO_SKIP.get(item.nodeid, None)) is not None:
+            item.add_marker(pytest.mark.skip(reason=reason))
+        elif (entry := EXPECTED_FAILURES.get(item.nodeid, None)) is not None:
+            if isinstance(entry, tuple):
+                # the second entry in the tuple is the condition to xfail on
+                reason, condition = entry
+                item.add_marker(
+                    pytest.mark.xfail(
+                        condition=condition,
+                        reason=reason,
+                    ),
+                )
+            else:
+                item.add_marker(pytest.mark.xfail(reason=entry))

cudf_polars/typing/__init__.py ADDED Viewed

@@ -0,0 +1,219 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Typing utilities for cudf_polars."""
+from __future__ import annotations
+import sys
+from collections.abc import Hashable, MutableMapping
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    NewType,
+    Protocol,
+    TypeVar,
+    Union,
+)
+import polars as pl
+import polars.datatypes
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from typing import TypeAlias
+    import pylibcudf as plc
+    from cudf_polars.containers import DataFrame, DataType
+    from cudf_polars.dsl import nodebase
+if sys.version_info >= (3, 11):
+    # Inheriting from TypeDict + Generic added in python 3.11
+    from typing import TypedDict  # pragma: no cover
+else:
+    from typing_extensions import TypedDict  # pragma: no cover
+__all__: list[str] = [
+    "ClosedInterval",
+    "ColumnHeader",
+    "ColumnOptions",
+    "DataFrameHeader",
+    "GenericTransformer",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "PolarsExpr",
+    "PolarsIR",
+    "Schema",
+    "Slice",
+]
+PolarsIR: TypeAlias = Union[
+    pl_ir.PythonScan,
+    pl_ir.Scan,
+    pl_ir.Cache,
+    pl_ir.DataFrameScan,
+    pl_ir.Select,
+    pl_ir.GroupBy,
+    pl_ir.Join,
+    pl_ir.HStack,
+    pl_ir.Distinct,
+    pl_ir.Sort,
+    pl_ir.Slice,
+    pl_ir.Filter,
+    pl_ir.SimpleProjection,
+    pl_ir.MapFunction,
+    pl_ir.Union,
+    pl_ir.HConcat,
+    pl_ir.ExtContext,
+]
+PolarsExpr: TypeAlias = Union[
+    pl_expr.Function,
+    pl_expr.Window,
+    pl_expr.Literal,
+    pl_expr.Sort,
+    pl_expr.SortBy,
+    pl_expr.Gather,
+    pl_expr.Filter,
+    pl_expr.Cast,
+    pl_expr.Column,
+    pl_expr.Agg,
+    pl_expr.BinaryExpr,
+    pl_expr.Len,
+    pl_expr.PyExprIR,
+]
+PolarsSchema: TypeAlias = dict[str, pl.DataType]
+Schema: TypeAlias = dict[str, "DataType"]
+PolarsDataType: TypeAlias = polars.datatypes.DataTypeClass | polars.datatypes.DataType
+Slice: TypeAlias = tuple[int, int | None]
+CSECache: TypeAlias = MutableMapping[int, tuple["DataFrame", int]]
+ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"]
+Duration = NewType("Duration", tuple[int, int, int, int, bool, bool])
+class NodeTraverser(Protocol):
+    """Abstract protocol for polars NodeTraverser."""
+    def get_node(self) -> int:
+        """Return current plan node id."""
+        ...
+    def set_node(self, n: int) -> None:
+        """Set the current plan node to n."""
+        ...
+    def view_current_node(self) -> PolarsIR:
+        """Convert current plan node to python rep."""
+        ...
+    def get_schema(self) -> PolarsSchema:
+        """Get the schema of the current plan node."""
+        ...
+    def get_dtype(self, n: int) -> pl.DataType:
+        """Get the datatype of the given expression id."""
+        ...
+    def view_expression(self, n: int) -> PolarsExpr:
+        """Convert the given expression to python rep."""
+        ...
+    def version(self) -> tuple[int, int]:
+        """The IR version as `(major, minor)`."""
+        ...
+    def set_udf(
+        self,
+        callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame],
+    ) -> None:
+        """Set the callback replacing the current node in the plan."""
+        ...
+OptimizationArgs: TypeAlias = Literal[
+    "type_coercion",
+    "predicate_pushdown",
+    "projection_pushdown",
+    "simplify_expression",
+    "slice_pushdown",
+    "comm_subplan_elim",
+    "comm_subexpr_elim",
+    "cluster_with_columns",
+    "no_optimization",
+]
+U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
+V_co = TypeVar("V_co", covariant=True)
+StateT_co = TypeVar("StateT_co", covariant=True)
+NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
+class GenericTransformer(Protocol[U_contra, V_co, StateT_co]):
+    """Abstract protocol for recursive visitors."""
+    def __call__(self, __value: U_contra) -> V_co:
+        """Apply the visitor to the node."""
+        ...
+    @property
+    def state(self) -> StateT_co:
+        """Transform-specific immutable state."""
+        ...
+class ColumnOptions(TypedDict):
+    """
+    Column constructor options.
+    Notes
+    -----
+    Used to serialize Column and DataFrame containers.
+    """
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+    name: str | None
+    dtype: str
+class DeserializedColumnOptions(TypedDict):
+    """
+    Deserialized Column constructor options.
+    Notes
+    -----
+    Used to deserialize Column and DataFrame containers.
+    """
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+    name: str | None
+    dtype: DataType
+class ColumnHeader(TypedDict):
+    """Column serialization header."""
+    column_kwargs: ColumnOptions
+    frame_count: int
+class DataFrameHeader(TypedDict):
+    """DataFrame serialization header."""
+    columns_kwargs: list[ColumnOptions]
+    frame_count: int

cudf_polars/utils/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Utilities."""
+from __future__ import annotations
+__all__: list[str] = []