PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/experimental/sort.py CHANGED Viewed

@@ -1,9 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Sorting Logic."""
 from __future__ import annotations
+from functools import partial
 from typing import TYPE_CHECKING, Any, Literal, TypedDict
 import polars as pl
@@ -22,11 +23,19 @@ from cudf_polars.experimental.repartition import Repartition
 from cudf_polars.experimental.shuffle import _simple_shuffle_graph
 from cudf_polars.experimental.utils import _concat, _fallback_inform, _lower_ir_fallback
 from cudf_polars.utils.config import ShuffleMethod
+from cudf_polars.utils.cuda_stream import (
+    get_dask_cuda_stream,
+    get_joined_cuda_stream,
+    join_cuda_streams,
+)
 if TYPE_CHECKING:
     from collections.abc import MutableMapping, Sequence
+    from rmm.pylibrmm.stream import Stream
     from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.dsl.ir import IRExecutionContext
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.typing import Schema
@@ -37,6 +46,7 @@ def find_sort_splits(
     my_part_id: int,
     column_order: Sequence[plc.types.Order],
     null_order: Sequence[plc.types.NullOrder],
+    stream: Stream,
 ) -> list[int]:
     """
     Find local sort splits given all (global) split candidates.
@@ -59,6 +69,10 @@ def find_sort_splits(
         The order in which tbl is sorted.
     null_order
         The null order in which tbl is sorted.
+    stream
+        CUDA stream used for device memory operations and kernel launches.
+        The values in both ``tbl`` and ``sort_boundaries`` must be valid on
+        ``stream``.
     Returns
     -------
@@ -69,28 +83,44 @@ def find_sort_splits(
     # We now need to find the local split points.  To do this, first split out
     # the partition id and the local row number of the final split values
-    *sort_boundaries, split_part_id, split_local_row = sort_boundaries.columns()
-    sort_boundaries = plc.Table(sort_boundaries)
+    *boundary_cols, split_part_id, split_local_row = sort_boundaries.columns()
+    sort_boundaries = plc.Table(boundary_cols)
     # Now we find the first and last row in the local table corresponding to the split value
     # (first and last, because there may be multiple rows with the same split value)
     split_first_col = plc.search.lower_bound(
-        tbl, sort_boundaries, column_order, null_order
+        tbl,
+        sort_boundaries,
+        column_order,
+        null_order,
+        stream=stream,
     )
     split_last_col = plc.search.upper_bound(
-        tbl, sort_boundaries, column_order, null_order
+        tbl,
+        sort_boundaries,
+        column_order,
+        null_order,
+        stream=stream,
     )
     # And convert to list for final processing
-    split_first_col = pl.Series(split_first_col).to_list()
-    split_last_col = pl.Series(split_last_col).to_list()
-    split_part_id = pl.Series(split_part_id).to_list()
-    split_local_row = pl.Series(split_local_row).to_list()
+    # The type ignores are for cross-library boundaries: plc.Column -> pl.Series
+    # These work at runtime via the Arrow C Data Interface protocol
+    # TODO: Find a way for pylibcudf types to show they export the Arrow protocol
+    # (mypy wasn't happy with a custom protocol)
+    split_first_list = pl.Series(split_first_col).to_list()
+    split_last_list = pl.Series(split_last_col).to_list()
+    split_part_id_list = pl.Series(split_part_id).to_list()
+    split_local_row_list = pl.Series(split_local_row).to_list()
     # Find the final split points.  This is slightly tricky because of the possibility
     # of equal values, which is why we need the part_id and local_row.
     # Consider for example the case when all data is equal.
     split_points = []
     for first, last, part_id, local_row in zip(
-        split_first_col, split_last_col, split_part_id, split_local_row, strict=False
+        split_first_list,
+        split_last_list,
+        split_part_id_list,
+        split_local_row_list,
+        strict=False,
     ):
         if part_id < my_part_id:
             # Local data is globally later so split at first valid row.
@@ -126,31 +156,42 @@ def _select_local_split_candidates(
             [
                 *df.columns,
                 Column(
-                    plc.column_factories.make_empty_column(part_id_dtype.plc),
+                    plc.column_factories.make_empty_column(
+                        part_id_dtype.plc_type, stream=df.stream
+                    ),
                     dtype=part_id_dtype,
                     name=next(name_gen),
                 ),
                 Column(
-                    plc.column_factories.make_empty_column(part_id_dtype.plc),
+                    plc.column_factories.make_empty_column(
+                        part_id_dtype.plc_type, stream=df.stream
+                    ),
                     dtype=part_id_dtype,
                     name=next(name_gen),
                 ),
-            ]
+            ],
+            stream=df.stream,
         )
     candidates = [i * df.num_rows // num_partitions for i in range(num_partitions)]
-    row_id = plc.Column.from_iterable_of_py(candidates, part_id_dtype.plc)
+    row_id = plc.Column.from_iterable_of_py(
+        candidates, part_id_dtype.plc_type, stream=df.stream
+    )
-    res = plc.copying.gather(df.table, row_id, plc.copying.OutOfBoundsPolicy.DONT_CHECK)
+    res = plc.copying.gather(
+        df.table, row_id, plc.copying.OutOfBoundsPolicy.DONT_CHECK, stream=df.stream
+    )
     part_id = plc.Column.from_scalar(
-        plc.Scalar.from_py(my_part_id, part_id_dtype.plc),
+        plc.Scalar.from_py(my_part_id, part_id_dtype.plc_type, stream=df.stream),
         len(candidates),
+        stream=df.stream,
     )
     return DataFrame.from_table(
         plc.Table([*res.columns(), part_id, row_id]),
         [*df.column_names, next(name_gen), next(name_gen)],
         [*df.dtypes, part_id_dtype, part_id_dtype],
+        stream=df.stream,
     )
@@ -159,7 +200,7 @@ def _get_final_sort_boundaries(
     column_order: Sequence[plc.types.Order],
     null_order: Sequence[plc.types.NullOrder],
     num_partitions: int,
-) -> plc.Table:
+) -> DataFrame:
     """
     Find the global sort split boundaries from all gathered split candidates.
@@ -186,22 +227,28 @@ def _get_final_sort_boundaries(
         # split candidates has the additional partition_id and row_number columns
         column_order + [plc.types.Order.ASCENDING] * 2,
         null_order + [plc.types.NullOrder.AFTER] * 2,
+        stream=sort_boundaries_candidates.stream,
     )
     selected_candidates = plc.Column.from_iterable_of_py(
         [
             i * sorted_candidates.num_rows() // num_partitions
             for i in range(1, num_partitions)
-        ]
+        ],
+        stream=sort_boundaries_candidates.stream,
     )
     # Get the actual values at which we will split the data
     sort_boundaries = plc.copying.gather(
-        sorted_candidates, selected_candidates, plc.copying.OutOfBoundsPolicy.DONT_CHECK
+        sorted_candidates,
+        selected_candidates,
+        plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+        stream=sort_boundaries_candidates.stream,
     )
     return DataFrame.from_table(
         sort_boundaries,
         sort_boundaries_candidates.column_names,
         sort_boundaries_candidates.dtypes,
+        stream=sort_boundaries_candidates.stream,
     )
@@ -211,6 +258,7 @@ def _sort_boundaries_graph(
     column_order: Sequence[plc.types.Order],
     null_order: Sequence[plc.types.NullOrder],
     count: int,
+    context: IRExecutionContext,
 ) -> tuple[str, MutableMapping[Any, Any]]:
     """Graph to get the boundaries from all partitions."""
     local_boundaries_name = f"sort-boundaries_local-{name_in}"
@@ -229,7 +277,7 @@ def _sort_boundaries_graph(
         )
         _concat_list.append((local_boundaries_name, part_id))
-    graph[concat_boundaries_name] = (_concat, *_concat_list)
+    graph[concat_boundaries_name] = (partial(_concat, context=context), *_concat_list)
     graph[global_boundaries_name] = (
         _get_final_sort_boundaries,
         concat_boundaries_name,
@@ -276,6 +324,11 @@ class RMPFIntegrationSortedShuffle:  # pragma: no cover
         context = get_worker_context()
         by = options["by"]
+        data_streams = [
+            df.stream,
+            sort_boundaries.stream,
+        ]
+        stream = get_joined_cuda_stream(get_dask_cuda_stream, upstreams=data_streams)
         splits = find_sort_splits(
             df.select(by).table,
@@ -283,15 +336,20 @@ class RMPFIntegrationSortedShuffle:  # pragma: no cover
             partition_id,
             options["order"],
             options["null_order"],
+            stream=stream,
         )
         packed_inputs = split_and_pack(
             df.table,
             splits=splits,
             br=context.br,
-            stream=DEFAULT_STREAM,
+            stream=stream,
         )
+        # TODO: figure out handoff with rapidsmpf
+        # https://github.com/rapidsai/cudf/issues/20337
         shuffler.insert_chunks(packed_inputs)
+        join_cuda_streams(downstreams=data_streams, upstreams=[stream])
     @staticmethod
     def extract_partition(
         partition_id: int,
@@ -316,8 +374,12 @@ class RMPFIntegrationSortedShuffle:  # pragma: no cover
         column_names = options["column_names"]
         column_dtypes = options["column_dtypes"]
+        stream = DEFAULT_STREAM
         # TODO: When sorting, this step should finalize with a merge (unless we
         # require stability, as cudf merge is not stable).
+        # TODO: figure out handoff with rapidsmpf
+        # https://github.com/rapidsai/cudf/issues/20337
         return DataFrame.from_table(
             unpack_and_concat(
                 unspill_partitions(
@@ -327,10 +389,11 @@ class RMPFIntegrationSortedShuffle:  # pragma: no cover
                     statistics=context.statistics,
                 ),
                 br=context.br,
-                stream=DEFAULT_STREAM,
+                stream=stream,
             ),
             column_names,
             column_dtypes,
+            stream=stream,
         )
@@ -359,7 +422,11 @@ def _sort_partition_dataframe(
     """
     if df.num_rows == 0:  # pragma: no cover
         # Fast path for empty DataFrame
-        return {i: df for i in range(partition_count)}
+        return dict.fromkeys(range(partition_count), df)
+    stream = get_joined_cuda_stream(
+        get_dask_cuda_stream, upstreams=(df.stream, sort_boundaries.stream)
+    )
     splits = find_sort_splits(
         df.select(options["by"]).table,
@@ -367,6 +434,7 @@ def _sort_partition_dataframe(
         partition_id,
         options["order"],
         options["null_order"],
+        stream=stream,
     )
     # Split and return the partitioned result
@@ -375,8 +443,9 @@ def _sort_partition_dataframe(
             split,
             df.column_names,
             df.dtypes,
+            stream=df.stream,
         )
-        for i, split in enumerate(plc.copying.split(df.table, splits))
+        for i, split in enumerate(plc.copying.split(df.table, splits, stream=stream))
     }
@@ -428,6 +497,8 @@ class ShuffleSorted(IR):
         null_order: tuple[plc.types.NullOrder, ...],
         shuffle_method: ShuffleMethod,
         df: DataFrame,
+        *,
+        context: IRExecutionContext,
     ) -> DataFrame:  # pragma: no cover
         """Evaluate and return a dataframe."""
         # Single-partition ShuffleSorted evaluation is a no-op
@@ -532,7 +603,9 @@ def _(
 @generate_ir_tasks.register(ShuffleSorted)
 def _(
-    ir: ShuffleSorted, partition_info: MutableMapping[IR, PartitionInfo]
+    ir: ShuffleSorted,
+    partition_info: MutableMapping[IR, PartitionInfo],
+    context: IRExecutionContext,
 ) -> MutableMapping[Any, Any]:
     by = [ne.value.name for ne in ir.by if isinstance(ne.value, Col)]
     if len(by) != len(ir.by):  # pragma: no cover
@@ -547,6 +620,7 @@ def _(
         ir.order,
         ir.null_order,
         partition_info[child].count,
+        context,
     )
     options = {
@@ -596,7 +670,7 @@ def _(
     # Simple task-based fall-back
     graph.update(
-        _simple_shuffle_graph(
+        partial(_simple_shuffle_graph, context=context)(
             get_key_name(child),
             get_key_name(ir),
             partition_info[child].count,

cudf_polars/experimental/spilling.py CHANGED Viewed

@@ -8,9 +8,9 @@ from typing import TYPE_CHECKING, Any
 from dask.sizeof import sizeof
 from distributed import get_worker
-from rapidsmpf.buffer.buffer import MemoryType
 from rapidsmpf.integrations.dask.core import get_worker_context
 from rapidsmpf.integrations.dask.spilling import SpillableWrapper
+from rapidsmpf.memory.buffer import MemoryType
 from cudf_polars.containers import DataFrame

cudf_polars/experimental/statistics.py CHANGED Viewed

@@ -37,6 +37,7 @@ from cudf_polars.experimental.dispatch import (
 from cudf_polars.experimental.expressions import _SUPPORTED_AGGS
 from cudf_polars.experimental.utils import _leaf_column_names
 from cudf_polars.utils import conversion
+from cudf_polars.utils.cuda_stream import get_cuda_stream
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
@@ -47,7 +48,10 @@ if TYPE_CHECKING:
     from cudf_polars.utils.config import ConfigOptions, StatsPlanningOptions
-def collect_statistics(root: IR, config_options: ConfigOptions) -> StatsCollector:
+def collect_statistics(
+    root: IR,
+    config_options: ConfigOptions,
+) -> StatsCollector:
     """
     Collect column statistics for a query.
@@ -607,7 +611,12 @@ def _(ir: IR, stats: StatsCollector, config_options: ConfigOptions) -> None:
 @update_column_stats.register(DataFrameScan)
-def _(ir: DataFrameScan, stats: StatsCollector, config_options: ConfigOptions) -> None:
+def _(
+    ir: DataFrameScan,
+    stats: StatsCollector,
+    config_options: ConfigOptions,
+) -> None:
+    stream = get_cuda_stream()
     # Use datasource row-count estimate.
     if stats.column_stats[ir]:
         stats.row_count[ir] = next(
@@ -620,15 +629,23 @@ def _(ir: DataFrameScan, stats: StatsCollector, config_options: ConfigOptions) -
     for column_stats in stats.column_stats[ir].values():
         if column_stats.source_info.implied_unique_count.value is None:
             # We don't have a unique-count estimate, so we need to sample the data.
-            source_unique_stats = column_stats.source_info.unique_stats(force=False)
+            source_unique_stats = column_stats.source_info.unique_stats(
+                force=False,
+            )
             if source_unique_stats.count.value is not None:
                 column_stats.unique_count = source_unique_stats.count
         else:
             column_stats.unique_count = column_stats.source_info.implied_unique_count
+    stream.synchronize()
 @update_column_stats.register(Scan)
-def _(ir: Scan, stats: StatsCollector, config_options: ConfigOptions) -> None:
+def _(
+    ir: Scan,
+    stats: StatsCollector,
+    config_options: ConfigOptions,
+) -> None:
     # Use datasource row-count estimate.
     if stats.column_stats[ir]:
         stats.row_count[ir] = next(
@@ -649,7 +666,9 @@ def _(ir: Scan, stats: StatsCollector, config_options: ConfigOptions) -> None:
     for column_stats in stats.column_stats[ir].values():
         if column_stats.source_info.implied_unique_count.value is None:
             # We don't have a unique-count estimate, so we need to sample the data.
-            source_unique_stats = column_stats.source_info.unique_stats(force=False)
+            source_unique_stats = column_stats.source_info.unique_stats(
+                force=False,
+            )
             if source_unique_stats.count.value is not None:
                 column_stats.unique_count = source_unique_stats.count
             elif (

cudf_polars/experimental/utils.py CHANGED Viewed

@@ -20,15 +20,15 @@ if TYPE_CHECKING:
     from cudf_polars.containers import DataFrame
     from cudf_polars.dsl.expr import Expr
-    from cudf_polars.dsl.ir import IR
+    from cudf_polars.dsl.ir import IR, IRExecutionContext
     from cudf_polars.experimental.base import ColumnStats
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.utils.config import ConfigOptions
-def _concat(*dfs: DataFrame) -> DataFrame:
+def _concat(*dfs: DataFrame, context: IRExecutionContext) -> DataFrame:
     # Concatenate a sequence of DataFrames vertically
-    return Union.do_evaluate(None, *dfs)
+    return dfs[0] if len(dfs) == 1 else Union.do_evaluate(None, *dfs, context=context)
 def _fallback_inform(msg: str, config_options: ConfigOptions) -> None:
@@ -63,23 +63,41 @@ def _lower_ir_fallback(
     # those children will be collapsed with `Repartition`.
     from cudf_polars.experimental.repartition import Repartition
+    # TODO: (IMPORTANT) Since Repartition is a local operation,
+    # the current fallback logic will only work for one rank!
+    # For multiple ranks, we will need to AllGather the data
+    # on all ranks.
+    config_options = rec.state["config_options"]
+    assert config_options.executor.name == "streaming", (
+        "'in-memory' executor not supported in 'generate_ir_sub_network'"
+    )
+    if (
+        (rapidsmpf_engine := config_options.executor.runtime == "rapidsmpf")
+        and config_options.executor.scheduler == "distributed"
+    ):  # pragma: no cover; Requires distributed
+        raise NotImplementedError(
+            "Fallback is not yet supported distributed execution "
+            "with the RAPIDS-MPF streaming runtime."
+        )
     # Lower children
     lowered_children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True)
     partition_info = reduce(operator.or_, _partition_info)
     # Ensure all children are single-partitioned
     children = []
-    fallback = False
+    inform = False
     for c in lowered_children:
         child = c
-        if partition_info[c].count > 1:
+        if multi_partitioned := partition_info[c].count > 1:
+            inform = True
+        if multi_partitioned or rapidsmpf_engine:
             # Fall-back logic
-            fallback = True
             child = Repartition(child.schema, child)
             partition_info[child] = PartitionInfo(count=1)
         children.append(child)
-    if fallback and msg:
+    if inform and msg:
         # Warn/raise the user if any children were collapsed
         # and the "fallback_mode" configuration is not "silent"
         _fallback_inform(msg, rec.state["config_options"])

cudf_polars/testing/asserts.py CHANGED Viewed

@@ -28,9 +28,10 @@ __all__: list[str] = [
 ]
 # Will be overriden by `conftest.py` with the value from the `--executor`
-# and `--scheduler` command-line arguments
+# and `--cluster` command-line arguments
 DEFAULT_EXECUTOR = "in-memory"
-DEFAULT_SCHEDULER = "synchronous"
+DEFAULT_RUNTIME = "tasks"
+DEFAULT_CLUSTER = "single"
 DEFAULT_BLOCKSIZE_MODE: Literal["small", "default"] = "default"
@@ -111,8 +112,8 @@ def assert_gpu_result_equal(
     # These keywords are correct, but mypy doesn't see that.
     # the 'misc' is for 'error: Keywords must be strings'
-    expect = lazydf.collect(**final_polars_collect_kwargs)  # type: ignore[call-overload,misc]
-    got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine)  # type: ignore[call-overload,misc]
+    expect = lazydf.collect(**final_polars_collect_kwargs)  # type: ignore[misc, call-overload]
+    got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine)  # type: ignore[misc, call-overload]
     assert_kwargs_bool: dict[str, bool] = {
         "check_row_order": check_row_order,
@@ -128,11 +129,14 @@ def assert_gpu_result_equal(
     else:
         tol_kwargs = {"rel_tol": rtol, "abs_tol": atol}
+    # the type checker errors with:
+    # Argument 4 to "assert_frame_equal" has incompatible type "**dict[str, float]"; expected "bool"  [arg-type]
+    # which seems to be a bug in the type checker / type annotations.
     assert_frame_equal(
         expect,
         got,
         **assert_kwargs_bool,
-        **tol_kwargs,
+        **tol_kwargs,  # type: ignore[arg-type]
     )
@@ -202,7 +206,8 @@ def get_default_engine(
     executor_options: dict[str, Any] = {}
     executor = executor or DEFAULT_EXECUTOR
     if executor == "streaming":
-        executor_options["scheduler"] = DEFAULT_SCHEDULER
+        executor_options["cluster"] = DEFAULT_CLUSTER
+        executor_options["runtime"] = DEFAULT_RUNTIME
         blocksize_mode = blocksize_mode or DEFAULT_BLOCKSIZE_MODE
@@ -289,7 +294,7 @@ def assert_collect_raises(
     )
     try:
-        lazydf.collect(**final_polars_collect_kwargs)  # type: ignore[call-overload,misc]
+        lazydf.collect(**final_polars_collect_kwargs)  # type: ignore[misc, call-overload]
     except polars_except:
         pass
     except Exception as e:
@@ -302,7 +307,7 @@ def assert_collect_raises(
     engine = GPUEngine(raise_on_fail=True)
     try:
-        lazydf.collect(**final_cudf_collect_kwargs, engine=engine)  # type: ignore[call-overload,misc]
+        lazydf.collect(**final_cudf_collect_kwargs, engine=engine)  # type: ignore[misc, call-overload]
     except cudf_except:
         pass
     except Exception as e:

cudf_polars/testing/io.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import TYPE_CHECKING
 import polars as pl
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from typing import Literal
 __all__: list[str] = ["make_partitioned_source"]
@@ -110,7 +111,7 @@ def make_lazy_frame(
         assert path is not None, f"path is required for fmt={fmt}."
         row_group_size: int | None = None
         if fmt == "parquet":
-            read = pl.scan_parquet
+            read: Callable[..., pl.LazyFrame] = pl.scan_parquet
             row_group_size = 10
         elif fmt == "csv":
             read = pl.scan_csv

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl