PyPI - cudf-polars-cu13 - Versions diffs - 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl - Mend

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

cudf_polars/GIT_COMMIT +1 -1
cudf_polars/VERSION +1 -1
cudf_polars/callback.py +60 -15
cudf_polars/containers/column.py +137 -77
cudf_polars/containers/dataframe.py +123 -34
cudf_polars/containers/datatype.py +134 -13
cudf_polars/dsl/expr.py +0 -2
cudf_polars/dsl/expressions/aggregation.py +80 -28
cudf_polars/dsl/expressions/binaryop.py +34 -14
cudf_polars/dsl/expressions/boolean.py +110 -37
cudf_polars/dsl/expressions/datetime.py +59 -30
cudf_polars/dsl/expressions/literal.py +11 -5
cudf_polars/dsl/expressions/rolling.py +460 -119
cudf_polars/dsl/expressions/selection.py +9 -8
cudf_polars/dsl/expressions/slicing.py +1 -1
cudf_polars/dsl/expressions/string.py +256 -114
cudf_polars/dsl/expressions/struct.py +19 -7
cudf_polars/dsl/expressions/ternary.py +33 -3
cudf_polars/dsl/expressions/unary.py +126 -64
cudf_polars/dsl/ir.py +1053 -350
cudf_polars/dsl/to_ast.py +30 -13
cudf_polars/dsl/tracing.py +194 -0
cudf_polars/dsl/translate.py +307 -107
cudf_polars/dsl/utils/aggregations.py +43 -30
cudf_polars/dsl/utils/reshape.py +14 -2
cudf_polars/dsl/utils/rolling.py +12 -8
cudf_polars/dsl/utils/windows.py +35 -20
cudf_polars/experimental/base.py +55 -2
cudf_polars/experimental/benchmarks/pdsds.py +12 -126
cudf_polars/experimental/benchmarks/pdsh.py +792 -2
cudf_polars/experimental/benchmarks/utils.py +596 -39
cudf_polars/experimental/dask_registers.py +47 -20
cudf_polars/experimental/dispatch.py +9 -3
cudf_polars/experimental/distinct.py +2 -0
cudf_polars/experimental/explain.py +15 -2
cudf_polars/experimental/expressions.py +30 -15
cudf_polars/experimental/groupby.py +25 -4
cudf_polars/experimental/io.py +156 -124
cudf_polars/experimental/join.py +53 -23
cudf_polars/experimental/parallel.py +68 -19
cudf_polars/experimental/rapidsmpf/__init__.py +8 -0
cudf_polars/experimental/rapidsmpf/collectives/__init__.py +9 -0
cudf_polars/experimental/rapidsmpf/collectives/allgather.py +90 -0
cudf_polars/experimental/rapidsmpf/collectives/common.py +96 -0
cudf_polars/experimental/rapidsmpf/collectives/shuffle.py +253 -0
cudf_polars/experimental/rapidsmpf/core.py +488 -0
cudf_polars/experimental/rapidsmpf/dask.py +172 -0
cudf_polars/experimental/rapidsmpf/dispatch.py +153 -0
cudf_polars/experimental/rapidsmpf/io.py +696 -0
cudf_polars/experimental/rapidsmpf/join.py +322 -0
cudf_polars/experimental/rapidsmpf/lower.py +74 -0
cudf_polars/experimental/rapidsmpf/nodes.py +735 -0
cudf_polars/experimental/rapidsmpf/repartition.py +216 -0
cudf_polars/experimental/rapidsmpf/union.py +115 -0
cudf_polars/experimental/rapidsmpf/utils.py +374 -0
cudf_polars/experimental/repartition.py +9 -2
cudf_polars/experimental/select.py +177 -14
cudf_polars/experimental/shuffle.py +46 -12
cudf_polars/experimental/sort.py +100 -26
cudf_polars/experimental/spilling.py +1 -1
cudf_polars/experimental/statistics.py +24 -5
cudf_polars/experimental/utils.py +25 -7
cudf_polars/testing/asserts.py +13 -8
cudf_polars/testing/io.py +2 -1
cudf_polars/testing/plugin.py +93 -17
cudf_polars/typing/__init__.py +86 -32
cudf_polars/utils/config.py +473 -58
cudf_polars/utils/cuda_stream.py +70 -0
cudf_polars/utils/versions.py +5 -4
cudf_polars_cu13-26.2.0.dist-info/METADATA +181 -0
cudf_polars_cu13-26.2.0.dist-info/RECORD +108 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/WHEEL +1 -1
cudf_polars_cu13-25.10.0.dist-info/METADATA +0 -136
cudf_polars_cu13-25.10.0.dist-info/RECORD +0 -92
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/licenses/LICENSE +0 -0
{cudf_polars_cu13-25.10.0.dist-info → cudf_polars_cu13-26.2.0.dist-info}/top_level.txt +0 -0

cudf_polars/utils/config.py CHANGED Viewed

@@ -28,7 +28,10 @@ import importlib.util
 import json
 import os
 import warnings
-from typing import TYPE_CHECKING, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Literal, TypeVar
+from rmm.pylibrmm.cuda_stream import CudaStreamFlags
+from rmm.pylibrmm.cuda_stream_pool import CudaStreamPool
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -37,13 +40,18 @@ if TYPE_CHECKING:
     import polars.lazyframe.engine_config
+    import rmm.mr
 __all__ = [
+    "Cluster",
     "ConfigOptions",
     "InMemoryExecutor",
     "ParquetOptions",
-    "Scheduler",
+    "Runtime",
+    "Scheduler",  # Deprecated, kept for backward compatibility
     "ShuffleMethod",
+    "ShufflerInsertionMethod",
     "StatsPlanningOptions",
     "StreamingExecutor",
     "StreamingFallbackMode",
@@ -57,8 +65,9 @@ def _env_get_int(name: str, default: int) -> int:
         return default  # pragma: no cover
-def get_total_device_memory() -> int | None:
-    """Return the total memory of the current device."""
+@functools.cache
+def get_device_handle() -> Any:
+    # Gets called for each IR.do_evaluate node, so we'll cache it.
     import pynvml
     try:
@@ -74,12 +83,27 @@ def get_total_device_memory() -> int | None:
                 handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle(handle)
         else:
             handle = pynvml.nvmlDeviceGetHandleByIndex(int(index))
-        return pynvml.nvmlDeviceGetMemoryInfo(handle).total
     except pynvml.NVMLError_NotSupported:  # pragma: no cover
         # System doesn't have proper "GPU memory".
         return None
+    else:
+        return handle
+def get_total_device_memory() -> int | None:
+    """Return the total memory of the current device."""
+    import pynvml
+    maybe_handle = get_device_handle()
+    if maybe_handle is not None:
+        try:
+            return pynvml.nvmlDeviceGetMemoryInfo(maybe_handle).total
+        except pynvml.NVMLError_NotSupported:  # pragma: no cover
+            # System doesn't have proper "GPU memory".
+            return None
+    else:  # pragma: no cover
+        return None
 @functools.cache
@@ -120,14 +144,43 @@ class StreamingFallbackMode(str, enum.Enum):
     SILENT = "silent"
+class Runtime(str, enum.Enum):
+    """
+    The runtime to use for the streaming executor.
+    * ``Runtime.TASKS`` : Use the task-based runtime.
+      This is the default runtime.
+    * ``Runtime.RAPIDSMPF`` : Use the coroutine-based streaming runtime (rapidsmpf).
+      This runtime is experimental.
+    """
+    TASKS = "tasks"
+    RAPIDSMPF = "rapidsmpf"
+class Cluster(str, enum.Enum):
+    """
+    The cluster configuration for the streaming executor.
+    * ``Cluster.SINGLE`` : Single-GPU execution. Currently uses a zero-dependency,
+      synchronous, single-threaded task scheduler.
+    * ``Cluster.DISTRIBUTED`` : Multi-GPU distributed execution. Currently
+      uses a Dask-based distributed scheduler and requires an
+      active Dask cluster.
+    """
+    SINGLE = "single"
+    DISTRIBUTED = "distributed"
 class Scheduler(str, enum.Enum):
     """
-    The scheduler to use for the streaming executor.
+    **Deprecated**: Use :class:`Cluster` instead.
+    The scheduler to use for the task-based streaming executor.
-    * ``Scheduler.SYNCHRONOUS`` : A zero-dependency, synchronous,
-      single-threaded scheduler.
-    * ``Scheduler.DISTRIBUTED`` : A Dask-based distributed scheduler.
-      Using this scheduler requires an active Dask cluster.
+    * ``Scheduler.SYNCHRONOUS`` : Single-GPU execution (use ``Cluster.SINGLE`` instead)
+    * ``Scheduler.DISTRIBUTED`` : Multi-GPU execution (use ``Cluster.DISTRIBUTED`` instead)
     """
     SYNCHRONOUS = "synchronous"
@@ -143,12 +196,12 @@ class ShuffleMethod(str, enum.Enum):
     * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
     With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
-    will attempt to use ``ShuffleMethod.RAPIDSMPF`` for the distributed scheduler,
+    will attempt to use ``ShuffleMethod.RAPIDSMPF`` for a distributed cluster,
     but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
     The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
     A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
-    shuffler automatically when the 'synchronous' scheduler is active.
+    shuffler automatically when using single-GPU execution.
     """
     TASKS = "tasks"
@@ -156,6 +209,20 @@ class ShuffleMethod(str, enum.Enum):
     _RAPIDSMPF_SINGLE = "rapidsmpf-single"
+class ShufflerInsertionMethod(str, enum.Enum):
+    """
+    The method to use for inserting chunks into the rapidsmpf shuffler.
+    * ``ShufflerInsertionMethod.INSERT_CHUNKS`` : Use insert_chunks for inserting data.
+    * ``ShufflerInsertionMethod.CONCAT_INSERT`` : Use concat_insert for inserting data.
+    Only applicable with the "rapidsmpf" shuffle method and the "tasks" runtime.
+    """
+    INSERT_CHUNKS = "insert_chunks"
+    CONCAT_INSERT = "concat_insert"
 T = TypeVar("T")
@@ -215,6 +282,10 @@ class ParquetOptions:
         Set to 0 to avoid row-group sampling. Note that row-group sampling
         will also be skipped if ``max_footer_samples`` is 0.
+    use_rapidsmpf_native
+        Whether to use the native rapidsmpf node for parquet reading.
+        This option is only used when the rapidsmpf runtime is enabled.
+        Default is True.
     """
     _env_prefix = "CUDF_POLARS__PARQUET_OPTIONS"
@@ -249,6 +320,13 @@ class ParquetOptions:
             f"{_env_prefix}__MAX_ROW_GROUP_SAMPLES", int, default=1
         )
     )
+    use_rapidsmpf_native: bool = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__USE_RAPIDSMPF_NATIVE",
+            _bool_converter,
+            default=True,
+        )
+    )
     def __post_init__(self) -> None:  # noqa: D105
         if not isinstance(self.chunked, bool):
@@ -263,9 +341,11 @@ class ParquetOptions:
             raise TypeError("max_footer_samples must be an int")
         if not isinstance(self.max_row_group_samples, int):
             raise TypeError("max_row_group_samples must be an int")
+        if not isinstance(self.use_rapidsmpf_native, bool):
+            raise TypeError("use_rapidsmpf_native must be a bool")
-def default_blocksize(scheduler: str) -> int:
+def default_blocksize(cluster: str) -> int:
     """Return the default blocksize."""
     device_size = get_total_device_memory()
     if device_size is None:  # pragma: no cover
@@ -274,7 +354,7 @@ def default_blocksize(scheduler: str) -> int:
         return 1_000_000_000
     if (
-        scheduler == "distributed"
+        cluster == "distributed"
         or _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 0
     ):
         # Distributed execution requires a conservative
@@ -370,6 +450,102 @@ class StatsPlanningOptions:
             raise TypeError("default_selectivity must be a float")
+@dataclasses.dataclass(frozen=True, eq=True)
+class MemoryResourceConfig:
+    """
+    Configuration for the default memory resource.
+    Parameters
+    ----------
+    qualname
+        The fully qualified name of the memory resource class to use.
+    options
+        This can be either a dictionary representing the options to pass
+        to the memory resource class, or, a dictionary representing a
+        nested memory resource configuration. The presence of "qualname"
+        field indicates a nested memory resource configuration.
+    Examples
+    --------
+    Create a memory resource config for a single memory resource:
+    >>> MemoryResourceConfig(
+    ...     qualname="rmm.mr.CudaAsyncMemoryResource",
+    ...     options={"initial_pool_size": 100},
+    ... )
+    Create a memory resource config for a nested memory resource configuration:
+    >>> MemoryResourceConfig(
+    ...     qualname="rmm.mr.PrefetchResourceAdaptor",
+    ...     options={
+    ...         "upstream_mr": {
+    ...             "qualname": "rmm.mr.PoolMemoryResource",
+    ...             "options": {
+    ...                 "upstream_mr": {
+    ...                     "qualname": "rmm.mr.ManagedMemoryResource",
+    ...                 },
+    ...                 "initial_pool_size": 256,
+    ...             },
+    ...         }
+    ...     },
+    ... )
+    """
+    _env_prefix = "CUDF_POLARS__MEMORY_RESOURCE_CONFIG"
+    qualname: str = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__QUALNAME",
+            str,
+            # We shouldn't reach here if qualname isn't set in the environment.
+            default=None,  # type: ignore[assignment]
+        )
+    )
+    options: dict[str, Any] | None = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__OPTIONS",
+            json.loads,
+            default=None,
+        )
+    )
+    def __post_init__(self) -> None:
+        if self.qualname.count(".") < 1:
+            raise ValueError(
+                f"MemoryResourceConfig.qualname '{self.qualname}' must be a fully qualified name to a class, including the module name."
+            )
+    def create_memory_resource(self) -> rmm.mr.DeviceMemoryResource:
+        """Create a memory resource from the configuration."""
+        def create_mr(
+            qualname: str, options: dict[str, Any] | None
+        ) -> rmm.mr.DeviceMemoryResource:
+            module_name, class_name = qualname.rsplit(".", 1)
+            module = importlib.import_module(module_name)
+            cls = getattr(module, class_name)
+            return cls(**options or {})
+        def process_options(opts: dict[str, Any] | None) -> dict[str, Any]:
+            if opts is None:
+                return {}
+            processed = {}
+            for key, value in opts.items():
+                if isinstance(value, dict) and "qualname" in value:
+                    # This is a nested memory resource config
+                    nested_qualname = value["qualname"]
+                    nested_options = process_options(value.get("options"))
+                    processed[key] = create_mr(nested_qualname, nested_options)
+                else:
+                    processed[key] = value
+            return processed
+        # Create the top-level memory resource
+        return create_mr(self.qualname, process_options(self.options))
+    def __hash__(self) -> int:
+        return hash((self.qualname, json.dumps(self.options, sort_keys=True)))
 @dataclasses.dataclass(frozen=True, eq=True)
 class StreamingExecutor:
     """
@@ -380,11 +556,25 @@ class StreamingExecutor:
     Parameters
     ----------
+    runtime
+        The runtime to use for the streaming executor.
+        ``Runtime.TASKS`` by default.
+    cluster
+        The cluster configuration for the streaming executor.
+        ``Cluster.SINGLE`` by default.
+        This setting applies to both task-based and rapidsmpf execution modes:
+        * ``Cluster.SINGLE``: Single-GPU execution
+        * ``Cluster.DISTRIBUTED``: Multi-GPU distributed execution (requires
+          an active Dask cluster)
     scheduler
-        The scheduler to use for the streaming executor. ``Scheduler.SYNCHRONOUS``
-        by default.
+        **Deprecated**: Use ``cluster`` instead.
-        Note ``scheduler="distributed"`` requires a Dask cluster to be running.
+        For backward compatibility:
+        * ``Scheduler.SYNCHRONOUS`` maps to ``Cluster.SINGLE``
+        * ``Scheduler.DISTRIBUTED`` maps to ``Cluster.DISTRIBUTED``
     fallback_mode
         How to handle errors when the GPU engine fails to execute a query.
         ``StreamingFallbackMode.WARN`` by default.
@@ -414,10 +604,10 @@ class StreamingExecutor:
         - the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
         By default, cudf-polars uses a target partition size that's a fraction
-        of the device memory, where the fraction depends on the scheduler:
+        of the device memory, where the fraction depends on the cluster:
         - distributed: 1/40th of the device memory
-        - synchronous: 1/16th of the device memory
+        - single: 1/16th of the device memory
         The optional pynvml dependency is used to query the device memory size. If
         pynvml is not available, a warning is emitted and the device size is assumed
@@ -434,36 +624,62 @@ class StreamingExecutor:
         a broadcast join.
     shuffle_method
         The method to use for shuffling data between workers. Defaults to
-        'rapidsmpf' for distributed scheduler if available (otherwise 'tasks'),
-        and 'tasks' for synchronous scheduler.
+        'rapidsmpf' for distributed cluster if available (otherwise 'tasks'),
+        and 'tasks' for single-GPU cluster.
+    shuffler_insertion_method
+        The method to use for inserting chunks with the rapidsmpf shuffler.
+        Can be 'insert_chunks' (default) or 'concat_insert'.
+        Only applicable with ``shuffle_method="rapidsmpf"`` and ``runtime="tasks"``.
     rapidsmpf_spill
         Whether to wrap task arguments and output in objects that are
         spillable by 'rapidsmpf'.
+    client_device_threshold
+        Threshold for spilling data from device memory in rapidsmpf.
+        Default is 50% of device memory on the client process.
+        This argument is only used by the "rapidsmpf" runtime.
     sink_to_directory
         Whether multi-partition sink operations should write to a directory
         rather than a single file. By default, this will be set to True for
-        the 'distributed' scheduler and False otherwise. The 'distrubuted'
-        scheduler does not currently support ``sink_to_directory=False``.
+        the 'distributed' cluster and False otherwise. The 'distributed'
+        cluster does not currently support ``sink_to_directory=False``.
     stats_planning
         Options controlling statistics-based query planning. See
         :class:`~cudf_polars.utils.config.StatsPlanningOptions` for more.
+    max_io_threads
+        Maximum number of IO threads for the rapidsmpf runtime. Default is 2.
+        This controls the parallelism of IO operations when reading data.
     Notes
     -----
     The streaming executor does not currently support profiling a query via
     the ``.profile()`` method. We recommend using nsys to profile queries
-    with the 'synchronous' scheduler and Dask's built-in profiling tools
-    with the 'distributed' scheduler.
+    with single-GPU execution and Dask's built-in profiling tools
+    with distributed execution.
     """
     _env_prefix = "CUDF_POLARS__EXECUTOR"
     name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
-    scheduler: Scheduler = dataclasses.field(
+    runtime: Runtime = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__RUNTIME",
+            Runtime.__call__,
+            default=Runtime.TASKS,
+        )
+    )
+    cluster: Cluster | None = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__CLUSTER",
+            Cluster.__call__,
+            default=None,
+        )
+    )
+    scheduler: Scheduler | None = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__SCHEDULER",
             Scheduler.__call__,
-            default=Scheduler.SYNCHRONOUS,
+            default=None,
         )
     )
     fallback_mode: StreamingFallbackMode = dataclasses.field(
@@ -505,11 +721,23 @@ class StreamingExecutor:
             default=ShuffleMethod.TASKS,
         )
     )
+    shuffler_insertion_method: ShufflerInsertionMethod = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__SHUFFLER_INSERTION_METHOD",
+            ShufflerInsertionMethod.__call__,
+            default=ShufflerInsertionMethod.INSERT_CHUNKS,
+        )
+    )
     rapidsmpf_spill: bool = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
         )
     )
+    client_device_threshold: float = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__CLIENT_DEVICE_THRESHOLD", float, default=0.5
+        )
+    )
     sink_to_directory: bool | None = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__SINK_TO_DIRECTORY", _bool_converter, default=None
@@ -518,12 +746,64 @@ class StreamingExecutor:
     stats_planning: StatsPlanningOptions = dataclasses.field(
         default_factory=StatsPlanningOptions
     )
+    max_io_threads: int = dataclasses.field(
+        default_factory=_make_default_factory(
+            f"{_env_prefix}__MAX_IO_THREADS", int, default=2
+        )
+    )
     def __post_init__(self) -> None:  # noqa: D105
+        # Check for rapidsmpf runtime
+        if self.runtime == "rapidsmpf":  # pragma: no cover; requires rapidsmpf runtime
+            if not rapidsmpf_single_available():
+                raise ValueError("The rapidsmpf streaming engine requires rapidsmpf.")
+            if self.shuffle_method == "tasks":
+                raise ValueError(
+                    "The rapidsmpf streaming engine does not support task-based shuffling."
+                )
+            object.__setattr__(self, "shuffle_method", "rapidsmpf")
+        # Handle backward compatibility for deprecated scheduler parameter
+        if self.scheduler is not None:
+            if self.cluster is not None:
+                raise ValueError(
+                    "Cannot specify both 'scheduler' and 'cluster'. "
+                    "The 'scheduler' parameter is deprecated. "
+                    "Please use only 'cluster' instead."
+                )
+            else:
+                warnings.warn(
+                    """The 'scheduler' parameter is deprecated. Please use 'cluster' instead.
+                    Use 'cluster="single"' instead of 'scheduler="synchronous"' and "
+                    'cluster="distributed"' instead of 'scheduler="distributed"'.""",
+                    FutureWarning,
+                    stacklevel=2,
+                )
+            # Map old scheduler values to new cluster values
+            if self.scheduler == "synchronous":
+                object.__setattr__(self, "cluster", Cluster.SINGLE)
+            elif self.scheduler == "distributed":
+                object.__setattr__(self, "cluster", Cluster.DISTRIBUTED)
+            # Clear scheduler to avoid confusion
+            object.__setattr__(self, "scheduler", None)
+        elif self.cluster is None:
+            object.__setattr__(self, "cluster", Cluster.SINGLE)
+        assert self.cluster is not None, "Expected cluster to be set."
+        # Warn loudly that multi-GPU execution is under construction
+        # for the rapidsmpf runtime
+        if self.cluster == "distributed" and self.runtime == "rapidsmpf":
+            warnings.warn(
+                "UNDER CONSTRUCTION!!!"
+                "The rapidsmpf runtime does NOT support distributed execution yet. "
+                "Use at your own risk!!!",
+                stacklevel=2,
+            )
         # Handle shuffle_method defaults for streaming executor
         if self.shuffle_method is None:
-            if self.scheduler == "distributed" and rapidsmpf_distributed_available():
-                # For distributed scheduler, prefer rapidsmpf if available
+            if self.cluster == "distributed" and rapidsmpf_distributed_available():
+                # For distributed cluster, prefer rapidsmpf if available
                 object.__setattr__(self, "shuffle_method", "rapidsmpf")
             else:
                 # Otherwise, use task-based shuffle for now.
@@ -534,19 +814,16 @@ class StreamingExecutor:
             raise ValueError("rapidsmpf-single is not a supported shuffle method.")
         elif self.shuffle_method == "rapidsmpf":
             # Check that we have rapidsmpf installed
-            if (
-                self.scheduler == "distributed"
-                and not rapidsmpf_distributed_available()
-            ):
+            if self.cluster == "distributed" and not rapidsmpf_distributed_available():
                 raise ValueError(
                     "rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
                 )
-            elif self.scheduler == "synchronous" and not rapidsmpf_single_available():
+            elif self.cluster == "single" and not rapidsmpf_single_available():
                 raise ValueError(
                     "rapidsmpf shuffle method requested, but rapidsmpf is not installed."
                 )
-            # Select "rapidsmpf-single" for the synchronous
-            if self.scheduler == "synchronous":
+            # Select "rapidsmpf-single" for single-GPU
+            if self.cluster == "single":
                 object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
         # frozen dataclass, so use object.__setattr__
@@ -555,17 +832,24 @@ class StreamingExecutor:
         )
         if self.target_partition_size == 0:
             object.__setattr__(
-                self, "target_partition_size", default_blocksize(self.scheduler)
+                self,
+                "target_partition_size",
+                default_blocksize(self.cluster),
             )
         if self.broadcast_join_limit == 0:
             object.__setattr__(
                 self,
                 "broadcast_join_limit",
-                # Usually better to avoid shuffling for single gpu
-                2 if self.scheduler == "distributed" else 32,
+                # Usually better to avoid shuffling for single gpu with UVM
+                2 if self.cluster == "distributed" else 32,
             )
-        object.__setattr__(self, "scheduler", Scheduler(self.scheduler))
+        object.__setattr__(self, "cluster", Cluster(self.cluster))
         object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
+        object.__setattr__(
+            self,
+            "shuffler_insertion_method",
+            ShufflerInsertionMethod(self.shuffler_insertion_method),
+        )
         # Make sure stats_planning is a dataclass
         if isinstance(self.stats_planning, dict):
@@ -575,10 +859,10 @@ class StreamingExecutor:
                 StatsPlanningOptions(**self.stats_planning),
             )
-        if self.scheduler == "distributed":
+        if self.cluster == "distributed":
             if self.sink_to_directory is False:
                 raise ValueError(
-                    "The distributed scheduler requires sink_to_directory=True"
+                    "The distributed cluster requires sink_to_directory=True"
                 )
             object.__setattr__(self, "sink_to_directory", True)
         elif self.sink_to_directory is None:
@@ -599,13 +883,17 @@ class StreamingExecutor:
             raise TypeError("rapidsmpf_spill must be bool")
         if not isinstance(self.sink_to_directory, bool):
             raise TypeError("sink_to_directory must be bool")
+        if not isinstance(self.client_device_threshold, float):
+            raise TypeError("client_device_threshold must be a float")
+        if not isinstance(self.max_io_threads, int):
+            raise TypeError("max_io_threads must be an int")
-        # RapidsMPF spill is only supported for the distributed scheduler for now.
+        # RapidsMPF spill is only supported for distributed clusters for now.
         # This is because the spilling API is still within the RMPF-Dask integration.
         # (See https://github.com/rapidsai/rapidsmpf/issues/439)
-        if self.scheduler == "synchronous" and self.rapidsmpf_spill:  # pragma: no cover
+        if self.cluster == "single" and self.rapidsmpf_spill:  # pragma: no cover
             raise ValueError(
-                "rapidsmpf_spill is not supported for the synchronous scheduler."
+                "rapidsmpf_spill is not supported for single-GPU execution."
             )
     def __hash__(self) -> int:  # noqa: D105
@@ -622,14 +910,83 @@ class InMemoryExecutor:
     """
     Configuration for the cudf-polars in-memory executor.
+    The in-memory executor only supports single-GPU execution.
+    """
+    name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
+@dataclasses.dataclass(frozen=True, eq=True)
+class CUDAStreamPoolConfig:
+    """
+    Configuration for the CUDA stream pool.
     Parameters
     ----------
-    scheduler:
-        The scheduler to use for the in-memory executor. Currently
-        only ``Scheduler.SYNCHRONOUS`` is supported for the in-memory executor.
+    pool_size
+        The size of the CUDA stream pool.
+    flags
+        The flags to use for the CUDA stream pool.
     """
-    name: Literal["in-memory"] = dataclasses.field(default="in-memory", init=False)
+    pool_size: int = 16
+    flags: CudaStreamFlags = CudaStreamFlags.NON_BLOCKING
+    def build(self) -> CudaStreamPool:
+        return CudaStreamPool(
+            pool_size=self.pool_size,
+            flags=self.flags,
+        )
+class CUDAStreamPolicy(str, enum.Enum):
+    """
+    The policy to use for acquiring new CUDA streams.
+    * ``CUDAStreamPolicy.DEFAULT`` : Use the default CUDA stream.
+    * ``CUDAStreamPolicy.NEW`` : Create a new CUDA stream.
+    """
+    DEFAULT = "default"
+    NEW = "new"
+def _convert_cuda_stream_policy(
+    user_cuda_stream_policy: dict | str,
+) -> CUDAStreamPolicy | CUDAStreamPoolConfig:
+    match user_cuda_stream_policy:
+        case "default" | "new":
+            return CUDAStreamPolicy(user_cuda_stream_policy)
+        case "pool":
+            return CUDAStreamPoolConfig()
+        case dict():
+            return CUDAStreamPoolConfig(**user_cuda_stream_policy)
+        case str():
+            # assume it's a JSON encoded CUDAStreamPoolConfig
+            try:
+                d = json.loads(user_cuda_stream_policy)
+            except json.JSONDecodeError:
+                raise ValueError(
+                    f"Invalid CUDA stream policy: '{user_cuda_stream_policy}'"
+                ) from None
+            match d:
+                case {"pool_size": int(), "flags": int()}:
+                    return CUDAStreamPoolConfig(
+                        pool_size=d["pool_size"], flags=CudaStreamFlags(d["flags"])
+                    )
+                case {"pool_size": int(), "flags": str()}:
+                    # convert the string names to enums
+                    return CUDAStreamPoolConfig(
+                        pool_size=d["pool_size"],
+                        flags=CudaStreamFlags(CudaStreamFlags.__members__[d["flags"]]),
+                    )
+                case _:
+                    try:
+                        return CUDAStreamPoolConfig(**d)
+                    except TypeError:
+                        raise ValueError(
+                            f"Invalid CUDA stream policy: {user_cuda_stream_policy}"
+                        ) from None
 @dataclasses.dataclass(frozen=True, eq=True)
@@ -651,6 +1008,8 @@ class ConfigOptions:
     device
         The GPU used to run the query. If not provided, the
         query uses the current CUDA device.
+    cuda_stream_policy
+        The policy to use for acquiring new CUDA streams. See :class:`~cudf_polars.utils.config.CUDAStreamPolicy` for more.
     """
     raise_on_fail: bool = False
@@ -659,6 +1018,14 @@ class ConfigOptions:
         default_factory=StreamingExecutor
     )
     device: int | None = None
+    memory_resource_config: MemoryResourceConfig | None = None
+    cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig = dataclasses.field(
+        default_factory=_make_default_factory(
+            "CUDF_POLARS__CUDA_STREAM_POLICY",
+            CUDAStreamPolicy.__call__,
+            default=CUDAStreamPolicy.DEFAULT,
+        )
+    )
     @classmethod
     def from_polars_engine(
@@ -672,6 +1039,8 @@ class ConfigOptions:
             "executor_options",
             "parquet_options",
             "raise_on_fail",
+            "memory_resource_config",
+            "cuda_stream_policy",
         }
         extra_options = set(engine.config.keys()) - valid_options
@@ -684,8 +1053,20 @@ class ConfigOptions:
             user_executor = os.environ.get(f"{env_prefix}__EXECUTOR", "streaming")
         user_executor_options = engine.config.get("executor_options", {})
         user_parquet_options = engine.config.get("parquet_options", {})
+        if user_parquet_options is None:
+            user_parquet_options = {}
         # This is set in polars, and so can't be overridden by the environment
         user_raise_on_fail = engine.config.get("raise_on_fail", False)
+        user_memory_resource_config = engine.config.get("memory_resource_config", None)
+        if user_memory_resource_config is None and (
+            os.environ.get(f"{MemoryResourceConfig._env_prefix}__QUALNAME", "") != ""
+        ):
+            # We'll pick up the qualname / options from the environment.
+            user_memory_resource_config = MemoryResourceConfig()
+        elif isinstance(user_memory_resource_config, dict):
+            user_memory_resource_config = MemoryResourceConfig(
+                **user_memory_resource_config
+            )
         # Backward compatibility for "cardinality_factor"
         # TODO: Remove this in 25.10
@@ -717,7 +1098,7 @@ class ConfigOptions:
             case "streaming":
                 user_executor_options = user_executor_options.copy()
                 # Handle the interaction between the default shuffle method, the
-                # scheduler, and whether rapidsmpf is available.
+                # cluster, and whether rapidsmpf is available.
                 env_shuffle_method = os.environ.get(
                     "CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
                 )
@@ -733,9 +1114,43 @@ class ConfigOptions:
             case _:  # pragma: no cover; Unreachable
                 raise ValueError(f"Unsupported executor: {user_executor}")
-        return cls(
-            raise_on_fail=user_raise_on_fail,
-            parquet_options=ParquetOptions(**user_parquet_options),
-            executor=executor,
-            device=engine.device,
-        )
+        kwargs = {
+            "raise_on_fail": user_raise_on_fail,
+            "parquet_options": ParquetOptions(**user_parquet_options),
+            "executor": executor,
+            "device": engine.device,
+            "memory_resource_config": user_memory_resource_config,
+        }
+        # Handle "cuda-stream-policy".
+        # The default will depend on the runtime and executor.
+        user_cuda_stream_policy = engine.config.get(
+            "cuda_stream_policy", None
+        ) or os.environ.get("CUDF_POLARS__CUDA_STREAM_POLICY", None)
+        cuda_stream_policy: CUDAStreamPolicy | CUDAStreamPoolConfig
+        if user_cuda_stream_policy is None:
+            if (
+                executor.name == "streaming" and executor.runtime == Runtime.RAPIDSMPF
+            ):  # pragma: no cover; requires rapidsmpf runtime
+                # the rapidsmpf runtime defaults to using a stream pool
+                cuda_stream_policy = CUDAStreamPoolConfig()
+            else:
+                # everything else defaults to the default stream
+                cuda_stream_policy = CUDAStreamPolicy.DEFAULT
+        else:
+            cuda_stream_policy = _convert_cuda_stream_policy(user_cuda_stream_policy)
+        # Pool policy is only supported by the rapidsmpf runtime.
+        if isinstance(cuda_stream_policy, CUDAStreamPoolConfig) and (
+            (executor.name != "streaming")
+            or (executor.name == "streaming" and executor.runtime != Runtime.RAPIDSMPF)
+        ):
+            raise ValueError(
+                "CUDAStreamPolicy.POOL is only supported by the rapidsmpf runtime."
+            )
+        kwargs["cuda_stream_policy"] = cuda_stream_policy
+        return cls(**kwargs)

cudf-polars-cu13 25.10.0__py3-none-any.whl → 26.2.0__py3-none-any.whl

cudf-polars-cu13 25.10.0py3-none-any.whl → 26.2.0py3-none-any.whl