PyPI - dask-cuda - Versions diffs - 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

dask-cuda 25.6.0py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
dask_cuda/benchmarks/local_cudf_merge.py +5 -2
dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
dask_cuda/benchmarks/local_cupy.py +4 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
dask_cuda/benchmarks/utils.py +7 -4
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +27 -57
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
dask_cuda/get_device_memory_objects.py +3 -3
dask_cuda/initialize.py +80 -44
dask_cuda/local_cuda_cluster.py +63 -66
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +12 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_dask_cuda_worker.py +109 -25
dask_cuda/tests/test_dgx.py +10 -18
dask_cuda/tests/test_explicit_comms.py +30 -12
dask_cuda/tests/test_from_array.py +7 -5
dask_cuda/tests/test_initialize.py +16 -37
dask_cuda/tests/test_local_cuda_cluster.py +159 -52
dask_cuda/tests/test_proxify_host_file.py +19 -3
dask_cuda/tests/test_proxy.py +18 -16
dask_cuda/tests/test_rdd_ucx.py +160 -0
dask_cuda/tests/test_spill.py +7 -0
dask_cuda/tests/test_utils.py +106 -20
dask_cuda/tests/test_worker_spec.py +5 -2
dask_cuda/utils.py +261 -38
dask_cuda/utils_test.py +23 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +12 -5
{dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
dask_cuda-25.8.0.dist-info/RECORD +63 -0
dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.6.0.dist-info/RECORD +0 -57
dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
{dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/explicit_comms/dataframe/shuffle.py CHANGED Viewed

@@ -65,13 +65,13 @@ def get_no_comm_postprocess(
 ) -> Callable[[DataFrame], DataFrame]:
     """Get function for post-processing partitions not communicated
-    In cuDF, the `group_split_dispatch` uses `scatter_by_map` to create
+    In cuDF, the ``group_split_dispatch`` uses ``scatter_by_map`` to create
     the partitions, which is implemented by splitting a single base dataframe
     into multiple partitions. This means that memory are not freed until
     ALL partitions are deleted.
     In order to free memory ASAP, we can deep copy partitions NOT being
-    communicated. We do this when `num_rounds != batchsize`.
+    communicated. We do this when ``num_rounds != batchsize``.
     Parameters
     ----------
@@ -116,7 +116,7 @@ async def send(
     rank_to_out_part_ids: Dict[int, Set[int]],
     out_part_id_to_dataframe: Dict[int, DataFrame],
 ) -> None:
-    """Notice, items sent are removed from `out_part_id_to_dataframe`"""
+    """Notice, items sent are removed from ``out_part_id_to_dataframe``"""
     futures = []
     for rank, out_part_ids in rank_to_out_part_ids.items():
         if rank != myrank:
@@ -135,7 +135,7 @@ async def recv(
     out_part_id_to_dataframe_list: Dict[int, List[DataFrame]],
     proxify: Proxify,
 ) -> None:
-    """Notice, received items are appended to `out_parts_list`"""
+    """Notice, received items are appended to ``out_parts_list``"""
     async def read_msg(rank: int) -> None:
         msg: Dict[int, DataFrame] = nested_deserialize(await eps[rank].read())
@@ -150,11 +150,11 @@ async def recv(
 def compute_map_index(
     df: DataFrame, column_names: List[str], npartitions: int
 ) -> Series:
-    """Return a Series that maps each row `df` to a partition ID
+    """Return a Series that maps each row ``df`` to a partition ID
     The partitions are determined by hashing the columns given by column_names
-    unless if `column_names[0] == "_partitions"`, in which case the values of
-    `column_names[0]` are used as index.
+    unless if ``column_names[0] == "_partitions"``, in which case the values of
+    ``column_names[0]`` are used as index.
     Parameters
     ----------
@@ -168,7 +168,7 @@ def compute_map_index(
     Returns
     -------
     Series
-        Series that maps each row `df` to a partition ID
+        Series that maps each row ``df`` to a partition ID
     """
     if column_names[0] == "_partitions":
@@ -193,8 +193,8 @@ def partition_dataframe(
     """Partition dataframe to a dict of dataframes
     The partitions are determined by hashing the columns given by column_names
-    unless `column_names[0] == "_partitions"`, in which case the values of
-    `column_names[0]` are used as index.
+    unless ``column_names[0] == "_partitions"``, in which case the values of
+    ``column_names[0]`` are used as index.
     Parameters
     ----------
@@ -301,13 +301,13 @@ async def send_recv_partitions(
     rank_to_out_part_ids
         dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
-        dict. Partition IDs are global integers `0..npartitions` and corresponds
-        to the dict keys returned by `group_split_dispatch`.
+        dict. Partition IDs are global integers ``0..npartitions`` and corresponds
+        to the dict keys returned by ``group_split_dispatch``.
     out_part_id_to_dataframe
         Mapping from partition ID to dataframe. This dict is cleared on return.
     no_comm_postprocess
         Function to post-process partitions not communicated.
-        See `get_no_comm_postprocess`
+        See ``get_no_comm_postprocess``
     proxify
         Function to proxify object.
     out_part_id_to_dataframe_list
@@ -365,8 +365,8 @@ async def shuffle_task(
     rank_to_out_part_ids: dict
         dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
-        dict. Partition IDs are global integers `0..npartitions` and corresponds
-        to the dict keys returned by `group_split_dispatch`.
+        dict. Partition IDs are global integers ``0..npartitions`` and corresponds
+        to the dict keys returned by ``group_split_dispatch``.
     column_names: list of strings
         List of column names on which we want to split.
     npartitions: int
@@ -449,7 +449,7 @@ def shuffle(
         List of column names on which we want to split.
     npartitions: int or None
         The desired number of output partitions. If None, the number of output
-        partitions equals `df.npartitions`
+        partitions equals ``df.npartitions``
     ignore_index: bool
         Ignore index during shuffle. If True, performance may improve,
         but index values will not be preserved.
@@ -460,7 +460,7 @@ def shuffle(
         If -1, each worker will handle all its partitions in a single round and
         all techniques to reduce memory usage are disabled, which might be faster
         when memory pressure isn't an issue.
-        If None, the value of `DASK_EXPLICIT_COMMS_BATCHSIZE` is used or 1 if not
+        If None, the value of ``DASK_EXPLICIT_COMMS_BATCHSIZE`` is used or 1 if not
         set thus by default, we prioritize robustness over performance.
     Returns
@@ -471,12 +471,12 @@ def shuffle(
     Developer Notes
     ---------------
     The implementation consist of three steps:
-      (a) Stage the partitions of `df` on all workers and then cancel them
+      (a) Stage the partitions of ``df`` on all workers and then cancel them
           thus at this point the Dask Scheduler doesn't know about any of the
           the partitions.
       (b) Submit a task on each worker that shuffle (all-to-all communicate)
           the staged partitions and return a list of dataframe-partitions.
-      (c) Submit a dask graph that extract (using `getitem()`) individual
+      (c) Submit a dask graph that extract (using ``getitem()``) individual
           dataframe-partitions from (b).
     """
     c = comms.default_comms()
@@ -594,7 +594,7 @@ def _contains_shuffle_expr(*args) -> bool:
     """
     Check whether any of the arguments is a Shuffle expression.
-    This is called by `compute`, which is given a sequence of Dask Collections
+    This is called by ``compute``, which is given a sequence of Dask Collections
     to process. For each of those, we'll check whether the expresion contains a
     Shuffle operation.
     """
@@ -712,9 +712,9 @@ def patch_shuffle_expression() -> None:
     """Patch Dasks Shuffle expression.
     Notice, this is monkey patched into Dask at dask_cuda
-    import, and it changes `Shuffle._layer` to lower into
-    an `ECShuffle` expression when the 'explicit-comms'
-    config is set to `True`.
+    import, and it changes ``Shuffle._layer`` to lower into
+    an ``ECShuffle`` expression when the 'explicit-comms'
+    config is set to ``True``.
     """
     dask.base.compute = _patched_compute

dask_cuda/get_device_memory_objects.py CHANGED Viewed

@@ -27,10 +27,10 @@ class DeviceMemoryId:
 def get_device_memory_ids(obj) -> Set[DeviceMemoryId]:
-    """Find all CUDA device objects in `obj`
+    """Find all CUDA device objects in ``obj``
-    Search through `obj` and find all CUDA device objects, which are objects
-    that either are known to `dispatch` or implement `__cuda_array_interface__`.
+    Search through ``obj`` and find all CUDA device objects, which are objects
+    that either are known to ``dispatch`` or implement ``__cuda_array_interface__``.
     Parameters
     ----------

dask_cuda/initialize.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import logging
 import os
@@ -7,7 +10,7 @@ import numba.cuda
 import dask
 from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
-from .utils import get_ucx_config
+from .utils import _get_active_ucx_implementation_name, get_ucx_config
 logger = logging.getLogger(__name__)
@@ -22,65 +25,97 @@ def _create_cuda_context_handler():
         numba.cuda.current_context()
-def _create_cuda_context(protocol="ucx"):
-    if protocol not in ["ucx", "ucxx"]:
-        return
+def _warn_generic():
     try:
+        # TODO: update when UCX-Py is removed, see
+        # https://github.com/rapidsai/dask-cuda/issues/1517
+        import distributed.comm.ucx
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
-        try:
-            if protocol == "ucx":
-                import distributed.comm.ucx
-                distributed.comm.ucx.init_once()
-            elif protocol == "ucxx":
-                import distributed_ucxx.ucxx
-                distributed_ucxx.ucxx.init_once()
-        except ModuleNotFoundError:
-            # UCX initialization has to be delegated to Distributed, it will take care
-            # of setting correct environment variables and importing `ucp` after that.
-            # Therefore if ``import ucp`` fails we can just continue here.
-            pass
+        cuda_visible_device = get_device_index_and_uuid(
+            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+        )
+        ctx = has_cuda_context()
+        if (
+            ctx.has_context
+            and not distributed.comm.ucx.cuda_context_created.has_context
+        ):
+            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        _create_cuda_context_handler()
+        if not distributed.comm.ucx.cuda_context_created.has_context:
+            ctx = has_cuda_context()
+            if ctx.has_context and ctx.device_info != cuda_visible_device:
+                distributed.comm.ucx._warn_cuda_context_wrong_device(
+                    cuda_visible_device, ctx.device_info, os.getpid()
+                )
+    except Exception:
+        logger.error("Unable to start CUDA Context", exc_info=True)
+def _initialize_ucx():
+    try:
+        import distributed.comm.ucx
+        distributed.comm.ucx.init_once()
+    except ModuleNotFoundError:
+        # UCX initialization has to be delegated to Distributed, it will take care
+        # of setting correct environment variables and importing `ucp` after that.
+        # Therefore if ``import ucp`` fails we can just continue here.
+        pass
+def _initialize_ucxx():
+    try:
+        # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
+        # context directly from the UCX module, thus avoiding a similar warning there.
+        import distributed_ucxx.ucxx
+        distributed_ucxx.ucxx.init_once()
         cuda_visible_device = get_device_index_and_uuid(
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if protocol == "ucx":
-            if (
-                ctx.has_context
-                and not distributed.comm.ucx.cuda_context_created.has_context
-            ):
-                distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
-        elif protocol == "ucxx":
-            if (
-                ctx.has_context
-                and not distributed_ucxx.ucxx.cuda_context_created.has_context
-            ):
-                distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
+        if (
+            ctx.has_context
+            and not distributed_ucxx.ucxx.cuda_context_created.has_context
+        ):
+            distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
         _create_cuda_context_handler()
-        if protocol == "ucx":
-            if not distributed.comm.ucx.cuda_context_created.has_context:
-                ctx = has_cuda_context()
-                if ctx.has_context and ctx.device_info != cuda_visible_device:
-                    distributed.comm.ucx._warn_cuda_context_wrong_device(
-                        cuda_visible_device, ctx.device_info, os.getpid()
-                    )
-        elif protocol == "ucxx":
-            if not distributed_ucxx.ucxx.cuda_context_created.has_context:
-                ctx = has_cuda_context()
-                if ctx.has_context and ctx.device_info != cuda_visible_device:
-                    distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
-                        cuda_visible_device, ctx.device_info, os.getpid()
-                    )
+        if not distributed_ucxx.ucxx.cuda_context_created.has_context:
+            ctx = has_cuda_context()
+            if ctx.has_context and ctx.device_info != cuda_visible_device:
+                distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
+                    cuda_visible_device, ctx.device_info, os.getpid()
+                )
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
+def _create_cuda_context(protocol="ucx"):
+    if protocol not in ["ucx", "ucxx", "ucx-old"]:
+        return
+    try:
+        ucx_implementation = _get_active_ucx_implementation_name(protocol)
+    except ValueError:
+        # Not a UCX protocol, just raise CUDA context warnings if needed.
+        _warn_generic()
+    else:
+        if ucx_implementation == "ucxx":
+            _initialize_ucxx()
+        else:
+            _initialize_ucx()
+            _warn_generic()
 def initialize(
     create_cuda_context=True,
     enable_tcp_over_ucx=None,
@@ -138,6 +173,7 @@ def initialize(
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
         enable_rdmacm=enable_rdmacm,
+        protocol=protocol,
     )
     dask.config.set({"distributed.comm.ucx": ucx_config})

dask_cuda/local_cuda_cluster.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import copy
 import logging
 import os
@@ -8,18 +11,15 @@ import dask
 from distributed import LocalCluster, Nanny, Worker
 from distributed.worker_memory import parse_memory_limit
-from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
-from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
-    get_cpu_affinity,
     get_ucx_config,
     nvml_device_index,
     parse_cuda_visible_device,
     parse_device_memory_limit,
 )
+from .worker_common import worker_data_function, worker_plugins
 class LoggedWorker(Worker):
@@ -68,11 +68,16 @@ class LocalCUDACluster(LocalCluster):
         starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
         integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
         or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.
-    device_memory_limit : int, float, str, or None, default 0.8
+    device_memory_limit : int, float, str, or None, default "default"
         Size of the CUDA device LRU cache, which is used to determine when the worker
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
-        total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
+        total device memory), string (like ``"5GB"`` or ``"5000M"``), ``"auto"``, ``0``
         or ``None`` to disable spilling to host (i.e. allow full device memory usage).
+        Another special value ``"default"`` (which happens to be the default) is also
+        available and uses the recommended Dask-CUDA's defaults and means 80% of the
+        total device memory (analogous to ``0.8``), and disabled spilling (analogous
+        to ``auto``/``0``) on devices without a dedicated memory resource, such as
+        system on a chip (SoC) devices.
     enable_cudf_spill : bool, default False
         Enable automatic cuDF spilling.
@@ -87,7 +92,7 @@ class LocalCUDACluster(LocalCluster):
         ``dask.temporary-directory`` in the local Dask configuration, using the current
         working directory if this is not set.
     shared_filesystem: bool or None, default None
-        Whether the `local_directory` above is shared between all workers or not.
+        Whether the ``local_directory`` above is shared between all workers or not.
         If ``None``, the "jit-unspill-shared-fs" config value are used, which
         defaults to True. Notice, in all other cases this option defaults to False,
         but on a local cluster it defaults to True -- we assume all workers use the
@@ -100,13 +105,16 @@ class LocalCUDACluster(LocalCluster):
         are not supported or disabled.
     enable_infiniband : bool, default None
         Set environment variables to enable UCX over InfiniBand, requires
-        ``protocol="ucx"`` and implies ``enable_tcp_over_ucx=True`` when ``True``.
+        ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
+        ``enable_tcp_over_ucx=True`` when ``True``.
     enable_nvlink : bool, default None
-        Set environment variables to enable UCX over NVLink, requires ``protocol="ucx"``
-        and implies ``enable_tcp_over_ucx=True`` when ``True``.
+        Set environment variables to enable UCX over NVLink, requires
+        ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
+        ``enable_tcp_over_ucx=True`` when ``True``.
     enable_rdmacm : bool, default None
         Set environment variables to enable UCX RDMA connection manager support,
-        requires ``protocol="ucx"`` and ``enable_infiniband=True``.
+        requires ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``,
+        and ``enable_infiniband=True``.
     rmm_pool_size : int, str or None, default None
         RMM pool size to initialize each worker with. Can be an integer (bytes), float
         (fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
@@ -123,8 +131,8 @@ class LocalCUDACluster(LocalCluster):
         and to set the maximum pool size.
         .. note::
-            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
-            due to fragmentation.
+            When paired with ``--enable-rmm-async`` the maximum size cannot be
+            guaranteed due to fragmentation.
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
@@ -140,9 +148,8 @@ class LocalCUDACluster(LocalCluster):
         See ``rmm.mr.CudaAsyncMemoryResource`` for more info.
         .. warning::
-            The asynchronous allocator requires CUDA Toolkit 11.2 or newer. It is also
-            incompatible with RMM pools and managed memory. Trying to enable both will
-            result in an exception.
+            The asynchronous allocator is incompatible with RMM pools and managed
+            memory. Trying to enable both will result in an exception.
     rmm_allocator_external_lib_list: str, list or None, default None
         List of external libraries for which to set RMM as the allocator.
         Supported options are: ``["torch", "cupy"]``. Can be a comma-separated string
@@ -201,7 +208,8 @@ class LocalCUDACluster(LocalCluster):
     Raises
     ------
     TypeError
-        If InfiniBand or NVLink are enabled and ``protocol!="ucx"``.
+        If InfiniBand or NVLink are enabled and
+        ``protocol not in ("ucx", "ucxx", "ucx-old")``.
     ValueError
         If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
         cannot be imported.
@@ -221,10 +229,9 @@ class LocalCUDACluster(LocalCluster):
         n_workers=None,
         threads_per_worker=1,
         memory_limit="auto",
-        device_memory_limit=0.8,
+        device_memory_limit="default",
         enable_cudf_spill=False,
         cudf_spill_stats=0,
-        data=None,
         local_directory=None,
         shared_filesystem=None,
         protocol=None,
@@ -242,7 +249,6 @@ class LocalCUDACluster(LocalCluster):
         rmm_track_allocations=False,
         jit_unspill=None,
         log_spilling=False,
-        worker_class=None,
         pre_import=None,
         **kwargs,
     ):
@@ -339,40 +345,29 @@ class LocalCUDACluster(LocalCluster):
             jit_unspill = dask.config.get("jit-unspill", default=False)
         data = kwargs.pop("data", None)
         if data is None:
-            if device_memory_limit is None and memory_limit is None:
-                data = {}
-            elif jit_unspill:
-                if enable_cudf_spill:
-                    warnings.warn(
-                        "Enabling cuDF spilling and JIT-Unspill together is not "
-                        "safe, consider disabling JIT-Unspill."
-                    )
-                data = (
-                    ProxifyHostFile,
-                    {
-                        "device_memory_limit": self.device_memory_limit,
-                        "memory_limit": self.memory_limit,
-                        "shared_filesystem": shared_filesystem,
-                    },
-                )
-            else:
-                data = (
-                    DeviceHostFile,
-                    {
-                        "device_memory_limit": self.device_memory_limit,
-                        "memory_limit": self.memory_limit,
-                        "log_spilling": log_spilling,
-                    },
-                )
+            self.data = worker_data_function(
+                device_memory_limit=self.device_memory_limit,
+                memory_limit=self.memory_limit,
+                jit_unspill=jit_unspill,
+                enable_cudf_spill=enable_cudf_spill,
+                shared_filesystem=shared_filesystem,
+            )
         if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
             if protocol is None:
-                protocol = "ucx"
-            elif protocol not in ["ucx", "ucxx"]:
+                ucx_protocol = dask.config.get(
+                    "distributed.comm.ucx.ucx-protocol", default=None
+                )
+                if ucx_protocol is not None:
+                    # TODO: remove when UCX-Py is removed,
+                    # see https://github.com/rapidsai/dask-cuda/issues/1517
+                    protocol = ucx_protocol
+                else:
+                    protocol = "ucx"
+            elif protocol not in ("ucx", "ucxx", "ucx-old"):
                 raise TypeError(
-                    "Enabling InfiniBand or NVLink requires protocol='ucx' or "
-                    "protocol='ucxx'"
+                    "Enabling InfiniBand or NVLink requires protocol='ucx', "
+                    "protocol='ucxx' or protocol='ucx-old'"
                 )
         self.host = kwargs.get("host", None)
@@ -385,6 +380,7 @@ class LocalCUDACluster(LocalCluster):
             enable_rdmacm=enable_rdmacm,
         )
+        worker_class = kwargs.pop("worker_class", None)
         if worker_class is not None:
             if log_spilling is True:
                 raise ValueError(
@@ -441,28 +437,29 @@ class LocalCUDACluster(LocalCluster):
         spec = copy.deepcopy(self.new_spec)
         worker_count = self.cuda_visible_devices.index(name)
         visible_devices = cuda_visible_devices(worker_count, self.cuda_visible_devices)
+        device_index = nvml_device_index(0, visible_devices)
         spec["options"].update(
             {
                 "env": {
                     "CUDA_VISIBLE_DEVICES": visible_devices,
                 },
-                "plugins": {
-                    CPUAffinity(
-                        get_cpu_affinity(nvml_device_index(0, visible_devices))
-                    ),
-                    RMMSetup(
-                        initial_pool_size=self.rmm_pool_size,
-                        maximum_pool_size=self.rmm_maximum_pool_size,
-                        managed_memory=self.rmm_managed_memory,
-                        async_alloc=self.rmm_async,
-                        release_threshold=self.rmm_release_threshold,
-                        log_directory=self.rmm_log_directory,
-                        track_allocations=self.rmm_track_allocations,
-                        external_lib_list=self.rmm_allocator_external_lib_list,
+                **({"data": self.data(device_index)} if hasattr(self, "data") else {}),
+                "plugins": worker_plugins(
+                    device_index=device_index,
+                    rmm_initial_pool_size=self.rmm_pool_size,
+                    rmm_maximum_pool_size=self.rmm_maximum_pool_size,
+                    rmm_managed_memory=self.rmm_managed_memory,
+                    rmm_async_alloc=self.rmm_async,
+                    rmm_release_threshold=self.rmm_release_threshold,
+                    rmm_log_directory=self.rmm_log_directory,
+                    rmm_track_allocations=self.rmm_track_allocations,
+                    rmm_allocator_external_lib_list=(
+                        self.rmm_allocator_external_lib_list
                     ),
-                    PreImport(self.pre_import),
-                    CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
-                },
+                    pre_import=self.pre_import,
+                    enable_cudf_spill=self.enable_cudf_spill,
+                    cudf_spill_stats=self.cudf_spill_stats,
+                ),
             }
         )

dask_cuda/plugins.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import importlib
 import logging
 import os
@@ -5,7 +8,7 @@ from typing import Callable, Dict
 from distributed import WorkerPlugin
-from .utils import get_rmm_log_file_name, parse_device_memory_limit
+from .utils import get_rmm_log_file_name, parse_device_bytes
 class CPUAffinity(WorkerPlugin):
@@ -75,28 +78,26 @@ class RMMSetup(WorkerPlugin):
         self.external_lib_list = external_lib_list
     def setup(self, worker=None):
-        if self.initial_pool_size is not None:
-            self.initial_pool_size = parse_device_memory_limit(
-                self.initial_pool_size, alignment_size=256
-            )
+        self.initial_pool_size = parse_device_bytes(
+            self.initial_pool_size, alignment_size=256
+        )
         if self.async_alloc:
             import rmm
-            if self.release_threshold is not None:
-                self.release_threshold = parse_device_memory_limit(
-                    self.release_threshold, alignment_size=256
-                )
+            self.release_threshold = parse_device_bytes(
+                self.release_threshold, alignment_size=256
+            )
             mr = rmm.mr.CudaAsyncMemoryResource(
                 initial_pool_size=self.initial_pool_size,
                 release_threshold=self.release_threshold,
             )
+            self.maximum_pool_size = parse_device_bytes(
+                self.maximum_pool_size, alignment_size=256
+            )
             if self.maximum_pool_size is not None:
-                self.maximum_pool_size = parse_device_memory_limit(
-                    self.maximum_pool_size, alignment_size=256
-                )
                 mr = rmm.mr.LimitingResourceAdaptor(
                     mr, allocation_limit=self.maximum_pool_size
                 )
@@ -114,10 +115,9 @@ class RMMSetup(WorkerPlugin):
             pool_allocator = False if self.initial_pool_size is None else True
             if self.initial_pool_size is not None:
-                if self.maximum_pool_size is not None:
-                    self.maximum_pool_size = parse_device_memory_limit(
-                        self.maximum_pool_size, alignment_size=256
-                    )
+                self.maximum_pool_size = parse_device_bytes(
+                    self.maximum_pool_size, alignment_size=256
+                )
             rmm.reinitialize(
                 pool_allocator=pool_allocator,
@@ -129,6 +129,7 @@ class RMMSetup(WorkerPlugin):
                     worker, self.logging, self.log_directory
                 ),
             )
         if self.rmm_track_allocations:
             import rmm

dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

dask-cuda 25.6.0py3-none-any.whl → 25.8.0py3-none-any.whl