PyPI - dask-cuda - Versions diffs - 23.12.0a231026__py3-none-any.whl → 24.2.0a3__py3-none-any.whl - Mend

dask-cuda 23.12.0a231026py3-none-any.whl → 24.2.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

dask_cuda/VERSION +1 -0
dask_cuda/__init__.py +1 -3
dask_cuda/_version.py +20 -0
dask_cuda/benchmarks/local_cudf_groupby.py +1 -1
dask_cuda/benchmarks/local_cudf_merge.py +1 -1
dask_cuda/benchmarks/local_cudf_shuffle.py +1 -1
dask_cuda/benchmarks/local_cupy.py +1 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +1 -1
dask_cuda/benchmarks/utils.py +1 -1
dask_cuda/cuda_worker.py +1 -3
dask_cuda/device_host_file.py +1 -1
dask_cuda/initialize.py +47 -16
dask_cuda/local_cuda_cluster.py +19 -19
dask_cuda/plugins.py +122 -0
dask_cuda/tests/test_dask_cuda_worker.py +3 -3
dask_cuda/tests/test_dgx.py +45 -17
dask_cuda/tests/test_explicit_comms.py +5 -5
dask_cuda/tests/test_from_array.py +6 -2
dask_cuda/tests/test_initialize.py +69 -21
dask_cuda/tests/test_local_cuda_cluster.py +47 -14
dask_cuda/tests/test_proxify_host_file.py +5 -1
dask_cuda/tests/test_proxy.py +13 -3
dask_cuda/tests/test_spill.py +3 -0
dask_cuda/tests/test_utils.py +20 -6
dask_cuda/utils.py +6 -140
dask_cuda/utils_test.py +45 -0
dask_cuda/worker_spec.py +2 -1
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/METADATA +2 -3
dask_cuda-24.2.0a3.dist-info/RECORD +53 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/WHEEL +1 -1
dask_cuda/compat.py +0 -118
dask_cuda-23.12.0a231026.dist-info/RECORD +0 -50
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/LICENSE +0 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/entry_points.txt +0 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/top_level.txt +0 -0

dask_cuda/tests/test_initialize.py CHANGED Viewed

@@ -10,9 +10,9 @@ from distributed.deploy.local import LocalCluster
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import get_ucx_config
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
-ucp = pytest.importorskip("ucp")
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
@@ -20,15 +20,21 @@ ucp = pytest.importorskip("ucp")
 # of UCX before retrieving the current config.
-def _test_initialize_ucx_tcp():
+def _test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
     kwargs = {"enable_tcp_over_ucx": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -48,22 +54,34 @@ def _test_initialize_ucx_tcp():
             assert all(client.run(check_ucx_options).values())
-def test_initialize_ucx_tcp():
-    p = mp.Process(target=_test_initialize_ucx_tcp)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_tcp(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+    p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_nvlink():
+def _test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
     kwargs = {"enable_nvlink": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -84,22 +102,34 @@ def _test_initialize_ucx_nvlink():
             assert all(client.run(check_ucx_options).values())
-def test_initialize_ucx_nvlink():
-    p = mp.Process(target=_test_initialize_ucx_nvlink)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_nvlink(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+    p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_infiniband():
+def _test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
     kwargs = {"enable_infiniband": True}
-    initialize(**kwargs)
+    initialize(protocol=protocol, **kwargs)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -123,21 +153,33 @@ def _test_initialize_ucx_infiniband():
 @pytest.mark.skipif(
     "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
 )
-def test_initialize_ucx_infiniband():
-    p = mp.Process(target=_test_initialize_ucx_infiniband)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_infiniband(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+    p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_all():
-    initialize()
+def _test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+    initialize(protocol=protocol)
     with LocalCluster(
-        protocol="ucx",
+        protocol=protocol,
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config()},
     ) as cluster:
         with Client(cluster) as client:
@@ -161,8 +203,14 @@ def _test_initialize_ucx_all():
             assert all(client.run(check_ucx_options).values())
-def test_initialize_ucx_all():
-    p = mp.Process(target=_test_initialize_ucx_all)
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+def test_initialize_ucx_all(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
+    p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode

dask_cuda/tests/test_local_cuda_cluster.py CHANGED Viewed

@@ -13,13 +13,13 @@ from distributed.utils_test import gen_test, raises_with_cause
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import (
-    MockWorker,
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     print_cluster_config,
 )
+from dask_cuda.utils_test import MockWorker
 @gen_test(timeout=20)
@@ -87,23 +87,38 @@ async def test_with_subset_of_cuda_visible_devices():
                 }
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_ucx_protocol():
-    pytest.importorskip("ucp")
+async def test_ucx_protocol(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     async with LocalCUDACluster(
-        protocol="ucx", asynchronous=True, data=dict
+        protocol=protocol, asynchronous=True, data=dict
     ) as cluster:
         assert all(
-            ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
+            ws.address.startswith(f"{protocol}://")
+            for ws in cluster.scheduler.workers.values()
         )
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_explicit_ucx_with_protocol_none():
-    pytest.importorskip("ucp")
+async def test_explicit_ucx_with_protocol_none(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
-    initialize(enable_tcp_over_ucx=True)
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
     async with LocalCUDACluster(
         protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
     ) as cluster:
@@ -113,11 +128,18 @@ async def test_explicit_ucx_with_protocol_none():
 @pytest.mark.filterwarnings("ignore:Exception ignored in")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @gen_test(timeout=20)
-async def test_ucx_protocol_type_error():
-    pytest.importorskip("ucp")
+async def test_ucx_protocol_type_error(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
-    initialize(enable_tcp_over_ucx=True)
+    initialize(protocol=protocol, enable_tcp_over_ucx=True)
     with pytest.raises(TypeError):
         async with LocalCUDACluster(
             protocol="tcp", enable_tcp_over_ucx=True, asynchronous=True, data=dict
@@ -337,6 +359,7 @@ async def test_pre_import():
 # Intentionally not using @gen_test to skip cleanup checks
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_pre_import_not_found():
     async def _test_pre_import_not_found():
         with raises_with_cause(RuntimeError, None, ImportError, None):
@@ -477,20 +500,30 @@ async def test_worker_fraction_limits():
             )
-def test_print_cluster_config(capsys):
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_print_cluster_config(capsys, protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     pytest.importorskip("rich")
     with LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol="ucx"
+        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
     ) as cluster:
         with Client(cluster) as client:
             print_cluster_config(client)
             captured = capsys.readouterr()
             assert "Dask Cluster Configuration" in captured.out
-            assert "ucx" in captured.out
+            assert protocol in captured.out
             assert "1 B" in captured.out
             assert "[plugin]" in captured.out
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_death_timeout_raises():
     with pytest.raises(asyncio.exceptions.TimeoutError):
         with LocalCUDACluster(

dask_cuda/tests/test_proxify_host_file.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dask_cuda.get_device_memory_objects import get_device_memory_ids
 from dask_cuda.proxify_host_file import ProxifyHostFile
 from dask_cuda.proxy_object import ProxyObject, asproxy, unproxy
 from dask_cuda.utils import get_device_total_memory
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 cupy = pytest.importorskip("cupy")
 cupy.cuda.set_allocator(None)
@@ -393,7 +394,10 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
     with dask.config.set(jit_unspill_compatibility_mode=compatibility_mode):
         async with dask_cuda.LocalCUDACluster(
-            n_workers=1, jit_unspill=True, asynchronous=True
+            n_workers=1,
+            jit_unspill=True,
+            worker_class=IncreasedCloseTimeoutNanny,
+            asynchronous=True,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
                 ddf = dask.dataframe.from_pandas(

dask_cuda/tests/test_proxy.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 # Make the "disk" serializer available and use a directory that are
 # remove on exit.
@@ -399,10 +400,14 @@ class _PxyObjTest(proxy_object.ProxyObject):
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @gen_test(timeout=120)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
     def task(x):
@@ -411,7 +416,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
         serializers_used = x._pxy_get().serializer
         # Check that `x` is serialized with the expected serializers
-        if protocol == "ucx":
+        if protocol in ["ucx", "ucxx"]:
             if send_serializers is None:
                 assert serializers_used == "cuda"
             else:
@@ -422,6 +427,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
@@ -441,11 +447,15 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
             await client.submit(task, df)
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @pytest.mark.parametrize("shared_fs", [True, False])
 @gen_test(timeout=20)
 async def test_communicating_disk_objects(protocol, shared_fs):
     """Testing disk serialization of cuDF dataframe when communicating"""
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     cudf = pytest.importorskip("cudf")
     ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs

dask_cuda/tests/test_spill.py CHANGED Viewed

@@ -12,6 +12,7 @@ from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 if utils.get_device_total_memory() < 1e10:
     pytest.skip("Not enough GPU memory", allow_module_level=True)
@@ -160,6 +161,7 @@ async def test_cupy_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
@@ -263,6 +265,7 @@ async def test_cudf_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:

dask_cuda/tests/test_utils.py CHANGED Viewed

@@ -79,11 +79,18 @@ def test_get_device_total_memory():
             assert total_mem > 0
-def test_get_preload_options_default():
-    pytest.importorskip("ucp")
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_get_preload_options_default(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
     )
@@ -93,14 +100,21 @@ def test_get_preload_options_default():
     assert opts["preload_argv"] == ["--create-cuda-context"]
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
 @pytest.mark.parametrize("enable_tcp", [True, False])
 @pytest.mark.parametrize("enable_infiniband", [True, False])
 @pytest.mark.parametrize("enable_nvlink", [True, False])
-def test_get_preload_options(enable_tcp, enable_infiniband, enable_nvlink):
-    pytest.importorskip("ucp")
+def test_get_preload_options(protocol, enable_tcp, enable_infiniband, enable_nvlink):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     opts = get_preload_options(
-        protocol="ucx",
+        protocol=protocol,
         create_cuda_context=True,
         enable_tcp_over_ucx=enable_tcp,
         enable_infiniband=enable_infiniband,

dask_cuda/utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import importlib
 import math
 import operator
 import os
@@ -18,7 +17,7 @@ import dask
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
 from dask.utils import format_bytes, parse_bytes
-from distributed import Worker, WorkerPlugin, wait
+from distributed import wait
 from distributed.comm import parse_address
 try:
@@ -32,122 +31,6 @@ except ImportError:
         yield
-class CPUAffinity(WorkerPlugin):
-    def __init__(self, cores):
-        self.cores = cores
-    def setup(self, worker=None):
-        os.sched_setaffinity(0, self.cores)
-class RMMSetup(WorkerPlugin):
-    def __init__(
-        self,
-        initial_pool_size,
-        maximum_pool_size,
-        managed_memory,
-        async_alloc,
-        release_threshold,
-        log_directory,
-        track_allocations,
-    ):
-        if initial_pool_size is None and maximum_pool_size is not None:
-            raise ValueError(
-                "`rmm_maximum_pool_size` was specified without specifying "
-                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
-            )
-        if async_alloc is True:
-            if managed_memory is True:
-                raise ValueError(
-                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
-                )
-        if async_alloc is False and release_threshold is not None:
-            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
-        self.initial_pool_size = initial_pool_size
-        self.maximum_pool_size = maximum_pool_size
-        self.managed_memory = managed_memory
-        self.async_alloc = async_alloc
-        self.release_threshold = release_threshold
-        self.logging = log_directory is not None
-        self.log_directory = log_directory
-        self.rmm_track_allocations = track_allocations
-    def setup(self, worker=None):
-        if self.initial_pool_size is not None:
-            self.initial_pool_size = parse_device_memory_limit(
-                self.initial_pool_size, alignment_size=256
-            )
-        if self.async_alloc:
-            import rmm
-            if self.release_threshold is not None:
-                self.release_threshold = parse_device_memory_limit(
-                    self.release_threshold, alignment_size=256
-                )
-            mr = rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=self.initial_pool_size,
-                release_threshold=self.release_threshold,
-            )
-            if self.maximum_pool_size is not None:
-                self.maximum_pool_size = parse_device_memory_limit(
-                    self.maximum_pool_size, alignment_size=256
-                )
-                mr = rmm.mr.LimitingResourceAdaptor(
-                    mr, allocation_limit=self.maximum_pool_size
-                )
-            rmm.mr.set_current_device_resource(mr)
-            if self.logging:
-                rmm.enable_logging(
-                    log_file_name=get_rmm_log_file_name(
-                        worker, self.logging, self.log_directory
-                    )
-                )
-        elif self.initial_pool_size is not None or self.managed_memory:
-            import rmm
-            pool_allocator = False if self.initial_pool_size is None else True
-            if self.initial_pool_size is not None:
-                if self.maximum_pool_size is not None:
-                    self.maximum_pool_size = parse_device_memory_limit(
-                        self.maximum_pool_size, alignment_size=256
-                    )
-            rmm.reinitialize(
-                pool_allocator=pool_allocator,
-                managed_memory=self.managed_memory,
-                initial_pool_size=self.initial_pool_size,
-                maximum_pool_size=self.maximum_pool_size,
-                logging=self.logging,
-                log_file_name=get_rmm_log_file_name(
-                    worker, self.logging, self.log_directory
-                ),
-            )
-        if self.rmm_track_allocations:
-            import rmm
-            mr = rmm.mr.get_current_device_resource()
-            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
-class PreImport(WorkerPlugin):
-    def __init__(self, libraries):
-        if libraries is None:
-            libraries = []
-        elif isinstance(libraries, str):
-            libraries = libraries.split(",")
-        self.libraries = libraries
-    def setup(self, worker=None):
-        for l in self.libraries:
-            importlib.import_module(l)
 def unpack_bitmask(x, mask_bits=64):
     """Unpack a list of integers containing bitmasks.
@@ -404,7 +287,7 @@ def get_preload_options(
     if create_cuda_context:
         preload_options["preload_argv"].append("--create-cuda-context")
-    if protocol == "ucx":
+    if protocol in ["ucx", "ucxx"]:
         initialize_ucx_argv = []
         if enable_tcp_over_ucx:
             initialize_ucx_argv.append("--enable-tcp-over-ucx")
@@ -669,27 +552,6 @@ def parse_device_memory_limit(device_memory_limit, device_index=0, alignment_siz
         return _align(int(device_memory_limit), alignment_size)
-class MockWorker(Worker):
-    """Mock Worker class preventing NVML from getting used by SystemMonitor.
-    By preventing the Worker from initializing NVML in the SystemMonitor, we can
-    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
-    machines.
-    """
-    def __init__(self, *args, **kwargs):
-        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
-        self._device_get_count = distributed.diagnostics.nvml.device_get_count
-        super().__init__(*args, **kwargs)
-    def __del__(self):
-        distributed.diagnostics.nvml.device_get_count = self._device_get_count
-    @staticmethod
-    def device_get_count():
-        return 0
 def get_gpu_uuid_from_index(device_index=0):
     """Get GPU UUID from CUDA device index.
@@ -763,6 +625,10 @@ def get_worker_config(dask_worker):
         import ucp
         ret["ucx-transports"] = ucp.get_active_transports()
+    elif scheme == "ucxx":
+        import ucxx
+        ret["ucx-transports"] = ucxx.get_active_transports()
     # comm timeouts
     ret["distributed.comm.timeouts"] = dask.config.get("distributed.comm.timeouts")

dask_cuda/utils_test.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Literal
+import distributed
+from distributed import Nanny, Worker
+class MockWorker(Worker):
+    """Mock Worker class preventing NVML from getting used by SystemMonitor.
+    By preventing the Worker from initializing NVML in the SystemMonitor, we can
+    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
+    machines.
+    """
+    def __init__(self, *args, **kwargs):
+        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
+        self._device_get_count = distributed.diagnostics.nvml.device_get_count
+        super().__init__(*args, **kwargs)
+    def __del__(self):
+        distributed.diagnostics.nvml.device_get_count = self._device_get_count
+    @staticmethod
+    def device_get_count():
+        return 0
+class IncreasedCloseTimeoutNanny(Nanny):
+    """Increase `Nanny`'s close timeout.
+    The internal close timeout mechanism of `Nanny` recomputes the time left to kill
+    the `Worker` process based on elapsed time of the close task, which may leave
+    very little time for the subprocess to shutdown cleanly, which may cause tests
+    to fail when the system is under higher load. This class increases the default
+    close timeout of 5.0 seconds that `Nanny` sets by default, which can be overriden
+    via Distributed's public API.
+    This class can be used with the `worker_class` argument of `LocalCluster` or
+    `LocalCUDACluster` to provide a much higher default of 30.0 seconds.
+    """
+    async def close(  # type:ignore[override]
+        self, timeout: float = 30.0, reason: str = "nanny-close"
+    ) -> Literal["OK"]:
+        return await super().close(timeout=timeout, reason=reason)

dask_cuda/worker_spec.py CHANGED Viewed

@@ -5,7 +5,8 @@ from distributed.system import MEMORY_LIMIT
 from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
-from .utils import CPUAffinity, get_cpu_affinity, get_gpu_count
+from .plugins import CPUAffinity
+from .utils import get_cpu_affinity, get_gpu_count
 def worker_spec(

dask-cuda 23.12.0a231026__py3-none-any.whl → 24.2.0a3__py3-none-any.whl

dask-cuda 23.12.0a231026py3-none-any.whl → 24.2.0a3py3-none-any.whl