PyPI - dask-cuda - Versions diffs - 23.12.0a231026__py3-none-any.whl → 24.2.0a3__py3-none-any.whl - Mend

dask-cuda 23.12.0a231026py3-none-any.whl → 24.2.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

dask_cuda/VERSION +1 -0
dask_cuda/__init__.py +1 -3
dask_cuda/_version.py +20 -0
dask_cuda/benchmarks/local_cudf_groupby.py +1 -1
dask_cuda/benchmarks/local_cudf_merge.py +1 -1
dask_cuda/benchmarks/local_cudf_shuffle.py +1 -1
dask_cuda/benchmarks/local_cupy.py +1 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +1 -1
dask_cuda/benchmarks/utils.py +1 -1
dask_cuda/cuda_worker.py +1 -3
dask_cuda/device_host_file.py +1 -1
dask_cuda/initialize.py +47 -16
dask_cuda/local_cuda_cluster.py +19 -19
dask_cuda/plugins.py +122 -0
dask_cuda/tests/test_dask_cuda_worker.py +3 -3
dask_cuda/tests/test_dgx.py +45 -17
dask_cuda/tests/test_explicit_comms.py +5 -5
dask_cuda/tests/test_from_array.py +6 -2
dask_cuda/tests/test_initialize.py +69 -21
dask_cuda/tests/test_local_cuda_cluster.py +47 -14
dask_cuda/tests/test_proxify_host_file.py +5 -1
dask_cuda/tests/test_proxy.py +13 -3
dask_cuda/tests/test_spill.py +3 -0
dask_cuda/tests/test_utils.py +20 -6
dask_cuda/utils.py +6 -140
dask_cuda/utils_test.py +45 -0
dask_cuda/worker_spec.py +2 -1
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/METADATA +2 -3
dask_cuda-24.2.0a3.dist-info/RECORD +53 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/WHEEL +1 -1
dask_cuda/compat.py +0 -118
dask_cuda-23.12.0a231026.dist-info/RECORD +0 -50
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/LICENSE +0 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/entry_points.txt +0 -0
{dask_cuda-23.12.0a231026.dist-info → dask_cuda-24.2.0a3.dist-info}/top_level.txt +0 -0

dask_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 24.02.00a3

dask_cuda/__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@ import dask.dataframe.shuffle
 import dask.dataframe.multi
 import dask.bag.core
+from ._version import __git_commit__, __version__
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
@@ -19,9 +20,6 @@ from .explicit_comms.dataframe.shuffle import (
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
-__version__ = "23.12.00"
-from . import compat
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(

dask_cuda/_version.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.resources
+__version__ = (
+    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""

dask_cuda/benchmarks/local_cudf_groupby.py CHANGED Viewed

@@ -139,7 +139,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cudf_merge.py CHANGED Viewed

@@ -217,7 +217,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Frac-match", value=f"{args.frac_match}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cudf_shuffle.py CHANGED Viewed

@@ -146,7 +146,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cupy.py CHANGED Viewed

@@ -193,7 +193,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cupy_map_overlap.py CHANGED Viewed

@@ -78,7 +78,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol == "ucx":
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/utils.py CHANGED Viewed

@@ -73,7 +73,7 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     cluster_args.add_argument(
         "-p",
         "--protocol",
-        choices=["tcp", "ucx"],
+        choices=["tcp", "ucx", "ucxx"],
         default="tcp",
         type=str,
         help="The communication protocol to use.",

dask_cuda/cuda_worker.py CHANGED Viewed

@@ -20,11 +20,9 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_n_gpus,

dask_cuda/device_host_file.py CHANGED Viewed

@@ -17,7 +17,7 @@ from distributed.protocol import (
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
-from distributed.spill import CustomFile as KeyAsStringFile
+from distributed.spill import AnyKeyFile as KeyAsStringFile
 from distributed.utils import nbytes
 from .is_device_object import is_device_object

dask_cuda/initialize.py CHANGED Viewed

@@ -5,7 +5,6 @@ import click
 import numba.cuda
 import dask
-import distributed.comm.ucx
 from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
 from .utils import get_ucx_config
@@ -23,12 +22,21 @@ def _create_cuda_context_handler():
         numba.cuda.current_context()
-def _create_cuda_context():
+def _create_cuda_context(protocol="ucx"):
+    if protocol not in ["ucx", "ucxx"]:
+        return
     try:
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
         try:
-            distributed.comm.ucx.init_once()
+            if protocol == "ucx":
+                import distributed.comm.ucx
+                distributed.comm.ucx.init_once()
+            elif protocol == "ucxx":
+                import distributed_ucxx.ucxx
+                distributed_ucxx.ucxx.init_once()
         except ModuleNotFoundError:
             # UCX initialization has to be delegated to Distributed, it will take care
             # of setting correct environment variables and importing `ucp` after that.
@@ -39,20 +47,35 @@ def _create_cuda_context():
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if (
-            ctx.has_context
-            and not distributed.comm.ucx.cuda_context_created.has_context
-        ):
-            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        if protocol == "ucx":
+            if (
+                ctx.has_context
+                and not distributed.comm.ucx.cuda_context_created.has_context
+            ):
+                distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        elif protocol == "ucxx":
+            if (
+                ctx.has_context
+                and not distributed_ucxx.ucxx.cuda_context_created.has_context
+            ):
+                distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
         _create_cuda_context_handler()
-        if not distributed.comm.ucx.cuda_context_created.has_context:
-            ctx = has_cuda_context()
-            if ctx.has_context and ctx.device_info != cuda_visible_device:
-                distributed.comm.ucx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx.device_info, os.getpid()
-                )
+        if protocol == "ucx":
+            if not distributed.comm.ucx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed.comm.ucx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
+        elif protocol == "ucxx":
+            if not distributed_ucxx.ucxx.cuda_context_created.has_context:
+                ctx = has_cuda_context()
+                if ctx.has_context and ctx.device_info != cuda_visible_device:
+                    distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
+                        cuda_visible_device, ctx.device_info, os.getpid()
+                    )
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
@@ -64,6 +87,7 @@ def initialize(
     enable_infiniband=None,
     enable_nvlink=None,
     enable_rdmacm=None,
+    protocol="ucx",
 ):
     """Create CUDA context and initialize UCX-Py, depending on user parameters.
@@ -118,7 +142,7 @@ def initialize(
     dask.config.set({"distributed.comm.ucx": ucx_config})
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)
 @click.command()
@@ -127,6 +151,12 @@ def initialize(
     default=False,
     help="Create CUDA context",
 )
+@click.option(
+    "--protocol",
+    default=None,
+    type=str,
+    help="Communication protocol, such as: 'tcp', 'tls', 'ucx' or 'ucxx'.",
+)
 @click.option(
     "--enable-tcp-over-ucx/--disable-tcp-over-ucx",
     default=False,
@@ -150,10 +180,11 @@ def initialize(
 def dask_setup(
     service,
     create_cuda_context,
+    protocol,
     enable_tcp_over_ucx,
     enable_infiniband,
     enable_nvlink,
     enable_rdmacm,
 ):
     if create_cuda_context:
-        _create_cuda_context()
+        _create_cuda_context(protocol=protocol)

dask_cuda/local_cuda_cluster.py CHANGED Viewed

@@ -3,7 +3,6 @@ import logging
 import os
 import warnings
 from functools import partial
-from typing import Literal
 import dask
 from distributed import LocalCluster, Nanny, Worker
@@ -11,11 +10,9 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_ucx_config,
@@ -25,13 +22,6 @@ from .utils import (
 )
-class IncreasedCloseTimeoutNanny(Nanny):
-    async def close(  # type:ignore[override]
-        self, timeout: float = 10.0, reason: str = "nanny-close"
-    ) -> Literal["OK"]:
-        return await super().close(timeout=timeout, reason=reason)
 class LoggedWorker(Worker):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -41,7 +31,7 @@ class LoggedWorker(Worker):
         self.data.set_address(self.address)
-class LoggedNanny(IncreasedCloseTimeoutNanny):
+class LoggedNanny(Nanny):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, worker_class=LoggedWorker, **kwargs)
@@ -329,8 +319,11 @@ class LocalCUDACluster(LocalCluster):
         if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
             if protocol is None:
                 protocol = "ucx"
-            elif protocol != "ucx":
-                raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
+            elif protocol not in ["ucx", "ucxx"]:
+                raise TypeError(
+                    "Enabling InfiniBand or NVLink requires protocol='ucx' or "
+                    "protocol='ucxx'"
+                )
         self.host = kwargs.get("host", None)
@@ -342,10 +335,17 @@ class LocalCUDACluster(LocalCluster):
             enable_rdmacm=enable_rdmacm,
         )
-        worker_class = partial(
-            LoggedNanny if log_spilling is True else IncreasedCloseTimeoutNanny,
-            worker_class=worker_class,
-        )
+        if worker_class is not None:
+            if log_spilling is True:
+                raise ValueError(
+                    "Cannot enable `log_spilling` when `worker_class` is specified. If "
+                    "logging is needed, ensure `worker_class` is a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedNanny` or a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedWorker`, and specify "
+                    "`log_spilling=False`."
+                )
+            if not issubclass(worker_class, Nanny):
+                worker_class = partial(Nanny, worker_class=worker_class)
         self.pre_import = pre_import
@@ -374,7 +374,7 @@ class LocalCUDACluster(LocalCluster):
         ) + ["dask_cuda.initialize"]
         self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
             "preload_argv", []
-        ) + ["--create-cuda-context"]
+        ) + ["--create-cuda-context", "--protocol", protocol]
         self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
         self.scale(n_workers)

dask_cuda/plugins.py ADDED Viewed

@@ -0,0 +1,122 @@
+import importlib
+import os
+from distributed import WorkerPlugin
+from .utils import get_rmm_log_file_name, parse_device_memory_limit
+class CPUAffinity(WorkerPlugin):
+    def __init__(self, cores):
+        self.cores = cores
+    def setup(self, worker=None):
+        os.sched_setaffinity(0, self.cores)
+class RMMSetup(WorkerPlugin):
+    def __init__(
+        self,
+        initial_pool_size,
+        maximum_pool_size,
+        managed_memory,
+        async_alloc,
+        release_threshold,
+        log_directory,
+        track_allocations,
+    ):
+        if initial_pool_size is None and maximum_pool_size is not None:
+            raise ValueError(
+                "`rmm_maximum_pool_size` was specified without specifying "
+                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
+            )
+        if async_alloc is True:
+            if managed_memory is True:
+                raise ValueError(
+                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
+                )
+        if async_alloc is False and release_threshold is not None:
+            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
+        self.initial_pool_size = initial_pool_size
+        self.maximum_pool_size = maximum_pool_size
+        self.managed_memory = managed_memory
+        self.async_alloc = async_alloc
+        self.release_threshold = release_threshold
+        self.logging = log_directory is not None
+        self.log_directory = log_directory
+        self.rmm_track_allocations = track_allocations
+    def setup(self, worker=None):
+        if self.initial_pool_size is not None:
+            self.initial_pool_size = parse_device_memory_limit(
+                self.initial_pool_size, alignment_size=256
+            )
+        if self.async_alloc:
+            import rmm
+            if self.release_threshold is not None:
+                self.release_threshold = parse_device_memory_limit(
+                    self.release_threshold, alignment_size=256
+                )
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=self.initial_pool_size,
+                release_threshold=self.release_threshold,
+            )
+            if self.maximum_pool_size is not None:
+                self.maximum_pool_size = parse_device_memory_limit(
+                    self.maximum_pool_size, alignment_size=256
+                )
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=self.maximum_pool_size
+                )
+            rmm.mr.set_current_device_resource(mr)
+            if self.logging:
+                rmm.enable_logging(
+                    log_file_name=get_rmm_log_file_name(
+                        worker, self.logging, self.log_directory
+                    )
+                )
+        elif self.initial_pool_size is not None or self.managed_memory:
+            import rmm
+            pool_allocator = False if self.initial_pool_size is None else True
+            if self.initial_pool_size is not None:
+                if self.maximum_pool_size is not None:
+                    self.maximum_pool_size = parse_device_memory_limit(
+                        self.maximum_pool_size, alignment_size=256
+                    )
+            rmm.reinitialize(
+                pool_allocator=pool_allocator,
+                managed_memory=self.managed_memory,
+                initial_pool_size=self.initial_pool_size,
+                maximum_pool_size=self.maximum_pool_size,
+                logging=self.logging,
+                log_file_name=get_rmm_log_file_name(
+                    worker, self.logging, self.log_directory
+                ),
+            )
+        if self.rmm_track_allocations:
+            import rmm
+            mr = rmm.mr.get_current_device_resource()
+            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
+class PreImport(WorkerPlugin):
+    def __init__(self, libraries):
+        if libraries is None:
+            libraries = []
+        elif isinstance(libraries, str):
+            libraries = libraries.split(",")
+        self.libraries = libraries
+    def setup(self, worker=None):
+        for l in self.libraries:
+            importlib.import_module(l)

dask_cuda/tests/test_dask_cuda_worker.py CHANGED Viewed

@@ -40,7 +40,7 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 str(nthreads),
                 "--no-dashboard",
                 "--worker-class",
-                "dask_cuda.utils.MockWorker",
+                "dask_cuda.utils_test.MockWorker",
             ]
         ):
             with Client("127.0.0.1:9359", loop=loop) as client:
@@ -329,7 +329,7 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
                     str(nthreads),
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
@@ -364,7 +364,7 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
                     "127.0.0.1",
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:

dask_cuda/tests/test_dgx.py CHANGED Viewed

@@ -73,10 +73,13 @@ def test_default():
     assert not p.exitcode
-def _test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")
+def _test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
-    with LocalCUDACluster(enable_tcp_over_ucx=True) as cluster:
+    with LocalCUDACluster(protocol=protocol, enable_tcp_over_ucx=True) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
             res = res.sum().compute()
@@ -93,10 +96,17 @@ def _test_tcp_over_ucx():
             assert all(client.run(check_ucx_options).values())
-def test_tcp_over_ucx():
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
+@pytest.mark.parametrize(
+    "protocol",
+    ["ucx", "ucxx"],
+)
+def test_tcp_over_ucx(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
-    p = mp.Process(target=_test_tcp_over_ucx)
+    p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
     p.start()
     p.join()
     assert not p.exitcode
@@ -117,9 +127,22 @@ def test_tcp_only():
     assert not p.exitcode
-def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm):
+def _test_ucx_infiniband_nvlink(
+    skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
+):
     cupy = pytest.importorskip("cupy")
-    ucp = pytest.importorskip("ucp")
+    if protocol == "ucx":
+        ucp = pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        ucp = pytest.importorskip("ucxx")
+    if enable_infiniband and not any(
+        [at.startswith("rc") for at in ucp.get_active_transports()]
+    ):
+        skip_queue.put("No support available for 'rc' transport in UCX")
+        return
+    else:
+        skip_queue.put("ok")
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
@@ -135,6 +158,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
             cm_tls_priority = ["tcp"]
     initialize(
+        protocol=protocol,
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
@@ -142,6 +166,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
     )
     with LocalCUDACluster(
+        protocol=protocol,
         interface="ib0",
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
@@ -171,6 +196,7 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
             assert all(client.run(check_ucx_options).values())
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
 @pytest.mark.parametrize(
     "params",
     [
@@ -185,16 +211,19 @@ def _test_ucx_infiniband_nvlink(enable_infiniband, enable_nvlink, enable_rdmacm)
     _get_dgx_version() == DGXVersion.DGX_A100,
     reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
 )
-def test_ucx_infiniband_nvlink(params):
-    ucp = pytest.importorskip("ucp")  # NOQA: F841
+def test_ucx_infiniband_nvlink(protocol, params):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
-    if params["enable_infiniband"]:
-        if not any([at.startswith("rc") for at in ucp.get_active_transports()]):
-            pytest.skip("No support available for 'rc' transport in UCX")
+    skip_queue = mp.Queue()
     p = mp.Process(
         target=_test_ucx_infiniband_nvlink,
         args=(
+            skip_queue,
+            protocol,
             params["enable_infiniband"],
             params["enable_nvlink"],
             params["enable_rdmacm"],
@@ -203,9 +232,8 @@ def test_ucx_infiniband_nvlink(params):
     p.start()
     p.join()
-    # Starting a new cluster on the same pytest process after an rdmacm cluster
-    # has been used may cause UCX-Py to complain about being already initialized.
-    if params["enable_rdmacm"] is True:
-        ucp.reset()
+    skip_msg = skip_queue.get()
+    if skip_msg != "ok":
+        pytest.skip(skip_msg)
     assert not p.exitcode

dask_cuda/tests/test_explicit_comms.py CHANGED Viewed

@@ -17,7 +17,7 @@ from distributed.deploy.local import LocalCluster
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from dask_cuda.local_cuda_cluster import IncreasedCloseTimeoutNanny
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -44,7 +44,7 @@ def _test_local_cluster(protocol):
             assert sum(c.run(my_rank, 0)) == sum(range(4))
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_local_cluster(protocol):
     p = mp.Process(target=_test_local_cluster, args=(protocol,))
     p.start()
@@ -160,7 +160,7 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
@@ -256,7 +256,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
         pytest.importorskip("cudf")
@@ -293,7 +293,7 @@ def _test_jit_unspill(protocol):
             assert_eq(got, expected)
-@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
 def test_jit_unspill(protocol):
     pytest.importorskip("cudf")

dask_cuda/tests/test_from_array.py CHANGED Viewed

@@ -5,12 +5,16 @@ from distributed import Client
 from dask_cuda import LocalCUDACluster
-pytest.importorskip("ucp")
 cupy = pytest.importorskip("cupy")
-@pytest.mark.parametrize("protocol", ["ucx", "tcp"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "tcp"])
 def test_ucx_from_array(protocol):
+    if protocol == "ucx":
+        pytest.importorskip("ucp")
+    elif protocol == "ucxx":
+        pytest.importorskip("ucxx")
     N = 10_000
     with LocalCUDACluster(protocol=protocol) as cluster:
         with Client(cluster):

dask-cuda 23.12.0a231026__py3-none-any.whl → 24.2.0a3__py3-none-any.whl

dask-cuda 23.12.0a231026py3-none-any.whl → 24.2.0a3py3-none-any.whl