PyPI - dask-cuda - Versions diffs - 25.8.0__py3-none-any.whl → 25.10.0__py3-none-any.whl - Mend

dask-cuda 25.8.0py3-none-any.whl → 25.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/benchmarks/local_cudf_groupby.py +1 -1
dask_cuda/benchmarks/local_cudf_merge.py +1 -1
dask_cuda/benchmarks/local_cudf_shuffle.py +1 -1
dask_cuda/benchmarks/local_cupy.py +1 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +1 -1
dask_cuda/benchmarks/utils.py +1 -1
dask_cuda/cuda_worker.py +1 -1
dask_cuda/get_device_memory_objects.py +1 -4
dask_cuda/initialize.py +140 -121
dask_cuda/local_cuda_cluster.py +10 -25
dask_cuda/tests/test_cudf_builtin_spilling.py +3 -1
dask_cuda/tests/test_dask_setup.py +193 -0
dask_cuda/tests/test_dgx.py +16 -32
dask_cuda/tests/test_explicit_comms.py +11 -10
dask_cuda/tests/test_from_array.py +1 -5
dask_cuda/tests/test_initialize.py +230 -41
dask_cuda/tests/test_local_cuda_cluster.py +16 -62
dask_cuda/tests/test_proxify_host_file.py +9 -4
dask_cuda/tests/test_proxy.py +8 -8
dask_cuda/tests/test_spill.py +3 -3
dask_cuda/tests/test_utils.py +8 -23
dask_cuda/tests/test_worker_spec.py +5 -2
dask_cuda/utils.py +12 -66
dask_cuda/utils_test.py +0 -13
dask_cuda/worker_spec.py +7 -9
{dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/METADATA +11 -4
dask_cuda-25.10.0.dist-info/RECORD +63 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +1 -1
dask_cuda/tests/test_rdd_ucx.py +0 -160
dask_cuda-25.8.0.dist-info/RECORD +0 -63
{dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/WHEEL +0 -0
{dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/licenses/LICENSE +0 -0
{dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/top_level.txt +0 -0

dask_cuda/GIT_COMMIT CHANGED Viewed

	@@ -1 +1 @@
1	- ~~bde9a4d3ee2c4338f56b3acf919b8e756ecb35b3~~
1	+ 472ca1ce6d1fe836104a5a4f10b284ca9a828ea9

dask_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 25.08.00
1	+ 25.10.00

dask_cuda/benchmarks/local_cudf_groupby.py CHANGED Viewed

@@ -141,7 +141,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol in ["ucx", "ucxx", "ucx-old"]:
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cudf_merge.py CHANGED Viewed

@@ -227,7 +227,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Frac-match", value=f"{args.frac_match}")
-    if args.protocol in ["ucx", "ucxx", "ucx-old"]:
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cudf_shuffle.py CHANGED Viewed

@@ -152,7 +152,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
             key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
-    if args.protocol in ["ucx", "ucxx", "ucx-old"]:
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cupy.py CHANGED Viewed

@@ -195,7 +195,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol in ["ucx", "ucxx", "ucx-old"]:
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/local_cupy_map_overlap.py CHANGED Viewed

@@ -80,7 +80,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         )
     print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
     print_key_value(key="Protocol", value=f"{args.protocol}")
-    if args.protocol in ["ucx", "ucxx", "ucx-old"]:
+    if args.protocol in ["ucx", "ucxx"]:
         print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
         print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
         print_key_value(key="NVLink", value=f"{args.enable_nvlink}")

dask_cuda/benchmarks/utils.py CHANGED Viewed

@@ -80,7 +80,7 @@ def parse_benchmark_args(
     cluster_args.add_argument(
         "-p",
         "--protocol",
-        choices=["tcp", "ucx", "ucxx", "ucx-old"],
+        choices=["tcp", "ucx", "ucxx"],
         default="tcp",
         type=str,
         help="The communication protocol to use.",

dask_cuda/cuda_worker.py CHANGED Viewed

@@ -210,7 +210,7 @@ class CUDAWorker(Server):
                 name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
                 local_directory=local_directory,
                 config={
-                    "distributed.comm.ucx": get_ucx_config(
+                    "distributed-ucxx": get_ucx_config(
                         enable_tcp_over_ucx=enable_tcp_over_ucx,
                         enable_infiniband=enable_infiniband,
                         enable_nvlink=enable_nvlink,

dask_cuda/get_device_memory_objects.py CHANGED Viewed

@@ -119,11 +119,8 @@ def get_device_memory_objects_register_cudf():
         return []
     @dispatch.register(cudf.core.index.Index)
-    def get_device_memory_objects_cudf_index(obj):
-        return dispatch(obj._values)
     @dispatch.register(cudf.core.multiindex.MultiIndex)
-    def get_device_memory_objects_cudf_multiindex(obj):
+    def get_device_memory_objects_cudf_index(obj):
         return dispatch(obj._columns)
     @dispatch.register(cudf.core.column.ColumnBase)

dask_cuda/initialize.py CHANGED Viewed

@@ -5,126 +5,177 @@ import logging
 import os
 import click
-import numba.cuda
+import cuda.core.experimental
 import dask
-from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
+from distributed.diagnostics.nvml import (
+    CudaDeviceInfo,
+    get_device_index_and_uuid,
+    has_cuda_context,
+)
-from .utils import _get_active_ucx_implementation_name, get_ucx_config
+from .utils import get_ucx_config
 logger = logging.getLogger(__name__)
+pre_existing_cuda_context = None
+cuda_context_created = None
+_warning_suffix = (
+    "This is often the result of a CUDA-enabled library calling a CUDA runtime "
+    "function before Dask-CUDA can spawn worker processes. Please make sure any such "
+    "function calls don't happen at import time or in the global scope of a program."
+)
+def _get_device_and_uuid_str(device_info: CudaDeviceInfo) -> str:
+    return f"{device_info.device_index} ({str(device_info.uuid)})"
+def _warn_existing_cuda_context(device_info: CudaDeviceInfo, pid: int) -> None:
+    device_uuid_str = _get_device_and_uuid_str(device_info)
+    logger.warning(
+        f"A CUDA context for device {device_uuid_str} already exists "
+        f"on process ID {pid}. {_warning_suffix}"
+    )
+def _warn_cuda_context_wrong_device(
+    device_info_expected: CudaDeviceInfo, device_info_actual: CudaDeviceInfo, pid: int
+) -> None:
+    expected_device_uuid_str = _get_device_and_uuid_str(device_info_expected)
+    actual_device_uuid_str = _get_device_and_uuid_str(device_info_actual)
+    logger.warning(
+        f"Worker with process ID {pid} should have a CUDA context assigned to device "
+        f"{expected_device_uuid_str}, but instead the CUDA context is on device "
+        f"{actual_device_uuid_str}. {_warning_suffix}"
+    )
+def _mock_test_device() -> bool:
+    """Check whether running tests in a single-GPU environment.
+    Returns
+    -------
+    Whether running tests in a single-GPU environment, determined by checking whether
+    `DASK_CUDA_TEST_SINGLE_GPU` environment variable is set to a value different than
+    `"0"`.
+    """
+    return int(os.environ.get("DASK_CUDA_TEST_SINGLE_GPU", "0")) != 0
+def _get_device_str() -> str:
+    """Get the device string.
+    Get a string with the first device (first element before the comma), which may be
+    an index or a UUID.
+    Always returns "0" when running tests in a single-GPU environment, determined by
+    the result returned by `_mock_test_device()`.
+    Returns
+    -------
+    The device string.
+    """
+    if _mock_test_device():
+        return "0"
+    else:
+        return os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
 def _create_cuda_context_handler():
-    if int(os.environ.get("DASK_CUDA_TEST_SINGLE_GPU", "0")) != 0:
+    """Create a CUDA context on the current device.
+    A CUDA context is created on the current device if one does not exist yet, and not
+    running tests on a single-GPU environment, determined by the result returned by
+    `_mock_test_device()`.
+    Returns
+    -------
+    The device string.
+    """
+    if _mock_test_device():
         try:
-            numba.cuda.current_context()
-        except numba.cuda.cudadrv.error.CudaSupportError:
+            cuda.core.experimental.Device().set_current()
+        except Exception:
             pass
     else:
-        numba.cuda.current_context()
+        cuda.core.experimental.Device().set_current()
-def _warn_generic():
-    try:
-        # TODO: update when UCX-Py is removed, see
-        # https://github.com/rapidsai/dask-cuda/issues/1517
-        import distributed.comm.ucx
+def _create_cuda_context_and_warn():
+    """Create CUDA context and warn depending on certain conditions.
-        # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
-        # context directly from the UCX module, thus avoiding a similar warning there.
-        cuda_visible_device = get_device_index_and_uuid(
-            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
-        )
-        ctx = has_cuda_context()
-        if (
-            ctx.has_context
-            and not distributed.comm.ucx.cuda_context_created.has_context
-        ):
-            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
-        _create_cuda_context_handler()
-        if not distributed.comm.ucx.cuda_context_created.has_context:
-            ctx = has_cuda_context()
-            if ctx.has_context and ctx.device_info != cuda_visible_device:
-                distributed.comm.ucx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx.device_info, os.getpid()
-                )
+    Warns if a pre-existing CUDA context already existed or if the resulting CUDA
+    context was created in the wrong device.
-    except Exception:
-        logger.error("Unable to start CUDA Context", exc_info=True)
+    This function is almost an identical duplicate from
+    `distributed_ucxx.ucxx.init_once`, the duplication is necessary because Dask-CUDA
+    needs to support `protocol="tcp"` as well, even when distributed-ucxx is not
+    installed, but this here runs _after_ comms have started, which is fine for TCP
+    because the time when CUDA context is created is not important. The code needs to
+    live also in distributed-ucxx because there the time when a CUDA context is created
+    matters, and it needs to happen _before_ UCX is initialized, but comms in
+    Distributed is initialized before preload, and thus only after this function
+    executes.
+    Raises
+    ------
+    Exception
+        If anything wrong happened during context initialization.
-def _initialize_ucx():
-    try:
-        import distributed.comm.ucx
-        distributed.comm.ucx.init_once()
-    except ModuleNotFoundError:
-        # UCX initialization has to be delegated to Distributed, it will take care
-        # of setting correct environment variables and importing `ucp` after that.
-        # Therefore if ``import ucp`` fails we can just continue here.
-        pass
+    Returns
+    -------
+    None
+    """
+    global pre_existing_cuda_context, cuda_context_created
+    cuda_visible_device = get_device_index_and_uuid(_get_device_str())
+    pre_existing_cuda_context = has_cuda_context()
+    if pre_existing_cuda_context.has_context:
+        _warn_existing_cuda_context(pre_existing_cuda_context.device_info, os.getpid())
+    _create_cuda_context_handler()
+    cuda_context_created = has_cuda_context()
+    if (
+        cuda_context_created.has_context
+        and cuda_context_created.device_info.uuid != cuda_visible_device.uuid
+    ):
+        _warn_cuda_context_wrong_device(
+            cuda_visible_device, cuda_context_created.device_info, os.getpid()
+        )
-def _initialize_ucxx():
+def _create_cuda_context():
     try:
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
         import distributed_ucxx.ucxx
+    except ImportError:
+        pass
+    else:
+        if distributed_ucxx.ucxx.ucxx is not None:
+            # UCXX has already initialized (and warned if necessary)
+            return
-        distributed_ucxx.ucxx.init_once()
-        cuda_visible_device = get_device_index_and_uuid(
-            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
-        )
-        ctx = has_cuda_context()
-        if (
-            ctx.has_context
-            and not distributed_ucxx.ucxx.cuda_context_created.has_context
-        ):
-            distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
-        _create_cuda_context_handler()
-        if not distributed_ucxx.ucxx.cuda_context_created.has_context:
-            ctx = has_cuda_context()
-            if ctx.has_context and ctx.device_info != cuda_visible_device:
-                distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx.device_info, os.getpid()
-                )
+    try:
+        _create_cuda_context_and_warn()
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
-def _create_cuda_context(protocol="ucx"):
-    if protocol not in ["ucx", "ucxx", "ucx-old"]:
-        return
-    try:
-        ucx_implementation = _get_active_ucx_implementation_name(protocol)
-    except ValueError:
-        # Not a UCX protocol, just raise CUDA context warnings if needed.
-        _warn_generic()
-    else:
-        if ucx_implementation == "ucxx":
-            _initialize_ucxx()
-        else:
-            _initialize_ucx()
-            _warn_generic()
 def initialize(
     create_cuda_context=True,
     enable_tcp_over_ucx=None,
     enable_infiniband=None,
     enable_nvlink=None,
     enable_rdmacm=None,
-    protocol="ucx",
 ):
-    """Create CUDA context and initialize UCX-Py, depending on user parameters.
+    """Create CUDA context and initialize UCXX configuration.
     Sometimes it is convenient to initialize the CUDA context, particularly before
     starting up Dask worker processes which create a variety of threads.
@@ -173,12 +224,11 @@ def initialize(
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
         enable_rdmacm=enable_rdmacm,
-        protocol=protocol,
     )
-    dask.config.set({"distributed.comm.ucx": ucx_config})
+    dask.config.set({"distributed-ucxx": ucx_config})
     if create_cuda_context:
-        _create_cuda_context(protocol=protocol)
+        _create_cuda_context()
 @click.command()
@@ -187,40 +237,9 @@ def initialize(
     default=False,
     help="Create CUDA context",
 )
-@click.option(
-    "--protocol",
-    default=None,
-    type=str,
-    help="Communication protocol, such as: 'tcp', 'tls', 'ucx' or 'ucxx'.",
-)
-@click.option(
-    "--enable-tcp-over-ucx/--disable-tcp-over-ucx",
-    default=False,
-    help="Enable TCP communication over UCX",
-)
-@click.option(
-    "--enable-infiniband/--disable-infiniband",
-    default=False,
-    help="Enable InfiniBand communication",
-)
-@click.option(
-    "--enable-nvlink/--disable-nvlink",
-    default=False,
-    help="Enable NVLink communication",
-)
-@click.option(
-    "--enable-rdmacm/--disable-rdmacm",
-    default=False,
-    help="Enable RDMA connection manager, currently requires InfiniBand enabled.",
-)
 def dask_setup(
-    service,
+    worker,
     create_cuda_context,
-    protocol,
-    enable_tcp_over_ucx,
-    enable_infiniband,
-    enable_nvlink,
-    enable_rdmacm,
 ):
     if create_cuda_context:
-        _create_cuda_context(protocol=protocol)
+        _create_cuda_context()

dask_cuda/local_cuda_cluster.py CHANGED Viewed

@@ -47,8 +47,8 @@ class LocalCUDACluster(LocalCluster):
     respect this hardware as much as possible.
     Each worker process is automatically assigned the correct CPU cores and network
-    interface cards to maximize performance. If UCX and UCX-Py are available, InfiniBand
-    and NVLink connections can be used to optimize data transfer performance.
+    interface cards to maximize performance. If UCX and distributed-ucxx are available,
+    InfiniBand and NVLink connections can be used to optimize data transfer performance.
     Parameters
     ----------
@@ -105,16 +105,13 @@ class LocalCUDACluster(LocalCluster):
         are not supported or disabled.
     enable_infiniband : bool, default None
         Set environment variables to enable UCX over InfiniBand, requires
-        ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
-        ``enable_tcp_over_ucx=True`` when ``True``.
+        ``protocol="ucx"``, and implies ``enable_tcp_over_ucx=True`` when ``True``.
     enable_nvlink : bool, default None
         Set environment variables to enable UCX over NVLink, requires
-        ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
-        ``enable_tcp_over_ucx=True`` when ``True``.
+        ``protocol="ucx"``, and implies ``enable_tcp_over_ucx=True`` when ``True``.
     enable_rdmacm : bool, default None
         Set environment variables to enable UCX RDMA connection manager support,
-        requires ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``,
-        and ``enable_infiniband=True``.
+        requires ``protocol="ucx"``, and ``enable_infiniband=True``.
     rmm_pool_size : int, str or None, default None
         RMM pool size to initialize each worker with. Can be an integer (bytes), float
         (fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
@@ -208,8 +205,7 @@ class LocalCUDACluster(LocalCluster):
     Raises
     ------
     TypeError
-        If InfiniBand or NVLink are enabled and
-        ``protocol not in ("ucx", "ucxx", "ucx-old")``.
+        If InfiniBand or NVLink are enabled and ``protocol != "ucx"``.
     ValueError
         If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
         cannot be imported.
@@ -355,20 +351,9 @@ class LocalCUDACluster(LocalCluster):
         if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
             if protocol is None:
-                ucx_protocol = dask.config.get(
-                    "distributed.comm.ucx.ucx-protocol", default=None
-                )
-                if ucx_protocol is not None:
-                    # TODO: remove when UCX-Py is removed,
-                    # see https://github.com/rapidsai/dask-cuda/issues/1517
-                    protocol = ucx_protocol
-                else:
-                    protocol = "ucx"
-            elif protocol not in ("ucx", "ucxx", "ucx-old"):
-                raise TypeError(
-                    "Enabling InfiniBand or NVLink requires protocol='ucx', "
-                    "protocol='ucxx' or protocol='ucx-old'"
-                )
+                protocol = "ucx"
+            if protocol not in ("ucx", "ucxx"):
+                raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
         self.host = kwargs.get("host", None)
@@ -420,7 +405,7 @@ class LocalCUDACluster(LocalCluster):
         ) + ["dask_cuda.initialize"]
         self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
             "preload_argv", []
-        ) + ["--create-cuda-context", "--protocol", protocol]
+        ) + ["--create-cuda-context"]
         self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
         self.scale(n_workers)

dask_cuda/tests/test_cudf_builtin_spilling.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 from distributed.sizeof import safe_sizeof
@@ -6,6 +8,7 @@ from dask_cuda.device_host_file import DeviceHostFile
 from dask_cuda.is_spillable_object import is_spillable_object
 from dask_cuda.proxify_host_file import ProxifyHostFile
+cudf = pytest.importorskip("cudf")
 cupy = pytest.importorskip("cupy")
 pandas = pytest.importorskip("pandas")
@@ -14,7 +17,6 @@ pytest.importorskip(
     reason="Current version of cudf doesn't support built-in spilling",
 )
-import cudf  # noqa: E402
 from cudf.core.buffer.spill_manager import (  # noqa: E402
     SpillManager,
     get_global_manager,

dask-cuda 25.8.0__py3-none-any.whl → 25.10.0__py3-none-any.whl

dask-cuda 25.8.0py3-none-any.whl → 25.10.0py3-none-any.whl