PyPI - dask-cuda - Versions diffs - 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/_compat.py +18 -0
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
dask_cuda/benchmarks/local_cudf_merge.py +5 -2
dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
dask_cuda/benchmarks/local_cupy.py +4 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
dask_cuda/benchmarks/utils.py +7 -4
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +27 -57
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
dask_cuda/get_device_memory_objects.py +18 -3
dask_cuda/initialize.py +80 -44
dask_cuda/is_device_object.py +4 -1
dask_cuda/is_spillable_object.py +4 -1
dask_cuda/local_cuda_cluster.py +63 -66
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +15 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_dask_cuda_worker.py +114 -27
dask_cuda/tests/test_dgx.py +10 -18
dask_cuda/tests/test_explicit_comms.py +51 -18
dask_cuda/tests/test_from_array.py +7 -5
dask_cuda/tests/test_initialize.py +16 -37
dask_cuda/tests/test_local_cuda_cluster.py +164 -54
dask_cuda/tests/test_proxify_host_file.py +33 -4
dask_cuda/tests/test_proxy.py +18 -16
dask_cuda/tests/test_rdd_ucx.py +160 -0
dask_cuda/tests/test_spill.py +107 -27
dask_cuda/tests/test_utils.py +106 -20
dask_cuda/tests/test_worker_spec.py +5 -2
dask_cuda/utils.py +319 -68
dask_cuda/utils_test.py +23 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +12 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
dask_cuda-25.8.0.dist-info/RECORD +63 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.4.0.dist-info/RECORD +0 -56
dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/proxy_object.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import copy as _copy
 import functools
 import operator
@@ -52,21 +55,21 @@ def asproxy(
     serializers: Optional[Iterable[str]] = None,
     subclass: Optional[Type["ProxyObject"]] = None,
 ) -> "ProxyObject":
-    """Wrap `obj` in a ProxyObject object if it isn't already.
+    """Wrap ``obj`` in a ProxyObject object if it isn't already.
     Parameters
     ----------
     obj: object
         Object to wrap in a ProxyObject object.
     serializers: Iterable[str], optional
-        Serializers to use to serialize `obj`. If None, no serialization is done.
+        Serializers to use to serialize ``obj``. If None, no serialization is done.
     subclass: class, optional
         Specify a subclass of ProxyObject to create instead of ProxyObject.
-        `subclass` must be pickable.
+        ``subclass`` must be pickable.
     Returns
     -------
-    The ProxyObject proxying `obj`
+    The ProxyObject proxying ``obj``
     """
     if isinstance(obj, ProxyObject):  # Already a proxy object
         ret = obj
@@ -119,7 +122,7 @@ def unproxy(obj):
     Returns
     -------
-    The proxied object or `obj` itself if it isn't a ProxyObject
+    The proxied object or ``obj`` itself if it isn't a ProxyObject
     """
     try:
         obj = obj._pxy_deserialize()
@@ -185,16 +188,16 @@ class ProxyDetail:
         Dictionary of attributes that are accessible without deserializing
         the proxied object.
     type_serialized: bytes
-        Pickled type of `obj`.
+        Pickled type of ``obj``.
     typename: str
-        Name of the type of `obj`.
+        Name of the type of ``obj``.
     is_cuda_object: boolean
-        Whether `obj` is a CUDA object or not.
+        Whether ``obj`` is a CUDA object or not.
     subclass: bytes
         Pickled type to use instead of ProxyObject when deserializing. The type
         must inherit from ProxyObject.
     serializers: str, optional
-        Serializers to use to serialize `obj`. If None, no serialization is done.
+        Serializers to use to serialize ``obj``. If None, no serialization is done.
     explicit_proxy: bool
         Mark the proxy object as "explicit", which means that the user allows it
         as input argument to dask tasks even in compatibility-mode.
@@ -258,7 +261,7 @@ class ProxyDetail:
         return self.serializer is not None
     def serialize(self, serializers: Iterable[str]) -> Tuple[dict, list]:
-        """Inplace serialization of the proxied object using the `serializers`
+        """Inplace serialization of the proxied object using the ``serializers``
         Parameters
         ----------
@@ -333,7 +336,7 @@ class ProxyObject:
     ProxyObject has some limitations and doesn't mimic the proxied object perfectly.
     Thus, if encountering problems remember that it is always possible to use unproxy()
     to access the proxied object directly or disable JIT deserialization completely
-    with `jit_unspill=False`.
+    with ``jit_unspill=False``.
     Type checking using instance() works as expected but direct type checking
     doesn't:
@@ -386,7 +389,7 @@ class ProxyObject:
         serializers: Iterable[str],
         proxy_detail: Optional[ProxyDetail] = None,
     ) -> None:
-        """Inplace serialization of the proxied object using the `serializers`
+        """Inplace serialization of the proxied object using the ``serializers``
         Parameters
         ----------
@@ -787,8 +790,8 @@ class ProxyObject:
 def obj_pxy_is_device_object(obj: ProxyObject):
     """
     In order to avoid de-serializing the proxied object,
-    we check `is_cuda_object` instead of the default
-    `hasattr(o, "__cuda_array_interface__")` check.
+    we check ``is_cuda_object`` instead of the default
+    ``hasattr(o, "__cuda_array_interface__")`` check.
     """
     return obj._pxy_get().is_cuda_object
@@ -830,7 +833,7 @@ def obj_pxy_dask_serialize(obj: ProxyObject):
     As serializers, it uses "dask" or "pickle", which means that proxied CUDA objects
     are spilled to main memory before communicated. Deserialization is needed, unless
-    obj is serialized to disk on a shared filesystem see `handle_disk_serialized()`.
+    obj is serialized to disk on a shared filesystem see ``handle_disk_serialized()``.
     """
     pxy = obj._pxy_get(copy=True)
     if pxy.serializer == "disk":
@@ -851,7 +854,7 @@ def obj_pxy_cuda_serialize(obj: ProxyObject):
     As serializers, it uses "cuda", which means that proxied CUDA objects are _not_
     spilled to main memory before communicated. However, we still have to handle disk
-    serialized proxied like in `obj_pxy_dask_serialize()`
+    serialized proxied like in ``obj_pxy_dask_serialize()``
     """
     pxy = obj._pxy_get(copy=True)
     if pxy.serializer in ("dask", "pickle"):
@@ -897,7 +900,7 @@ def obj_pxy_dask_deserialize(header, frames):
 def unproxify_input_wrapper(func):
-    """Unproxify the input of `func`"""
+    """Unproxify the input of ``func``"""
     @functools.wraps(func)
     def wrapper(*args, **kwargs):

dask_cuda/tests/conftest.py ADDED Viewed

@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from dask_cuda.utils import has_device_memory_resource
+def pytest_configure(config):
+    """Register custom markers."""
+    config.addinivalue_line(
+        "markers",
+        "skip_if_no_device_memory: mark test to skip if device has no dedicated memory "
+        "resource",
+    )
+    config.addinivalue_line(
+        "markers",
+        "skip_if_device_memory: mark test to skip if device has dedicated memory "
+        "resource",
+    )
+def pytest_collection_modifyitems(items):
+    """Handle skip_if_no_device_memory marker."""
+    for item in items:
+        if item.get_closest_marker("skip_if_no_device_memory"):
+            skip_marker = item.get_closest_marker("skip_if_no_device_memory")
+            reason = skip_marker.kwargs.get(
+                "reason", "Test requires device with dedicated memory resource"
+            )
+            item.add_marker(
+                pytest.mark.skipif(not has_device_memory_resource(), reason=reason)
+            )
+        if item.get_closest_marker("skip_if_device_memory"):
+            skip_marker = item.get_closest_marker("skip_if_device_memory")
+            reason = skip_marker.kwargs.get(
+                "reason", "Test requires device without dedicated memory resource"
+            )
+            item.add_marker(
+                pytest.mark.skipif(has_device_memory_resource(), reason=reason)
+            )

dask_cuda/tests/test_dask_cuda_worker.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 from __future__ import absolute_import, division, print_function
 import os
@@ -16,15 +19,18 @@ from dask_cuda.utils import (
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
-    get_gpu_uuid_from_index,
+    get_gpu_uuid,
     get_n_gpus,
+    has_device_memory_resource,
     wait_workers,
 )
-@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,3,7,8"})
-def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
-    nthreads = 4
+@patch.dict(
+    os.environ,
+    {"CUDA_VISIBLE_DEVICES": "0,3,7,8", "DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC": "1"},
+)
+def test_cuda_visible_devices(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
         with popen(
             [
@@ -34,14 +40,10 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 "127.0.0.1:9359",
                 "--host",
                 "127.0.0.1",
-                "--device-memory-limit",
-                "1 MB",
-                "--nthreads",
-                str(nthreads),
                 "--no-dashboard",
                 "--worker-class",
                 "dask_cuda.utils_test.MockWorker",
-            ]
+            ],
         ):
             with Client("127.0.0.1:9359", loop=loop) as client:
                 assert wait_workers(client, n_gpus=4)
@@ -55,12 +57,43 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 for v in result.values():
                     del expected[v]
-                workers = client.scheduler_info()["workers"]
+                assert len(expected) == 0
+def test_memory_limit_and_nthreads(loop):  # noqa: F811
+    nthreads = 4
+    device_memory_limit_args = []
+    if has_device_memory_resource():
+        device_memory_limit_args = ["--device-memory-limit", "1 MB"]
+    with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9359",
+                "--host",
+                "127.0.0.1",
+                *device_memory_limit_args,
+                "--nthreads",
+                str(nthreads),
+                "--no-dashboard",
+                "--worker-class",
+                "dask_cuda.utils_test.MockWorker",
+            ],
+        ):
+            with Client("127.0.0.1:9359", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+                def get_visible_devices():
+                    return os.environ["CUDA_VISIBLE_DEVICES"]
+                workers = client.scheduler_info(n_workers=-1)["workers"]
                 for w in workers.values():
                     assert w["memory_limit"] == MEMORY_LIMIT // len(workers)
-                assert len(expected) == 0
 def test_rmm_pool(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
@@ -116,11 +149,6 @@ def test_rmm_managed(loop):  # noqa: F811
 def test_rmm_async(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    driver_version = rmm._cuda.gpu.driverGetVersion()
-    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-    if driver_version < 11020 or runtime_version < 11020:
-        pytest.skip("cudaMallocAsync not supported")
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
@@ -156,11 +184,6 @@ def test_rmm_async(loop):  # noqa: F811
 def test_rmm_async_with_maximum_pool_size(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    driver_version = rmm._cuda.gpu.driverGetVersion()
-    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-    if driver_version < 11020 or runtime_version < 11020:
-        pytest.skip("cudaMallocAsync not supported")
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
@@ -260,8 +283,12 @@ def test_cudf_spill_disabled(loop):  # noqa: F811
                     assert v == 0
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources cannot enable cuDF spill"
+)
 def test_cudf_spill(loop):  # noqa: F811
     cudf = pytest.importorskip("cudf")
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
@@ -289,6 +316,24 @@ def test_cudf_spill(loop):  # noqa: F811
                     assert v == 2
+@pytest.mark.skip_if_device_memory(
+    "Devices with dedicated memory resources cannot test error"
+)
+def test_cudf_spill_no_dedicated_memory_error():
+    pytest.importorskip("cudf")
+    ret = subprocess.run(
+        ["dask", "cuda", "worker", "127.0.0.1:9369", "--enable-cudf-spill"],
+        capture_output=True,
+    )
+    assert ret.returncode != 0
+    assert (
+        b"cuDF spilling is not supported on devices without dedicated memory"
+        in ret.stderr
+    )
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
@@ -409,7 +454,7 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
 def test_cuda_visible_devices_uuid(loop):  # noqa: F811
-    gpu_uuid = get_gpu_uuid_from_index(0)
+    gpu_uuid = get_gpu_uuid(0)
     with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": gpu_uuid}):
         with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
@@ -469,6 +514,11 @@ def test_rmm_track_allocations(loop):  # noqa: F811
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_get_cluster_configuration(loop):  # noqa: F811
     pytest.importorskip("rmm")
+    device_memory_limit_args = []
+    if has_device_memory_resource():
+        device_memory_limit_args += ["--device-memory-limit", "30 B"]
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
@@ -478,8 +528,7 @@ def test_get_cluster_configuration(loop):  # noqa: F811
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
-                "--device-memory-limit",
-                "30 B",
+                *device_memory_limit_args,
                 "--rmm-pool-size",
                 "2 GB",
                 "--rmm-maximum-pool-size",
@@ -496,12 +545,17 @@ def test_get_cluster_configuration(loop):  # noqa: F811
                 assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
                 assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
                 assert ret["jit-unspill"] is False
-                assert ret["device-memory-limit"] == 30
+                if has_device_memory_resource():
+                    assert ret["device-memory-limit"] == 30
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support fractional limits"
+)
 def test_worker_fraction_limits(loop):  # noqa: F811
     pytest.importorskip("rmm")
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
@@ -542,6 +596,33 @@ def test_worker_fraction_limits(loop):  # noqa: F811
                 )
+@pytest.mark.parametrize(
+    "argument", ["pool_size", "maximum_pool_size", "release_threshold"]
+)
+@pytest.mark.skip_if_device_memory(
+    "Devices with dedicated memory resources cannot test error"
+)
+def test_worker_fraction_limits_no_dedicated_memory(argument):
+    if argument == "pool_size":
+        argument_list = ["--rmm-pool-size", "0.1"]
+    elif argument == "maximum_pool_size":
+        argument_list = ["--rmm-pool-size", "1 GiB", "--rmm-maximum-pool-size", "0.1"]
+    else:
+        argument_list = ["--rmm-async", "--rmm-release-threshold", "0.1"]
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        ret = subprocess.run(
+            ["dask", "cuda", "worker", "127.0.0.1:9369", *argument_list],
+            capture_output=True,
+        )
+        assert ret.returncode != 0
+        assert (
+            b"Fractional of total device memory not supported in devices without a "
+            b"dedicated memory resource" in ret.stderr
+        )
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_worker_timeout():
     ret = subprocess.run(
@@ -592,6 +673,12 @@ def test_worker_cudf_spill_warning(enable_cudf_spill_warning):  # noqa: F811
             capture_output=True,
         )
         if enable_cudf_spill_warning:
-            assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
+            if has_device_memory_resource():
+                assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
+            else:
+                assert (
+                    b"cuDF spilling is not supported on devices without dedicated "
+                    b"memory" in ret.stderr
+                )
         else:
             assert b"UserWarning: cuDF spilling is enabled" not in ret.stderr

dask_cuda/tests/test_dgx.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import multiprocessing as mp
 import os
 from enum import Enum, auto
@@ -10,6 +13,7 @@ from distributed import Client
 from dask_cuda import LocalCUDACluster
 from dask_cuda.initialize import initialize
+from dask_cuda.utils_test import get_ucx_implementation
 mp = mp.get_context("spawn")  # type: ignore
 psutil = pytest.importorskip("psutil")
@@ -78,10 +82,7 @@ def test_default():
 def _test_tcp_over_ucx(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     with LocalCUDACluster(protocol=protocol, enable_tcp_over_ucx=True) as cluster:
         with Client(cluster) as client:
@@ -102,13 +103,10 @@ def _test_tcp_over_ucx(protocol):
 @pytest.mark.parametrize(
     "protocol",
-    ["ucx", "ucxx"],
+    ["ucx", "ucx-old"],
 )
 def test_tcp_over_ucx(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     if _is_ucx_116(ucp):
         pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
@@ -137,10 +135,7 @@ def _test_ucx_infiniband_nvlink(
     skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
 ):
     cupy = pytest.importorskip("cupy")
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     if enable_infiniband and not any(
         [at.startswith("rc") for at in ucp.get_active_transports()]
@@ -206,7 +201,7 @@ def _test_ucx_infiniband_nvlink(
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
 @pytest.mark.parametrize(
     "params",
     [
@@ -222,10 +217,7 @@ def _test_ucx_infiniband_nvlink(
     reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
 )
 def test_ucx_infiniband_nvlink(protocol, params):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
         pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")

dask_cuda/tests/test_explicit_comms.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright (c) 2021-2025 NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import multiprocessing as mp
@@ -21,18 +22,16 @@ from distributed.deploy.local import LocalCluster
 import dask_cuda
 from dask_cuda.explicit_comms import comms
-from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
-from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
+from dask_cuda.explicit_comms.dataframe.shuffle import (
+    _contains_shuffle_expr,
+    shuffle as explicit_comms_shuffle,
+)
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
-# Set default shuffle method to "tasks"
-if dask.config.get("dataframe.shuffle.method", None) is None:
-    dask.config.set({"dataframe.shuffle.method": "tasks"})
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
@@ -55,8 +54,10 @@ def _test_local_cluster(protocol):
             assert sum(c.run(my_rank, 0)) == sum(range(4))
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
 def test_local_cluster(protocol):
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     p = mp.Process(target=_test_local_cluster, args=(protocol,))
     p.start()
     p.join()
@@ -99,7 +100,7 @@ def test_dataframe_merge_empty_partitions():
 def check_partitions(df, npartitions):
-    """Check that all values in `df` hashes to the same"""
+    """Check that all values in ``df`` hashes to the same"""
     dtypes = {}
     for col, dtype in df.dtypes.items():
         if pd.api.types.is_numeric_dtype(dtype):
@@ -201,11 +202,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
         pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     p = mp.Process(
         target=_test_dataframe_shuffle, args=(backend, protocol, nworkers, _partitions)
@@ -322,10 +325,12 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
         pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     p = mp.Process(
         target=_test_dataframe_shuffle_merge, args=(backend, protocol, nworkers)
     )
@@ -359,9 +364,14 @@ def _test_jit_unspill(protocol):
             assert_eq(got, expected)
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
+@pytest.mark.skip_if_no_device_memory(
+    "JIT-Unspill not supported in devices without dedicated memory resource"
+)
 def test_jit_unspill(protocol):
     pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     p = mp.Process(target=_test_jit_unspill, args=(protocol,))
     p.start()
@@ -386,7 +396,7 @@ def _test_lock_workers(scheduler_address, ranks):
 def test_lock_workers():
     """
-    Testing `run(...,lock_workers=True)` by spawning 30 runs with overlapping
+    Testing ``run(...,lock_workers=True)`` by spawning 30 runs with overlapping
     and non-overlapping worker sets.
     """
     try:
@@ -425,7 +435,9 @@ def test_create_destroy_create():
     with LocalCluster(n_workers=1) as cluster:
         with Client(cluster) as client:
             context = comms.default_comms()
-            scheduler_addresses_old = list(client.scheduler_info()["workers"].keys())
+            scheduler_addresses_old = list(
+                client.scheduler_info(n_workers=-1)["workers"].keys()
+            )
             comms_addresses_old = list(comms.default_comms().worker_addresses)
             assert comms.default_comms() is context
             assert len(comms._comms_cache) == 1
@@ -446,7 +458,9 @@ def test_create_destroy_create():
     # because we referenced the old cluster's addresses.
     with LocalCluster(n_workers=1) as cluster:
         with Client(cluster) as client:
-            scheduler_addresses_new = list(client.scheduler_info()["workers"].keys())
+            scheduler_addresses_new = list(
+                client.scheduler_info(n_workers=-1)["workers"].keys()
+            )
             comms_addresses_new = list(comms.default_comms().worker_addresses)
     assert scheduler_addresses_new == comms_addresses_new
@@ -487,7 +501,8 @@ def test_scaled_cluster_gets_new_comms_context():
                 "n_workers": 2,
             }
             expected_1 = {
-                k: expected_values for k in client.scheduler_info()["workers"]
+                k: expected_values
+                for k in client.scheduler_info(n_workers=-1)["workers"]
             }
             assert result_1 == expected_1
@@ -515,7 +530,8 @@ def test_scaled_cluster_gets_new_comms_context():
                 "n_workers": 3,
             }
             expected_2 = {
-                k: expected_values for k in client.scheduler_info()["workers"]
+                k: expected_values
+                for k in client.scheduler_info(n_workers=-1)["workers"]
             }
             assert result_2 == expected_2
@@ -530,3 +546,20 @@ def test_scaled_cluster_gets_new_comms_context():
                 expected = shuffled.compute()
             assert_eq(result, expected)
+def test_contains_shuffle_expr():
+    df = dd.from_pandas(pd.DataFrame({"key": np.arange(10)}), npartitions=2)
+    assert not _contains_shuffle_expr(df)
+    with dask.config.set(explicit_comms=True):
+        shuffled = df.shuffle(on="key")
+        assert _contains_shuffle_expr(shuffled)
+        assert not _contains_shuffle_expr(df)
+        # this requires an active client.
+        with LocalCluster(n_workers=1) as cluster:
+            with Client(cluster):
+                explict_shuffled = explicit_comms_shuffle(df, ["key"])
+                assert not _contains_shuffle_expr(explict_shuffled)

dask_cuda/tests/test_from_array.py CHANGED Viewed

@@ -1,19 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 import dask.array as da
 from distributed import Client
 from dask_cuda import LocalCUDACluster
+from dask_cuda.utils_test import get_ucx_implementation
 cupy = pytest.importorskip("cupy")
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "tcp"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old", "tcp"])
 def test_ucx_from_array(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     N = 10_000
     with LocalCUDACluster(protocol=protocol) as cluster:

dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl