PyPI - dask-cuda - Versions diffs - 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/_compat.py +18 -0
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
dask_cuda/benchmarks/local_cudf_merge.py +5 -2
dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
dask_cuda/benchmarks/local_cupy.py +4 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
dask_cuda/benchmarks/utils.py +7 -4
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +27 -57
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
dask_cuda/get_device_memory_objects.py +18 -3
dask_cuda/initialize.py +80 -44
dask_cuda/is_device_object.py +4 -1
dask_cuda/is_spillable_object.py +4 -1
dask_cuda/local_cuda_cluster.py +63 -66
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +15 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_dask_cuda_worker.py +114 -27
dask_cuda/tests/test_dgx.py +10 -18
dask_cuda/tests/test_explicit_comms.py +51 -18
dask_cuda/tests/test_from_array.py +7 -5
dask_cuda/tests/test_initialize.py +16 -37
dask_cuda/tests/test_local_cuda_cluster.py +164 -54
dask_cuda/tests/test_proxify_host_file.py +33 -4
dask_cuda/tests/test_proxy.py +18 -16
dask_cuda/tests/test_rdd_ucx.py +160 -0
dask_cuda/tests/test_spill.py +107 -27
dask_cuda/tests/test_utils.py +106 -20
dask_cuda/tests/test_worker_spec.py +5 -2
dask_cuda/utils.py +319 -68
dask_cuda/utils_test.py +23 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +12 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
dask_cuda-25.8.0.dist-info/RECORD +63 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.4.0.dist-info/RECORD +0 -56
dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/tests/test_initialize.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import multiprocessing as mp
 import sys
@@ -11,7 +14,7 @@ from distributed.deploy.local import LocalCluster
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import get_ucx_config
-from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
 mp = mp.get_context("spawn")  # type: ignore
@@ -22,10 +25,7 @@ mp = mp.get_context("spawn")  # type: ignore
 def _test_initialize_ucx_tcp(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     kwargs = {"enable_tcp_over_ucx": True}
     initialize(protocol=protocol, **kwargs)
@@ -55,12 +55,9 @@ def _test_initialize_ucx_tcp(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
 def test_initialize_ucx_tcp(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
     p.start()
@@ -69,10 +66,7 @@ def test_initialize_ucx_tcp(protocol):
 def _test_initialize_ucx_nvlink(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     kwargs = {"enable_nvlink": True}
     initialize(protocol=protocol, **kwargs)
@@ -103,12 +97,9 @@ def _test_initialize_ucx_nvlink(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
 def test_initialize_ucx_nvlink(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
     p.start()
@@ -117,10 +108,7 @@ def test_initialize_ucx_nvlink(protocol):
 def _test_initialize_ucx_infiniband(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     kwargs = {"enable_infiniband": True}
     initialize(protocol=protocol, **kwargs)
@@ -154,12 +142,9 @@ def _test_initialize_ucx_infiniband(protocol):
 @pytest.mark.skipif(
     "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
 )
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
 def test_initialize_ucx_infiniband(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
     p.start()
@@ -168,10 +153,7 @@ def test_initialize_ucx_infiniband(protocol):
 def _test_initialize_ucx_all(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+    ucp = get_ucx_implementation(protocol)
     initialize(protocol=protocol)
     with LocalCluster(
@@ -204,12 +186,9 @@ def _test_initialize_ucx_all(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
 def test_initialize_ucx_all(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
     p.start()

dask_cuda/tests/test_local_cuda_cluster.py CHANGED Viewed

@@ -1,4 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
+import contextlib
 import os
 import pkgutil
 import sys
@@ -16,16 +20,18 @@ from dask_cuda.utils import (
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
-    get_gpu_uuid_from_index,
+    get_gpu_uuid,
+    has_device_memory_resource,
     print_cluster_config,
 )
-from dask_cuda.utils_test import MockWorker
+from dask_cuda.utils_test import MockWorker, get_ucx_implementation
 @gen_test(timeout=20)
 async def test_local_cuda_cluster():
     async with LocalCUDACluster(
-        scheduler_port=0, asynchronous=True, device_memory_limit=1
+        scheduler_port=0,
+        asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             assert len(cluster.workers) == utils.get_n_gpus()
@@ -65,8 +71,8 @@ async def test_with_subset_of_cuda_visible_devices():
     async with LocalCUDACluster(
         scheduler_port=0,
         asynchronous=True,
-        device_memory_limit=1,
         worker_class=MockWorker,
+        data=dict,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             assert len(cluster.workers) == 4
@@ -89,14 +95,11 @@ async def test_with_subset_of_cuda_visible_devices():
 @pytest.mark.parametrize(
     "protocol",
-    ["ucx", "ucxx"],
+    ["ucx", "ucx-old"],
 )
 @gen_test(timeout=20)
 async def test_ucx_protocol(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     async with LocalCUDACluster(
         protocol=protocol, asynchronous=True, data=dict
@@ -109,35 +112,32 @@ async def test_ucx_protocol(protocol):
 @pytest.mark.parametrize(
     "protocol",
-    ["ucx", "ucxx"],
+    ["ucx", "ucx-old"],
 )
 @gen_test(timeout=20)
 async def test_explicit_ucx_with_protocol_none(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     initialize(protocol=protocol, enable_tcp_over_ucx=True)
     async with LocalCUDACluster(
-        protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
+        protocol=None,
+        enable_tcp_over_ucx=True,
+        asynchronous=True,
     ) as cluster:
         assert all(
-            ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
+            ws.address.startswith(f"{protocol}://")
+            for ws in cluster.scheduler.workers.values()
         )
 @pytest.mark.filterwarnings("ignore:Exception ignored in")
 @pytest.mark.parametrize(
     "protocol",
-    ["ucx", "ucxx"],
+    ["ucx", "ucx-old"],
 )
 @gen_test(timeout=20)
 async def test_ucx_protocol_type_error(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    get_ucx_implementation(protocol)
     initialize(protocol=protocol, enable_tcp_over_ucx=True)
     with pytest.raises(TypeError):
@@ -150,7 +150,10 @@ async def test_ucx_protocol_type_error(protocol):
 @gen_test(timeout=20)
 async def test_n_workers():
     async with LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
+        CUDA_VISIBLE_DEVICES="0,1",
+        worker_class=MockWorker,
+        asynchronous=True,
+        data=dict,
     ) as cluster:
         assert len(cluster.workers) == 2
         assert len(cluster.worker_spec) == 2
@@ -205,10 +208,13 @@ async def test_no_memory_limits_cudaworker():
 @gen_test(timeout=20)
 async def test_all_to_all():
     async with LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
+        CUDA_VISIBLE_DEVICES="0,1",
+        worker_class=MockWorker,
+        asynchronous=True,
+        data=dict,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
-            workers = list(client.scheduler_info()["workers"])
+            workers = list(client.scheduler_info(n_workers=-1)["workers"])
             n_workers = len(workers)
             await utils.all_to_all(client)
             # assert all to all has resulted in all data on every worker
@@ -260,11 +266,6 @@ async def test_rmm_managed():
 async def test_rmm_async():
     rmm = pytest.importorskip("rmm")
-    driver_version = rmm._cuda.gpu.driverGetVersion()
-    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-    if driver_version < 11020 or runtime_version < 11020:
-        pytest.skip("cudaMallocAsync not supported")
     async with LocalCUDACluster(
         rmm_async=True,
         rmm_pool_size="2GB",
@@ -287,11 +288,6 @@ async def test_rmm_async():
 async def test_rmm_async_with_maximum_pool_size():
     rmm = pytest.importorskip("rmm")
-    driver_version = rmm._cuda.gpu.driverGetVersion()
-    runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-    if driver_version < 11020 or runtime_version < 11020:
-        pytest.skip("cudaMallocAsync not supported")
     async with LocalCUDACluster(
         rmm_async=True,
         rmm_pool_size="2GB",
@@ -378,7 +374,6 @@ async def test_cluster_worker():
     async with LocalCUDACluster(
         scheduler_port=0,
         asynchronous=True,
-        device_memory_limit=1,
         n_workers=1,
     ) as cluster:
         assert len(cluster.workers) == 1
@@ -419,7 +414,7 @@ async def test_available_mig_workers():
 @gen_test(timeout=20)
 async def test_gpu_uuid():
-    gpu_uuid = get_gpu_uuid_from_index(0)
+    gpu_uuid = get_gpu_uuid(0)
     async with LocalCUDACluster(
         CUDA_VISIBLE_DEVICES=gpu_uuid,
@@ -461,7 +456,7 @@ async def test_get_cluster_configuration():
     async with LocalCUDACluster(
         rmm_pool_size="2GB",
         rmm_maximum_pool_size="3GB",
-        device_memory_limit="30B",
+        device_memory_limit="30B" if has_device_memory_resource() else None,
         CUDA_VISIBLE_DEVICES="0",
         scheduler_port=0,
         asynchronous=True,
@@ -471,10 +466,14 @@ async def test_get_cluster_configuration():
             assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
             assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
             assert ret["jit-unspill"] is False
-            assert ret["device-memory-limit"] == 30
+            if has_device_memory_resource():
+                assert ret["device-memory-limit"] == 30
 @gen_test(timeout=20)
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support fractional limits"
+)
 async def test_worker_fraction_limits():
     async with LocalCUDACluster(
         dashboard_address=None,
@@ -500,6 +499,40 @@ async def test_worker_fraction_limits():
             )
+# Intentionally not using @gen_test to skip cleanup checks
+@pytest.mark.parametrize(
+    "argument", ["pool_size", "maximum_pool_size", "release_threshold"]
+)
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
+@pytest.mark.skip_if_device_memory(
+    "Devices with dedicated memory resources cannot test error"
+)
+def test_worker_fraction_limits_no_dedicated_memory(argument):
+    async def _test_worker_fraction_limits_no_dedicated_memory():
+        if argument == "pool_size":
+            kwargs = {"rmm_pool_size": "0.1"}
+        elif argument == "maximum_pool_size":
+            kwargs = {"rmm_pool_size": "1 GiB", "rmm_maximum_pool_size": "0.1"}
+        else:
+            kwargs = {"rmm_async": True, "rmm_release_threshold": "0.1"}
+        with raises_with_cause(
+            RuntimeError,
+            "Nanny failed to start",
+            RuntimeError,
+            "Worker failed to start",
+            ValueError,
+            "Fractional of total device memory not supported in devices without a "
+            "dedicated memory resource",
+        ):
+            await LocalCUDACluster(
+                asynchronous=True,
+                **kwargs,
+            )
+    asyncio.run(_test_worker_fraction_limits_no_dedicated_memory())
 @gen_test(timeout=20)
 async def test_cudf_spill_disabled():
     cudf = pytest.importorskip("cudf")
@@ -524,6 +557,9 @@ async def test_cudf_spill_disabled():
 @gen_test(timeout=20)
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources cannot enable cuDF spill"
+)
 async def test_cudf_spill():
     cudf = pytest.importorskip("cudf")
@@ -548,27 +584,101 @@ async def test_cudf_spill():
                 assert v == 2
+@pytest.mark.skip_if_device_memory(
+    "Devices with dedicated memory resources cannot test error"
+)
+@gen_test(timeout=20)
+async def test_cudf_spill_no_dedicated_memory():
+    cudf = pytest.importorskip("cudf")  # noqa: F841
+    with pytest.raises(
+        ValueError,
+        match="cuDF spilling is not supported on devices without dedicated memory",
+    ):
+        await LocalCUDACluster(
+            enable_cudf_spill=True,
+            cudf_spill_stats=2,
+            asynchronous=True,
+        )
 @pytest.mark.parametrize(
     "protocol",
-    ["ucx", "ucxx"],
+    ["ucx", "ucx-old"],
+)
+@pytest.mark.parametrize(
+    "jit_unspill",
+    [False, True],
 )
-def test_print_cluster_config(capsys, protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+@pytest.mark.parametrize(
+    "device_memory_limit",
+    [None, "1B"],
+)
+def test_print_cluster_config(capsys, protocol, jit_unspill, device_memory_limit):
+    get_ucx_implementation(protocol)
     pytest.importorskip("rich")
-    with LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
-    ) as cluster:
-        with Client(cluster) as client:
-            print_cluster_config(client)
-            captured = capsys.readouterr()
-            assert "Dask Cluster Configuration" in captured.out
-            assert protocol in captured.out
-            assert "1 B" in captured.out
-            assert "[plugin]" in captured.out
+    ctx = contextlib.nullcontext()
+    if not has_device_memory_resource():
+        if device_memory_limit:
+            ctx = pytest.raises(
+                ValueError,
+                match="device_memory_limit is set but device has no dedicated memory.",
+            )
+        if jit_unspill:
+            # JIT-Unspill exception has precedence, thus overwrite ctx if both are
+            # enabled
+            ctx = pytest.raises(
+                ValueError,
+                match="JIT-Unspill is not supported on devices without dedicated "
+                "memory",
+            )
+    with ctx:
+        with LocalCUDACluster(
+            n_workers=1,
+            device_memory_limit=device_memory_limit,
+            jit_unspill=jit_unspill,
+            protocol=protocol,
+        ) as cluster:
+            with Client(cluster) as client:
+                print_cluster_config(client)
+                captured = capsys.readouterr()
+                assert "Dask Cluster Configuration" in captured.out
+                assert protocol in captured.out
+                if device_memory_limit == "1B":
+                    assert "1 B" in captured.out
+                assert "[plugin]" in captured.out
+                client.shutdown()
+    def ucxpy_reset(timeout=20):
+        """Reset UCX-Py with a timeout.
+        Attempt to reset UCX-Py, not doing so may cause a deadlock because UCX-Py is
+        not thread-safe and the Dask cluster may still be alive while a new cluster
+        and UCX-Py instances are initalized.
+        """
+        import time
+        import ucp
+        start = time.monotonic()
+        while True:
+            try:
+                ucp.reset()
+            except ucp._libs.exceptions.UCXError as e:
+                if time.monotonic() - start > timeout:
+                    raise RuntimeError(
+                        f"Could not reset UCX-Py in {timeout} seconds, this may result "
+                        f"in a deadlock. Failure:\n{e}"
+                    )
+                continue
+            else:
+                break
+    if protocol == "ucx-old":
+        ucxpy_reset()
 @pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")

dask_cuda/tests/test_proxify_host_file.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 from typing import Iterable
 from unittest.mock import patch
@@ -217,6 +220,9 @@ def test_one_item_host_limit(capsys, root_dir):
     assert len(dhf.manager) == 0
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support spilling"
+)
 def test_spill_on_demand(root_dir):
     """
     Test spilling on demand by disabling the device_memory_limit
@@ -239,6 +245,9 @@ def test_spill_on_demand(root_dir):
 @pytest.mark.parametrize("jit_unspill", [True, False])
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support spilling"
+)
 @gen_test(timeout=20)
 async def test_local_cuda_cluster(jit_unspill):
     """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -375,9 +384,9 @@ def test_externals(root_dir):
 @patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
 def test_incompatible_types(root_dir):
-    """Check that ProxifyHostFile unproxifies `cupy.ndarray` on retrieval
+    """Check that ProxifyHostFile unproxifies ``cupy.ndarray`` on retrieval
-    Notice, in this test we add `cupy.ndarray` to the incompatible_types temporarily.
+    Notice, in this test we add ``cupy.ndarray`` to the incompatible_types temporarily.
     """
     cupy = pytest.importorskip("cupy")
     cudf = pytest.importorskip("cudf")
@@ -396,6 +405,9 @@ def test_incompatible_types(root_dir):
 @pytest.mark.parametrize("npartitions", [1, 2, 3])
 @pytest.mark.parametrize("compatibility_mode", [True, False])
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support JIT-Unspill"
+)
 @gen_test(timeout=30)
 async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
     cudf = pytest.importorskip("cudf")
@@ -414,7 +426,7 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
                 ddf = dask.dataframe.from_pandas(
                     cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
                 )
-                res = ddf.shuffle(on="key", shuffle_method="tasks").persist()
+                [res] = client.persist([ddf.shuffle(on="key", shuffle_method="tasks")])
                 # With compatibility mode on, we shouldn't encounter any proxy objects
                 if compatibility_mode:
@@ -428,6 +440,9 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
                     assert all(res)  # Only proxy objects
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support JIT-Unspill"
+)
 @gen_test(timeout=60)
 async def test_worker_force_spill_to_disk():
     """Test Dask triggering CPU-to-Disk spilling"""
@@ -440,7 +455,7 @@ async def test_worker_force_spill_to_disk():
             async with Client(cluster, asynchronous=True) as client:
                 # Create a df that are spilled to host memory immediately
                 df = cudf.DataFrame({"key": np.arange(10**8)})
-                ddf = dask.dataframe.from_pandas(df, npartitions=1).persist()
+                [ddf] = client.persist([dask.dataframe.from_pandas(df, npartitions=1)])
                 await ddf
                 async def f(dask_worker):
@@ -463,6 +478,9 @@ async def test_worker_force_spill_to_disk():
                 assert "Unmanaged memory use is high" not in log
+@pytest.mark.skip_if_no_device_memory(
+    "Devices without dedicated memory resources do not support JIT-Unspill"
+)
 def test_on_demand_debug_info():
     """Test worker logging when on-demand-spilling fails"""
     rmm = pytest.importorskip("rmm")
@@ -498,3 +516,14 @@ def test_on_demand_debug_info():
             assert f"WARNING - RMM allocation of {size} failed" in log
             assert f"RMM allocs: {size}" in log
             assert "traceback:" in log
+def test_sizeof_owner_with_cai():
+    cudf = pytest.importorskip("cudf")
+    s = cudf.Series([1, 2, 3])
+    items = dask_cuda.get_device_memory_objects.dispatch(s)
+    assert len(items) == 1
+    item = items[0]
+    result = dask.sizeof.sizeof(item)
+    assert result == 24

dask_cuda/tests/test_proxy.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import operator
 import os
 import pickle
@@ -23,7 +26,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
-from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
 # Make the "disk" serializer available and use a directory that are
 # remove on exit.
@@ -242,7 +245,7 @@ def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers):
 @pytest.mark.parametrize("backend", ["numpy", "cupy"])
 def test_fixed_attribute_length(backend):
-    """Test fixed attribute `x.__len__` access
+    """Test fixed attribute ``x.__len__`` access
     Notice, accessing fixed attributes shouldn't de-serialize the proxied object
     """
@@ -263,7 +266,7 @@ def test_fixed_attribute_length(backend):
 def test_fixed_attribute_name():
-    """Test fixed attribute `x.name` access
+    """Test fixed attribute ``x.name`` access
     Notice, accessing fixed attributes shouldn't de-serialize the proxied object
     """
@@ -284,6 +287,9 @@ def test_fixed_attribute_name():
 @pytest.mark.parametrize("jit_unspill", [True, False])
+@pytest.mark.skip_if_no_device_memory(
+    "Spilling not supported in devices without dedicated memory resource"
+)
 @gen_test(timeout=20)
 async def test_spilling_local_cuda_cluster(jit_unspill):
     """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -386,8 +392,8 @@ def test_serializing_array_to_disk(backend, serializers, size):
 class _PxyObjTest(proxy_object.ProxyObject):
     """
     A class that:
-        - defines `__dask_tokenize__` in order to avoid deserialization when
-          calling `client.scatter()`
+        - defines ``__dask_tokenize__`` in order to avoid deserialization when
+          calling ``client.scatter()``
         - Asserts that no deserialization is performaned when communicating.
     """
@@ -401,14 +407,12 @@ class _PxyObjTest(proxy_object.ProxyObject):
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
 @gen_test(timeout=120)
 async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     cudf = pytest.importorskip("cudf")
     def task(x):
@@ -417,7 +421,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
         serializers_used = x._pxy_get().serializer
         # Check that `x` is serialized with the expected serializers
-        if protocol in ["ucx", "ucxx"]:
+        if protocol in ["ucx", "ucx-old"]:
             if send_serializers is None:
                 assert serializers_used == "cuda"
             else:
@@ -448,15 +452,13 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
             await client.submit(task, df)
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
 @pytest.mark.parametrize("shared_fs", [True, False])
 @gen_test(timeout=20)
 async def test_communicating_disk_objects(protocol, shared_fs):
     """Testing disk serialization of cuDF dataframe when communicating"""
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+    if protocol.startswith("ucx"):
+        get_ucx_implementation(protocol)
     cudf = pytest.importorskip("cudf")
     ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs

dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl