PyPI - dask-cuda - Versions diffs - 25.6.0__py3-none-any.whl → 25.10.0__py3-none-any.whl - Mend

dask-cuda 25.6.0py3-none-any.whl → 25.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +3 -0
dask_cuda/benchmarks/local_cudf_merge.py +4 -1
dask_cuda/benchmarks/local_cudf_shuffle.py +4 -1
dask_cuda/benchmarks/local_cupy.py +3 -0
dask_cuda/benchmarks/local_cupy_map_overlap.py +3 -0
dask_cuda/benchmarks/utils.py +6 -3
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +28 -58
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
dask_cuda/get_device_memory_objects.py +4 -7
dask_cuda/initialize.py +149 -94
dask_cuda/local_cuda_cluster.py +52 -70
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +12 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_cudf_builtin_spilling.py +3 -1
dask_cuda/tests/test_dask_cuda_worker.py +109 -25
dask_cuda/tests/test_dask_setup.py +193 -0
dask_cuda/tests/test_dgx.py +20 -44
dask_cuda/tests/test_explicit_comms.py +31 -12
dask_cuda/tests/test_from_array.py +4 -6
dask_cuda/tests/test_initialize.py +233 -65
dask_cuda/tests/test_local_cuda_cluster.py +129 -68
dask_cuda/tests/test_proxify_host_file.py +28 -7
dask_cuda/tests/test_proxy.py +15 -13
dask_cuda/tests/test_spill.py +10 -3
dask_cuda/tests/test_utils.py +100 -29
dask_cuda/tests/test_worker_spec.py +6 -0
dask_cuda/utils.py +211 -42
dask_cuda/utils_test.py +10 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +6 -1
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/METADATA +11 -4
dask_cuda-25.10.0.dist-info/RECORD +63 -0
dask_cuda-25.10.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.6.0.dist-info/RECORD +0 -57
dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/WHEEL +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/tests/test_dask_setup.py ADDED Viewed

@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os
+import time
+from contextlib import contextmanager
+from unittest.mock import Mock, patch
+import pytest
+from distributed import Client
+from distributed.utils import open_port
+from distributed.utils_test import popen
+from dask_cuda.initialize import dask_setup
+from dask_cuda.utils import wait_workers
+def test_dask_setup_function_with_mock_worker():
+    """Test the dask_setup function directly with mock worker."""
+    # Create a mock worker object
+    mock_worker = Mock()
+    with patch("dask_cuda.initialize._create_cuda_context") as mock_create_context:
+        # Test with create_cuda_context=True
+        # Call the underlying function directly (the Click decorator wraps the real
+        # function)
+        dask_setup.callback(
+            worker=mock_worker,
+            create_cuda_context=True,
+        )
+        mock_create_context.assert_called_once_with()
+        mock_create_context.reset_mock()
+        # Test with create_cuda_context=False
+        dask_setup.callback(
+            worker=mock_worker,
+            create_cuda_context=False,
+        )
+        mock_create_context.assert_not_called()
+@contextmanager
+def start_dask_scheduler(protocol: str, max_attempts: int = 5, timeout: int = 10):
+    """Start Dask scheduler in subprocess.
+    Attempts to start a Dask scheduler in subprocess, if the port is not available
+    retry on a different port up to a maximum of `max_attempts` attempts. The stdout
+    and stderr of the process is read to determine whether the scheduler failed to
+    bind to port or succeeded, and ensures no more than `timeout` seconds are awaited
+    for between reads.
+    This is primarily useful because UCX does not release TCP ports immediately. A
+    workaround without the need for this function is setting `UCX_TCP_CM_REUSEADDR=y`,
+    but that requires to be explicitly set when running tests, and that is not very
+    friendly.
+    Parameters
+    ----------
+    protocol: str
+        Communication protocol to use.
+    max_attempts: int
+        Maximum attempts to try to open scheduler.
+    timeout: int
+        Time to wait while reading stdout/stderr of subprocess.
+    """
+    port = open_port()
+    for _ in range(max_attempts):
+        with popen(
+            [
+                "dask",
+                "scheduler",
+                "--no-dashboard",
+                "--protocol",
+                protocol,
+                "--port",
+                str(port),
+            ],
+            capture_output=True,  # Capture stdout and stderr
+        ) as scheduler_process:
+            # Check if the scheduler process started successfully by streaming output
+            try:
+                start_time = time.monotonic()
+                while True:
+                    if time.monotonic() - start_time > timeout:
+                        raise TimeoutError("Timeout while waiting for scheduler output")
+                    line = scheduler_process.stdout.readline()
+                    if not line:
+                        break  # End of output
+                    print(
+                        line.decode(), end=""
+                    )  # Since capture_output=True, print the line here
+                    if b"Scheduler at:" in line:
+                        # Scheduler is now listening
+                        break
+                    elif b"UCXXBusyError" in line:
+                        raise Exception("UCXXBusyError detected in scheduler output")
+            except Exception:
+                port += 1
+            else:
+                yield scheduler_process, port
+                return
+    else:
+        pytest.fail(f"Failed to start dask scheduler after {max_attempts} attempts.")
+@pytest.mark.timeout(30)
+@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+def test_dask_cuda_worker_cli_integration(protocol, tmp_path):
+    """Test that dask cuda worker CLI correctly passes arguments to dask_setup.
+    Verifies the end-to-end integration where the CLI tool actually launches and calls
+    dask_setup with correct args.
+    """
+    # Use pytest's tmp_path for file management
+    capture_file_path = tmp_path / "dask_setup_integration_test.json"
+    preload_file = tmp_path / "preload_capture.py"
+    # Write the preload script to tmp_path
+    preload_file.write_text(
+        f'''
+import json
+import os
+def capture_dask_setup_call(worker, create_cuda_context):
+    """Capture dask_setup arguments and write to file."""
+    result = {{
+        'worker_protocol': getattr(worker, '_protocol', 'unknown'),
+        'create_cuda_context': create_cuda_context,
+        'test_success': True
+    }}
+    # Write immediately to ensure it gets captured
+    with open(r"{capture_file_path}", 'w') as f:
+        json.dump(result, f)
+# Patch dask_setup callback
+from dask_cuda.initialize import dask_setup
+dask_setup.callback = capture_dask_setup_call
+'''
+    )
+    with start_dask_scheduler(protocol=protocol) as scheduler_process_port:
+        scheduler_process, scheduler_port = scheduler_process_port
+        sched_addr = f"{protocol}://127.0.0.1:{scheduler_port}"
+        print(f"{sched_addr=}", flush=True)
+        # Build dask cuda worker args
+        dask_cuda_worker_args = [
+            "dask",
+            "cuda",
+            "worker",
+            sched_addr,
+            "--host",
+            "127.0.0.1",
+            "--no-dashboard",
+            "--preload",
+            str(preload_file),
+            "--death-timeout",
+            "10",
+        ]
+        with popen(dask_cuda_worker_args):
+            # Wait and check for worker connection
+            with Client(sched_addr) as client:
+                assert wait_workers(client, n_gpus=1)
+                # Check if dask_setup was called and captured correctly
+                if capture_file_path.exists():
+                    with open(capture_file_path, "r") as cf:
+                        captured_args = json.load(cf)
+                    # Verify the critical arguments were passed correctly
+                    assert (
+                        captured_args["create_cuda_context"] is True
+                    ), "create_cuda_context should be True"
+                    # Verify worker has a protocol set
+                    assert (
+                        captured_args["worker_protocol"] == protocol
+                    ), "Worker should have a protocol"
+                else:
+                    pytest.fail(
+                        "capture file not found: dask_setup was not called or "
+                        "failed to write to file"
+                    )

dask_cuda/tests/test_dgx.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import multiprocessing as mp
 import os
 from enum import Enum, auto
@@ -15,10 +18,6 @@ mp = mp.get_context("spawn")  # type: ignore
 psutil = pytest.importorskip("psutil")
-def _is_ucx_116(ucp):
-    return ucp.get_ucx_version()[:2] == (1, 16)
 class DGXVersion(Enum):
     DGX_1 = auto()
     DGX_2 = auto()
@@ -77,20 +76,17 @@ def test_default():
     assert not p.exitcode
-def _test_tcp_over_ucx(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+def _test_tcp_over_ucx():
+    ucxx = pytest.importorskip("ucxx")
-    with LocalCUDACluster(protocol=protocol, enable_tcp_over_ucx=True) as cluster:
+    with LocalCUDACluster(protocol="ucx", enable_tcp_over_ucx=True) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
             res = res.sum().compute()
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert "tcp" in conf["TLS"]
                 assert "cuda_copy" in conf["TLS"]
@@ -100,19 +96,10 @@ def _test_tcp_over_ucx(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize(
-    "protocol",
-    ["ucx", "ucxx"],
-)
-def test_tcp_over_ucx(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
-    if _is_ucx_116(ucp):
-        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
-    p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
+def test_tcp_over_ucx():
+    pytest.importorskip("distributed_ucxx")
+    p = mp.Process(target=_test_tcp_over_ucx)
     p.start()
     p.join()
     assert not p.exitcode
@@ -134,25 +121,22 @@ def test_tcp_only():
 def _test_ucx_infiniband_nvlink(
-    skip_queue, protocol, enable_infiniband, enable_nvlink, enable_rdmacm
+    skip_queue, enable_infiniband, enable_nvlink, enable_rdmacm
 ):
+    ucxx = pytest.importorskip("ucxx")
     cupy = pytest.importorskip("cupy")
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
     if enable_infiniband and not any(
-        [at.startswith("rc") for at in ucp.get_active_transports()]
+        [at.startswith("rc") for at in ucxx.get_active_transports()]
     ):
         skip_queue.put("No support available for 'rc' transport in UCX")
         return
     else:
         skip_queue.put("ok")
-    # `ucp.get_active_transports()` call above initializes UCX, we must reset it
+    # `ucxx.get_active_transports()` call above initializes UCX, we must reset it
     # so that Dask doesn't try to initialize it again and raise an exception.
-    ucp.reset()
+    ucxx.reset()
     if enable_infiniband is None and enable_nvlink is None and enable_rdmacm is None:
         enable_tcp_over_ucx = None
@@ -168,7 +152,6 @@ def _test_ucx_infiniband_nvlink(
             cm_tls_priority = ["tcp"]
     initialize(
-        protocol=protocol,
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
@@ -176,7 +159,7 @@ def _test_ucx_infiniband_nvlink(
     )
     with LocalCUDACluster(
-        protocol=protocol,
+        protocol="ucx",
         interface="ib0",
         enable_tcp_over_ucx=enable_tcp_over_ucx,
         enable_infiniband=enable_infiniband,
@@ -190,7 +173,7 @@ def _test_ucx_infiniband_nvlink(
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert all(t in conf["TLS"] for t in cm_tls)
                 assert all(p in conf["SOCKADDR_TLS_PRIORITY"] for p in cm_tls_priority)
@@ -206,7 +189,6 @@ def _test_ucx_infiniband_nvlink(
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
 @pytest.mark.parametrize(
     "params",
     [
@@ -221,13 +203,8 @@ def _test_ucx_infiniband_nvlink(
     _get_dgx_version() == DGXVersion.DGX_A100,
     reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
 )
-def test_ucx_infiniband_nvlink(protocol, params):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
-    if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
-        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
+def test_ucx_infiniband_nvlink(params):
+    pytest.importorskip("distributed_ucxx")
     skip_queue = mp.Queue()
@@ -235,7 +212,6 @@ def test_ucx_infiniband_nvlink(protocol, params):
         target=_test_ucx_infiniband_nvlink,
         args=(
             skip_queue,
-            protocol,
             params["enable_infiniband"],
             params["enable_nvlink"],
             params["enable_rdmacm"],

dask_cuda/tests/test_explicit_comms.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# Copyright (c) 2021-2025 NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import multiprocessing as mp
@@ -28,7 +29,6 @@ from dask_cuda.explicit_comms.dataframe.shuffle import (
 from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
-ucp = pytest.importorskip("ucp")
 # Notice, all of the following tests is executed in a new process such
@@ -53,8 +53,11 @@ def _test_local_cluster(protocol):
             assert sum(c.run(my_rank, 0)) == sum(range(4))
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 def test_local_cluster(protocol):
+    if protocol.startswith("ucx"):
+        pytest.importorskip("distributed_ucxx")
     p = mp.Process(target=_test_local_cluster, args=(protocol,))
     p.start()
     p.join()
@@ -97,7 +100,7 @@ def test_dataframe_merge_empty_partitions():
 def check_partitions(df, npartitions):
-    """Check that all values in `df` hashes to the same"""
+    """Check that all values in ``df`` hashes to the same"""
     dtypes = {}
     for col, dtype in df.dtypes.items():
         if pd.api.types.is_numeric_dtype(dtype):
@@ -199,11 +202,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 @pytest.mark.parametrize("_partitions", [True, False])
 def test_dataframe_shuffle(backend, protocol, nworkers, _partitions):
     if backend == "cudf":
         pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        pytest.importorskip("distributed_ucxx")
     p = mp.Process(
         target=_test_dataframe_shuffle, args=(backend, protocol, nworkers, _partitions)
@@ -320,10 +325,13 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 def test_dataframe_shuffle_merge(backend, protocol, nworkers):
     if backend == "cudf":
         pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        pytest.importorskip("distributed_ucxx")
     p = mp.Process(
         target=_test_dataframe_shuffle_merge, args=(backend, protocol, nworkers)
     )
@@ -357,9 +365,14 @@ def _test_jit_unspill(protocol):
             assert_eq(got, expected)
-@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
+@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+@pytest.mark.skip_if_no_device_memory(
+    "JIT-Unspill not supported in devices without dedicated memory resource"
+)
 def test_jit_unspill(protocol):
     pytest.importorskip("cudf")
+    if protocol.startswith("ucx"):
+        pytest.importorskip("distributed_ucxx")
     p = mp.Process(target=_test_jit_unspill, args=(protocol,))
     p.start()
@@ -384,7 +397,7 @@ def _test_lock_workers(scheduler_address, ranks):
 def test_lock_workers():
     """
-    Testing `run(...,lock_workers=True)` by spawning 30 runs with overlapping
+    Testing ``run(...,lock_workers=True)`` by spawning 30 runs with overlapping
     and non-overlapping worker sets.
     """
     try:
@@ -423,7 +436,9 @@ def test_create_destroy_create():
     with LocalCluster(n_workers=1) as cluster:
         with Client(cluster) as client:
             context = comms.default_comms()
-            scheduler_addresses_old = list(client.scheduler_info()["workers"].keys())
+            scheduler_addresses_old = list(
+                client.scheduler_info(n_workers=-1)["workers"].keys()
+            )
             comms_addresses_old = list(comms.default_comms().worker_addresses)
             assert comms.default_comms() is context
             assert len(comms._comms_cache) == 1
@@ -444,7 +459,9 @@ def test_create_destroy_create():
     # because we referenced the old cluster's addresses.
     with LocalCluster(n_workers=1) as cluster:
         with Client(cluster) as client:
-            scheduler_addresses_new = list(client.scheduler_info()["workers"].keys())
+            scheduler_addresses_new = list(
+                client.scheduler_info(n_workers=-1)["workers"].keys()
+            )
             comms_addresses_new = list(comms.default_comms().worker_addresses)
     assert scheduler_addresses_new == comms_addresses_new
@@ -485,7 +502,8 @@ def test_scaled_cluster_gets_new_comms_context():
                 "n_workers": 2,
             }
             expected_1 = {
-                k: expected_values for k in client.scheduler_info()["workers"]
+                k: expected_values
+                for k in client.scheduler_info(n_workers=-1)["workers"]
             }
             assert result_1 == expected_1
@@ -513,7 +531,8 @@ def test_scaled_cluster_gets_new_comms_context():
                 "n_workers": 3,
             }
             expected_2 = {
-                k: expected_values for k in client.scheduler_info()["workers"]
+                k: expected_values
+                for k in client.scheduler_info(n_workers=-1)["workers"]
             }
             assert result_2 == expected_2

dask_cuda/tests/test_from_array.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 import dask.array as da
@@ -8,13 +11,8 @@ from dask_cuda import LocalCUDACluster
 cupy = pytest.importorskip("cupy")
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "tcp"])
+@pytest.mark.parametrize("protocol", ["ucx", "tcp"])
 def test_ucx_from_array(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
     N = 10_000
     with LocalCUDACluster(protocol=protocol) as cluster:
         with Client(cluster):

dask-cuda 25.6.0__py3-none-any.whl → 25.10.0__py3-none-any.whl

dask-cuda 25.6.0py3-none-any.whl → 25.10.0py3-none-any.whl