PyPI - dask-cuda - Versions diffs - 25.6.0__py3-none-any.whl → 25.10.0__py3-none-any.whl - Mend

dask-cuda 25.6.0py3-none-any.whl → 25.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +3 -0
dask_cuda/benchmarks/local_cudf_merge.py +4 -1
dask_cuda/benchmarks/local_cudf_shuffle.py +4 -1
dask_cuda/benchmarks/local_cupy.py +3 -0
dask_cuda/benchmarks/local_cupy_map_overlap.py +3 -0
dask_cuda/benchmarks/utils.py +6 -3
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +28 -58
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
dask_cuda/get_device_memory_objects.py +4 -7
dask_cuda/initialize.py +149 -94
dask_cuda/local_cuda_cluster.py +52 -70
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +12 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_cudf_builtin_spilling.py +3 -1
dask_cuda/tests/test_dask_cuda_worker.py +109 -25
dask_cuda/tests/test_dask_setup.py +193 -0
dask_cuda/tests/test_dgx.py +20 -44
dask_cuda/tests/test_explicit_comms.py +31 -12
dask_cuda/tests/test_from_array.py +4 -6
dask_cuda/tests/test_initialize.py +233 -65
dask_cuda/tests/test_local_cuda_cluster.py +129 -68
dask_cuda/tests/test_proxify_host_file.py +28 -7
dask_cuda/tests/test_proxy.py +15 -13
dask_cuda/tests/test_spill.py +10 -3
dask_cuda/tests/test_utils.py +100 -29
dask_cuda/tests/test_worker_spec.py +6 -0
dask_cuda/utils.py +211 -42
dask_cuda/utils_test.py +10 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +6 -1
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/METADATA +11 -4
dask_cuda-25.10.0.dist-info/RECORD +63 -0
dask_cuda-25.10.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.6.0.dist-info/RECORD +0 -57
dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/WHEEL +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.6.0.dist-info → dask_cuda-25.10.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/tests/test_initialize.py CHANGED Viewed

@@ -1,6 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import multiprocessing as mp
+import os
+import shutil
+import subprocess
 import sys
+import tempfile
+import textwrap
+import cuda.core.experimental
 import numpy
 import psutil
 import pytest
@@ -21,22 +30,19 @@ mp = mp.get_context("spawn")  # type: ignore
 # of UCX before retrieving the current config.
-def _test_initialize_ucx_tcp(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+def _test_initialize_ucx_tcp():
+    ucxx = pytest.importorskip("ucxx")
     kwargs = {"enable_tcp_over_ucx": True}
-    initialize(protocol=protocol, **kwargs)
+    initialize(**kwargs)
     with LocalCluster(
-        protocol=protocol,
+        protocol="ucx",
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
         worker_class=IncreasedCloseTimeoutNanny,
-        config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
+        config={"distributed-ucxx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
@@ -44,7 +50,7 @@ def _test_initialize_ucx_tcp(protocol):
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert "tcp" in conf["TLS"]
                 assert "cuda_copy" in conf["TLS"]
@@ -55,35 +61,28 @@ def _test_initialize_ucx_tcp(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
-def test_initialize_ucx_tcp(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+def test_initialize_ucx_tcp():
+    pytest.importorskip("distributed_ucxx")
-    p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
+    p = mp.Process(target=_test_initialize_ucx_tcp)
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_nvlink(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+def _test_initialize_ucx_nvlink():
+    ucxx = pytest.importorskip("ucxx")
     kwargs = {"enable_nvlink": True}
-    initialize(protocol=protocol, **kwargs)
+    initialize(**kwargs)
     with LocalCluster(
-        protocol=protocol,
+        protocol="ucx",
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
         worker_class=IncreasedCloseTimeoutNanny,
-        config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
+        config={"distributed-ucxx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
@@ -91,7 +90,7 @@ def _test_initialize_ucx_nvlink(protocol):
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert "cuda_ipc" in conf["TLS"]
                 assert "tcp" in conf["TLS"]
@@ -103,35 +102,28 @@ def _test_initialize_ucx_nvlink(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
-def test_initialize_ucx_nvlink(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+def test_initialize_ucx_nvlink():
+    pytest.importorskip("distributed_ucxx")
-    p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
+    p = mp.Process(target=_test_initialize_ucx_nvlink)
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_infiniband(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+def _test_initialize_ucx_infiniband():
+    ucxx = pytest.importorskip("ucxx")
     kwargs = {"enable_infiniband": True}
-    initialize(protocol=protocol, **kwargs)
+    initialize(**kwargs)
     with LocalCluster(
-        protocol=protocol,
+        protocol="ucx",
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
         worker_class=IncreasedCloseTimeoutNanny,
-        config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
+        config={"distributed-ucxx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
@@ -139,7 +131,7 @@ def _test_initialize_ucx_infiniband(protocol):
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert "rc" in conf["TLS"]
                 assert "tcp" in conf["TLS"]
@@ -154,34 +146,27 @@ def _test_initialize_ucx_infiniband(protocol):
 @pytest.mark.skipif(
     "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
 )
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
-def test_initialize_ucx_infiniband(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
-    p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
+def test_initialize_ucx_infiniband():
+    pytest.importorskip("distributed_ucxx")
+    p = mp.Process(target=_test_initialize_ucx_infiniband)
     p.start()
     p.join()
     assert not p.exitcode
-def _test_initialize_ucx_all(protocol):
-    if protocol == "ucx":
-        ucp = pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        ucp = pytest.importorskip("ucxx")
+def _test_initialize_ucx_all():
+    ucxx = pytest.importorskip("ucxx")
-    initialize(protocol=protocol)
+    initialize()
     with LocalCluster(
-        protocol=protocol,
+        protocol="ucx",
         dashboard_address=None,
         n_workers=1,
         threads_per_worker=1,
         processes=True,
         worker_class=IncreasedCloseTimeoutNanny,
-        config={"distributed.comm.ucx": get_ucx_config()},
+        config={"distributed-ucxx": get_ucx_config()},
     ) as cluster:
         with Client(cluster) as client:
             res = da.from_array(numpy.arange(10000), chunks=(1000,))
@@ -189,7 +174,7 @@ def _test_initialize_ucx_all(protocol):
             assert res == 49995000
             def check_ucx_options():
-                conf = ucp.get_config()
+                conf = ucxx.get_config()
                 assert "TLS" in conf
                 assert conf["TLS"] == "all"
                 assert all(
@@ -204,14 +189,10 @@ def _test_initialize_ucx_all(protocol):
             assert all(client.run(check_ucx_options).values())
-@pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
-def test_initialize_ucx_all(protocol):
-    if protocol == "ucx":
-        pytest.importorskip("ucp")
-    elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+def test_initialize_ucx_all():
+    pytest.importorskip("distributed_ucxx")
-    p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
+    p = mp.Process(target=_test_initialize_ucx_all)
     p.start()
     p.join()
     assert not p.exitcode
@@ -250,3 +231,190 @@ def test_dask_cuda_import():
     p.start()
     p.join()
     assert not p.exitcode
+def _test_cuda_context_warning_with_subprocess_warnings(protocol):
+    """Test CUDA context warnings from both parent and worker subprocesses.
+    This test creates a standalone script that imports a problematic library
+    and creates LocalCUDACluster with processes=True. This should generate
+    warnings from both the parent process and each worker subprocess, since
+    they all inherit the CUDA context created at import time.
+    """
+    # Create temporary directory for our test files
+    temp_dir = tempfile.mkdtemp()
+    # Create the problematic library that creates CUDA context at import
+    problematic_library_code = textwrap.dedent(
+        """
+        # Problematic library that creates CUDA context at import time
+        import os
+        import cuda.core.experimental
+        try:
+            # Create CUDA context at import time, this will be inherited by subprocesses
+            cuda.core.experimental.Device().set_current()
+            print("Problematic library: Created CUDA context at import time")
+            os.environ['SUBPROCESS_CUDA_CONTEXT_CREATED'] = '1'
+        except Exception as e:
+            raise RuntimeError(
+                f"Problematic library: Failed to create CUDA context({e})"
+            )
+            os.environ['SUBPROCESS_CUDA_CONTEXT_CREATED'] = '0'
+        """
+    )
+    problematic_lib_path = os.path.join(temp_dir, "problematic_cuda_library.py")
+    with open(problematic_lib_path, "w") as f:
+        f.write(problematic_library_code)
+    # Create the main test script that imports the problematic library
+    # and creates LocalCUDACluster - this will run in a subprocess
+    main_script_code = textwrap.dedent(
+        f"""
+        # Main script that demonstrates the real-world problematic scenario
+        import os
+        import sys
+        import logging
+        # Add the temp directory to path so we can import our problematic library
+        sys.path.insert(0, '{temp_dir}')
+        print("=== Starting subprocess warnings test ===")
+        # This is the key part: import the problematic library BEFORE creating
+        # LocalCUDACluster. This creates a CUDA context that will be inherited
+        # by all worker subprocesses
+        print("Importing problematic library...")
+        import problematic_cuda_library
+        context_mode = os.environ.get('SUBPROCESS_CUDA_CONTEXT_CREATED', None)
+        if context_mode == "1":
+            print(f"Context creation successful")
+        else:
+            raise RuntimeError("Context creation failed")
+        if __name__ == "__main__":
+            try:
+                from dask_cuda import LocalCUDACluster
+                from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
+                cluster = LocalCUDACluster(
+                    dashboard_address=None,
+                    worker_class=IncreasedCloseTimeoutNanny,
+                    protocol=f"{protocol}",
+                )
+                print("LocalCUDACluster created successfully!")
+                cluster.close()
+                print("Cluster closed successfully")
+            except Exception as e:
+                raise RuntimeError(f"Cluster setup error: {{e}}")
+        print("=== Subprocess warnings test completed ===")
+    """
+    )
+    main_script_path = os.path.join(temp_dir, "test_subprocess_warnings.py")
+    with open(main_script_path, "w") as f:
+        f.write(main_script_code)
+    try:
+        # Run the main script in a subprocess
+        result = subprocess.run(
+            [sys.executable, main_script_path],
+            capture_output=True,
+            text=True,
+            timeout=30,  # Reduced timeout for simpler test
+            cwd=os.getcwd(),
+        )
+        # Check for successful test execution regardless of warnings
+        assert (
+            "Context creation successful" in result.stdout
+        ), "Test did not create a CUDA context"
+        assert (
+            "Creating LocalCUDACluster" in result.stdout
+            or "LocalCUDACluster created successfully" in result.stdout
+        ), "LocalCUDACluster was not created"
+        # Check the log file for warnings from multiple processes
+        warnings_found = []
+        warnings_assigned_device_found = []
+        # Look for CUDA context warnings from different processes
+        lines = result.stderr.split("\n")
+        for line in lines:
+            if "A CUDA context for device" in line and "already exists" in line:
+                warnings_found.append(line)
+            if (
+                "should have a CUDA context assigned to device" in line
+                and "but instead the CUDA context is on device" in line
+            ):
+                warnings_assigned_device_found.append(line)
+        num_devices = cuda.core.experimental.system.num_devices
+        # Every worker raises the warning once. With protocol="ucx" the warning is
+        # raised once more by the parent process.
+        expected_warnings = num_devices if protocol == "tcp" else num_devices + 1
+        assert len(warnings_found) == expected_warnings, (
+            f"Expected {expected_warnings} CUDA context warnings, "
+            f"but found {len(warnings_assigned_device_found)}"
+        )
+        # Can only be tested in multi-GPU test environment, device 0 can never raise
+        # this warning (because it's where all CUDA contexts are created), thus one
+        # warning is raised by every device except 0.
+        expected_assigned_device_warnings = num_devices - 1
+        assert (
+            len(warnings_assigned_device_found) == expected_assigned_device_warnings
+        ), (
+            f"Expected {expected_assigned_device_warnings} warnings assigned to "
+            f"device, but found {len(warnings_assigned_device_found)}"
+        )
+        # Verify warnings contents
+        for warning in warnings_found:
+            assert (
+                "This is often the result of a CUDA-enabled library calling a "
+                "CUDA runtime function before Dask-CUDA" in warning
+            ), f"Warning missing explanatory text: {warning}"
+        for warning in warnings_assigned_device_found:
+            assert (
+                "This is often the result of a CUDA-enabled library calling a "
+                "CUDA runtime function before Dask-CUDA" in warning
+            ), f"Warning missing explanatory text: {warning}"
+    finally:
+        # Clean up temporary files
+        try:
+            if os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir)
+        except Exception as e:
+            print(f"Cleanup error: {e}")
+@pytest.mark.parametrize("protocol", ["tcp", "ucx"])
+def test_cuda_context_warning_with_subprocess_warnings(protocol):
+    """Test CUDA context warnings from parent and worker subprocesses.
+    This test creates a standalone script that imports a problematic library at the top
+    level and then creates LocalCUDACluster with processes=True. This replicates the
+    exact real-world scenario where:
+    1. User imports a problematic library that creates CUDA context at import time
+    2. User creates LocalCUDACluster with multiple workers
+    3. Each worker subprocess inherits the CUDA context and emits warnings
+    4. Multiple warnings are generated (parent process + each worker subprocess)
+    This is the ultimate test as it demonstrates the distributed warning scenario
+    that users actually encounter in production.
+    """
+    p = mp.Process(
+        target=_test_cuda_context_warning_with_subprocess_warnings, args=(protocol,)
+    )
+    p.start()
+    p.join()
+    assert not p.exitcode

dask-cuda 25.6.0__py3-none-any.whl → 25.10.0__py3-none-any.whl

dask-cuda 25.6.0py3-none-any.whl → 25.10.0py3-none-any.whl