PyPI - dask-cuda - Versions diffs - 23.10.0a231015__py3-none-any.whl → 23.12.0a24__py3-none-any.whl - Mend

dask-cuda 23.10.0a231015py3-none-any.whl → 23.12.0a24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

dask_cuda/VERSION +1 -0
dask_cuda/__init__.py +1 -3
dask_cuda/_version.py +20 -0
dask_cuda/cuda_worker.py +1 -3
dask_cuda/device_host_file.py +1 -1
dask_cuda/local_cuda_cluster.py +12 -9
dask_cuda/plugins.py +122 -0
dask_cuda/tests/test_dask_cuda_worker.py +3 -3
dask_cuda/tests/test_explicit_comms.py +7 -0
dask_cuda/tests/test_initialize.py +5 -0
dask_cuda/tests/test_local_cuda_cluster.py +3 -1
dask_cuda/tests/test_proxify_host_file.py +5 -1
dask_cuda/tests/test_proxy.py +2 -0
dask_cuda/tests/test_spill.py +67 -30
dask_cuda/utils.py +1 -139
dask_cuda/utils_test.py +45 -0
dask_cuda/worker_spec.py +2 -1
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/METADATA +3 -3
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/RECORD +23 -20
dask_cuda/compat.py +0 -118
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/LICENSE +0 -0
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/WHEEL +0 -0
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/entry_points.txt +0 -0
{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/top_level.txt +0 -0

dask_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 23.12.00a24

dask_cuda/__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@ import dask.dataframe.shuffle
 import dask.dataframe.multi
 import dask.bag.core
+from ._version import __git_commit__, __version__
 from .cuda_worker import CUDAWorker
 from .explicit_comms.dataframe.shuffle import (
     get_rearrange_by_column_wrapper,
@@ -19,9 +20,6 @@ from .explicit_comms.dataframe.shuffle import (
 from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
-__version__ = "23.10.00"
-from . import compat
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(

dask_cuda/_version.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.resources
+__version__ = (
+    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""

dask_cuda/cuda_worker.py CHANGED Viewed

@@ -20,11 +20,9 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_n_gpus,

dask_cuda/device_host_file.py CHANGED Viewed

@@ -17,7 +17,7 @@ from distributed.protocol import (
     serialize_bytelist,
 )
 from distributed.sizeof import safe_sizeof
-from distributed.spill import CustomFile as KeyAsStringFile
+from distributed.spill import AnyKeyFile as KeyAsStringFile
 from distributed.utils import nbytes
 from .is_device_object import is_device_object

dask_cuda/local_cuda_cluster.py CHANGED Viewed

@@ -2,6 +2,7 @@ import copy
 import logging
 import os
 import warnings
+from functools import partial
 import dask
 from distributed import LocalCluster, Nanny, Worker
@@ -9,11 +10,9 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
+from .plugins import CPUAffinity, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
-    CPUAffinity,
-    PreImport,
-    RMMSetup,
     cuda_visible_devices,
     get_cpu_affinity,
     get_ucx_config,
@@ -334,12 +333,16 @@ class LocalCUDACluster(LocalCluster):
         )
         if worker_class is not None:
-            from functools import partial
-            worker_class = partial(
-                LoggedNanny if log_spilling is True else Nanny,
-                worker_class=worker_class,
-            )
+            if log_spilling is True:
+                raise ValueError(
+                    "Cannot enable `log_spilling` when `worker_class` is specified. If "
+                    "logging is needed, ensure `worker_class` is a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedNanny` or a subclass of "
+                    "`distributed.local_cuda_cluster.LoggedWorker`, and specify "
+                    "`log_spilling=False`."
+                )
+            if not issubclass(worker_class, Nanny):
+                worker_class = partial(Nanny, worker_class=worker_class)
         self.pre_import = pre_import

dask_cuda/plugins.py ADDED Viewed

@@ -0,0 +1,122 @@
+import importlib
+import os
+from distributed import WorkerPlugin
+from .utils import get_rmm_log_file_name, parse_device_memory_limit
+class CPUAffinity(WorkerPlugin):
+    def __init__(self, cores):
+        self.cores = cores
+    def setup(self, worker=None):
+        os.sched_setaffinity(0, self.cores)
+class RMMSetup(WorkerPlugin):
+    def __init__(
+        self,
+        initial_pool_size,
+        maximum_pool_size,
+        managed_memory,
+        async_alloc,
+        release_threshold,
+        log_directory,
+        track_allocations,
+    ):
+        if initial_pool_size is None and maximum_pool_size is not None:
+            raise ValueError(
+                "`rmm_maximum_pool_size` was specified without specifying "
+                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
+            )
+        if async_alloc is True:
+            if managed_memory is True:
+                raise ValueError(
+                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
+                )
+        if async_alloc is False and release_threshold is not None:
+            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
+        self.initial_pool_size = initial_pool_size
+        self.maximum_pool_size = maximum_pool_size
+        self.managed_memory = managed_memory
+        self.async_alloc = async_alloc
+        self.release_threshold = release_threshold
+        self.logging = log_directory is not None
+        self.log_directory = log_directory
+        self.rmm_track_allocations = track_allocations
+    def setup(self, worker=None):
+        if self.initial_pool_size is not None:
+            self.initial_pool_size = parse_device_memory_limit(
+                self.initial_pool_size, alignment_size=256
+            )
+        if self.async_alloc:
+            import rmm
+            if self.release_threshold is not None:
+                self.release_threshold = parse_device_memory_limit(
+                    self.release_threshold, alignment_size=256
+                )
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=self.initial_pool_size,
+                release_threshold=self.release_threshold,
+            )
+            if self.maximum_pool_size is not None:
+                self.maximum_pool_size = parse_device_memory_limit(
+                    self.maximum_pool_size, alignment_size=256
+                )
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=self.maximum_pool_size
+                )
+            rmm.mr.set_current_device_resource(mr)
+            if self.logging:
+                rmm.enable_logging(
+                    log_file_name=get_rmm_log_file_name(
+                        worker, self.logging, self.log_directory
+                    )
+                )
+        elif self.initial_pool_size is not None or self.managed_memory:
+            import rmm
+            pool_allocator = False if self.initial_pool_size is None else True
+            if self.initial_pool_size is not None:
+                if self.maximum_pool_size is not None:
+                    self.maximum_pool_size = parse_device_memory_limit(
+                        self.maximum_pool_size, alignment_size=256
+                    )
+            rmm.reinitialize(
+                pool_allocator=pool_allocator,
+                managed_memory=self.managed_memory,
+                initial_pool_size=self.initial_pool_size,
+                maximum_pool_size=self.maximum_pool_size,
+                logging=self.logging,
+                log_file_name=get_rmm_log_file_name(
+                    worker, self.logging, self.log_directory
+                ),
+            )
+        if self.rmm_track_allocations:
+            import rmm
+            mr = rmm.mr.get_current_device_resource()
+            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
+class PreImport(WorkerPlugin):
+    def __init__(self, libraries):
+        if libraries is None:
+            libraries = []
+        elif isinstance(libraries, str):
+            libraries = libraries.split(",")
+        self.libraries = libraries
+    def setup(self, worker=None):
+        for l in self.libraries:
+            importlib.import_module(l)

dask_cuda/tests/test_dask_cuda_worker.py CHANGED Viewed

@@ -40,7 +40,7 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
                 str(nthreads),
                 "--no-dashboard",
                 "--worker-class",
-                "dask_cuda.utils.MockWorker",
+                "dask_cuda.utils_test.MockWorker",
             ]
         ):
             with Client("127.0.0.1:9359", loop=loop) as client:
@@ -329,7 +329,7 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
                     str(nthreads),
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
@@ -364,7 +364,7 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
                     "127.0.0.1",
                     "--no-dashboard",
                     "--worker-class",
-                    "dask_cuda.utils.MockWorker",
+                    "dask_cuda.utils_test.MockWorker",
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:

dask_cuda/tests/test_explicit_comms.py CHANGED Viewed

@@ -17,6 +17,7 @@ from distributed.deploy.local import LocalCluster
 import dask_cuda
 from dask_cuda.explicit_comms import comms
 from dask_cuda.explicit_comms.dataframe.shuffle import shuffle as explicit_comms_shuffle
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -35,6 +36,7 @@ def _test_local_cluster(protocol):
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster) as client:
@@ -56,6 +58,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
         dashboard_address=None,
         n_workers=npartitions,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -102,6 +105,7 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster) as client:
@@ -204,6 +208,7 @@ def test_dask_use_explicit_comms(in_cluster):
             dashboard_address=None,
             n_workers=2,
             threads_per_worker=1,
+            worker_class=IncreasedCloseTimeoutNanny,
             processes=True,
         ) as cluster:
             with Client(cluster):
@@ -221,6 +226,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
         dashboard_address=None,
         n_workers=n_workers,
         threads_per_worker=1,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         with Client(cluster):
@@ -327,6 +333,7 @@ def test_lock_workers():
         dashboard_address=None,
         n_workers=4,
         threads_per_worker=5,
+        worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
         ps = []

dask_cuda/tests/test_initialize.py CHANGED Viewed

@@ -10,6 +10,7 @@ from distributed.deploy.local import LocalCluster
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import get_ucx_config
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
@@ -29,6 +30,7 @@ def _test_initialize_ucx_tcp():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -64,6 +66,7 @@ def _test_initialize_ucx_nvlink():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -100,6 +103,7 @@ def _test_initialize_ucx_infiniband():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config(**kwargs)},
     ) as cluster:
         with Client(cluster) as client:
@@ -138,6 +142,7 @@ def _test_initialize_ucx_all():
         n_workers=1,
         threads_per_worker=1,
         processes=True,
+        worker_class=IncreasedCloseTimeoutNanny,
         config={"distributed.comm.ucx": get_ucx_config()},
     ) as cluster:
         with Client(cluster) as client:

dask_cuda/tests/test_local_cuda_cluster.py CHANGED Viewed

@@ -13,13 +13,13 @@ from distributed.utils_test import gen_test, raises_with_cause
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import (
-    MockWorker,
     get_cluster_configuration,
     get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     print_cluster_config,
 )
+from dask_cuda.utils_test import MockWorker
 @gen_test(timeout=20)
@@ -337,6 +337,7 @@ async def test_pre_import():
 # Intentionally not using @gen_test to skip cleanup checks
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_pre_import_not_found():
     async def _test_pre_import_not_found():
         with raises_with_cause(RuntimeError, None, ImportError, None):
@@ -491,6 +492,7 @@ def test_print_cluster_config(capsys):
             assert "[plugin]" in captured.out
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
 def test_death_timeout_raises():
     with pytest.raises(asyncio.exceptions.TimeoutError):
         with LocalCUDACluster(

dask_cuda/tests/test_proxify_host_file.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dask_cuda.get_device_memory_objects import get_device_memory_ids
 from dask_cuda.proxify_host_file import ProxifyHostFile
 from dask_cuda.proxy_object import ProxyObject, asproxy, unproxy
 from dask_cuda.utils import get_device_total_memory
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 cupy = pytest.importorskip("cupy")
 cupy.cuda.set_allocator(None)
@@ -393,7 +394,10 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
     with dask.config.set(jit_unspill_compatibility_mode=compatibility_mode):
         async with dask_cuda.LocalCUDACluster(
-            n_workers=1, jit_unspill=True, asynchronous=True
+            n_workers=1,
+            jit_unspill=True,
+            worker_class=IncreasedCloseTimeoutNanny,
+            asynchronous=True,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
                 ddf = dask.dataframe.from_pandas(

dask_cuda/tests/test_proxy.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 # Make the "disk" serializer available and use a directory that are
 # remove on exit.
@@ -422,6 +423,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
     async with dask_cuda.LocalCUDACluster(
         n_workers=1,
         protocol=protocol,
+        worker_class=IncreasedCloseTimeoutNanny,
         asynchronous=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:

dask_cuda/tests/test_spill.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import gc
 import os
 from time import sleep
@@ -11,6 +12,7 @@ from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 if utils.get_device_total_memory() < 1e10:
     pytest.skip("Not enough GPU memory", allow_module_level=True)
@@ -58,7 +60,10 @@ def assert_device_host_file_size(
 def worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     assert_device_host_file_size(
         dask_worker.data, total_size, device_chunk_overhead, serialized_chunk_overhead
@@ -66,7 +71,10 @@ def worker_assert(
 def delayed_worker_assert(
-    dask_worker, total_size, device_chunk_overhead, serialized_chunk_overhead
+    total_size,
+    device_chunk_overhead,
+    serialized_chunk_overhead,
+    dask_worker=None,
 ):
     start = time()
     while not device_host_file_size_matches(
@@ -82,6 +90,18 @@ def delayed_worker_assert(
             )
+def assert_host_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is False:
+        assert len(dask_worker.data.host)
+def assert_disk_chunks(spills_to_disk, dask_worker=None):
+    if spills_to_disk is True:
+        assert len(dask_worker.data.disk or list()) > 0
+    else:
+        assert len(dask_worker.data.disk or list()) == 0
 @pytest.mark.parametrize(
     "params",
     [
@@ -122,7 +142,7 @@ def delayed_worker_assert(
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cupy_cluster_device_spill(params):
     cupy = pytest.importorskip("cupy")
     with dask.config.set(
@@ -141,9 +161,12 @@ async def test_cupy_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
+                await client.wait_for_workers(1)
                 rs = da.random.RandomState(RandomState=cupy.random.RandomState)
                 x = rs.random(int(50e6), chunks=2e6)
                 await wait(x)
@@ -153,7 +176,10 @@ async def test_cupy_cluster_device_spill(params):
                 # Allow up to 1024 bytes overhead per chunk serialized
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
                 y = client.compute(x.sum())
@@ -162,20 +188,19 @@ async def test_cupy_cluster_device_spill(params):
                 assert (abs(res / x.size) - 0.5) < 1e-3
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, x.nbytes, 1024, 1024)
+                    worker_assert,
+                    x.nbytes,
+                    1024,
+                    1024,
                 )
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
 @pytest.mark.parametrize(
@@ -218,7 +243,7 @@ async def test_cupy_cluster_device_spill(params):
         },
     ],
 )
-@gen_test(timeout=120)
+@gen_test(timeout=30)
 async def test_cudf_cluster_device_spill(params):
     cudf = pytest.importorskip("cudf")
@@ -240,9 +265,12 @@ async def test_cudf_cluster_device_spill(params):
             asynchronous=True,
             device_memory_limit=params["device_memory_limit"],
             memory_limit=params["memory_limit"],
+            worker_class=IncreasedCloseTimeoutNanny,
         ) as cluster:
             async with Client(cluster, asynchronous=True) as client:
+                await client.wait_for_workers(1)
                 # There's a known issue with datetime64:
                 # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
                 # The same error above happens when spilling datetime64 to disk
@@ -264,26 +292,35 @@ async def test_cudf_cluster_device_spill(params):
                 await wait(cdf2)
                 del cdf
+                gc.collect()
-                host_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.host)
+                await client.run(
+                    assert_host_chunks,
+                    params["spills_to_disk"],
                 )
-                disk_chunks = await client.run(
-                    lambda dask_worker: len(dask_worker.data.disk or list())
+                await client.run(
+                    assert_disk_chunks,
+                    params["spills_to_disk"],
                 )
-                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                    if params["spills_to_disk"]:
-                        assert dc > 0
-                    else:
-                        assert hc > 0
-                        assert dc == 0
                 await client.run(
-                    lambda dask_worker: worker_assert(dask_worker, nbytes, 32, 2048)
+                    worker_assert,
+                    nbytes,
+                    32,
+                    2048,
                 )
                 del cdf2
-                await client.run(
-                    lambda dask_worker: delayed_worker_assert(dask_worker, 0, 0, 0)
-                )
+                while True:
+                    try:
+                        await client.run(
+                            delayed_worker_assert,
+                            0,
+                            0,
+                            0,
+                        )
+                    except AssertionError:
+                        gc.collect()
+                    else:
+                        break

dask_cuda/utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import importlib
 import math
 import operator
 import os
@@ -18,7 +17,7 @@ import dask
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
 from dask.utils import format_bytes, parse_bytes
-from distributed import Worker, wait
+from distributed import wait
 from distributed.comm import parse_address
 try:
@@ -32,122 +31,6 @@ except ImportError:
         yield
-class CPUAffinity:
-    def __init__(self, cores):
-        self.cores = cores
-    def setup(self, worker=None):
-        os.sched_setaffinity(0, self.cores)
-class RMMSetup:
-    def __init__(
-        self,
-        initial_pool_size,
-        maximum_pool_size,
-        managed_memory,
-        async_alloc,
-        release_threshold,
-        log_directory,
-        track_allocations,
-    ):
-        if initial_pool_size is None and maximum_pool_size is not None:
-            raise ValueError(
-                "`rmm_maximum_pool_size` was specified without specifying "
-                "`rmm_pool_size`.`rmm_pool_size` must be specified to use RMM pool."
-            )
-        if async_alloc is True:
-            if managed_memory is True:
-                raise ValueError(
-                    "`rmm_managed_memory` is incompatible with the `rmm_async`."
-                )
-        if async_alloc is False and release_threshold is not None:
-            raise ValueError("`rmm_release_threshold` requires `rmm_async`.")
-        self.initial_pool_size = initial_pool_size
-        self.maximum_pool_size = maximum_pool_size
-        self.managed_memory = managed_memory
-        self.async_alloc = async_alloc
-        self.release_threshold = release_threshold
-        self.logging = log_directory is not None
-        self.log_directory = log_directory
-        self.rmm_track_allocations = track_allocations
-    def setup(self, worker=None):
-        if self.initial_pool_size is not None:
-            self.initial_pool_size = parse_device_memory_limit(
-                self.initial_pool_size, alignment_size=256
-            )
-        if self.async_alloc:
-            import rmm
-            if self.release_threshold is not None:
-                self.release_threshold = parse_device_memory_limit(
-                    self.release_threshold, alignment_size=256
-                )
-            mr = rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=self.initial_pool_size,
-                release_threshold=self.release_threshold,
-            )
-            if self.maximum_pool_size is not None:
-                self.maximum_pool_size = parse_device_memory_limit(
-                    self.maximum_pool_size, alignment_size=256
-                )
-                mr = rmm.mr.LimitingResourceAdaptor(
-                    mr, allocation_limit=self.maximum_pool_size
-                )
-            rmm.mr.set_current_device_resource(mr)
-            if self.logging:
-                rmm.enable_logging(
-                    log_file_name=get_rmm_log_file_name(
-                        worker, self.logging, self.log_directory
-                    )
-                )
-        elif self.initial_pool_size is not None or self.managed_memory:
-            import rmm
-            pool_allocator = False if self.initial_pool_size is None else True
-            if self.initial_pool_size is not None:
-                if self.maximum_pool_size is not None:
-                    self.maximum_pool_size = parse_device_memory_limit(
-                        self.maximum_pool_size, alignment_size=256
-                    )
-            rmm.reinitialize(
-                pool_allocator=pool_allocator,
-                managed_memory=self.managed_memory,
-                initial_pool_size=self.initial_pool_size,
-                maximum_pool_size=self.maximum_pool_size,
-                logging=self.logging,
-                log_file_name=get_rmm_log_file_name(
-                    worker, self.logging, self.log_directory
-                ),
-            )
-        if self.rmm_track_allocations:
-            import rmm
-            mr = rmm.mr.get_current_device_resource()
-            rmm.mr.set_current_device_resource(rmm.mr.TrackingResourceAdaptor(mr))
-class PreImport:
-    def __init__(self, libraries):
-        if libraries is None:
-            libraries = []
-        elif isinstance(libraries, str):
-            libraries = libraries.split(",")
-        self.libraries = libraries
-    def setup(self, worker=None):
-        for l in self.libraries:
-            importlib.import_module(l)
 def unpack_bitmask(x, mask_bits=64):
     """Unpack a list of integers containing bitmasks.
@@ -669,27 +552,6 @@ def parse_device_memory_limit(device_memory_limit, device_index=0, alignment_siz
         return _align(int(device_memory_limit), alignment_size)
-class MockWorker(Worker):
-    """Mock Worker class preventing NVML from getting used by SystemMonitor.
-    By preventing the Worker from initializing NVML in the SystemMonitor, we can
-    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
-    machines.
-    """
-    def __init__(self, *args, **kwargs):
-        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
-        self._device_get_count = distributed.diagnostics.nvml.device_get_count
-        super().__init__(*args, **kwargs)
-    def __del__(self):
-        distributed.diagnostics.nvml.device_get_count = self._device_get_count
-    @staticmethod
-    def device_get_count():
-        return 0
 def get_gpu_uuid_from_index(device_index=0):
     """Get GPU UUID from CUDA device index.

dask_cuda/utils_test.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Literal
+import distributed
+from distributed import Nanny, Worker
+class MockWorker(Worker):
+    """Mock Worker class preventing NVML from getting used by SystemMonitor.
+    By preventing the Worker from initializing NVML in the SystemMonitor, we can
+    mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
+    machines.
+    """
+    def __init__(self, *args, **kwargs):
+        distributed.diagnostics.nvml.device_get_count = MockWorker.device_get_count
+        self._device_get_count = distributed.diagnostics.nvml.device_get_count
+        super().__init__(*args, **kwargs)
+    def __del__(self):
+        distributed.diagnostics.nvml.device_get_count = self._device_get_count
+    @staticmethod
+    def device_get_count():
+        return 0
+class IncreasedCloseTimeoutNanny(Nanny):
+    """Increase `Nanny`'s close timeout.
+    The internal close timeout mechanism of `Nanny` recomputes the time left to kill
+    the `Worker` process based on elapsed time of the close task, which may leave
+    very little time for the subprocess to shutdown cleanly, which may cause tests
+    to fail when the system is under higher load. This class increases the default
+    close timeout of 5.0 seconds that `Nanny` sets by default, which can be overriden
+    via Distributed's public API.
+    This class can be used with the `worker_class` argument of `LocalCluster` or
+    `LocalCUDACluster` to provide a much higher default of 30.0 seconds.
+    """
+    async def close(  # type:ignore[override]
+        self, timeout: float = 30.0, reason: str = "nanny-close"
+    ) -> Literal["OK"]:
+        return await super().close(timeout=timeout, reason=reason)

dask_cuda/worker_spec.py CHANGED Viewed

@@ -5,7 +5,8 @@ from distributed.system import MEMORY_LIMIT
 from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
-from .utils import CPUAffinity, get_cpu_affinity, get_gpu_count
+from .plugins import CPUAffinity
+from .utils import get_cpu_affinity, get_gpu_count
 def worker_spec(

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dask-cuda
-Version: 23.10.0a231015
+Version: 23.12.0a24
 Summary: Utilities for Dask and CUDA interactions
 Author: NVIDIA Corporation
 License: Apache-2.0
@@ -17,8 +17,8 @@ Classifier: Programming Language :: Python :: 3.10
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: dask ==2023.9.2
-Requires-Dist: distributed ==2023.9.2
+Requires-Dist: dask >=2023.9.2
+Requires-Dist: distributed >=2023.9.2
 Requires-Dist: pynvml <11.5,>=11.0.0
 Requires-Dist: numpy >=1.21
 Requires-Dist: numba >=0.57

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,22 @@
-dask_cuda/__init__.py,sha256=2oMXKPmTjhzvy2sCRD8O88sV0cqVzAnlxLn9-3j3_os,1452
+dask_cuda/VERSION,sha256=B3lEoXnOJhj1jdnSXB9iPziZLRAQGvXofHbxd6cA664,12
+dask_cuda/__init__.py,sha256=XnMTUi-SvoGn7g1Dj6XW97HnQzGQv0G3EnvSjcZ7vU4,1455
+dask_cuda/_version.py,sha256=FgBzL-H3uFWUDb0QvqJw3AytPr1PG8LbMnHxQEX8Vx4,738
 dask_cuda/cli.py,sha256=XNRH0bu-6jzRoyWJB5qSWuzePJSh3z_5Ng6rDCnz7lg,15970
-dask_cuda/compat.py,sha256=BLXv9IHUtD3h6-T_8MX-uGt-UDMG6EuGuyN-zw3XndU,4084
-dask_cuda/cuda_worker.py,sha256=hUJ3dCdeF1GxL0Oio-d-clQ5tLxQ9xjwU6Bse5JW54g,8571
-dask_cuda/device_host_file.py,sha256=D0rHOFz1TRfvaecoP30x3JRWe1TiHUaq45Dg-v0DfoY,10272
+dask_cuda/cuda_worker.py,sha256=bIu-ESeIpJG_WaTYrv0z9z5juJ1qR5i_5Ng3CN1WK8s,8579
+dask_cuda/device_host_file.py,sha256=yS31LGtt9VFAG78uBBlTDr7HGIng2XymV1OxXIuEMtM,10272
 dask_cuda/disk_io.py,sha256=urSLKiPvJvYmKCzDPOUDCYuLI3r1RUiyVh3UZGRoF_Y,6626
 dask_cuda/get_device_memory_objects.py,sha256=zMSqWzm5rflRInbNMz7U2Ewv5nMcE-H8stMJeWHVWyc,3890
 dask_cuda/initialize.py,sha256=mzPgKhs8oLgUWpqd4ckvLNKvhLoHjt96RrBPeVneenI,5231
 dask_cuda/is_device_object.py,sha256=CnajvbQiX0FzFzwft0MqK1OPomx3ZGDnDxT56wNjixw,1046
 dask_cuda/is_spillable_object.py,sha256=CddGmg0tuSpXh2m_TJSY6GRpnl1WRHt1CRcdWgHPzWA,1457
-dask_cuda/local_cuda_cluster.py,sha256=hjjgqFkGyuEqYMIYbxBV4xW2b7M6UPw9TnYM1Tf5r_4,17377
+dask_cuda/local_cuda_cluster.py,sha256=w2HXMZtEukwklkB3J6l6DqZstNA5uvGEdFkdzpyUJ6k,17810
+dask_cuda/plugins.py,sha256=cnHsdrXx7PBPmrzHX6YEkCH5byCsUk8LE2FeTeu8ZLU,4259
 dask_cuda/proxify_device_objects.py,sha256=99CD7LOE79YiQGJ12sYl_XImVhJXpFR4vG5utdkjTQo,8108
 dask_cuda/proxify_host_file.py,sha256=Wf5CFCC1JN5zmfvND3ls0M5FL01Y8VhHrk0xV3UQ9kk,30850
 dask_cuda/proxy_object.py,sha256=bZq92kjgFB-ad_luSAFT_RItV3nssmiEk4OOSp34laU,29812
-dask_cuda/utils.py,sha256=IGlr6SZAhULIo4WJrhJxyHAy7l6mp9vN_U1QZjqUYJY,29815
-dask_cuda/worker_spec.py,sha256=EQffH_fuBBaghmO8o9kxJ7EAQiB4gaW-uPRYesPknSs,4356
+dask_cuda/utils.py,sha256=wNRItbIXrOpH77AUUrZNrGqgIiGNzpClXYl0QmQqfxs,25002
+dask_cuda/utils_test.py,sha256=WNMR0gic2tuP3pgygcR9g52NfyX8iGMOan6juXhpkCE,1694
+dask_cuda/worker_spec.py,sha256=7-Uq_e5q2SkTlsmctMcYLCa9_3RiiVHZLIN7ctfaFmE,4376
 dask_cuda/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dask_cuda/benchmarks/common.py,sha256=sEIFnRZS6wbyKCQyB4fDclYLc2YqC0PolurR5qzuRxw,6393
 dask_cuda/benchmarks/local_cudf_groupby.py,sha256=2iHk-a-GvLmAgajwQJNrqmZ-WJeiyMFEyflcxh7SPO8,8894
@@ -27,24 +30,24 @@ dask_cuda/explicit_comms/comms.py,sha256=Su6PuNo68IyS-AwoqU4S9TmqWsLvUdNa0jot2hx
 dask_cuda/explicit_comms/dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=2f2wlPyqXpryIHgMpsZzs3pDE7eyslYam-jQh3ujszQ,20124
 dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=u3kW91YRLdHFycvpGfSQKrEucu5khMJ1k4sjmddO490,4910
-dask_cuda/tests/test_dask_cuda_worker.py,sha256=VgybyylO7eaSk9yVBj1snp3vM7ZTG-VPEcE8agTmaWI,17714
+dask_cuda/tests/test_dask_cuda_worker.py,sha256=gViHaMCSfB6ip125OEi9D0nfKC-qBXRoHz6BRodEdb4,17729
 dask_cuda/tests/test_device_host_file.py,sha256=79ssUISo1YhsW_7HdwqPfsH2LRzS2bi5BjPym1Sdgqw,5882
 dask_cuda/tests/test_dgx.py,sha256=bKX-GvkYjWlmcEIK15aGErxmc0qPqIWOG1CeDFGoXFU,6381
-dask_cuda/tests/test_explicit_comms.py,sha256=WJDQQkqaYT9tRiz0zgC9_udzRG3DuhPlbH7X-wshC7w,11925
+dask_cuda/tests/test_explicit_comms.py,sha256=3Q3o9BX4ksCgz11o38o5QhKg3Rv-EtTsGnVG83wwyyo,12283
 dask_cuda/tests/test_from_array.py,sha256=i2Vha4mchB0BopTlEdXV7CxY7qyTzFYdgYQTmukZX38,493
 dask_cuda/tests/test_gds.py,sha256=6jf0HPTHAIG8Mp_FC4Ai4zpn-U1K7yk0fSXg8He8-r8,1513
-dask_cuda/tests/test_initialize.py,sha256=EV3FTqBRX_kxHJ0ZEij34JpLyOJvGIYB_hQc-0afoG8,5235
-dask_cuda/tests/test_local_cuda_cluster.py,sha256=5-55CSMDJqBXqQzFQibmbWwvVOFC5iq7F1KtvtUx0kE,17417
-dask_cuda/tests/test_proxify_host_file.py,sha256=vnmUuU9w9hO4Et-qwnvY5VMkoohRt62cKhyP-wi7zKM,18492
-dask_cuda/tests/test_proxy.py,sha256=eJuXU0KRQC36R8g0WN9gyIeZ3tbKFlqMxybEzmaT1LA,23371
-dask_cuda/tests/test_spill.py,sha256=RfgIDWUkTbe7XqdDVJNnRuB_2U-IUvV_rwtZhY8OofE,9741
+dask_cuda/tests/test_initialize.py,sha256=-Vo8SVBrVEKB0V1C6ia8khvbHJt4BC0xEjMNLhNbFxI,5491
+dask_cuda/tests/test_local_cuda_cluster.py,sha256=1zlbRLn8ukopl5u8wBEfyyEhWpUblHYnwcPiPJO5bAU,17603
+dask_cuda/tests/test_proxify_host_file.py,sha256=cp-U1uNPhesQaHbftKV8ir_dt5fbs0ZXSIsL39oI0fE,18630
+dask_cuda/tests/test_proxy.py,sha256=Nu9vLx-dALINcF_wsxuFYUryRE0Jq43w7bAYAchK8RY,23480
+dask_cuda/tests/test_spill.py,sha256=xN9PbVERBYMuZxvscSO0mAM22loq9WT3ltZVBFxlmM4,10239
 dask_cuda/tests/test_utils.py,sha256=wgYPvu7Sk61C64pah9ZbK8cnBXK5RyUCpu3G2ny6OZQ,8832
 dask_cuda/tests/test_worker_spec.py,sha256=Bvu85vkqm6ZDAYPXKMJlI2pm9Uc5tiYKNtO4goXSw-I,2399
 examples/ucx/client_initialize.py,sha256=YN3AXHF8btcMd6NicKKhKR9SXouAsK1foJhFspbOn70,1262
 examples/ucx/local_cuda_cluster.py,sha256=7xVY3EhwhkY2L4VZin_BiMCbrjhirDNChoC86KiETNc,1983
-dask_cuda-23.10.0a231015.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
-dask_cuda-23.10.0a231015.dist-info/METADATA,sha256=CxnTtTdisHQZtHiF3hS16hC-X_fhRJ21oIVEga974JM,2285
-dask_cuda-23.10.0a231015.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
-dask_cuda-23.10.0a231015.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
-dask_cuda-23.10.0a231015.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
-dask_cuda-23.10.0a231015.dist-info/RECORD,,
+dask_cuda-23.12.0a24.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
+dask_cuda-23.12.0a24.dist-info/METADATA,sha256=1MrxlpZ1ah-mzK-LqOVqkoJD8M5pRC_WH-j7nwdOya8,2281
+dask_cuda-23.12.0a24.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
+dask_cuda-23.12.0a24.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
+dask_cuda-23.12.0a24.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
+dask_cuda-23.12.0a24.dist-info/RECORD,,

dask_cuda/compat.py DELETED Viewed

@@ -1,118 +0,0 @@
-import pickle
-import msgpack
-from packaging.version import Version
-import dask
-import distributed
-import distributed.comm.utils
-import distributed.protocol
-from distributed.comm.utils import OFFLOAD_THRESHOLD, nbytes, offload
-from distributed.protocol.core import (
-    Serialized,
-    decompress,
-    logger,
-    merge_and_deserialize,
-    msgpack_decode_default,
-    msgpack_opts,
-)
-if Version(distributed.__version__) >= Version("2023.8.1"):
-    # Monkey-patch protocol.core.loads (and its users)
-    async def from_frames(
-        frames, deserialize=True, deserializers=None, allow_offload=True
-    ):
-        """
-        Unserialize a list of Distributed protocol frames.
-        """
-        size = False
-        def _from_frames():
-            try:
-                # Patched code
-                return loads(
-                    frames, deserialize=deserialize, deserializers=deserializers
-                )
-                # end patched code
-            except EOFError:
-                if size > 1000:
-                    datastr = "[too large to display]"
-                else:
-                    datastr = frames
-                # Aid diagnosing
-                logger.error("truncated data stream (%d bytes): %s", size, datastr)
-                raise
-        if allow_offload and deserialize and OFFLOAD_THRESHOLD:
-            size = sum(map(nbytes, frames))
-        if (
-            allow_offload
-            and deserialize
-            and OFFLOAD_THRESHOLD
-            and size > OFFLOAD_THRESHOLD
-        ):
-            res = await offload(_from_frames)
-        else:
-            res = _from_frames()
-        return res
-    def loads(frames, deserialize=True, deserializers=None):
-        """Transform bytestream back into Python value"""
-        allow_pickle = dask.config.get("distributed.scheduler.pickle")
-        try:
-            def _decode_default(obj):
-                offset = obj.get("__Serialized__", 0)
-                if offset > 0:
-                    sub_header = msgpack.loads(
-                        frames[offset],
-                        object_hook=msgpack_decode_default,
-                        use_list=False,
-                        **msgpack_opts,
-                    )
-                    offset += 1
-                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
-                    if deserialize:
-                        if "compression" in sub_header:
-                            sub_frames = decompress(sub_header, sub_frames)
-                        return merge_and_deserialize(
-                            sub_header, sub_frames, deserializers=deserializers
-                        )
-                    else:
-                        return Serialized(sub_header, sub_frames)
-                offset = obj.get("__Pickled__", 0)
-                if offset > 0:
-                    sub_header = msgpack.loads(frames[offset])
-                    offset += 1
-                    sub_frames = frames[offset : offset + sub_header["num-sub-frames"]]
-                    # Patched code
-                    if "compression" in sub_header:
-                        sub_frames = decompress(sub_header, sub_frames)
-                    # end patched code
-                    if allow_pickle:
-                        return pickle.loads(
-                            sub_header["pickled-obj"], buffers=sub_frames
-                        )
-                    else:
-                        raise ValueError(
-                            "Unpickle on the Scheduler isn't allowed, "
-                            "set `distributed.scheduler.pickle=true`"
-                        )
-                return msgpack_decode_default(obj)
-            return msgpack.loads(
-                frames[0], object_hook=_decode_default, use_list=False, **msgpack_opts
-            )
-        except Exception:
-            logger.critical("Failed to deserialize", exc_info=True)
-            raise
-    distributed.protocol.loads = loads
-    distributed.protocol.core.loads = loads
-    distributed.comm.utils.from_frames = from_frames

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/LICENSE RENAMED Viewed

File without changes

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/WHEEL RENAMED Viewed

File without changes

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{dask_cuda-23.10.0a231015.dist-info → dask_cuda-23.12.0a24.dist-info}/top_level.txt RENAMED Viewed

File without changes

dask-cuda 23.10.0a231015__py3-none-any.whl → 23.12.0a24__py3-none-any.whl

dask-cuda 23.10.0a231015py3-none-any.whl → 23.12.0a24py3-none-any.whl