PyPI - dask-cuda - Versions diffs - 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl - Mend

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

dask_cuda/GIT_COMMIT +1 -1
dask_cuda/VERSION +1 -1
dask_cuda/_compat.py +18 -0
dask_cuda/benchmarks/common.py +4 -1
dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
dask_cuda/benchmarks/local_cudf_merge.py +5 -2
dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
dask_cuda/benchmarks/local_cupy.py +4 -1
dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
dask_cuda/benchmarks/utils.py +7 -4
dask_cuda/cli.py +21 -15
dask_cuda/cuda_worker.py +27 -57
dask_cuda/device_host_file.py +31 -15
dask_cuda/disk_io.py +7 -4
dask_cuda/explicit_comms/comms.py +11 -7
dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
dask_cuda/get_device_memory_objects.py +18 -3
dask_cuda/initialize.py +80 -44
dask_cuda/is_device_object.py +4 -1
dask_cuda/is_spillable_object.py +4 -1
dask_cuda/local_cuda_cluster.py +63 -66
dask_cuda/plugins.py +17 -16
dask_cuda/proxify_device_objects.py +15 -10
dask_cuda/proxify_host_file.py +30 -27
dask_cuda/proxy_object.py +20 -17
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_dask_cuda_worker.py +114 -27
dask_cuda/tests/test_dgx.py +10 -18
dask_cuda/tests/test_explicit_comms.py +51 -18
dask_cuda/tests/test_from_array.py +7 -5
dask_cuda/tests/test_initialize.py +16 -37
dask_cuda/tests/test_local_cuda_cluster.py +164 -54
dask_cuda/tests/test_proxify_host_file.py +33 -4
dask_cuda/tests/test_proxy.py +18 -16
dask_cuda/tests/test_rdd_ucx.py +160 -0
dask_cuda/tests/test_spill.py +107 -27
dask_cuda/tests/test_utils.py +106 -20
dask_cuda/tests/test_worker_spec.py +5 -2
dask_cuda/utils.py +319 -68
dask_cuda/utils_test.py +23 -7
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +12 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
dask_cuda-25.8.0.dist-info/RECORD +63 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
dask_cuda-25.4.0.dist-info/RECORD +0 -56
dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
{dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0

dask_cuda/explicit_comms/dataframe/shuffle.py CHANGED Viewed

@@ -1,6 +1,9 @@
+# Copyright (c) 2021-2025 NVIDIA CORPORATION.
 from __future__ import annotations
 import asyncio
+import functools
 from collections import defaultdict
 from math import ceil
 from operator import getitem
@@ -23,6 +26,7 @@ from distributed import wait
 from distributed.protocol import nested_deserialize, to_serialize
 from distributed.worker import Worker
+from ..._compat import DASK_2025_4_0
 from .. import comms
 T = TypeVar("T")
@@ -61,13 +65,13 @@ def get_no_comm_postprocess(
 ) -> Callable[[DataFrame], DataFrame]:
     """Get function for post-processing partitions not communicated
-    In cuDF, the `group_split_dispatch` uses `scatter_by_map` to create
+    In cuDF, the ``group_split_dispatch`` uses ``scatter_by_map`` to create
     the partitions, which is implemented by splitting a single base dataframe
     into multiple partitions. This means that memory are not freed until
     ALL partitions are deleted.
     In order to free memory ASAP, we can deep copy partitions NOT being
-    communicated. We do this when `num_rounds != batchsize`.
+    communicated. We do this when ``num_rounds != batchsize``.
     Parameters
     ----------
@@ -112,7 +116,7 @@ async def send(
     rank_to_out_part_ids: Dict[int, Set[int]],
     out_part_id_to_dataframe: Dict[int, DataFrame],
 ) -> None:
-    """Notice, items sent are removed from `out_part_id_to_dataframe`"""
+    """Notice, items sent are removed from ``out_part_id_to_dataframe``"""
     futures = []
     for rank, out_part_ids in rank_to_out_part_ids.items():
         if rank != myrank:
@@ -131,7 +135,7 @@ async def recv(
     out_part_id_to_dataframe_list: Dict[int, List[DataFrame]],
     proxify: Proxify,
 ) -> None:
-    """Notice, received items are appended to `out_parts_list`"""
+    """Notice, received items are appended to ``out_parts_list``"""
     async def read_msg(rank: int) -> None:
         msg: Dict[int, DataFrame] = nested_deserialize(await eps[rank].read())
@@ -146,11 +150,11 @@ async def recv(
 def compute_map_index(
     df: DataFrame, column_names: List[str], npartitions: int
 ) -> Series:
-    """Return a Series that maps each row `df` to a partition ID
+    """Return a Series that maps each row ``df`` to a partition ID
     The partitions are determined by hashing the columns given by column_names
-    unless if `column_names[0] == "_partitions"`, in which case the values of
-    `column_names[0]` are used as index.
+    unless if ``column_names[0] == "_partitions"``, in which case the values of
+    ``column_names[0]`` are used as index.
     Parameters
     ----------
@@ -164,7 +168,7 @@ def compute_map_index(
     Returns
     -------
     Series
-        Series that maps each row `df` to a partition ID
+        Series that maps each row ``df`` to a partition ID
     """
     if column_names[0] == "_partitions":
@@ -189,8 +193,8 @@ def partition_dataframe(
     """Partition dataframe to a dict of dataframes
     The partitions are determined by hashing the columns given by column_names
-    unless `column_names[0] == "_partitions"`, in which case the values of
-    `column_names[0]` are used as index.
+    unless ``column_names[0] == "_partitions"``, in which case the values of
+    ``column_names[0]`` are used as index.
     Parameters
     ----------
@@ -297,13 +301,13 @@ async def send_recv_partitions(
     rank_to_out_part_ids
         dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
-        dict. Partition IDs are global integers `0..npartitions` and corresponds
-        to the dict keys returned by `group_split_dispatch`.
+        dict. Partition IDs are global integers ``0..npartitions`` and corresponds
+        to the dict keys returned by ``group_split_dispatch``.
     out_part_id_to_dataframe
         Mapping from partition ID to dataframe. This dict is cleared on return.
     no_comm_postprocess
         Function to post-process partitions not communicated.
-        See `get_no_comm_postprocess`
+        See ``get_no_comm_postprocess``
     proxify
         Function to proxify object.
     out_part_id_to_dataframe_list
@@ -361,8 +365,8 @@ async def shuffle_task(
     rank_to_out_part_ids: dict
         dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
-        dict. Partition IDs are global integers `0..npartitions` and corresponds
-        to the dict keys returned by `group_split_dispatch`.
+        dict. Partition IDs are global integers ``0..npartitions`` and corresponds
+        to the dict keys returned by ``group_split_dispatch``.
     column_names: list of strings
         List of column names on which we want to split.
     npartitions: int
@@ -445,7 +449,7 @@ def shuffle(
         List of column names on which we want to split.
     npartitions: int or None
         The desired number of output partitions. If None, the number of output
-        partitions equals `df.npartitions`
+        partitions equals ``df.npartitions``
     ignore_index: bool
         Ignore index during shuffle. If True, performance may improve,
         but index values will not be preserved.
@@ -456,7 +460,7 @@ def shuffle(
         If -1, each worker will handle all its partitions in a single round and
         all techniques to reduce memory usage are disabled, which might be faster
         when memory pressure isn't an issue.
-        If None, the value of `DASK_EXPLICIT_COMMS_BATCHSIZE` is used or 1 if not
+        If None, the value of ``DASK_EXPLICIT_COMMS_BATCHSIZE`` is used or 1 if not
         set thus by default, we prioritize robustness over performance.
     Returns
@@ -467,12 +471,12 @@ def shuffle(
     Developer Notes
     ---------------
     The implementation consist of three steps:
-      (a) Stage the partitions of `df` on all workers and then cancel them
+      (a) Stage the partitions of ``df`` on all workers and then cancel them
           thus at this point the Dask Scheduler doesn't know about any of the
           the partitions.
       (b) Submit a task on each worker that shuffle (all-to-all communicate)
           the staged partitions and return a list of dataframe-partitions.
-      (c) Submit a dask graph that extract (using `getitem()`) individual
+      (c) Submit a dask graph that extract (using ``getitem()``) individual
           dataframe-partitions from (b).
     """
     c = comms.default_comms()
@@ -582,48 +586,136 @@ def _use_explicit_comms() -> bool:
     return False
-def patch_shuffle_expression() -> None:
-    """Patch Dasks Shuffle expression.
+_base_lower = dask_expr._shuffle.Shuffle._lower
+_base_compute = dask.base.compute
-    Notice, this is monkey patched into Dask at dask_cuda
-    import, and it changes `Shuffle._layer` to lower into
-    an `ECShuffle` expression when the 'explicit-comms'
-    config is set to `True`.
+def _contains_shuffle_expr(*args) -> bool:
     """
+    Check whether any of the arguments is a Shuffle expression.
-    class ECShuffle(dask_expr._shuffle.TaskShuffle):
-        """Explicit-Comms Shuffle Expression."""
-        def _layer(self):
-            # Execute an explicit-comms shuffle
-            if not hasattr(self, "_ec_shuffled"):
-                on = self.partitioning_index
-                df = dask_expr.new_collection(self.frame)
-                self._ec_shuffled = shuffle(
-                    df,
-                    [on] if isinstance(on, str) else on,
-                    self.npartitions_out,
-                    self.ignore_index,
+    This is called by ``compute``, which is given a sequence of Dask Collections
+    to process. For each of those, we'll check whether the expresion contains a
+    Shuffle operation.
+    """
+    for collection in args:
+        if isinstance(collection, dask.dataframe.DataFrame):
+            shuffle_ops = list(
+                collection.expr.find_operations(
+                    (
+                        dask_expr._shuffle.RearrangeByColumn,
+                        dask_expr.SetIndex,
+                        dask_expr._shuffle.Shuffle,
+                    )
                 )
-            graph = self._ec_shuffled.dask.copy()
-            shuffled_name = self._ec_shuffled._name
-            for i in range(self.npartitions_out):
-                graph[(self._name, i)] = graph[(shuffled_name, i)]
-            return graph
-    _base_lower = dask_expr._shuffle.Shuffle._lower
-    def _patched_lower(self):
-        if self.method in (None, "tasks") and _use_explicit_comms():
-            return ECShuffle(
-                self.frame,
-                self.partitioning_index,
+            )
+            if len(shuffle_ops) > 0:
+                return True
+    return False
+@functools.wraps(_base_compute)
+def _patched_compute(
+    *args,
+    traverse=True,
+    optimize_graph=True,
+    scheduler=None,
+    get=None,
+    **kwargs,
+):
+    # A patched version of dask.compute that explicitly materializes the task
+    # graph when we're using explicit-comms and the expression contains a
+    # Shuffle operation.
+    # https://github.com/rapidsai/dask-upstream-testing/issues/37#issuecomment-2779798670
+    # contains more details on the issue.
+    if DASK_2025_4_0() and _use_explicit_comms() and _contains_shuffle_expr(*args):
+        from dask.base import (
+            collections_to_expr,
+            flatten,
+            get_scheduler,
+            shorten_traceback,
+            unpack_collections,
+        )
+        collections, repack = unpack_collections(*args, traverse=traverse)
+        if not collections:
+            return args
+        schedule = get_scheduler(
+            scheduler=scheduler,
+            collections=collections,
+            get=get,
+        )
+        from dask._expr import FinalizeCompute
+        expr = collections_to_expr(collections, optimize_graph)
+        expr = FinalizeCompute(expr)
+        with shorten_traceback():
+            expr = expr.optimize()
+            keys = list(flatten(expr.__dask_keys__()))
+            # materialize the HLG here
+            expr = dict(expr.__dask_graph__())
+            results = schedule(expr, keys, **kwargs)
+            return repack(results)
+    else:
+        return _base_compute(
+            *args,
+            traverse=traverse,
+            optimize_graph=optimize_graph,
+            scheduler=scheduler,
+            get=get,
+            **kwargs,
+        )
+class ECShuffle(dask_expr._shuffle.TaskShuffle):
+    """Explicit-Comms Shuffle Expression."""
+    def _layer(self):
+        # Execute an explicit-comms shuffle
+        if not hasattr(self, "_ec_shuffled"):
+            on = self.partitioning_index
+            df = dask_expr.new_collection(self.frame)
+            ec_shuffled = shuffle(
+                df,
+                [on] if isinstance(on, str) else on,
                 self.npartitions_out,
                 self.ignore_index,
-                self.options,
-                self.original_partitioning_index,
             )
-        else:
-            return _base_lower(self)
+            object.__setattr__(self, "_ec_shuffled", ec_shuffled)
+        graph = self._ec_shuffled.dask.copy()
+        shuffled_name = self._ec_shuffled._name
+        for i in range(self.npartitions_out):
+            graph[(self._name, i)] = graph[(shuffled_name, i)]
+        return graph
+def _patched_lower(self):
+    if self.method in (None, "tasks") and _use_explicit_comms():
+        return ECShuffle(
+            self.frame,
+            self.partitioning_index,
+            self.npartitions_out,
+            self.ignore_index,
+            self.options,
+            self.original_partitioning_index,
+        )
+    else:
+        return _base_lower(self)
+def patch_shuffle_expression() -> None:
+    """Patch Dasks Shuffle expression.
+    Notice, this is monkey patched into Dask at dask_cuda
+    import, and it changes ``Shuffle._layer`` to lower into
+    an ``ECShuffle`` expression when the 'explicit-comms'
+    config is set to ``True``.
+    """
+    dask.base.compute = _patched_compute
     dask_expr._shuffle.Shuffle._lower = _patched_lower

dask_cuda/get_device_memory_objects.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
 from typing import Set
 from dask.sizeof import sizeof
@@ -25,10 +27,10 @@ class DeviceMemoryId:
 def get_device_memory_ids(obj) -> Set[DeviceMemoryId]:
-    """Find all CUDA device objects in `obj`
+    """Find all CUDA device objects in ``obj``
-    Search through `obj` and find all CUDA device objects, which are objects
-    that either are known to `dispatch` or implement `__cuda_array_interface__`.
+    Search through ``obj`` and find all CUDA device objects, which are objects
+    that either are known to ``dispatch`` or implement ``__cuda_array_interface__``.
     Parameters
     ----------
@@ -140,3 +142,16 @@ def register_cupy():  # NB: this overwrites dask.sizeof.register_cupy()
     @sizeof.register(cupy.ndarray)
     def sizeof_cupy_ndarray(x):
         return int(x.nbytes)
+@sizeof.register_lazy("pylibcudf")
+def register_pylibcudf():
+    import pylibcudf
+    @sizeof.register(pylibcudf.column.OwnerWithCAI)
+    def sizeof_owner_with_cai(x):
+        # OwnerWithCAI implements __cuda_array_interface__ so this should always
+        # be zero-copy
+        col = pylibcudf.column.Column.from_cuda_array_interface(x)
+        # col.data() returns a gpumemoryview, which knows the size in bytes
+        return col.data().nbytes

dask_cuda/initialize.py CHANGED Viewed

@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
 import logging
 import os
@@ -7,7 +10,7 @@ import numba.cuda
 import dask
 from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
-from .utils import get_ucx_config
+from .utils import _get_active_ucx_implementation_name, get_ucx_config
 logger = logging.getLogger(__name__)
@@ -22,65 +25,97 @@ def _create_cuda_context_handler():
         numba.cuda.current_context()
-def _create_cuda_context(protocol="ucx"):
-    if protocol not in ["ucx", "ucxx"]:
-        return
+def _warn_generic():
     try:
+        # TODO: update when UCX-Py is removed, see
+        # https://github.com/rapidsai/dask-cuda/issues/1517
+        import distributed.comm.ucx
         # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
         # context directly from the UCX module, thus avoiding a similar warning there.
-        try:
-            if protocol == "ucx":
-                import distributed.comm.ucx
-                distributed.comm.ucx.init_once()
-            elif protocol == "ucxx":
-                import distributed_ucxx.ucxx
-                distributed_ucxx.ucxx.init_once()
-        except ModuleNotFoundError:
-            # UCX initialization has to be delegated to Distributed, it will take care
-            # of setting correct environment variables and importing `ucp` after that.
-            # Therefore if ``import ucp`` fails we can just continue here.
-            pass
+        cuda_visible_device = get_device_index_and_uuid(
+            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+        )
+        ctx = has_cuda_context()
+        if (
+            ctx.has_context
+            and not distributed.comm.ucx.cuda_context_created.has_context
+        ):
+            distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
+        _create_cuda_context_handler()
+        if not distributed.comm.ucx.cuda_context_created.has_context:
+            ctx = has_cuda_context()
+            if ctx.has_context and ctx.device_info != cuda_visible_device:
+                distributed.comm.ucx._warn_cuda_context_wrong_device(
+                    cuda_visible_device, ctx.device_info, os.getpid()
+                )
+    except Exception:
+        logger.error("Unable to start CUDA Context", exc_info=True)
+def _initialize_ucx():
+    try:
+        import distributed.comm.ucx
+        distributed.comm.ucx.init_once()
+    except ModuleNotFoundError:
+        # UCX initialization has to be delegated to Distributed, it will take care
+        # of setting correct environment variables and importing `ucp` after that.
+        # Therefore if ``import ucp`` fails we can just continue here.
+        pass
+def _initialize_ucxx():
+    try:
+        # Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
+        # context directly from the UCX module, thus avoiding a similar warning there.
+        import distributed_ucxx.ucxx
+        distributed_ucxx.ucxx.init_once()
         cuda_visible_device = get_device_index_and_uuid(
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if protocol == "ucx":
-            if (
-                ctx.has_context
-                and not distributed.comm.ucx.cuda_context_created.has_context
-            ):
-                distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
-        elif protocol == "ucxx":
-            if (
-                ctx.has_context
-                and not distributed_ucxx.ucxx.cuda_context_created.has_context
-            ):
-                distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
+        if (
+            ctx.has_context
+            and not distributed_ucxx.ucxx.cuda_context_created.has_context
+        ):
+            distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
         _create_cuda_context_handler()
-        if protocol == "ucx":
-            if not distributed.comm.ucx.cuda_context_created.has_context:
-                ctx = has_cuda_context()
-                if ctx.has_context and ctx.device_info != cuda_visible_device:
-                    distributed.comm.ucx._warn_cuda_context_wrong_device(
-                        cuda_visible_device, ctx.device_info, os.getpid()
-                    )
-        elif protocol == "ucxx":
-            if not distributed_ucxx.ucxx.cuda_context_created.has_context:
-                ctx = has_cuda_context()
-                if ctx.has_context and ctx.device_info != cuda_visible_device:
-                    distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
-                        cuda_visible_device, ctx.device_info, os.getpid()
-                    )
+        if not distributed_ucxx.ucxx.cuda_context_created.has_context:
+            ctx = has_cuda_context()
+            if ctx.has_context and ctx.device_info != cuda_visible_device:
+                distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
+                    cuda_visible_device, ctx.device_info, os.getpid()
+                )
     except Exception:
         logger.error("Unable to start CUDA Context", exc_info=True)
+def _create_cuda_context(protocol="ucx"):
+    if protocol not in ["ucx", "ucxx", "ucx-old"]:
+        return
+    try:
+        ucx_implementation = _get_active_ucx_implementation_name(protocol)
+    except ValueError:
+        # Not a UCX protocol, just raise CUDA context warnings if needed.
+        _warn_generic()
+    else:
+        if ucx_implementation == "ucxx":
+            _initialize_ucxx()
+        else:
+            _initialize_ucx()
+            _warn_generic()
 def initialize(
     create_cuda_context=True,
     enable_tcp_over_ucx=None,
@@ -138,6 +173,7 @@ def initialize(
         enable_infiniband=enable_infiniband,
         enable_nvlink=enable_nvlink,
         enable_rdmacm=enable_rdmacm,
+        protocol=protocol,
     )
     dask.config.set({"distributed.comm.ucx": ucx_config})

dask_cuda/is_device_object.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.
 from __future__ import absolute_import, division, print_function
 from dask.utils import Dispatch
@@ -35,6 +36,8 @@ def register_cudf():
     def is_device_object_cudf_series(s):
         return True
-    @is_device_object.register(cudf.BaseIndex)
+    @is_device_object.register(cudf.Index)
+    @is_device_object.register(cudf.RangeIndex)
+    @is_device_object.register(cudf.MultiIndex)
     def is_device_object_cudf_index(s):
         return True

dask_cuda/is_spillable_object.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.
 from __future__ import absolute_import, division, print_function
 from typing import Optional
@@ -34,7 +35,9 @@ def register_cudf():
     def is_device_object_cudf_dataframe(df):
         return cudf_spilling_status()
-    @is_spillable_object.register(cudf.BaseIndex)
+    @is_spillable_object.register(cudf.Index)
+    @is_spillable_object.register(cudf.RangeIndex)
+    @is_spillable_object.register(cudf.MultiIndex)
     def is_device_object_cudf_index(s):
         return cudf_spilling_status()

dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

dask-cuda 25.4.0py3-none-any.whl → 25.8.0py3-none-any.whl