PyPI - dask-cuda - Versions diffs - 24.4.0__py3-none-any.whl → 24.8.2__py3-none-any.whl - Mend

dask-cuda 24.4.0py3-none-any.whl → 24.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

dask_cuda/VERSION +1 -1
dask_cuda/__init__.py +12 -0
dask_cuda/_version.py +12 -2
dask_cuda/benchmarks/common.py +12 -10
dask_cuda/benchmarks/local_cudf_merge.py +27 -20
dask_cuda/benchmarks/local_cudf_shuffle.py +16 -10
dask_cuda/benchmarks/utils.py +125 -31
dask_cuda/cli.py +22 -0
dask_cuda/cuda_worker.py +10 -1
dask_cuda/explicit_comms/dataframe/shuffle.py +31 -21
dask_cuda/local_cuda_cluster.py +24 -1
dask_cuda/plugins.py +15 -0
dask_cuda/tests/test_cudf_builtin_spilling.py +1 -1
dask_cuda/tests/test_dask_cuda_worker.py +58 -0
dask_cuda/tests/test_dgx.py +12 -4
dask_cuda/tests/test_explicit_comms.py +57 -8
dask_cuda/tests/test_local_cuda_cluster.py +48 -0
dask_cuda/tests/test_proxy.py +2 -2
dask_cuda/tests/test_version.py +12 -0
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/METADATA +19 -19
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/RECORD +25 -24
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/WHEEL +1 -1
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/LICENSE +0 -0
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/entry_points.txt +0 -0
{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/top_level.txt +0 -0

dask_cuda/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 24.04.00
1	+ 24.08.02

dask_cuda/__init__.py CHANGED Viewed

@@ -20,6 +20,18 @@ from .local_cuda_cluster import LocalCUDACluster
 from .proxify_device_objects import proxify_decorator, unproxify_decorator
+if dask.config.get("dataframe.query-planning", None) is not False and dask.config.get(
+    "explicit-comms", False
+):
+    raise NotImplementedError(
+        "The 'explicit-comms' config is not yet supported when "
+        "query-planning is enabled in dask. Please use the shuffle "
+        "API directly, or use the legacy dask-dataframe API "
+        "(set the 'dataframe.query-planning' config to `False`"
+        "before importing `dask.dataframe`).",
+    )
 # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
 dask.dataframe.shuffle.rearrange_by_column = get_rearrange_by_column_wrapper(
     dask.dataframe.shuffle.rearrange_by_column

dask_cuda/_version.py CHANGED Viewed

@@ -15,6 +15,16 @@
 import importlib.resources
 __version__ = (
-    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
 )
-__git_commit__ = "7ed67359e2264393e97ad57fc39829c3a69aa1f3"
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+__all__ = ["__git_commit__", "__version__"]

dask_cuda/benchmarks/common.py CHANGED Viewed

@@ -117,16 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
     wait_for_cluster(client, shutdown_on_failure=True)
     assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
-        client,
-        args.type == "gpu",
-        args.rmm_pool_size,
-        args.disable_rmm_pool,
-        args.enable_rmm_async,
-        args.enable_rmm_managed,
-        args.rmm_release_threshold,
-        args.rmm_log_directory,
-        args.enable_rmm_statistics,
-        args.enable_rmm_track_allocations,
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)

dask_cuda/benchmarks/local_cudf_merge.py CHANGED Viewed

@@ -7,8 +7,7 @@ import numpy as np
 import pandas as pd
 import dask
-from dask.base import tokenize
-from dask.dataframe.core import new_dd_object
+import dask.dataframe as dd
 from dask.distributed import performance_report, wait
 from dask.utils import format_bytes, parse_bytes
@@ -25,12 +24,20 @@ from dask_cuda.benchmarks.utils import (
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
-def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu):
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
+def generate_chunk(input):
+    i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu = input
     # Setting a seed that triggers max amount of comm in the two-GPU case.
     if gpu:
         import cupy as xp
         import cudf as xdf
+        import dask_cudf  # noqa: F401
     else:
         import numpy as xp
         import pandas as xdf
@@ -105,25 +112,25 @@ def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
     parts = [chunk_size for _ in range(num_chunks)]
     device_type = True if args.type == "gpu" else False
-    meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
+    meta = generate_chunk((0, 4, 1, chunk_type, None, device_type))
     divisions = [None] * (len(parts) + 1)
-    name = "generate-data-" + tokenize(chunk_size, num_chunks, frac_match, chunk_type)
-    graph = {
-        (name, i): (
-            generate_chunk,
-            i,
-            part,
-            len(parts),
-            chunk_type,
-            frac_match,
-            device_type,
-        )
-        for i, part in enumerate(parts)
-    }
-    ddf = new_dd_object(graph, name, meta, divisions)
+    ddf = dd.from_map(
+        generate_chunk,
+        [
+            (
+                i,
+                part,
+                len(parts),
+                chunk_type,
+                frac_match,
+                device_type,
+            )
+            for i, part in enumerate(parts)
+        ],
+        meta=meta,
+        divisions=divisions,
+    )
     if chunk_type == "build":
         if not args.no_shuffle:

dask_cuda/benchmarks/local_cudf_shuffle.py CHANGED Viewed

@@ -8,8 +8,6 @@ import pandas as pd
 import dask
 import dask.dataframe
-from dask.dataframe.core import new_dd_object
-from dask.dataframe.shuffle import shuffle
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, parse_bytes
@@ -33,7 +31,7 @@ except ImportError:
 def shuffle_dask(df, args):
-    result = shuffle(df, index="data", shuffle="tasks", ignore_index=args.ignore_index)
+    result = df.shuffle("data", shuffle_method="tasks", ignore_index=args.ignore_index)
     if args.backend == "dask-noop":
         result = as_noop(result)
     t1 = perf_counter()
@@ -94,18 +92,24 @@ def create_data(
         )
     # Create partition based to the specified partition distribution
-    dsk = {}
+    futures = []
     for i, part_size in enumerate(dist):
         for _ in range(part_size):
             # We use `client.submit` to control placement of the partition.
-            dsk[(name, len(dsk))] = client.submit(
-                create_df, chunksize, args.type, workers=[workers[i]], pure=False
+            futures.append(
+                client.submit(
+                    create_df, chunksize, args.type, workers=[workers[i]], pure=False
+                )
             )
-    wait(dsk.values())
+    wait(futures)
     df_meta = create_df(0, args.type)
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    ret = dask.dataframe.from_delayed(
+        futures,
+        meta=df_meta,
+        divisions=divs,
+    ).persist()
     wait(ret)
     data_processed = args.in_parts * args.partition_size
@@ -254,7 +258,9 @@ def parse_args():
     ]
     return parse_benchmark_args(
-        description="Distributed shuffle (dask/cudf) benchmark", args_list=special_args
+        description="Distributed shuffle (dask/cudf) benchmark",
+        args_list=special_args,
+        check_explicit_comms=False,
     )

dask_cuda/benchmarks/utils.py CHANGED Viewed

@@ -11,11 +11,13 @@ from typing import Any, Callable, Mapping, NamedTuple, Optional, Tuple
 import numpy as np
 import pandas as pd
+from dask import config
 from dask.distributed import Client, SSHCluster
 from dask.utils import format_bytes, format_time, parse_bytes
 from distributed.comm.addressing import get_address_host
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
+from dask_cuda.utils import parse_device_memory_limit
 def as_noop(dsk):
@@ -47,7 +49,11 @@ def as_noop(dsk):
         raise RuntimeError("Requested noop computation but dask-noop not installed.")
-def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]):
+def parse_benchmark_args(
+    description="Generic dask-cuda Benchmark",
+    args_list=[],
+    check_explicit_comms=True,
+):
     parser = argparse.ArgumentParser(description=description)
     worker_args = parser.add_argument_group(description="Worker configuration")
     worker_args.add_argument(
@@ -88,15 +94,41 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "'forkserver' can be used to avoid issues with fork not being allowed "
         "after the networking stack has been initialised.",
     )
+    cluster_args.add_argument(
+        "--disable-rmm",
+        action="store_true",
+        help="Disable RMM.",
+    )
+    cluster_args.add_argument(
+        "--disable-rmm-pool",
+        action="store_true",
+        help="Uses RMM for allocations but without a memory pool.",
+    )
     cluster_args.add_argument(
         "--rmm-pool-size",
         default=None,
         type=parse_bytes,
         help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
-        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
+        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
-        "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
+        "--rmm-maximum-pool-size",
+        default=None,
+        help="When ``--rmm-pool-size`` is specified, this argument indicates the "
+        "maximum pool size.  Can be an integer (bytes), or a string (like '4GB' or "
+        "'5000M'). By default, the total available memory on the GPU is used. "
+        "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
+        "pool size."
+        ""
+        ".. note::"
+        "    When paired with `--enable-rmm-async` the maximum size cannot be "
+        "    guaranteed due to fragmentation."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
         "--enable-rmm-managed",
@@ -317,6 +349,24 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
     if args.multi_node and len(args.hosts.split(",")) < 2:
         raise ValueError("--multi-node requires at least 2 hosts")
+    # Raise error early if "explicit-comms" is not allowed
+    if (
+        check_explicit_comms
+        and args.backend == "explicit-comms"
+        and config.get(
+            "dataframe.query-planning",
+            None,
+        )
+        is not False
+    ):
+        raise NotImplementedError(
+            "The 'explicit-comms' config is not yet supported when "
+            "query-planning is enabled in dask. Please use the legacy "
+            "dask-dataframe API by setting the following environment "
+            "variable before executing:",
+            "    DASK_DATAFRAME__QUERY_PLANNING=False",
+        )
     return args
@@ -384,10 +434,29 @@ def get_worker_device():
         return -1
+def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
+    import cupy
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
 def setup_memory_pool(
     dask_worker=None,
+    disable_rmm=None,
+    disable_rmm_pool=None,
     pool_size=None,
-    disable_pool=False,
+    maximum_pool_size=None,
     rmm_async=False,
     rmm_managed=False,
     release_threshold=None,
@@ -395,45 +464,66 @@ def setup_memory_pool(
     statistics=False,
     rmm_track_allocations=False,
 ):
-    import cupy
     import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
     from dask_cuda.utils import get_rmm_log_file_name
     logging = log_directory is not None
-    if rmm_async:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=pool_size, release_threshold=release_threshold
-            )
-        )
-    else:
-        rmm.reinitialize(
-            pool_allocator=not disable_pool,
-            managed_memory=rmm_managed,
-            initial_pool_size=pool_size,
-            logging=logging,
-            log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
-        )
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    if statistics:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+    if pool_size is not None:
+        pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
+    if maximum_pool_size is not None:
+        maximum_pool_size = parse_device_memory_limit(
+            maximum_pool_size, alignment_size=256
         )
-    if rmm_track_allocations:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+    if release_threshold is not None:
+        release_threshold = parse_device_memory_limit(
+            release_threshold, alignment_size=256
         )
+    if not disable_rmm:
+        if rmm_async:
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size,
+                release_threshold=release_threshold,
+            )
+            if maximum_pool_size is not None:
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=maximum_pool_size
+                )
+            rmm.mr.set_current_device_resource(mr)
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+        else:
+            rmm.reinitialize(
+                pool_allocator=not disable_rmm_pool,
+                managed_memory=rmm_managed,
+                initial_pool_size=pool_size,
+                maximum_pool_size=maximum_pool_size,
+                logging=logging,
+                log_file_name=get_rmm_log_file_name(
+                    dask_worker, logging, log_directory
+                ),
+            )
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
 def setup_memory_pools(
     client,
     is_gpu,
+    disable_rmm,
+    disable_rmm_pool,
     pool_size,
-    disable_pool,
+    maximum_pool_size,
     rmm_async,
     rmm_managed,
     release_threshold,
@@ -445,8 +535,10 @@ def setup_memory_pools(
         return
     client.run(
         setup_memory_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
         pool_size=pool_size,
-        disable_pool=disable_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
@@ -459,7 +551,9 @@ def setup_memory_pools(
     client.run_on_scheduler(
         setup_memory_pool,
         pool_size=1e9,
-        disable_pool=disable_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,

dask_cuda/cli.py CHANGED Viewed

@@ -101,6 +101,20 @@ def cuda():
     total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
     disable spilling to host (i.e. allow full device memory usage).""",
 )
+@click.option(
+    "--enable-cudf-spill/--disable-cudf-spill",
+    default=False,
+    show_default=True,
+    help="""Enable automatic cuDF spilling. WARNING: This should NOT be used with
+    JIT-Unspill.""",
+)
+@click.option(
+    "--cudf-spill-stats",
+    type=int,
+    default=0,
+    help="""Set the cuDF spilling statistics level. This option has no effect if
+    `--enable-cudf-spill` is not specified.""",
+)
 @click.option(
     "--rmm-pool-size",
     default=None,
@@ -120,6 +134,10 @@ def cuda():
     memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
     to set the maximum pool size.
+    .. note::
+        When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
+        to fragmentation.
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
 )
@@ -326,6 +344,8 @@ def worker(
     name,
     memory_limit,
     device_memory_limit,
+    enable_cudf_spill,
+    cudf_spill_stats,
     rmm_pool_size,
     rmm_maximum_pool_size,
     rmm_managed_memory,
@@ -398,6 +418,8 @@ def worker(
             name,
             memory_limit,
             device_memory_limit,
+            enable_cudf_spill,
+            cudf_spill_stats,
             rmm_pool_size,
             rmm_maximum_pool_size,
             rmm_managed_memory,

dask_cuda/cuda_worker.py CHANGED Viewed

@@ -20,7 +20,7 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -41,6 +41,8 @@ class CUDAWorker(Server):
         name=None,
         memory_limit="auto",
         device_memory_limit="auto",
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         rmm_pool_size=None,
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
@@ -166,6 +168,12 @@ class CUDAWorker(Server):
         if device_memory_limit is None and memory_limit is None:
             data = lambda _: {}
         elif jit_unspill:
+            if enable_cudf_spill:
+                warnings.warn(
+                    "Enabling cuDF spilling and JIT-Unspill together is not "
+                    "safe, consider disabling JIT-Unspill."
+                )
             data = lambda i: (
                 ProxifyHostFile,
                 {
@@ -217,6 +225,7 @@ class CUDAWorker(Server):
                         track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
+                    CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
                 },
                 name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
                 local_directory=local_directory,

dask_cuda/explicit_comms/dataframe/shuffle.py CHANGED Viewed

@@ -8,13 +8,18 @@ from math import ceil
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
+import numpy as np
+import pandas as pd
 import dask
 import dask.config
 import dask.dataframe
+import dask.dataframe as dd
 import dask.utils
 import distributed.worker
 from dask.base import tokenize
-from dask.dataframe.core import DataFrame, Series, _concat as dd_concat, new_dd_object
+from dask.dataframe import DataFrame, Series
+from dask.dataframe.core import _concat as dd_concat
 from dask.dataframe.shuffle import group_split_dispatch, hash_object_dispatch
 from distributed import wait
 from distributed.protocol import nested_deserialize, to_serialize
@@ -153,9 +158,16 @@ def compute_map_index(
     if column_names[0] == "_partitions":
         ind = df[column_names[0]]
     else:
-        ind = hash_object_dispatch(
-            df[column_names] if column_names else df, index=False
-        )
+        # Need to cast numerical dtypes to be consistent
+        # with `dask.dataframe.shuffle.partitioning_index`
+        dtypes = {}
+        index = df[column_names] if column_names else df
+        for col, dtype in index.dtypes.items():
+            if pd.api.types.is_numeric_dtype(dtype):
+                dtypes[col] = np.float64
+        if dtypes:
+            index = index.astype(dtypes, errors="ignore")
+        ind = hash_object_dispatch(index, index=False)
     return ind % npartitions
@@ -185,15 +197,8 @@ def partition_dataframe(
     partitions
         Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
-        return dict(
-            zip(
-                range(npartitions),
-                df.partition_by_hash(
-                    column_names, npartitions, keep_index=not ignore_index
-                ),
-            )
-        )
+    # TODO: Use `partition_by_hash` if/when dtype-casting is added
+    # (See: https://github.com/rapidsai/cudf/issues/16221)
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
@@ -468,18 +473,19 @@ def shuffle(
         npartitions = df.npartitions
     # Step (a):
-    df = df.persist()  # Make sure optimizations are apply on the existing graph
+    df = df.persist()  # Make sure optimizations are applied on the existing graph
     wait([df])  # Make sure all keys has been materialized on workers
+    persisted_keys = [f.key for f in c.client.futures_of(df)]
     name = (
         "explicit-comms-shuffle-"
-        f"{tokenize(df, column_names, npartitions, ignore_index)}"
+        f"{tokenize(df, column_names, npartitions, ignore_index, batchsize)}"
     )
     df_meta: DataFrame = df._meta
     # Stage all keys of `df` on the workers and cancel them, which makes it possible
     # for the shuffle to free memory as the partitions of `df` are consumed.
     # See CommsContext.stage_keys() for a description of staging.
-    rank_to_inkeys = c.stage_keys(name=name, keys=df.__dask_keys__())
+    rank_to_inkeys = c.stage_keys(name=name, keys=persisted_keys)
     c.client.cancel(df)
     # Get batchsize
@@ -526,23 +532,27 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
-    dsk = {}
+    _futures = {}
     for rank in ranks:
         for part_id in rank_to_out_part_ids[rank]:
-            dsk[(name, part_id)] = c.client.submit(
+            _futures[part_id] = c.client.submit(
                 getitem,
                 shuffle_result[rank],
                 part_id,
                 workers=[c.worker_addresses[rank]],
             )
+    # Make sure partitions are properly ordered
+    futures = [_futures.pop(i) for i in range(npartitions)]
     # Create a distributed Dataframe from all the pieces
-    divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    divs = [None] * (len(futures) + 1)
+    kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
+    ret = dd.from_delayed(futures, **kwargs).persist()
     wait([ret])
     # Release all temporary dataframes
-    for fut in [*shuffle_result.values(), *dsk.values()]:
+    for fut in [*shuffle_result.values(), *futures]:
         fut.release()
     return ret

dask_cuda/local_cuda_cluster.py CHANGED Viewed

@@ -10,7 +10,7 @@ from distributed.worker_memory import parse_memory_limit
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -73,6 +73,14 @@ class LocalCUDACluster(LocalCluster):
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
         total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
         or ``None`` to disable spilling to host (i.e. allow full device memory usage).
+    enable_cudf_spill : bool, default False
+        Enable automatic cuDF spilling.
+        .. warning::
+            This should NOT be used together with JIT-Unspill.
+    cudf_spill_stats : int, default 0
+        Set the cuDF spilling statistics level. This option has no effect if
+        ``enable_cudf_spill=False``.
     local_directory : str or None, default None
         Path on local machine to store temporary files. Can be a string (like
         ``"path/to/files"``) or ``None`` to fall back on the value of
@@ -114,6 +122,10 @@ class LocalCUDACluster(LocalCluster):
         memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
         and to set the maximum pool size.
+        .. note::
+            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
+            due to fragmentation.
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_managed_memory : bool, default False
@@ -205,6 +217,8 @@ class LocalCUDACluster(LocalCluster):
         threads_per_worker=1,
         memory_limit="auto",
         device_memory_limit=0.8,
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         data=None,
         local_directory=None,
         shared_filesystem=None,
@@ -255,6 +269,8 @@ class LocalCUDACluster(LocalCluster):
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )
+        self.enable_cudf_spill = enable_cudf_spill
+        self.cudf_spill_stats = cudf_spill_stats
         self.rmm_pool_size = rmm_pool_size
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
@@ -298,6 +314,12 @@ class LocalCUDACluster(LocalCluster):
             if device_memory_limit is None and memory_limit is None:
                 data = {}
             elif jit_unspill:
+                if enable_cudf_spill:
+                    warnings.warn(
+                        "Enabling cuDF spilling and JIT-Unspill together is not "
+                        "safe, consider disabling JIT-Unspill."
+                    )
                 data = (
                     ProxifyHostFile,
                     {
@@ -410,6 +432,7 @@ class LocalCUDACluster(LocalCluster):
                         track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
+                    CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
                 },
             }
         )

dask_cuda/plugins.py CHANGED Viewed

@@ -14,6 +14,21 @@ class CPUAffinity(WorkerPlugin):
         os.sched_setaffinity(0, self.cores)
+class CUDFSetup(WorkerPlugin):
+    def __init__(self, spill, spill_stats):
+        self.spill = spill
+        self.spill_stats = spill_stats
+    def setup(self, worker=None):
+        try:
+            import cudf
+            cudf.set_option("spill", self.spill)
+            cudf.set_option("spill_stats", self.spill_stats)
+        except ImportError:
+            pass
 class RMMSetup(WorkerPlugin):
     def __init__(
         self,

dask_cuda/tests/test_cudf_builtin_spilling.py CHANGED Viewed

@@ -20,7 +20,7 @@ from cudf.core.buffer.spill_manager import (  # noqa: E402
     get_global_manager,
     set_global_manager,
 )
-from cudf.testing._utils import assert_eq  # noqa: E402
+from cudf.testing import assert_eq  # noqa: E402
 if get_global_manager() is not None:
     pytest.skip(

dask_cuda/tests/test_dask_cuda_worker.py CHANGED Viewed

@@ -231,6 +231,64 @@ def test_rmm_logging(loop):  # noqa: F811
                     assert v is rmm.mr.LoggingResourceAdaptor
+def test_cudf_spill_disabled(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+                cudf_spill = client.run(
+                    cudf.get_option,
+                    "spill",
+                )
+                for v in cudf_spill.values():
+                    assert v is False
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 0
+def test_cudf_spill(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+                "--enable-cudf-spill",
+                "--cudf-spill-stats",
+                "2",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+                cudf_spill = client.run(cudf.get_option, "spill")
+                for v in cudf_spill.values():
+                    assert v is True
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 2
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):

dask_cuda/tests/test_dgx.py CHANGED Viewed

@@ -15,6 +15,10 @@ mp = mp.get_context("spawn")  # type: ignore
 psutil = pytest.importorskip("psutil")
+def _is_ucx_116(ucp):
+    return ucp.get_ucx_version()[:2] == (1, 16)
 class DGXVersion(Enum):
     DGX_1 = auto()
     DGX_2 = auto()
@@ -102,9 +106,11 @@ def _test_tcp_over_ucx(protocol):
 )
 def test_tcp_over_ucx(protocol):
     if protocol == "ucx":
-        pytest.importorskip("ucp")
+        ucp = pytest.importorskip("ucp")
     elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp):
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
     p = mp.Process(target=_test_tcp_over_ucx, args=(protocol,))
     p.start()
@@ -217,9 +223,11 @@ def _test_ucx_infiniband_nvlink(
 )
 def test_ucx_infiniband_nvlink(protocol, params):
     if protocol == "ucx":
-        pytest.importorskip("ucp")
+        ucp = pytest.importorskip("ucp")
     elif protocol == "ucxx":
-        pytest.importorskip("ucxx")
+        ucp = pytest.importorskip("ucxx")
+    if _is_ucx_116(ucp) and params["enable_infiniband"] is False:
+        pytest.skip("https://github.com/rapidsai/ucx-py/issues/1037")
     skip_queue = mp.Queue()

dask_cuda/tests/test_explicit_comms.py CHANGED Viewed

@@ -25,6 +25,22 @@ from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
 mp = mp.get_context("spawn")  # type: ignore
 ucp = pytest.importorskip("ucp")
+QUERY_PLANNING_ON = dask.config.get("dataframe.query-planning", None) is not False
+# Skip these tests when dask-expr is active (for now)
+query_planning_skip = pytest.mark.skipif(
+    QUERY_PLANNING_ON,
+    reason=(
+        "The 'explicit-comms' config is not supported "
+        "when query planning is enabled."
+    ),
+)
+# Set default shuffle method to "tasks"
+if dask.config.get("dataframe.shuffle.method", None) is None:
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
 # Notice, all of the following tests is executed in a new process such
 # that UCX options of the different tests doesn't conflict.
@@ -82,6 +98,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
                     pd.testing.assert_frame_equal(got, expected)
+@query_planning_skip
 def test_dataframe_merge_empty_partitions():
     # Notice, we use more partitions than rows
     p = mp.Process(target=_test_dataframe_merge_empty_partitions, args=(2, 4))
@@ -92,7 +109,14 @@ def test_dataframe_merge_empty_partitions():
 def check_partitions(df, npartitions):
     """Check that all values in `df` hashes to the same"""
-    hashes = partitioning_index(df, npartitions)
+    dtypes = {}
+    for col, dtype in df.dtypes.items():
+        if pd.api.types.is_numeric_dtype(dtype):
+            dtypes[col] = np.float64
+    if not dtypes:
+        dtypes = None
+    hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
     if len(hashes) > 0:
         return len(hashes.unique()) == 1
     else:
@@ -111,11 +135,10 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
-        with Client(cluster) as client:
-            all_workers = list(client.get_worker_logs().keys())
+        with Client(cluster):
             comms.default_comms()
             np.random.seed(42)
-            df = pd.DataFrame({"key": np.random.random(100)})
+            df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
@@ -124,15 +147,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
-                    ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
-                        workers=all_workers
-                    )
+                    ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
                     # To reduce test runtime, we change the batchsizes here instead
                     # of using a test parameter.
                     for batchsize in (-1, 1, 2):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
-                                ddf,
+                                ddf1,
                                 ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
@@ -160,6 +181,32 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
                                 got = ddf.compute().sort_values("key")
                                 assert_eq(got, expected)
+                                # Check that partitioning is consistent with "tasks"
+                                ddf_tasks = ddf1.shuffle(
+                                    ["key"],
+                                    npartitions=output_nparts,
+                                    shuffle_method="tasks",
+                                )
+                                for i in range(output_nparts):
+                                    expected_partition = ddf_tasks.partitions[
+                                        i
+                                    ].compute()["key"]
+                                    actual_partition = ddf.partitions[i].compute()[
+                                        "key"
+                                    ]
+                                    if backend == "cudf":
+                                        expected_partition = (
+                                            expected_partition.values_host
+                                        )
+                                        actual_partition = actual_partition.values_host
+                                    else:
+                                        expected_partition = expected_partition.values
+                                        actual_partition = actual_partition.values
+                                    assert all(
+                                        np.sort(expected_partition)
+                                        == np.sort(actual_partition)
+                                    )
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
@@ -220,6 +267,7 @@ def _test_dask_use_explicit_comms(in_cluster):
         check_shuffle()
+@query_planning_skip
 @pytest.mark.parametrize("in_cluster", [True, False])
 def test_dask_use_explicit_comms(in_cluster):
     def _timeout(process, function, timeout):
@@ -282,6 +330,7 @@ def _test_dataframe_shuffle_merge(backend, protocol, n_workers):
             assert_eq(got, expected)
+@query_planning_skip
 @pytest.mark.parametrize("nworkers", [1, 2, 4])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])

dask_cuda/tests/test_local_cuda_cluster.py CHANGED Viewed

@@ -500,6 +500,54 @@ async def test_worker_fraction_limits():
             )
+@gen_test(timeout=20)
+async def test_cudf_spill_disabled():
+    cudf = pytest.importorskip("cudf")
+    async with LocalCUDACluster(
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is False
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 0
+@gen_test(timeout=20)
+async def test_cudf_spill():
+    cudf = pytest.importorskip("cudf")
+    async with LocalCUDACluster(
+        enable_cudf_spill=True,
+        cudf_spill_stats=2,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is True
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 2
 @pytest.mark.parametrize(
     "protocol",
     ["ucx", "ucxx"],

dask_cuda/tests/test_proxy.py CHANGED Viewed

@@ -537,10 +537,10 @@ def test_from_cudf_of_proxy_object():
     assert has_parallel_type(df)
     ddf = dask_cudf.from_cudf(df, npartitions=1)
-    assert has_parallel_type(ddf)
+    assert has_parallel_type(ddf._meta)
     # Notice, the output is a dask-cudf dataframe and not a proxy object
-    assert type(ddf) is dask_cudf.core.DataFrame
+    assert type(ddf._meta) is cudf.DataFrame
 def test_proxy_object_parquet(tmp_path):

dask_cuda/tests/test_version.py ADDED Viewed

@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import dask_cuda
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cuda.__git_commit__, str)
+    # __version__ should always be non-empty
+    assert isinstance(dask_cuda.__version__, str)
+    assert len(dask_cuda.__version__) > 0

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.1
 Name: dask-cuda
-Version: 24.4.0
+Version: 24.8.2
 Summary: Utilities for Dask and CUDA interactions
 Author: NVIDIA Corporation
-License: Apache-2.0
+License: Apache 2.0
 Project-URL: Homepage, https://github.com/rapidsai/dask-cuda
 Project-URL: Documentation, https://docs.rapids.ai/api/dask-cuda/stable/
 Project-URL: Source, https://github.com/rapidsai/dask-cuda
@@ -18,25 +18,25 @@ Classifier: Programming Language :: Python :: 3.11
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: click >=8.1
-Requires-Dist: numba >=0.57
-Requires-Dist: numpy <2.0a0,>=1.23
-Requires-Dist: pandas >=1.3
-Requires-Dist: pynvml <11.5,>=11.0.0
-Requires-Dist: rapids-dask-dependency ==24.4.*
-Requires-Dist: zict >=2.0.0
+Requires-Dist: click>=8.1
+Requires-Dist: numba>=0.57
+Requires-Dist: numpy<2.0a0,>=1.23
+Requires-Dist: pandas>=1.3
+Requires-Dist: pynvml<11.5,>=11.0.0
+Requires-Dist: rapids-dask-dependency==24.8.*
+Requires-Dist: zict>=2.0.0
 Provides-Extra: docs
-Requires-Dist: numpydoc >=1.1.0 ; extra == 'docs'
-Requires-Dist: sphinx ; extra == 'docs'
-Requires-Dist: sphinx-click >=2.7.1 ; extra == 'docs'
-Requires-Dist: sphinx-rtd-theme >=0.5.1 ; extra == 'docs'
+Requires-Dist: numpydoc>=1.1.0; extra == "docs"
+Requires-Dist: sphinx; extra == "docs"
+Requires-Dist: sphinx-click>=2.7.1; extra == "docs"
+Requires-Dist: sphinx-rtd-theme>=0.5.1; extra == "docs"
 Provides-Extra: test
-Requires-Dist: cudf ==24.4.* ; extra == 'test'
-Requires-Dist: dask-cudf ==24.4.* ; extra == 'test'
-Requires-Dist: kvikio ==24.4.* ; extra == 'test'
-Requires-Dist: pytest ; extra == 'test'
-Requires-Dist: pytest-cov ; extra == 'test'
-Requires-Dist: ucx-py ==0.37.* ; extra == 'test'
+Requires-Dist: cudf==24.8.*; extra == "test"
+Requires-Dist: dask-cudf==24.8.*; extra == "test"
+Requires-Dist: kvikio==24.8.*; extra == "test"
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: pytest-cov; extra == "test"
+Requires-Dist: ucx-py==0.39.*; extra == "test"
 Dask CUDA
 =========

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
-dask_cuda/VERSION,sha256=cS6_wRwcl4ZhhE4gk3mZbC_4dD-r28YRdEQeaKdKu1U,9
-dask_cuda/__init__.py,sha256=Vt42yCT1WhjZgehiNLDLw2uvfxjqLThD8uKDMyKQYsw,1454
-dask_cuda/_version.py,sha256=OcuA1fDRuukb6OC8LzTwSO2aJ3ZtB1N2pm08Nrga8y8,778
-dask_cuda/cli.py,sha256=XNRH0bu-6jzRoyWJB5qSWuzePJSh3z_5Ng6rDCnz7lg,15970
-dask_cuda/cuda_worker.py,sha256=bIu-ESeIpJG_WaTYrv0z9z5juJ1qR5i_5Ng3CN1WK8s,8579
+dask_cuda/VERSION,sha256=5YtjwV2EoD7E5Ed4K-PvnU0eEtdkkn33JHuNFDy8oKA,8
+dask_cuda/__init__.py,sha256=JLDWev7vI_dPusLgRdOwXBz-xfhlX_hc-DzmLtrEYO0,1918
+dask_cuda/_version.py,sha256=cHDO9AzNtxkCVhwYu7hL3H7RPAkQnxpKBjElOst3rkI,964
+dask_cuda/cli.py,sha256=Y3aObfAyMwOIo0oVz3-NC2InGLShOpeINwW5ROTF2s8,16616
+dask_cuda/cuda_worker.py,sha256=uqyoDKsSe7sKN3StMVyz_971rj0Sjpmwfv7Bj083Wss,8959
 dask_cuda/device_host_file.py,sha256=yS31LGtt9VFAG78uBBlTDr7HGIng2XymV1OxXIuEMtM,10272
 dask_cuda/disk_io.py,sha256=urSLKiPvJvYmKCzDPOUDCYuLI3r1RUiyVh3UZGRoF_Y,6626
 dask_cuda/get_device_memory_objects.py,sha256=R3U2cq4fJZPgtsUKyIguy9161p3Q99oxmcCmTcg6BtQ,4075
 dask_cuda/initialize.py,sha256=Gjcxs_c8DTafgsHe5-2mw4lJdOmbFJJAZVOnxA8lTjM,6462
 dask_cuda/is_device_object.py,sha256=CnajvbQiX0FzFzwft0MqK1OPomx3ZGDnDxT56wNjixw,1046
 dask_cuda/is_spillable_object.py,sha256=CddGmg0tuSpXh2m_TJSY6GRpnl1WRHt1CRcdWgHPzWA,1457
-dask_cuda/local_cuda_cluster.py,sha256=hoEiEfJqAQrRS7N632VatSl1245GiWMT5B77Wc-i5C0,17928
-dask_cuda/plugins.py,sha256=cnHsdrXx7PBPmrzHX6YEkCH5byCsUk8LE2FeTeu8ZLU,4259
+dask_cuda/local_cuda_cluster.py,sha256=jgXjd6OvEDfQ3iXU8hV_UfULa13GZsli0SGC2PIouZk,18882
+dask_cuda/plugins.py,sha256=DCf7PnIBu_VNjFfrFeb1zCNuEnCaX9oz4Umn76t02Mc,4630
 dask_cuda/proxify_device_objects.py,sha256=99CD7LOE79YiQGJ12sYl_XImVhJXpFR4vG5utdkjTQo,8108
 dask_cuda/proxify_host_file.py,sha256=Wf5CFCC1JN5zmfvND3ls0M5FL01Y8VhHrk0xV3UQ9kk,30850
 dask_cuda/proxy_object.py,sha256=bZq92kjgFB-ad_luSAFT_RItV3nssmiEk4OOSp34laU,29812
@@ -18,36 +18,37 @@ dask_cuda/utils.py,sha256=RWlLK2cPHaCuNNhr8bW8etBeGklwREQJOafQbTydStk,25121
 dask_cuda/utils_test.py,sha256=WNMR0gic2tuP3pgygcR9g52NfyX8iGMOan6juXhpkCE,1694
 dask_cuda/worker_spec.py,sha256=7-Uq_e5q2SkTlsmctMcYLCa9_3RiiVHZLIN7ctfaFmE,4376
 dask_cuda/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dask_cuda/benchmarks/common.py,sha256=sEIFnRZS6wbyKCQyB4fDclYLc2YqC0PolurR5qzuRxw,6393
+dask_cuda/benchmarks/common.py,sha256=2MnDdQjvHfGaUWDgiTcTGI_EeKPmVBEwoWfsJUNpOjU,6613
 dask_cuda/benchmarks/local_cudf_groupby.py,sha256=T9lA9nb4Wzu46AH--SJEVCeCm3650J7slapdNR_08FU,8904
-dask_cuda/benchmarks/local_cudf_merge.py,sha256=POjxoPx4zY1TjG2S_anElL6rDtC5Jhn3nF4HABlnwZg,12447
-dask_cuda/benchmarks/local_cudf_shuffle.py,sha256=M-Lp3O3q8uyY50imQqMKZYwkAmyR0NApjx2ipGxDkXw,8608
+dask_cuda/benchmarks/local_cudf_merge.py,sha256=AsuVnMA3H93sJwjjgi4KaIdYKnnX1OeRMPiXizrwHGk,12577
+dask_cuda/benchmarks/local_cudf_shuffle.py,sha256=2xWJZf3gwDNimXKZN2ivtU3OE_qec1KNOhgL4_AGQZU,8655
 dask_cuda/benchmarks/local_cupy.py,sha256=aUKIYfeR7c77K4kKk697Rxo8tG8kFabQ9jQEVGr-oTs,10762
 dask_cuda/benchmarks/local_cupy_map_overlap.py,sha256=_texYmam1K_XbzIvURltui5KRsISGFNylXiGUtgRIz0,6442
-dask_cuda/benchmarks/utils.py,sha256=baL5zK6VS6Mw_M4x9zJe8vMLUd2SZd1lS78JrL-h6oo,26896
+dask_cuda/benchmarks/utils.py,sha256=4k8KnJPOczKDQNBPRWlaGsU2zdEA09BDGgklUXggwMU,30008
 dask_cuda/explicit_comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dask_cuda/explicit_comms/comms.py,sha256=Su6PuNo68IyS-AwoqU4S9TmqWsLvUdNa0jot2hx8jQQ,10400
 dask_cuda/explicit_comms/dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=YferHNWKsMea8tele-ynPVr_6RAZNZIR-VzK_uFuEQU,20131
-dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=u3kW91YRLdHFycvpGfSQKrEucu5khMJ1k4sjmddO490,4910
-dask_cuda/tests/test_dask_cuda_worker.py,sha256=gViHaMCSfB6ip125OEi9D0nfKC-qBXRoHz6BRodEdb4,17729
+dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=4xfhfbTGa36YPs_ex1_fFhzfGMYJq-QkS5q0RwgeHh8,20645
+dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=qVN9J0Hdv66A9COFArLIdRriyyxEKpS3lEZGHbVHaq8,4903
+dask_cuda/tests/test_dask_cuda_worker.py,sha256=o5g0_t-2M_2lfPeOPTS4NVF4rnQF0ZWAZekXw2h0xPc,19610
 dask_cuda/tests/test_device_host_file.py,sha256=79ssUISo1YhsW_7HdwqPfsH2LRzS2bi5BjPym1Sdgqw,5882
-dask_cuda/tests/test_dgx.py,sha256=Oh2vwL_CdUzSVQQoiIu6SPwXGRtmXwaW_Hh3ipXPUOc,7162
-dask_cuda/tests/test_explicit_comms.py,sha256=I4lSW-NQ0E08baEoG7cY4Ix3blGb1Auz88q2BNd1cPA,13136
+dask_cuda/tests/test_dgx.py,sha256=BPCF4ZvhrVKkT43OOFHdijuo-M34vW3V18C8rRH1HXg,7489
+dask_cuda/tests/test_explicit_comms.py,sha256=Pa5vVx63qWtScnVJuS31WESXIt2FPyTJVFO-0OUbbmU,15276
 dask_cuda/tests/test_from_array.py,sha256=okT1B6UqHmLxoy0uER0Ylm3UyOmi5BAXwJpTuTAw44I,601
 dask_cuda/tests/test_gds.py,sha256=6jf0HPTHAIG8Mp_FC4Ai4zpn-U1K7yk0fSXg8He8-r8,1513
 dask_cuda/tests/test_initialize.py,sha256=Rba59ZbljEm1yyN94_sWZPEE_f7hWln95aiBVc49pmY,6960
-dask_cuda/tests/test_local_cuda_cluster.py,sha256=G3kR-4o-vCqWWfSuQLFKVEK0F243FaDSgRlDTUll5aU,18376
+dask_cuda/tests/test_local_cuda_cluster.py,sha256=Lc9QncyGwBwhaZPGBfreXJf3ZC9Zd8SjDc2fpeQ-BT0,19710
 dask_cuda/tests/test_proxify_host_file.py,sha256=Yiv0sDcUoWw0d2oiPeHGoHqqSSM4lfQ4rChCiaxb6EU,18994
-dask_cuda/tests/test_proxy.py,sha256=6iicSYYT2BGo1iKUQ7jM00mCjC4gtfwwxFXfGwH3QHc,23807
+dask_cuda/tests/test_proxy.py,sha256=OnGnPkl5ksCb-3hpEKG2z1OfPK9DbnOCtBHOjcUUjhg,23809
 dask_cuda/tests/test_spill.py,sha256=xN9PbVERBYMuZxvscSO0mAM22loq9WT3ltZVBFxlmM4,10239
 dask_cuda/tests/test_utils.py,sha256=JRIwXfemc3lWSzLJX0VcvR1_0wB4yeoOTsw7kB6z6pU,9176
+dask_cuda/tests/test_version.py,sha256=vK2HjlRLX0nxwvRsYxBqhoZryBNZklzA-vdnyuWDxVg,365
 dask_cuda/tests/test_worker_spec.py,sha256=Bvu85vkqm6ZDAYPXKMJlI2pm9Uc5tiYKNtO4goXSw-I,2399
 examples/ucx/client_initialize.py,sha256=YN3AXHF8btcMd6NicKKhKR9SXouAsK1foJhFspbOn70,1262
 examples/ucx/local_cuda_cluster.py,sha256=7xVY3EhwhkY2L4VZin_BiMCbrjhirDNChoC86KiETNc,1983
-dask_cuda-24.4.0.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
-dask_cuda-24.4.0.dist-info/METADATA,sha256=mjy5YntsqIKdldcPEj-jjR70Aphe-B0LXqjcZjVFS9U,2570
-dask_cuda-24.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-dask_cuda-24.4.0.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
-dask_cuda-24.4.0.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
-dask_cuda-24.4.0.dist-info/RECORD,,
+dask_cuda-24.8.2.dist-info/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
+dask_cuda-24.8.2.dist-info/METADATA,sha256=6iMwPI8cWrEYDYz73vm8pw-LkVeEgTQzymJgRxj32VQ,2546
+dask_cuda-24.8.2.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+dask_cuda-24.8.2.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
+dask_cuda-24.8.2.dist-info/top_level.txt,sha256=3kKxJxeM108fuYc_lwwlklP7YBU9IEmdmRAouzi397o,33
+dask_cuda-24.8.2.dist-info/RECORD,,

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (72.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{dask_cuda-24.4.0.dist-info → dask_cuda-24.8.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

dask-cuda 24.4.0__py3-none-any.whl → 24.8.2__py3-none-any.whl

dask-cuda 24.4.0py3-none-any.whl → 24.8.2py3-none-any.whl