PyPI - dask-cuda - Versions diffs - 25.12.0__py3-none-manylinux_2_28_aarch64.manylinux_2_28_x86_64.whl - Mend

dask-cuda 25.12.0__py3-none-manylinux_2_28_aarch64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

dask_cuda/GIT_COMMIT +1 -0
dask_cuda/VERSION +1 -0
dask_cuda/__init__.py +57 -0
dask_cuda/_compat.py +19 -0
dask_cuda/_version.py +19 -0
dask_cuda/benchmarks/__init__.py +2 -0
dask_cuda/benchmarks/common.py +216 -0
dask_cuda/benchmarks/local_cudf_groupby.py +278 -0
dask_cuda/benchmarks/local_cudf_merge.py +373 -0
dask_cuda/benchmarks/local_cudf_shuffle.py +327 -0
dask_cuda/benchmarks/local_cupy.py +327 -0
dask_cuda/benchmarks/local_cupy_map_overlap.py +198 -0
dask_cuda/benchmarks/read_parquet.py +270 -0
dask_cuda/benchmarks/utils.py +936 -0
dask_cuda/cli.py +546 -0
dask_cuda/cuda_worker.py +237 -0
dask_cuda/device_host_file.py +325 -0
dask_cuda/disk_io.py +227 -0
dask_cuda/explicit_comms/__init__.py +2 -0
dask_cuda/explicit_comms/comms.py +359 -0
dask_cuda/explicit_comms/dataframe/__init__.py +2 -0
dask_cuda/explicit_comms/dataframe/shuffle.py +722 -0
dask_cuda/get_device_memory_objects.py +155 -0
dask_cuda/initialize.py +245 -0
dask_cuda/is_device_object.py +44 -0
dask_cuda/is_spillable_object.py +59 -0
dask_cuda/local_cuda_cluster.py +459 -0
dask_cuda/plugins.py +209 -0
dask_cuda/proxify_device_objects.py +263 -0
dask_cuda/proxify_host_file.py +795 -0
dask_cuda/proxy_object.py +951 -0
dask_cuda/tests/conftest.py +41 -0
dask_cuda/tests/test_cudf_builtin_spilling.py +155 -0
dask_cuda/tests/test_dask_cuda_worker.py +696 -0
dask_cuda/tests/test_dask_setup.py +193 -0
dask_cuda/tests/test_device_host_file.py +204 -0
dask_cuda/tests/test_dgx.py +227 -0
dask_cuda/tests/test_explicit_comms.py +566 -0
dask_cuda/tests/test_from_array.py +20 -0
dask_cuda/tests/test_gds.py +47 -0
dask_cuda/tests/test_initialize.py +434 -0
dask_cuda/tests/test_local_cuda_cluster.py +661 -0
dask_cuda/tests/test_proxify_host_file.py +534 -0
dask_cuda/tests/test_proxy.py +698 -0
dask_cuda/tests/test_spill.py +504 -0
dask_cuda/tests/test_utils.py +348 -0
dask_cuda/tests/test_version.py +13 -0
dask_cuda/tests/test_worker_spec.py +83 -0
dask_cuda/utils.py +974 -0
dask_cuda/utils_test.py +48 -0
dask_cuda/worker_common.py +196 -0
dask_cuda/worker_spec.py +131 -0
dask_cuda-25.12.0.dist-info/METADATA +75 -0
dask_cuda-25.12.0.dist-info/RECORD +61 -0
dask_cuda-25.12.0.dist-info/WHEEL +6 -0
dask_cuda-25.12.0.dist-info/entry_points.txt +6 -0
dask_cuda-25.12.0.dist-info/licenses/LICENSE +201 -0
dask_cuda-25.12.0.dist-info/top_level.txt +6 -0
shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
shared-actions/telemetry-impls/summarize/send_trace.py +425 -0

dask_cuda/GIT_COMMIT ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7edf2c69a732ebd24f8dd3e76cb06235a473f7a5

dask_cuda/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 25.12.000

dask_cuda/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import sys
+if sys.platform != "linux":
+    raise ImportError("Only Linux is supported by Dask-CUDA at this time")
+import dask
+import dask.utils
+from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
+from distributed.protocol.serialize import dask_deserialize, dask_serialize
+from ._version import __git_commit__, __version__
+from .cuda_worker import CUDAWorker
+from .local_cuda_cluster import LocalCUDACluster
+try:
+    import dask.dataframe as dask_dataframe
+except ImportError:
+    # Dask DataFrame (optional) isn't installed
+    dask_dataframe = None
+if dask_dataframe is not None:
+    from .explicit_comms.dataframe.shuffle import patch_shuffle_expression
+    from .proxify_device_objects import proxify_decorator, unproxify_decorator
+    # Monkey patching Dask to make use of explicit-comms when `DASK_EXPLICIT_COMMS=True`
+    patch_shuffle_expression()
+    # Monkey patching Dask to make use of proxify and unproxify in compatibility mode
+    dask_dataframe.shuffle.shuffle_group = proxify_decorator(
+        dask.dataframe.shuffle.shuffle_group
+    )
+    dask_dataframe.core._concat = unproxify_decorator(dask.dataframe.core._concat)
+    def _register_cudf_spill_aware():
+        import cudf
+        # Only enable Dask/cuDF spilling if cuDF spilling is disabled, see
+        # https://github.com/rapidsai/dask-cuda/issues/1363
+        if not cudf.get_option("spill"):
+            # This reproduces the implementation of `_register_cudf`, see
+            # https://github.com/dask/distributed/blob/40fcd65e991382a956c3b879e438be1b100dff97/distributed/protocol/__init__.py#L106-L115
+            from cudf.comm import serialize
+    for registry in [
+        cuda_serialize,
+        cuda_deserialize,
+        dask_serialize,
+        dask_deserialize,
+    ]:
+        for lib in ["cudf", "dask_cudf"]:
+            if lib in registry._lazy:
+                registry._lazy[lib] = _register_cudf_spill_aware

dask_cuda/_compat.py ADDED Viewed

@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+import functools
+import importlib.metadata
+import packaging.version
+@functools.lru_cache(maxsize=None)
+def get_dask_version() -> packaging.version.Version:
+    return packaging.version.parse(importlib.metadata.version("dask"))
+@functools.lru_cache(maxsize=None)
+def DASK_2025_4_0():
+    # dask 2025.4.0 isn't currently released, so we're relying
+    # on strictly greater than here.
+    return get_dask_version() > packaging.version.parse("2025.3.0")

dask_cuda/_version.py ADDED Viewed

@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+import importlib.resources
+__version__ = (
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+__all__ = ["__git_commit__", "__version__"]

dask_cuda/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

dask_cuda/benchmarks/common.py ADDED Viewed

@@ -0,0 +1,216 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+import contextlib
+from argparse import Namespace
+from functools import partial
+from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
+from warnings import filterwarnings
+import numpy as np
+import pandas as pd
+import dask
+from distributed import Client, performance_report
+from dask_cuda.benchmarks.utils import (
+    address_to_index,
+    aggregate_transfer_log_data,
+    bandwidth_statistics,
+    get_cluster_options,
+    peer_to_peer_bandwidths,
+    save_benchmark_data,
+    setup_memory_pools,
+    wait_for_cluster,
+)
+from dask_cuda.utils import all_to_all
+__all__ = ("execute_benchmark", "Config")
+class Config(NamedTuple):
+    """Benchmark configuration"""
+    args: Namespace
+    """Parsed benchmark arguments"""
+    bench_once: Callable[[Client, Namespace, Optional[str]], Any]
+    """Callable to run a single benchmark iteration
+    Parameters
+    ----------
+    client
+        distributed Client object
+    args
+        Benchmark parsed arguments
+    write_profile
+        Should a profile be written?
+    Returns
+    -------
+    Benchmark data to be interpreted by ``pretty_print_results`` and
+    ``create_tidy_results``.
+    """
+    create_tidy_results: Callable[
+        [Namespace, np.ndarray, List[Any]], Tuple[pd.DataFrame, np.ndarray]
+    ]
+    """Callable to create tidy results for saving to disk
+    Parameters
+    ----------
+    args
+        Benchmark parsed arguments
+    p2p_bw
+        Array of point-to-point bandwidths
+    results: list
+        List of results from running ``bench_once``
+    Returns
+    -------
+    tuple
+        two-tuple of a pandas dataframe and the point-to-point bandwidths
+    """
+    pretty_print_results: Callable[
+        [Namespace, Mapping[str, int], np.ndarray, List[Any], Optional[Client]], None
+    ]
+    """Callable to pretty-print results for human consumption
+    Parameters
+    ----------
+    args
+        Benchmark parsed arguments
+    address_to_index
+        Mapping from worker addresses to indices
+    p2p_bw
+        Array of point-to-point bandwidths
+    results: list
+        List of results from running ``bench_once``
+    """
+def run_benchmark(client: Client, args: Namespace, config: Config):
+    """Run a benchmark a specified number of times
+    If ``args.profile`` is set, the final run is profiled.
+    """
+    results = []
+    for _ in range(max(0, args.warmup_runs)):
+        config.bench_once(client, args, write_profile=None)
+    ctx = contextlib.nullcontext()
+    if args.profile is not None:
+        ctx = performance_report(filename=args.profile)
+    with ctx:
+        for _ in range(max(1, args.runs) - 1):
+            res = config.bench_once(client, args, write_profile=None)
+            results.append(res)
+        results.append(config.bench_once(client, args, write_profile=args.profile_last))
+        return results
+def gather_bench_results(client: Client, args: Namespace, config: Config):
+    """Collect benchmark results from the workers"""
+    address2index = address_to_index(client)
+    if args.all_to_all:
+        all_to_all(client)
+    results = run_benchmark(client, args, config)
+    # Collect aggregated peer-to-peer bandwidth
+    message_data = client.run(
+        partial(aggregate_transfer_log_data, bandwidth_statistics, args.ignore_size)
+    )
+    return address2index, results, message_data
+def run(client: Client, args: Namespace, config: Config):
+    """Run the full benchmark on the cluster
+    Waits for the cluster, sets up memory pools, prints and saves results
+    """
+    wait_for_cluster(client, shutdown_on_failure=True)
+    assert len(client.scheduler_info(n_workers=-1)["workers"]) > 0
+    setup_memory_pools(
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
+    )
+    address_to_index, results, message_data = gather_bench_results(client, args, config)
+    p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
+    config.pretty_print_results(args, address_to_index, p2p_bw, results, client=client)
+    if args.output_basename:
+        df, p2p_bw = config.create_tidy_results(args, p2p_bw, results)
+        df["num_workers"] = len(address_to_index)
+        save_benchmark_data(
+            args.output_basename,
+            address_to_index,
+            df,
+            p2p_bw,
+        )
+def run_client_from_existing_scheduler(args: Namespace, config: Config):
+    """Set up a client by connecting to a scheduler
+    Shuts down the cluster at the end of the benchmark conditional on
+    ``args.shutdown_cluster``.
+    """
+    if args.scheduler_address is not None:
+        kwargs = {"address": args.scheduler_address}
+    elif args.scheduler_file is not None:
+        kwargs = {"scheduler_file": args.scheduler_file}
+    else:
+        raise RuntimeError(
+            "Need to specify either --scheduler-file or --scheduler-address"
+        )
+    with Client(**kwargs) as client:
+        run(client, args, config)
+        if args.shutdown_cluster:
+            client.shutdown()
+def run_create_client(args: Namespace, config: Config):
+    """Create a client + cluster and run
+    Shuts down the cluster at the end of the benchmark
+    """
+    cluster_options = get_cluster_options(args)
+    Cluster = cluster_options["class"]
+    cluster_args = cluster_options["args"]
+    cluster_kwargs = cluster_options["kwargs"]
+    scheduler_addr = cluster_options["scheduler_addr"]
+    filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning)
+    with Cluster(*cluster_args, **cluster_kwargs) as cluster:
+        # Use the scheduler address with an SSHCluster rather than the cluster
+        # object, otherwise we can't shut it down.
+        with Client(scheduler_addr if args.multi_node else cluster) as client:
+            run(client, args, config)
+            # An SSHCluster will not automatically shut down, we have to
+            # ensure it does.
+            if args.multi_node:
+                client.shutdown()
+def execute_benchmark(config: Config):
+    """Run complete benchmark given a configuration"""
+    args = config.args
+    if args.multiprocessing_method == "forkserver":
+        import multiprocessing.forkserver as f
+        f.ensure_running()
+    with dask.config.set(
+        {"distributed.worker.multiprocessing-method": args.multiprocessing_method}
+    ):
+        if args.scheduler_file is not None or args.scheduler_address is not None:
+            run_client_from_existing_scheduler(args, config)
+        else:
+            run_create_client(args, config)

dask_cuda/benchmarks/local_cudf_groupby.py ADDED Viewed

@@ -0,0 +1,278 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+import contextlib
+from collections import ChainMap
+from time import perf_counter as clock
+import pandas as pd
+import dask
+import dask.dataframe as dd
+from dask.distributed import performance_report, wait
+from dask.utils import format_bytes
+from dask_cuda.benchmarks.common import Config, execute_benchmark
+from dask_cuda.benchmarks.utils import (
+    as_noop,
+    parse_benchmark_args,
+    print_key_value,
+    print_separator,
+    print_throughput_bandwidth,
+)
+def apply_groupby(
+    df,
+    backend,
+    sort=False,
+    split_out=1,
+    split_every=8,
+    shuffle=None,
+):
+    if backend == "dask-noop" and shuffle == "explicit-comms":
+        raise RuntimeError("dask-noop not valid for explicit-comms shuffle")
+    # Handle special "explicit-comms" case
+    config = {}
+    if shuffle == "explicit-comms":
+        shuffle = "tasks"
+        config = {"explicit-comms": True}
+    with dask.config.set(config):
+        agg = df.groupby("key", sort=sort).agg(
+            {"int64": ["max", "count"], "float64": "mean"},
+            split_out=split_out,
+            split_every=split_every,
+            shuffle=shuffle,
+        )
+        if backend == "dask-noop":
+            agg = as_noop(agg)
+    wait(agg.persist())
+    return agg
+def generate_chunk(chunk_info, unique_size=1, gpu=True):
+    # Setting a seed that triggers max amount of comm in the two-GPU case.
+    if gpu:
+        import cupy as xp
+        import cudf as xdf
+    else:
+        import numpy as xp
+        import pandas as xdf
+    i_chunk, local_size = chunk_info
+    xp.random.seed(i_chunk * 1_000)
+    return xdf.DataFrame(
+        {
+            "key": xp.random.randint(0, unique_size, size=local_size, dtype="int64"),
+            "int64": xp.random.permutation(xp.arange(local_size, dtype="int64")),
+            "float64": xp.random.permutation(xp.arange(local_size, dtype="float64")),
+        }
+    )
+def get_random_ddf(args):
+    total_size = args.chunk_size * args.in_parts
+    chunk_kwargs = {
+        "unique_size": max(int(args.unique_ratio * total_size), 1),
+        "gpu": True if args.type == "gpu" else False,
+    }
+    return dd.from_map(
+        generate_chunk,
+        [(i, args.chunk_size) for i in range(args.in_parts)],
+        meta=generate_chunk((0, 1), **chunk_kwargs),
+        enforce_metadata=False,
+        **chunk_kwargs,
+    )
+def bench_once(client, args, write_profile=None):
+    # Generate random Dask dataframe
+    df = get_random_ddf(args)
+    data_processed = len(df) * sum([t.itemsize for t in df.dtypes])
+    shuffle = {
+        "True": "tasks",
+        "False": False,
+    }.get(args.shuffle, args.shuffle)
+    ctx = contextlib.nullcontext()
+    if write_profile is not None:
+        ctx = performance_report(filename=write_profile)
+    with ctx:
+        t1 = clock()
+        agg = apply_groupby(
+            df,
+            backend=args.backend,
+            sort=args.sort,
+            split_out=args.split_out,
+            split_every=args.split_every,
+            shuffle=shuffle,
+        )
+        t2 = clock()
+    output_size = agg.memory_usage(index=True, deep=True).compute().sum()
+    return (data_processed, output_size, t2 - t1)
+def pretty_print_results(args, address_to_index, p2p_bw, results, client=None):
+    if args.markdown:
+        print("```")
+    print("Groupby benchmark")
+    print_separator(separator="-")
+    print_key_value(key="Use shuffle", value=f"{args.shuffle}")
+    print_key_value(key="Backend", value=f"{args.backend}")
+    print_key_value(key="Output partitions", value=f"{args.split_out}")
+    print_key_value(key="Input partitions", value=f"{args.in_parts}")
+    print_key_value(key="Sort Groups", value=f"{args.sort}")
+    print_key_value(key="Rows-per-chunk", value=f"{args.chunk_size}")
+    print_key_value(key="Unique-group ratio", value=f"{args.unique_ratio}")
+    print_key_value(key="Protocol", value=f"{args.protocol}")
+    print_key_value(key="Device(s)", value=f"{args.devs}")
+    print_key_value(key="Tree-reduction width", value=f"{args.split_every}")
+    if args.device_memory_limit:
+        print_key_value(
+            key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
+        )
+    print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
+    if args.protocol in ["ucx", "ucxx"]:
+        print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
+        print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
+        print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
+    print_key_value(key="Worker thread(s)", value=f"{args.threads_per_worker}")
+    print_key_value(key="Data processed", value=f"{format_bytes(results[0][0])}")
+    print_key_value(key="Output size", value=f"{format_bytes(results[0][1])}")
+    if args.markdown:
+        print("\n```")
+    data_processed, output_size, durations = zip(*results)
+    print_throughput_bandwidth(
+        args, durations, data_processed, p2p_bw, address_to_index
+    )
+def create_tidy_results(args, p2p_bw, results):
+    configuration = {
+        "dataframe_type": "cudf" if args.type == "gpu" else "pandas",
+        "shuffle": args.shuffle,
+        "backend": args.backend,
+        "sort": args.sort,
+        "split_out": args.split_out,
+        "split_every": args.split_every,
+        "in_parts": args.in_parts,
+        "rows_per_chunk": args.chunk_size,
+        "unique_ratio": args.unique_ratio,
+        "protocol": args.protocol,
+        "devs": args.devs,
+        "device_memory_limit": args.device_memory_limit,
+        "rmm_pool": not args.disable_rmm_pool,
+        "tcp": args.enable_tcp_over_ucx,
+        "ib": args.enable_infiniband,
+        "nvlink": args.enable_nvlink,
+    }
+    timing_data = pd.DataFrame(
+        [
+            pd.Series(
+                data=ChainMap(
+                    configuration,
+                    {
+                        "wallclock": duration,
+                        "data_processed": data_processed,
+                        "output_size": output_size,
+                    },
+                )
+            )
+            for data_processed, output_size, duration in results
+        ]
+    )
+    return timing_data, p2p_bw
+def parse_args():
+    special_args = [
+        {
+            "name": "--in-parts",
+            "default": 100,
+            "metavar": "n",
+            "type": int,
+            "help": "Number of input partitions (default '100')",
+        },
+        {
+            "name": [
+                "-c",
+                "--chunk-size",
+            ],
+            "default": 1_000_000,
+            "metavar": "n",
+            "type": int,
+            "help": "Chunk size (default 1_000_000)",
+        },
+        {
+            "name": "--unique-ratio",
+            "default": 0.01,
+            "type": float,
+            "help": "Fraction of rows that are unique groups",
+        },
+        {
+            "name": "--sort",
+            "default": False,
+            "action": "store_true",
+            "help": "Whether to sort the output group order.",
+        },
+        {
+            "name": "--split_out",
+            "default": 1,
+            "type": int,
+            "help": "How many partitions to return.",
+        },
+        {
+            "name": "--split_every",
+            "default": 8,
+            "type": int,
+            "help": "Tree-reduction width.",
+        },
+        {
+            "name": "--shuffle",
+            "choices": ["False", "True", "tasks", "explicit-comms"],
+            "default": "False",
+            "type": str,
+            "help": "Whether to use shuffle-based groupby.",
+        },
+        {
+            "name": "--backend",
+            "choices": ["dask", "dask-noop"],
+            "default": "dask",
+            "type": str,
+            "help": (
+                "Compute engine to use, dask-noop turns the graph into a noop graph"
+            ),
+        },
+        {
+            "name": [
+                "-t",
+                "--type",
+            ],
+            "choices": ["cpu", "gpu"],
+            "default": "gpu",
+            "type": str,
+            "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
+        },
+    ]
+    return parse_benchmark_args(
+        description="Distributed groupby (dask/cudf) benchmark", args_list=special_args
+    )
+if __name__ == "__main__":
+    execute_benchmark(
+        Config(
+            args=parse_args(),
+            bench_once=bench_once,
+            create_tidy_results=create_tidy_results,
+            pretty_print_results=pretty_print_results,
+        )
+    )