dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/benchmarks/common.py +4 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
- dask_cuda/benchmarks/local_cudf_merge.py +5 -2
- dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
- dask_cuda/benchmarks/local_cupy.py +4 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
- dask_cuda/benchmarks/utils.py +7 -4
- dask_cuda/cli.py +21 -15
- dask_cuda/cuda_worker.py +27 -57
- dask_cuda/device_host_file.py +31 -15
- dask_cuda/disk_io.py +7 -4
- dask_cuda/explicit_comms/comms.py +11 -7
- dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
- dask_cuda/get_device_memory_objects.py +3 -3
- dask_cuda/initialize.py +80 -44
- dask_cuda/local_cuda_cluster.py +63 -66
- dask_cuda/plugins.py +17 -16
- dask_cuda/proxify_device_objects.py +12 -10
- dask_cuda/proxify_host_file.py +30 -27
- dask_cuda/proxy_object.py +20 -17
- dask_cuda/tests/conftest.py +41 -0
- dask_cuda/tests/test_dask_cuda_worker.py +109 -25
- dask_cuda/tests/test_dgx.py +10 -18
- dask_cuda/tests/test_explicit_comms.py +30 -12
- dask_cuda/tests/test_from_array.py +7 -5
- dask_cuda/tests/test_initialize.py +16 -37
- dask_cuda/tests/test_local_cuda_cluster.py +159 -52
- dask_cuda/tests/test_proxify_host_file.py +19 -3
- dask_cuda/tests/test_proxy.py +18 -16
- dask_cuda/tests/test_rdd_ucx.py +160 -0
- dask_cuda/tests/test_spill.py +7 -0
- dask_cuda/tests/test_utils.py +106 -20
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +261 -38
- dask_cuda/utils_test.py +23 -7
- dask_cuda/worker_common.py +196 -0
- dask_cuda/worker_spec.py +12 -5
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
- dask_cuda-25.8.0.dist-info/RECORD +63 -0
- dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
- shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
- shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
- dask_cuda-25.6.0.dist-info/RECORD +0 -57
- dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import contextlib
|
|
5
6
|
import os
|
|
6
7
|
import pkgutil
|
|
7
8
|
import sys
|
|
@@ -20,15 +21,17 @@ from dask_cuda.utils import (
|
|
|
20
21
|
get_device_total_memory,
|
|
21
22
|
get_gpu_count_mig,
|
|
22
23
|
get_gpu_uuid,
|
|
24
|
+
has_device_memory_resource,
|
|
23
25
|
print_cluster_config,
|
|
24
26
|
)
|
|
25
|
-
from dask_cuda.utils_test import MockWorker
|
|
27
|
+
from dask_cuda.utils_test import MockWorker, get_ucx_implementation
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
@gen_test(timeout=20)
|
|
29
31
|
async def test_local_cuda_cluster():
|
|
30
32
|
async with LocalCUDACluster(
|
|
31
|
-
scheduler_port=0,
|
|
33
|
+
scheduler_port=0,
|
|
34
|
+
asynchronous=True,
|
|
32
35
|
) as cluster:
|
|
33
36
|
async with Client(cluster, asynchronous=True) as client:
|
|
34
37
|
assert len(cluster.workers) == utils.get_n_gpus()
|
|
@@ -68,8 +71,8 @@ async def test_with_subset_of_cuda_visible_devices():
|
|
|
68
71
|
async with LocalCUDACluster(
|
|
69
72
|
scheduler_port=0,
|
|
70
73
|
asynchronous=True,
|
|
71
|
-
device_memory_limit=1,
|
|
72
74
|
worker_class=MockWorker,
|
|
75
|
+
data=dict,
|
|
73
76
|
) as cluster:
|
|
74
77
|
async with Client(cluster, asynchronous=True) as client:
|
|
75
78
|
assert len(cluster.workers) == 4
|
|
@@ -92,14 +95,11 @@ async def test_with_subset_of_cuda_visible_devices():
|
|
|
92
95
|
|
|
93
96
|
@pytest.mark.parametrize(
|
|
94
97
|
"protocol",
|
|
95
|
-
["ucx", "
|
|
98
|
+
["ucx", "ucx-old"],
|
|
96
99
|
)
|
|
97
100
|
@gen_test(timeout=20)
|
|
98
101
|
async def test_ucx_protocol(protocol):
|
|
99
|
-
|
|
100
|
-
pytest.importorskip("ucp")
|
|
101
|
-
elif protocol == "ucxx":
|
|
102
|
-
pytest.importorskip("ucxx")
|
|
102
|
+
get_ucx_implementation(protocol)
|
|
103
103
|
|
|
104
104
|
async with LocalCUDACluster(
|
|
105
105
|
protocol=protocol, asynchronous=True, data=dict
|
|
@@ -112,35 +112,32 @@ async def test_ucx_protocol(protocol):
|
|
|
112
112
|
|
|
113
113
|
@pytest.mark.parametrize(
|
|
114
114
|
"protocol",
|
|
115
|
-
["ucx", "
|
|
115
|
+
["ucx", "ucx-old"],
|
|
116
116
|
)
|
|
117
117
|
@gen_test(timeout=20)
|
|
118
118
|
async def test_explicit_ucx_with_protocol_none(protocol):
|
|
119
|
-
|
|
120
|
-
pytest.importorskip("ucp")
|
|
121
|
-
elif protocol == "ucxx":
|
|
122
|
-
pytest.importorskip("ucxx")
|
|
119
|
+
get_ucx_implementation(protocol)
|
|
123
120
|
|
|
124
121
|
initialize(protocol=protocol, enable_tcp_over_ucx=True)
|
|
125
122
|
async with LocalCUDACluster(
|
|
126
|
-
protocol=None,
|
|
123
|
+
protocol=None,
|
|
124
|
+
enable_tcp_over_ucx=True,
|
|
125
|
+
asynchronous=True,
|
|
127
126
|
) as cluster:
|
|
128
127
|
assert all(
|
|
129
|
-
ws.address.startswith("
|
|
128
|
+
ws.address.startswith(f"{protocol}://")
|
|
129
|
+
for ws in cluster.scheduler.workers.values()
|
|
130
130
|
)
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
@pytest.mark.filterwarnings("ignore:Exception ignored in")
|
|
134
134
|
@pytest.mark.parametrize(
|
|
135
135
|
"protocol",
|
|
136
|
-
["ucx", "
|
|
136
|
+
["ucx", "ucx-old"],
|
|
137
137
|
)
|
|
138
138
|
@gen_test(timeout=20)
|
|
139
139
|
async def test_ucx_protocol_type_error(protocol):
|
|
140
|
-
|
|
141
|
-
pytest.importorskip("ucp")
|
|
142
|
-
elif protocol == "ucxx":
|
|
143
|
-
pytest.importorskip("ucxx")
|
|
140
|
+
get_ucx_implementation(protocol)
|
|
144
141
|
|
|
145
142
|
initialize(protocol=protocol, enable_tcp_over_ucx=True)
|
|
146
143
|
with pytest.raises(TypeError):
|
|
@@ -153,7 +150,10 @@ async def test_ucx_protocol_type_error(protocol):
|
|
|
153
150
|
@gen_test(timeout=20)
|
|
154
151
|
async def test_n_workers():
|
|
155
152
|
async with LocalCUDACluster(
|
|
156
|
-
CUDA_VISIBLE_DEVICES="0,1",
|
|
153
|
+
CUDA_VISIBLE_DEVICES="0,1",
|
|
154
|
+
worker_class=MockWorker,
|
|
155
|
+
asynchronous=True,
|
|
156
|
+
data=dict,
|
|
157
157
|
) as cluster:
|
|
158
158
|
assert len(cluster.workers) == 2
|
|
159
159
|
assert len(cluster.worker_spec) == 2
|
|
@@ -208,10 +208,13 @@ async def test_no_memory_limits_cudaworker():
|
|
|
208
208
|
@gen_test(timeout=20)
|
|
209
209
|
async def test_all_to_all():
|
|
210
210
|
async with LocalCUDACluster(
|
|
211
|
-
CUDA_VISIBLE_DEVICES="0,1",
|
|
211
|
+
CUDA_VISIBLE_DEVICES="0,1",
|
|
212
|
+
worker_class=MockWorker,
|
|
213
|
+
asynchronous=True,
|
|
214
|
+
data=dict,
|
|
212
215
|
) as cluster:
|
|
213
216
|
async with Client(cluster, asynchronous=True) as client:
|
|
214
|
-
workers = list(client.scheduler_info()["workers"])
|
|
217
|
+
workers = list(client.scheduler_info(n_workers=-1)["workers"])
|
|
215
218
|
n_workers = len(workers)
|
|
216
219
|
await utils.all_to_all(client)
|
|
217
220
|
# assert all to all has resulted in all data on every worker
|
|
@@ -263,11 +266,6 @@ async def test_rmm_managed():
|
|
|
263
266
|
async def test_rmm_async():
|
|
264
267
|
rmm = pytest.importorskip("rmm")
|
|
265
268
|
|
|
266
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
267
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
268
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
269
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
270
|
-
|
|
271
269
|
async with LocalCUDACluster(
|
|
272
270
|
rmm_async=True,
|
|
273
271
|
rmm_pool_size="2GB",
|
|
@@ -290,11 +288,6 @@ async def test_rmm_async():
|
|
|
290
288
|
async def test_rmm_async_with_maximum_pool_size():
|
|
291
289
|
rmm = pytest.importorskip("rmm")
|
|
292
290
|
|
|
293
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
294
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
295
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
296
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
297
|
-
|
|
298
291
|
async with LocalCUDACluster(
|
|
299
292
|
rmm_async=True,
|
|
300
293
|
rmm_pool_size="2GB",
|
|
@@ -381,7 +374,6 @@ async def test_cluster_worker():
|
|
|
381
374
|
async with LocalCUDACluster(
|
|
382
375
|
scheduler_port=0,
|
|
383
376
|
asynchronous=True,
|
|
384
|
-
device_memory_limit=1,
|
|
385
377
|
n_workers=1,
|
|
386
378
|
) as cluster:
|
|
387
379
|
assert len(cluster.workers) == 1
|
|
@@ -464,7 +456,7 @@ async def test_get_cluster_configuration():
|
|
|
464
456
|
async with LocalCUDACluster(
|
|
465
457
|
rmm_pool_size="2GB",
|
|
466
458
|
rmm_maximum_pool_size="3GB",
|
|
467
|
-
device_memory_limit="30B",
|
|
459
|
+
device_memory_limit="30B" if has_device_memory_resource() else None,
|
|
468
460
|
CUDA_VISIBLE_DEVICES="0",
|
|
469
461
|
scheduler_port=0,
|
|
470
462
|
asynchronous=True,
|
|
@@ -474,10 +466,14 @@ async def test_get_cluster_configuration():
|
|
|
474
466
|
assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
|
|
475
467
|
assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
|
|
476
468
|
assert ret["jit-unspill"] is False
|
|
477
|
-
|
|
469
|
+
if has_device_memory_resource():
|
|
470
|
+
assert ret["device-memory-limit"] == 30
|
|
478
471
|
|
|
479
472
|
|
|
480
473
|
@gen_test(timeout=20)
|
|
474
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
475
|
+
"Devices without dedicated memory resources do not support fractional limits"
|
|
476
|
+
)
|
|
481
477
|
async def test_worker_fraction_limits():
|
|
482
478
|
async with LocalCUDACluster(
|
|
483
479
|
dashboard_address=None,
|
|
@@ -503,6 +499,40 @@ async def test_worker_fraction_limits():
|
|
|
503
499
|
)
|
|
504
500
|
|
|
505
501
|
|
|
502
|
+
# Intentionally not using @gen_test to skip cleanup checks
|
|
503
|
+
@pytest.mark.parametrize(
|
|
504
|
+
"argument", ["pool_size", "maximum_pool_size", "release_threshold"]
|
|
505
|
+
)
|
|
506
|
+
@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
|
|
507
|
+
@pytest.mark.skip_if_device_memory(
|
|
508
|
+
"Devices with dedicated memory resources cannot test error"
|
|
509
|
+
)
|
|
510
|
+
def test_worker_fraction_limits_no_dedicated_memory(argument):
|
|
511
|
+
async def _test_worker_fraction_limits_no_dedicated_memory():
|
|
512
|
+
if argument == "pool_size":
|
|
513
|
+
kwargs = {"rmm_pool_size": "0.1"}
|
|
514
|
+
elif argument == "maximum_pool_size":
|
|
515
|
+
kwargs = {"rmm_pool_size": "1 GiB", "rmm_maximum_pool_size": "0.1"}
|
|
516
|
+
else:
|
|
517
|
+
kwargs = {"rmm_async": True, "rmm_release_threshold": "0.1"}
|
|
518
|
+
|
|
519
|
+
with raises_with_cause(
|
|
520
|
+
RuntimeError,
|
|
521
|
+
"Nanny failed to start",
|
|
522
|
+
RuntimeError,
|
|
523
|
+
"Worker failed to start",
|
|
524
|
+
ValueError,
|
|
525
|
+
"Fractional of total device memory not supported in devices without a "
|
|
526
|
+
"dedicated memory resource",
|
|
527
|
+
):
|
|
528
|
+
await LocalCUDACluster(
|
|
529
|
+
asynchronous=True,
|
|
530
|
+
**kwargs,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
asyncio.run(_test_worker_fraction_limits_no_dedicated_memory())
|
|
534
|
+
|
|
535
|
+
|
|
506
536
|
@gen_test(timeout=20)
|
|
507
537
|
async def test_cudf_spill_disabled():
|
|
508
538
|
cudf = pytest.importorskip("cudf")
|
|
@@ -527,6 +557,9 @@ async def test_cudf_spill_disabled():
|
|
|
527
557
|
|
|
528
558
|
|
|
529
559
|
@gen_test(timeout=20)
|
|
560
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
561
|
+
"Devices without dedicated memory resources cannot enable cuDF spill"
|
|
562
|
+
)
|
|
530
563
|
async def test_cudf_spill():
|
|
531
564
|
cudf = pytest.importorskip("cudf")
|
|
532
565
|
|
|
@@ -551,27 +584,101 @@ async def test_cudf_spill():
|
|
|
551
584
|
assert v == 2
|
|
552
585
|
|
|
553
586
|
|
|
587
|
+
@pytest.mark.skip_if_device_memory(
|
|
588
|
+
"Devices with dedicated memory resources cannot test error"
|
|
589
|
+
)
|
|
590
|
+
@gen_test(timeout=20)
|
|
591
|
+
async def test_cudf_spill_no_dedicated_memory():
|
|
592
|
+
cudf = pytest.importorskip("cudf") # noqa: F841
|
|
593
|
+
|
|
594
|
+
with pytest.raises(
|
|
595
|
+
ValueError,
|
|
596
|
+
match="cuDF spilling is not supported on devices without dedicated memory",
|
|
597
|
+
):
|
|
598
|
+
await LocalCUDACluster(
|
|
599
|
+
enable_cudf_spill=True,
|
|
600
|
+
cudf_spill_stats=2,
|
|
601
|
+
asynchronous=True,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
554
605
|
@pytest.mark.parametrize(
|
|
555
606
|
"protocol",
|
|
556
|
-
["ucx", "
|
|
607
|
+
["ucx", "ucx-old"],
|
|
557
608
|
)
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
609
|
+
@pytest.mark.parametrize(
|
|
610
|
+
"jit_unspill",
|
|
611
|
+
[False, True],
|
|
612
|
+
)
|
|
613
|
+
@pytest.mark.parametrize(
|
|
614
|
+
"device_memory_limit",
|
|
615
|
+
[None, "1B"],
|
|
616
|
+
)
|
|
617
|
+
def test_print_cluster_config(capsys, protocol, jit_unspill, device_memory_limit):
|
|
618
|
+
get_ucx_implementation(protocol)
|
|
563
619
|
|
|
564
620
|
pytest.importorskip("rich")
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
621
|
+
|
|
622
|
+
ctx = contextlib.nullcontext()
|
|
623
|
+
if not has_device_memory_resource():
|
|
624
|
+
if device_memory_limit:
|
|
625
|
+
ctx = pytest.raises(
|
|
626
|
+
ValueError,
|
|
627
|
+
match="device_memory_limit is set but device has no dedicated memory.",
|
|
628
|
+
)
|
|
629
|
+
if jit_unspill:
|
|
630
|
+
# JIT-Unspill exception has precedence, thus overwrite ctx if both are
|
|
631
|
+
# enabled
|
|
632
|
+
ctx = pytest.raises(
|
|
633
|
+
ValueError,
|
|
634
|
+
match="JIT-Unspill is not supported on devices without dedicated "
|
|
635
|
+
"memory",
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
with ctx:
|
|
639
|
+
with LocalCUDACluster(
|
|
640
|
+
n_workers=1,
|
|
641
|
+
device_memory_limit=device_memory_limit,
|
|
642
|
+
jit_unspill=jit_unspill,
|
|
643
|
+
protocol=protocol,
|
|
644
|
+
) as cluster:
|
|
645
|
+
with Client(cluster) as client:
|
|
646
|
+
print_cluster_config(client)
|
|
647
|
+
captured = capsys.readouterr()
|
|
648
|
+
assert "Dask Cluster Configuration" in captured.out
|
|
649
|
+
assert protocol in captured.out
|
|
650
|
+
if device_memory_limit == "1B":
|
|
651
|
+
assert "1 B" in captured.out
|
|
652
|
+
assert "[plugin]" in captured.out
|
|
653
|
+
client.shutdown()
|
|
654
|
+
|
|
655
|
+
def ucxpy_reset(timeout=20):
|
|
656
|
+
"""Reset UCX-Py with a timeout.
|
|
657
|
+
|
|
658
|
+
Attempt to reset UCX-Py, not doing so may cause a deadlock because UCX-Py is
|
|
659
|
+
not thread-safe and the Dask cluster may still be alive while a new cluster
|
|
660
|
+
and UCX-Py instances are initalized.
|
|
661
|
+
"""
|
|
662
|
+
import time
|
|
663
|
+
|
|
664
|
+
import ucp
|
|
665
|
+
|
|
666
|
+
start = time.monotonic()
|
|
667
|
+
while True:
|
|
668
|
+
try:
|
|
669
|
+
ucp.reset()
|
|
670
|
+
except ucp._libs.exceptions.UCXError as e:
|
|
671
|
+
if time.monotonic() - start > timeout:
|
|
672
|
+
raise RuntimeError(
|
|
673
|
+
f"Could not reset UCX-Py in {timeout} seconds, this may result "
|
|
674
|
+
f"in a deadlock. Failure:\n{e}"
|
|
675
|
+
)
|
|
676
|
+
continue
|
|
677
|
+
else:
|
|
678
|
+
break
|
|
679
|
+
|
|
680
|
+
if protocol == "ucx-old":
|
|
681
|
+
ucxpy_reset()
|
|
575
682
|
|
|
576
683
|
|
|
577
684
|
@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
3
|
|
|
3
4
|
from typing import Iterable
|
|
4
5
|
from unittest.mock import patch
|
|
@@ -219,6 +220,9 @@ def test_one_item_host_limit(capsys, root_dir):
|
|
|
219
220
|
assert len(dhf.manager) == 0
|
|
220
221
|
|
|
221
222
|
|
|
223
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
224
|
+
"Devices without dedicated memory resources do not support spilling"
|
|
225
|
+
)
|
|
222
226
|
def test_spill_on_demand(root_dir):
|
|
223
227
|
"""
|
|
224
228
|
Test spilling on demand by disabling the device_memory_limit
|
|
@@ -241,6 +245,9 @@ def test_spill_on_demand(root_dir):
|
|
|
241
245
|
|
|
242
246
|
|
|
243
247
|
@pytest.mark.parametrize("jit_unspill", [True, False])
|
|
248
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
249
|
+
"Devices without dedicated memory resources do not support spilling"
|
|
250
|
+
)
|
|
244
251
|
@gen_test(timeout=20)
|
|
245
252
|
async def test_local_cuda_cluster(jit_unspill):
|
|
246
253
|
"""Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
|
|
@@ -377,9 +384,9 @@ def test_externals(root_dir):
|
|
|
377
384
|
|
|
378
385
|
@patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
|
|
379
386
|
def test_incompatible_types(root_dir):
|
|
380
|
-
"""Check that ProxifyHostFile unproxifies
|
|
387
|
+
"""Check that ProxifyHostFile unproxifies ``cupy.ndarray`` on retrieval
|
|
381
388
|
|
|
382
|
-
Notice, in this test we add
|
|
389
|
+
Notice, in this test we add ``cupy.ndarray`` to the incompatible_types temporarily.
|
|
383
390
|
"""
|
|
384
391
|
cupy = pytest.importorskip("cupy")
|
|
385
392
|
cudf = pytest.importorskip("cudf")
|
|
@@ -398,6 +405,9 @@ def test_incompatible_types(root_dir):
|
|
|
398
405
|
|
|
399
406
|
@pytest.mark.parametrize("npartitions", [1, 2, 3])
|
|
400
407
|
@pytest.mark.parametrize("compatibility_mode", [True, False])
|
|
408
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
409
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
410
|
+
)
|
|
401
411
|
@gen_test(timeout=30)
|
|
402
412
|
async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
|
|
403
413
|
cudf = pytest.importorskip("cudf")
|
|
@@ -430,6 +440,9 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
|
|
|
430
440
|
assert all(res) # Only proxy objects
|
|
431
441
|
|
|
432
442
|
|
|
443
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
444
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
445
|
+
)
|
|
433
446
|
@gen_test(timeout=60)
|
|
434
447
|
async def test_worker_force_spill_to_disk():
|
|
435
448
|
"""Test Dask triggering CPU-to-Disk spilling"""
|
|
@@ -465,6 +478,9 @@ async def test_worker_force_spill_to_disk():
|
|
|
465
478
|
assert "Unmanaged memory use is high" not in log
|
|
466
479
|
|
|
467
480
|
|
|
481
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
482
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
483
|
+
)
|
|
468
484
|
def test_on_demand_debug_info():
|
|
469
485
|
"""Test worker logging when on-demand-spilling fails"""
|
|
470
486
|
rmm = pytest.importorskip("rmm")
|
dask_cuda/tests/test_proxy.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import operator
|
|
2
5
|
import os
|
|
3
6
|
import pickle
|
|
@@ -23,7 +26,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
|
|
|
23
26
|
from dask_cuda.disk_io import SpillToDiskFile
|
|
24
27
|
from dask_cuda.proxify_device_objects import proxify_device_objects
|
|
25
28
|
from dask_cuda.proxify_host_file import ProxifyHostFile
|
|
26
|
-
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
|
|
29
|
+
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
|
|
27
30
|
|
|
28
31
|
# Make the "disk" serializer available and use a directory that are
|
|
29
32
|
# remove on exit.
|
|
@@ -242,7 +245,7 @@ def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers):
|
|
|
242
245
|
|
|
243
246
|
@pytest.mark.parametrize("backend", ["numpy", "cupy"])
|
|
244
247
|
def test_fixed_attribute_length(backend):
|
|
245
|
-
"""Test fixed attribute
|
|
248
|
+
"""Test fixed attribute ``x.__len__`` access
|
|
246
249
|
|
|
247
250
|
Notice, accessing fixed attributes shouldn't de-serialize the proxied object
|
|
248
251
|
"""
|
|
@@ -263,7 +266,7 @@ def test_fixed_attribute_length(backend):
|
|
|
263
266
|
|
|
264
267
|
|
|
265
268
|
def test_fixed_attribute_name():
|
|
266
|
-
"""Test fixed attribute
|
|
269
|
+
"""Test fixed attribute ``x.name`` access
|
|
267
270
|
|
|
268
271
|
Notice, accessing fixed attributes shouldn't de-serialize the proxied object
|
|
269
272
|
"""
|
|
@@ -284,6 +287,9 @@ def test_fixed_attribute_name():
|
|
|
284
287
|
|
|
285
288
|
|
|
286
289
|
@pytest.mark.parametrize("jit_unspill", [True, False])
|
|
290
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
291
|
+
"Spilling not supported in devices without dedicated memory resource"
|
|
292
|
+
)
|
|
287
293
|
@gen_test(timeout=20)
|
|
288
294
|
async def test_spilling_local_cuda_cluster(jit_unspill):
|
|
289
295
|
"""Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
|
|
@@ -386,8 +392,8 @@ def test_serializing_array_to_disk(backend, serializers, size):
|
|
|
386
392
|
class _PxyObjTest(proxy_object.ProxyObject):
|
|
387
393
|
"""
|
|
388
394
|
A class that:
|
|
389
|
-
- defines
|
|
390
|
-
calling
|
|
395
|
+
- defines ``__dask_tokenize__`` in order to avoid deserialization when
|
|
396
|
+
calling ``client.scatter()``
|
|
391
397
|
- Asserts that no deserialization is performaned when communicating.
|
|
392
398
|
"""
|
|
393
399
|
|
|
@@ -401,14 +407,12 @@ class _PxyObjTest(proxy_object.ProxyObject):
|
|
|
401
407
|
|
|
402
408
|
|
|
403
409
|
@pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
|
|
404
|
-
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "
|
|
410
|
+
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
|
|
405
411
|
@gen_test(timeout=120)
|
|
406
412
|
async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
407
413
|
"""Testing serialization of cuDF dataframe when communicating"""
|
|
408
|
-
if protocol
|
|
409
|
-
|
|
410
|
-
elif protocol == "ucxx":
|
|
411
|
-
pytest.importorskip("ucxx")
|
|
414
|
+
if protocol.startswith("ucx"):
|
|
415
|
+
get_ucx_implementation(protocol)
|
|
412
416
|
cudf = pytest.importorskip("cudf")
|
|
413
417
|
|
|
414
418
|
def task(x):
|
|
@@ -417,7 +421,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
|
417
421
|
serializers_used = x._pxy_get().serializer
|
|
418
422
|
|
|
419
423
|
# Check that `x` is serialized with the expected serializers
|
|
420
|
-
if protocol in ["ucx", "
|
|
424
|
+
if protocol in ["ucx", "ucx-old"]:
|
|
421
425
|
if send_serializers is None:
|
|
422
426
|
assert serializers_used == "cuda"
|
|
423
427
|
else:
|
|
@@ -448,15 +452,13 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
|
448
452
|
await client.submit(task, df)
|
|
449
453
|
|
|
450
454
|
|
|
451
|
-
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "
|
|
455
|
+
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
|
|
452
456
|
@pytest.mark.parametrize("shared_fs", [True, False])
|
|
453
457
|
@gen_test(timeout=20)
|
|
454
458
|
async def test_communicating_disk_objects(protocol, shared_fs):
|
|
455
459
|
"""Testing disk serialization of cuDF dataframe when communicating"""
|
|
456
|
-
if protocol
|
|
457
|
-
|
|
458
|
-
elif protocol == "ucxx":
|
|
459
|
-
pytest.importorskip("ucxx")
|
|
460
|
+
if protocol.startswith("ucx"):
|
|
461
|
+
get_ucx_implementation(protocol)
|
|
460
462
|
cudf = pytest.importorskip("cudf")
|
|
461
463
|
ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
|
|
462
464
|
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
import io
|
|
7
|
+
import multiprocessing as mp
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from dask_cuda import LocalCUDACluster
|
|
13
|
+
|
|
14
|
+
mp = mp.get_context("spawn") # type: ignore
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _has_distributed_ucxx() -> bool:
|
|
18
|
+
return bool(importlib.util.find_spec("distributed_ucxx"))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _test_protocol_ucx():
|
|
22
|
+
with LocalCUDACluster(protocol="ucx") as cluster:
|
|
23
|
+
assert cluster.scheduler_comm.address.startswith("ucx://")
|
|
24
|
+
|
|
25
|
+
if _has_distributed_ucxx():
|
|
26
|
+
import distributed_ucxx
|
|
27
|
+
|
|
28
|
+
assert all(
|
|
29
|
+
isinstance(batched_send.comm, distributed_ucxx.ucxx.UCXX)
|
|
30
|
+
for batched_send in cluster.scheduler.stream_comms.values()
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
import rapids_dask_dependency
|
|
34
|
+
|
|
35
|
+
assert all(
|
|
36
|
+
isinstance(
|
|
37
|
+
batched_send.comm,
|
|
38
|
+
rapids_dask_dependency.patches.distributed.comm.__rdd_patch_ucx.UCX,
|
|
39
|
+
)
|
|
40
|
+
for batched_send in cluster.scheduler.stream_comms.values()
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _test_protocol_ucxx():
|
|
45
|
+
if _has_distributed_ucxx():
|
|
46
|
+
with LocalCUDACluster(protocol="ucxx") as cluster:
|
|
47
|
+
assert cluster.scheduler_comm.address.startswith("ucxx://")
|
|
48
|
+
import distributed_ucxx
|
|
49
|
+
|
|
50
|
+
assert all(
|
|
51
|
+
isinstance(batched_send.comm, distributed_ucxx.ucxx.UCXX)
|
|
52
|
+
for batched_send in cluster.scheduler.stream_comms.values()
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
with pytest.raises(RuntimeError, match="Cluster failed to start"):
|
|
56
|
+
LocalCUDACluster(protocol="ucxx")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _test_protocol_ucx_old():
|
|
60
|
+
with LocalCUDACluster(protocol="ucx-old") as cluster:
|
|
61
|
+
assert cluster.scheduler_comm.address.startswith("ucx-old://")
|
|
62
|
+
|
|
63
|
+
import rapids_dask_dependency
|
|
64
|
+
|
|
65
|
+
assert all(
|
|
66
|
+
isinstance(
|
|
67
|
+
batched_send.comm,
|
|
68
|
+
rapids_dask_dependency.patches.distributed.comm.__rdd_patch_ucx.UCX,
|
|
69
|
+
)
|
|
70
|
+
for batched_send in cluster.scheduler.stream_comms.values()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _run_test_with_output_capture(test_func_name, conn):
|
|
75
|
+
"""Run a test function in a subprocess and capture stdout/stderr."""
|
|
76
|
+
# Redirect stdout and stderr to capture output
|
|
77
|
+
old_stdout = sys.stdout
|
|
78
|
+
old_stderr = sys.stderr
|
|
79
|
+
captured_output = io.StringIO()
|
|
80
|
+
sys.stdout = sys.stderr = captured_output
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
# Import and run the test function
|
|
84
|
+
if test_func_name == "_test_protocol_ucx":
|
|
85
|
+
_test_protocol_ucx()
|
|
86
|
+
elif test_func_name == "_test_protocol_ucxx":
|
|
87
|
+
_test_protocol_ucxx()
|
|
88
|
+
elif test_func_name == "_test_protocol_ucx_old":
|
|
89
|
+
_test_protocol_ucx_old()
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Unknown test function: {test_func_name}")
|
|
92
|
+
|
|
93
|
+
output = captured_output.getvalue()
|
|
94
|
+
conn.send((True, output)) # True = success
|
|
95
|
+
except Exception as e:
|
|
96
|
+
output = captured_output.getvalue()
|
|
97
|
+
output += f"\nException: {e}"
|
|
98
|
+
import traceback
|
|
99
|
+
|
|
100
|
+
output += f"\nTraceback:\n{traceback.format_exc()}"
|
|
101
|
+
conn.send((False, output)) # False = failure
|
|
102
|
+
finally:
|
|
103
|
+
# Restore original stdout/stderr
|
|
104
|
+
sys.stdout = old_stdout
|
|
105
|
+
sys.stderr = old_stderr
|
|
106
|
+
conn.close()
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@pytest.mark.parametrize("protocol", ["ucx", "ucxx", "ucx-old"])
|
|
110
|
+
def test_rdd_protocol(protocol):
|
|
111
|
+
"""Test rapids-dask-dependency protocol selection"""
|
|
112
|
+
if protocol == "ucx":
|
|
113
|
+
test_func_name = "_test_protocol_ucx"
|
|
114
|
+
elif protocol == "ucxx":
|
|
115
|
+
test_func_name = "_test_protocol_ucxx"
|
|
116
|
+
else:
|
|
117
|
+
test_func_name = "_test_protocol_ucx_old"
|
|
118
|
+
|
|
119
|
+
# Create a pipe for communication between parent and child processes
|
|
120
|
+
parent_conn, child_conn = mp.Pipe()
|
|
121
|
+
p = mp.Process(
|
|
122
|
+
target=_run_test_with_output_capture, args=(test_func_name, child_conn)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
p.start()
|
|
126
|
+
p.join(timeout=60)
|
|
127
|
+
|
|
128
|
+
if p.is_alive():
|
|
129
|
+
p.kill()
|
|
130
|
+
p.close()
|
|
131
|
+
raise TimeoutError("Test process timed out")
|
|
132
|
+
|
|
133
|
+
# Get the result from the child process
|
|
134
|
+
success, output = parent_conn.recv()
|
|
135
|
+
|
|
136
|
+
# Check that the test passed
|
|
137
|
+
assert success, f"Test failed in subprocess. Output:\n{output}"
|
|
138
|
+
|
|
139
|
+
# For the ucx protocol, check if warnings are printed when distributed_ucxx is not
|
|
140
|
+
# available
|
|
141
|
+
if protocol == "ucx" and not _has_distributed_ucxx():
|
|
142
|
+
# Check if the warning about protocol='ucx' is printed
|
|
143
|
+
print(f"Output for {protocol} protocol:\n{output}")
|
|
144
|
+
assert (
|
|
145
|
+
"you have requested protocol='ucx'" in output
|
|
146
|
+
), f"Expected warning not found in output: {output}"
|
|
147
|
+
assert (
|
|
148
|
+
"'distributed-ucxx' is not installed" in output
|
|
149
|
+
), f"Expected warning about distributed-ucxx not found in output: {output}"
|
|
150
|
+
elif protocol == "ucx" and _has_distributed_ucxx():
|
|
151
|
+
# When distributed_ucxx is available, the warning should NOT be printed
|
|
152
|
+
assert "you have requested protocol='ucx'" not in output, (
|
|
153
|
+
"Warning should not be printed when distributed_ucxx is available: "
|
|
154
|
+
f"{output}"
|
|
155
|
+
)
|
|
156
|
+
elif protocol == "ucx-old":
|
|
157
|
+
# The ucx-old protocol should not generate warnings
|
|
158
|
+
assert (
|
|
159
|
+
"you have requested protocol='ucx'" not in output
|
|
160
|
+
), f"Warning should not be printed for ucx-old protocol: {output}"
|
dask_cuda/tests/test_spill.py
CHANGED
|
@@ -20,6 +20,13 @@ import dask_cudf
|
|
|
20
20
|
from dask_cuda import LocalCUDACluster, utils
|
|
21
21
|
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
|
|
22
22
|
|
|
23
|
+
if not utils.has_device_memory_resource():
|
|
24
|
+
pytest.skip(
|
|
25
|
+
"No spilling tests supported for devices without memory resources. "
|
|
26
|
+
"See https://github.com/rapidsai/dask-cuda/issues/1510",
|
|
27
|
+
allow_module_level=True,
|
|
28
|
+
)
|
|
29
|
+
|
|
23
30
|
if utils.get_device_total_memory() < 1e10:
|
|
24
31
|
pytest.skip("Not enough GPU memory", allow_module_level=True)
|
|
25
32
|
|