dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/_compat.py +18 -0
- dask_cuda/benchmarks/common.py +4 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
- dask_cuda/benchmarks/local_cudf_merge.py +5 -2
- dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
- dask_cuda/benchmarks/local_cupy.py +4 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
- dask_cuda/benchmarks/utils.py +7 -4
- dask_cuda/cli.py +21 -15
- dask_cuda/cuda_worker.py +27 -57
- dask_cuda/device_host_file.py +31 -15
- dask_cuda/disk_io.py +7 -4
- dask_cuda/explicit_comms/comms.py +11 -7
- dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
- dask_cuda/get_device_memory_objects.py +18 -3
- dask_cuda/initialize.py +80 -44
- dask_cuda/is_device_object.py +4 -1
- dask_cuda/is_spillable_object.py +4 -1
- dask_cuda/local_cuda_cluster.py +63 -66
- dask_cuda/plugins.py +17 -16
- dask_cuda/proxify_device_objects.py +15 -10
- dask_cuda/proxify_host_file.py +30 -27
- dask_cuda/proxy_object.py +20 -17
- dask_cuda/tests/conftest.py +41 -0
- dask_cuda/tests/test_dask_cuda_worker.py +114 -27
- dask_cuda/tests/test_dgx.py +10 -18
- dask_cuda/tests/test_explicit_comms.py +51 -18
- dask_cuda/tests/test_from_array.py +7 -5
- dask_cuda/tests/test_initialize.py +16 -37
- dask_cuda/tests/test_local_cuda_cluster.py +164 -54
- dask_cuda/tests/test_proxify_host_file.py +33 -4
- dask_cuda/tests/test_proxy.py +18 -16
- dask_cuda/tests/test_rdd_ucx.py +160 -0
- dask_cuda/tests/test_spill.py +107 -27
- dask_cuda/tests/test_utils.py +106 -20
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +319 -68
- dask_cuda/utils_test.py +23 -7
- dask_cuda/worker_common.py +196 -0
- dask_cuda/worker_spec.py +12 -5
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
- dask_cuda-25.8.0.dist-info/RECORD +63 -0
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
- dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
- shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
- shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
- dask_cuda-25.4.0.dist-info/RECORD +0 -56
- dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import multiprocessing as mp
|
|
2
5
|
import sys
|
|
3
6
|
|
|
@@ -11,7 +14,7 @@ from distributed.deploy.local import LocalCluster
|
|
|
11
14
|
|
|
12
15
|
from dask_cuda.initialize import initialize
|
|
13
16
|
from dask_cuda.utils import get_ucx_config
|
|
14
|
-
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
|
|
17
|
+
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
|
|
15
18
|
|
|
16
19
|
mp = mp.get_context("spawn") # type: ignore
|
|
17
20
|
|
|
@@ -22,10 +25,7 @@ mp = mp.get_context("spawn") # type: ignore
|
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def _test_initialize_ucx_tcp(protocol):
|
|
25
|
-
|
|
26
|
-
ucp = pytest.importorskip("ucp")
|
|
27
|
-
elif protocol == "ucxx":
|
|
28
|
-
ucp = pytest.importorskip("ucxx")
|
|
28
|
+
ucp = get_ucx_implementation(protocol)
|
|
29
29
|
|
|
30
30
|
kwargs = {"enable_tcp_over_ucx": True}
|
|
31
31
|
initialize(protocol=protocol, **kwargs)
|
|
@@ -55,12 +55,9 @@ def _test_initialize_ucx_tcp(protocol):
|
|
|
55
55
|
assert all(client.run(check_ucx_options).values())
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
@pytest.mark.parametrize("protocol", ["ucx", "
|
|
58
|
+
@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
|
|
59
59
|
def test_initialize_ucx_tcp(protocol):
|
|
60
|
-
|
|
61
|
-
pytest.importorskip("ucp")
|
|
62
|
-
elif protocol == "ucxx":
|
|
63
|
-
pytest.importorskip("ucxx")
|
|
60
|
+
get_ucx_implementation(protocol)
|
|
64
61
|
|
|
65
62
|
p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
|
|
66
63
|
p.start()
|
|
@@ -69,10 +66,7 @@ def test_initialize_ucx_tcp(protocol):
|
|
|
69
66
|
|
|
70
67
|
|
|
71
68
|
def _test_initialize_ucx_nvlink(protocol):
|
|
72
|
-
|
|
73
|
-
ucp = pytest.importorskip("ucp")
|
|
74
|
-
elif protocol == "ucxx":
|
|
75
|
-
ucp = pytest.importorskip("ucxx")
|
|
69
|
+
ucp = get_ucx_implementation(protocol)
|
|
76
70
|
|
|
77
71
|
kwargs = {"enable_nvlink": True}
|
|
78
72
|
initialize(protocol=protocol, **kwargs)
|
|
@@ -103,12 +97,9 @@ def _test_initialize_ucx_nvlink(protocol):
|
|
|
103
97
|
assert all(client.run(check_ucx_options).values())
|
|
104
98
|
|
|
105
99
|
|
|
106
|
-
@pytest.mark.parametrize("protocol", ["ucx", "
|
|
100
|
+
@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
|
|
107
101
|
def test_initialize_ucx_nvlink(protocol):
|
|
108
|
-
|
|
109
|
-
pytest.importorskip("ucp")
|
|
110
|
-
elif protocol == "ucxx":
|
|
111
|
-
pytest.importorskip("ucxx")
|
|
102
|
+
get_ucx_implementation(protocol)
|
|
112
103
|
|
|
113
104
|
p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
|
|
114
105
|
p.start()
|
|
@@ -117,10 +108,7 @@ def test_initialize_ucx_nvlink(protocol):
|
|
|
117
108
|
|
|
118
109
|
|
|
119
110
|
def _test_initialize_ucx_infiniband(protocol):
|
|
120
|
-
|
|
121
|
-
ucp = pytest.importorskip("ucp")
|
|
122
|
-
elif protocol == "ucxx":
|
|
123
|
-
ucp = pytest.importorskip("ucxx")
|
|
111
|
+
ucp = get_ucx_implementation(protocol)
|
|
124
112
|
|
|
125
113
|
kwargs = {"enable_infiniband": True}
|
|
126
114
|
initialize(protocol=protocol, **kwargs)
|
|
@@ -154,12 +142,9 @@ def _test_initialize_ucx_infiniband(protocol):
|
|
|
154
142
|
@pytest.mark.skipif(
|
|
155
143
|
"ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
|
|
156
144
|
)
|
|
157
|
-
@pytest.mark.parametrize("protocol", ["ucx", "
|
|
145
|
+
@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
|
|
158
146
|
def test_initialize_ucx_infiniband(protocol):
|
|
159
|
-
|
|
160
|
-
pytest.importorskip("ucp")
|
|
161
|
-
elif protocol == "ucxx":
|
|
162
|
-
pytest.importorskip("ucxx")
|
|
147
|
+
get_ucx_implementation(protocol)
|
|
163
148
|
|
|
164
149
|
p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
|
|
165
150
|
p.start()
|
|
@@ -168,10 +153,7 @@ def test_initialize_ucx_infiniband(protocol):
|
|
|
168
153
|
|
|
169
154
|
|
|
170
155
|
def _test_initialize_ucx_all(protocol):
|
|
171
|
-
|
|
172
|
-
ucp = pytest.importorskip("ucp")
|
|
173
|
-
elif protocol == "ucxx":
|
|
174
|
-
ucp = pytest.importorskip("ucxx")
|
|
156
|
+
ucp = get_ucx_implementation(protocol)
|
|
175
157
|
|
|
176
158
|
initialize(protocol=protocol)
|
|
177
159
|
with LocalCluster(
|
|
@@ -204,12 +186,9 @@ def _test_initialize_ucx_all(protocol):
|
|
|
204
186
|
assert all(client.run(check_ucx_options).values())
|
|
205
187
|
|
|
206
188
|
|
|
207
|
-
@pytest.mark.parametrize("protocol", ["ucx", "
|
|
189
|
+
@pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
|
|
208
190
|
def test_initialize_ucx_all(protocol):
|
|
209
|
-
|
|
210
|
-
pytest.importorskip("ucp")
|
|
211
|
-
elif protocol == "ucxx":
|
|
212
|
-
pytest.importorskip("ucxx")
|
|
191
|
+
get_ucx_implementation(protocol)
|
|
213
192
|
|
|
214
193
|
p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
|
|
215
194
|
p.start()
|
|
@@ -1,4 +1,8 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import asyncio
|
|
5
|
+
import contextlib
|
|
2
6
|
import os
|
|
3
7
|
import pkgutil
|
|
4
8
|
import sys
|
|
@@ -16,16 +20,18 @@ from dask_cuda.utils import (
|
|
|
16
20
|
get_cluster_configuration,
|
|
17
21
|
get_device_total_memory,
|
|
18
22
|
get_gpu_count_mig,
|
|
19
|
-
|
|
23
|
+
get_gpu_uuid,
|
|
24
|
+
has_device_memory_resource,
|
|
20
25
|
print_cluster_config,
|
|
21
26
|
)
|
|
22
|
-
from dask_cuda.utils_test import MockWorker
|
|
27
|
+
from dask_cuda.utils_test import MockWorker, get_ucx_implementation
|
|
23
28
|
|
|
24
29
|
|
|
25
30
|
@gen_test(timeout=20)
|
|
26
31
|
async def test_local_cuda_cluster():
|
|
27
32
|
async with LocalCUDACluster(
|
|
28
|
-
scheduler_port=0,
|
|
33
|
+
scheduler_port=0,
|
|
34
|
+
asynchronous=True,
|
|
29
35
|
) as cluster:
|
|
30
36
|
async with Client(cluster, asynchronous=True) as client:
|
|
31
37
|
assert len(cluster.workers) == utils.get_n_gpus()
|
|
@@ -65,8 +71,8 @@ async def test_with_subset_of_cuda_visible_devices():
|
|
|
65
71
|
async with LocalCUDACluster(
|
|
66
72
|
scheduler_port=0,
|
|
67
73
|
asynchronous=True,
|
|
68
|
-
device_memory_limit=1,
|
|
69
74
|
worker_class=MockWorker,
|
|
75
|
+
data=dict,
|
|
70
76
|
) as cluster:
|
|
71
77
|
async with Client(cluster, asynchronous=True) as client:
|
|
72
78
|
assert len(cluster.workers) == 4
|
|
@@ -89,14 +95,11 @@ async def test_with_subset_of_cuda_visible_devices():
|
|
|
89
95
|
|
|
90
96
|
@pytest.mark.parametrize(
|
|
91
97
|
"protocol",
|
|
92
|
-
["ucx", "
|
|
98
|
+
["ucx", "ucx-old"],
|
|
93
99
|
)
|
|
94
100
|
@gen_test(timeout=20)
|
|
95
101
|
async def test_ucx_protocol(protocol):
|
|
96
|
-
|
|
97
|
-
pytest.importorskip("ucp")
|
|
98
|
-
elif protocol == "ucxx":
|
|
99
|
-
pytest.importorskip("ucxx")
|
|
102
|
+
get_ucx_implementation(protocol)
|
|
100
103
|
|
|
101
104
|
async with LocalCUDACluster(
|
|
102
105
|
protocol=protocol, asynchronous=True, data=dict
|
|
@@ -109,35 +112,32 @@ async def test_ucx_protocol(protocol):
|
|
|
109
112
|
|
|
110
113
|
@pytest.mark.parametrize(
|
|
111
114
|
"protocol",
|
|
112
|
-
["ucx", "
|
|
115
|
+
["ucx", "ucx-old"],
|
|
113
116
|
)
|
|
114
117
|
@gen_test(timeout=20)
|
|
115
118
|
async def test_explicit_ucx_with_protocol_none(protocol):
|
|
116
|
-
|
|
117
|
-
pytest.importorskip("ucp")
|
|
118
|
-
elif protocol == "ucxx":
|
|
119
|
-
pytest.importorskip("ucxx")
|
|
119
|
+
get_ucx_implementation(protocol)
|
|
120
120
|
|
|
121
121
|
initialize(protocol=protocol, enable_tcp_over_ucx=True)
|
|
122
122
|
async with LocalCUDACluster(
|
|
123
|
-
protocol=None,
|
|
123
|
+
protocol=None,
|
|
124
|
+
enable_tcp_over_ucx=True,
|
|
125
|
+
asynchronous=True,
|
|
124
126
|
) as cluster:
|
|
125
127
|
assert all(
|
|
126
|
-
ws.address.startswith("
|
|
128
|
+
ws.address.startswith(f"{protocol}://")
|
|
129
|
+
for ws in cluster.scheduler.workers.values()
|
|
127
130
|
)
|
|
128
131
|
|
|
129
132
|
|
|
130
133
|
@pytest.mark.filterwarnings("ignore:Exception ignored in")
|
|
131
134
|
@pytest.mark.parametrize(
|
|
132
135
|
"protocol",
|
|
133
|
-
["ucx", "
|
|
136
|
+
["ucx", "ucx-old"],
|
|
134
137
|
)
|
|
135
138
|
@gen_test(timeout=20)
|
|
136
139
|
async def test_ucx_protocol_type_error(protocol):
|
|
137
|
-
|
|
138
|
-
pytest.importorskip("ucp")
|
|
139
|
-
elif protocol == "ucxx":
|
|
140
|
-
pytest.importorskip("ucxx")
|
|
140
|
+
get_ucx_implementation(protocol)
|
|
141
141
|
|
|
142
142
|
initialize(protocol=protocol, enable_tcp_over_ucx=True)
|
|
143
143
|
with pytest.raises(TypeError):
|
|
@@ -150,7 +150,10 @@ async def test_ucx_protocol_type_error(protocol):
|
|
|
150
150
|
@gen_test(timeout=20)
|
|
151
151
|
async def test_n_workers():
|
|
152
152
|
async with LocalCUDACluster(
|
|
153
|
-
CUDA_VISIBLE_DEVICES="0,1",
|
|
153
|
+
CUDA_VISIBLE_DEVICES="0,1",
|
|
154
|
+
worker_class=MockWorker,
|
|
155
|
+
asynchronous=True,
|
|
156
|
+
data=dict,
|
|
154
157
|
) as cluster:
|
|
155
158
|
assert len(cluster.workers) == 2
|
|
156
159
|
assert len(cluster.worker_spec) == 2
|
|
@@ -205,10 +208,13 @@ async def test_no_memory_limits_cudaworker():
|
|
|
205
208
|
@gen_test(timeout=20)
|
|
206
209
|
async def test_all_to_all():
|
|
207
210
|
async with LocalCUDACluster(
|
|
208
|
-
CUDA_VISIBLE_DEVICES="0,1",
|
|
211
|
+
CUDA_VISIBLE_DEVICES="0,1",
|
|
212
|
+
worker_class=MockWorker,
|
|
213
|
+
asynchronous=True,
|
|
214
|
+
data=dict,
|
|
209
215
|
) as cluster:
|
|
210
216
|
async with Client(cluster, asynchronous=True) as client:
|
|
211
|
-
workers = list(client.scheduler_info()["workers"])
|
|
217
|
+
workers = list(client.scheduler_info(n_workers=-1)["workers"])
|
|
212
218
|
n_workers = len(workers)
|
|
213
219
|
await utils.all_to_all(client)
|
|
214
220
|
# assert all to all has resulted in all data on every worker
|
|
@@ -260,11 +266,6 @@ async def test_rmm_managed():
|
|
|
260
266
|
async def test_rmm_async():
|
|
261
267
|
rmm = pytest.importorskip("rmm")
|
|
262
268
|
|
|
263
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
264
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
265
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
266
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
267
|
-
|
|
268
269
|
async with LocalCUDACluster(
|
|
269
270
|
rmm_async=True,
|
|
270
271
|
rmm_pool_size="2GB",
|
|
@@ -287,11 +288,6 @@ async def test_rmm_async():
|
|
|
287
288
|
async def test_rmm_async_with_maximum_pool_size():
|
|
288
289
|
rmm = pytest.importorskip("rmm")
|
|
289
290
|
|
|
290
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
291
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
292
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
293
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
294
|
-
|
|
295
291
|
async with LocalCUDACluster(
|
|
296
292
|
rmm_async=True,
|
|
297
293
|
rmm_pool_size="2GB",
|
|
@@ -378,7 +374,6 @@ async def test_cluster_worker():
|
|
|
378
374
|
async with LocalCUDACluster(
|
|
379
375
|
scheduler_port=0,
|
|
380
376
|
asynchronous=True,
|
|
381
|
-
device_memory_limit=1,
|
|
382
377
|
n_workers=1,
|
|
383
378
|
) as cluster:
|
|
384
379
|
assert len(cluster.workers) == 1
|
|
@@ -419,7 +414,7 @@ async def test_available_mig_workers():
|
|
|
419
414
|
|
|
420
415
|
@gen_test(timeout=20)
|
|
421
416
|
async def test_gpu_uuid():
|
|
422
|
-
gpu_uuid =
|
|
417
|
+
gpu_uuid = get_gpu_uuid(0)
|
|
423
418
|
|
|
424
419
|
async with LocalCUDACluster(
|
|
425
420
|
CUDA_VISIBLE_DEVICES=gpu_uuid,
|
|
@@ -461,7 +456,7 @@ async def test_get_cluster_configuration():
|
|
|
461
456
|
async with LocalCUDACluster(
|
|
462
457
|
rmm_pool_size="2GB",
|
|
463
458
|
rmm_maximum_pool_size="3GB",
|
|
464
|
-
device_memory_limit="30B",
|
|
459
|
+
device_memory_limit="30B" if has_device_memory_resource() else None,
|
|
465
460
|
CUDA_VISIBLE_DEVICES="0",
|
|
466
461
|
scheduler_port=0,
|
|
467
462
|
asynchronous=True,
|
|
@@ -471,10 +466,14 @@ async def test_get_cluster_configuration():
|
|
|
471
466
|
assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
|
|
472
467
|
assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
|
|
473
468
|
assert ret["jit-unspill"] is False
|
|
474
|
-
|
|
469
|
+
if has_device_memory_resource():
|
|
470
|
+
assert ret["device-memory-limit"] == 30
|
|
475
471
|
|
|
476
472
|
|
|
477
473
|
@gen_test(timeout=20)
|
|
474
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
475
|
+
"Devices without dedicated memory resources do not support fractional limits"
|
|
476
|
+
)
|
|
478
477
|
async def test_worker_fraction_limits():
|
|
479
478
|
async with LocalCUDACluster(
|
|
480
479
|
dashboard_address=None,
|
|
@@ -500,6 +499,40 @@ async def test_worker_fraction_limits():
|
|
|
500
499
|
)
|
|
501
500
|
|
|
502
501
|
|
|
502
|
+
# Intentionally not using @gen_test to skip cleanup checks
|
|
503
|
+
@pytest.mark.parametrize(
|
|
504
|
+
"argument", ["pool_size", "maximum_pool_size", "release_threshold"]
|
|
505
|
+
)
|
|
506
|
+
@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
|
|
507
|
+
@pytest.mark.skip_if_device_memory(
|
|
508
|
+
"Devices with dedicated memory resources cannot test error"
|
|
509
|
+
)
|
|
510
|
+
def test_worker_fraction_limits_no_dedicated_memory(argument):
|
|
511
|
+
async def _test_worker_fraction_limits_no_dedicated_memory():
|
|
512
|
+
if argument == "pool_size":
|
|
513
|
+
kwargs = {"rmm_pool_size": "0.1"}
|
|
514
|
+
elif argument == "maximum_pool_size":
|
|
515
|
+
kwargs = {"rmm_pool_size": "1 GiB", "rmm_maximum_pool_size": "0.1"}
|
|
516
|
+
else:
|
|
517
|
+
kwargs = {"rmm_async": True, "rmm_release_threshold": "0.1"}
|
|
518
|
+
|
|
519
|
+
with raises_with_cause(
|
|
520
|
+
RuntimeError,
|
|
521
|
+
"Nanny failed to start",
|
|
522
|
+
RuntimeError,
|
|
523
|
+
"Worker failed to start",
|
|
524
|
+
ValueError,
|
|
525
|
+
"Fractional of total device memory not supported in devices without a "
|
|
526
|
+
"dedicated memory resource",
|
|
527
|
+
):
|
|
528
|
+
await LocalCUDACluster(
|
|
529
|
+
asynchronous=True,
|
|
530
|
+
**kwargs,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
asyncio.run(_test_worker_fraction_limits_no_dedicated_memory())
|
|
534
|
+
|
|
535
|
+
|
|
503
536
|
@gen_test(timeout=20)
|
|
504
537
|
async def test_cudf_spill_disabled():
|
|
505
538
|
cudf = pytest.importorskip("cudf")
|
|
@@ -524,6 +557,9 @@ async def test_cudf_spill_disabled():
|
|
|
524
557
|
|
|
525
558
|
|
|
526
559
|
@gen_test(timeout=20)
|
|
560
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
561
|
+
"Devices without dedicated memory resources cannot enable cuDF spill"
|
|
562
|
+
)
|
|
527
563
|
async def test_cudf_spill():
|
|
528
564
|
cudf = pytest.importorskip("cudf")
|
|
529
565
|
|
|
@@ -548,27 +584,101 @@ async def test_cudf_spill():
|
|
|
548
584
|
assert v == 2
|
|
549
585
|
|
|
550
586
|
|
|
587
|
+
@pytest.mark.skip_if_device_memory(
|
|
588
|
+
"Devices with dedicated memory resources cannot test error"
|
|
589
|
+
)
|
|
590
|
+
@gen_test(timeout=20)
|
|
591
|
+
async def test_cudf_spill_no_dedicated_memory():
|
|
592
|
+
cudf = pytest.importorskip("cudf") # noqa: F841
|
|
593
|
+
|
|
594
|
+
with pytest.raises(
|
|
595
|
+
ValueError,
|
|
596
|
+
match="cuDF spilling is not supported on devices without dedicated memory",
|
|
597
|
+
):
|
|
598
|
+
await LocalCUDACluster(
|
|
599
|
+
enable_cudf_spill=True,
|
|
600
|
+
cudf_spill_stats=2,
|
|
601
|
+
asynchronous=True,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
551
605
|
@pytest.mark.parametrize(
|
|
552
606
|
"protocol",
|
|
553
|
-
["ucx", "
|
|
607
|
+
["ucx", "ucx-old"],
|
|
608
|
+
)
|
|
609
|
+
@pytest.mark.parametrize(
|
|
610
|
+
"jit_unspill",
|
|
611
|
+
[False, True],
|
|
554
612
|
)
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
613
|
+
@pytest.mark.parametrize(
|
|
614
|
+
"device_memory_limit",
|
|
615
|
+
[None, "1B"],
|
|
616
|
+
)
|
|
617
|
+
def test_print_cluster_config(capsys, protocol, jit_unspill, device_memory_limit):
|
|
618
|
+
get_ucx_implementation(protocol)
|
|
560
619
|
|
|
561
620
|
pytest.importorskip("rich")
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
621
|
+
|
|
622
|
+
ctx = contextlib.nullcontext()
|
|
623
|
+
if not has_device_memory_resource():
|
|
624
|
+
if device_memory_limit:
|
|
625
|
+
ctx = pytest.raises(
|
|
626
|
+
ValueError,
|
|
627
|
+
match="device_memory_limit is set but device has no dedicated memory.",
|
|
628
|
+
)
|
|
629
|
+
if jit_unspill:
|
|
630
|
+
# JIT-Unspill exception has precedence, thus overwrite ctx if both are
|
|
631
|
+
# enabled
|
|
632
|
+
ctx = pytest.raises(
|
|
633
|
+
ValueError,
|
|
634
|
+
match="JIT-Unspill is not supported on devices without dedicated "
|
|
635
|
+
"memory",
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
with ctx:
|
|
639
|
+
with LocalCUDACluster(
|
|
640
|
+
n_workers=1,
|
|
641
|
+
device_memory_limit=device_memory_limit,
|
|
642
|
+
jit_unspill=jit_unspill,
|
|
643
|
+
protocol=protocol,
|
|
644
|
+
) as cluster:
|
|
645
|
+
with Client(cluster) as client:
|
|
646
|
+
print_cluster_config(client)
|
|
647
|
+
captured = capsys.readouterr()
|
|
648
|
+
assert "Dask Cluster Configuration" in captured.out
|
|
649
|
+
assert protocol in captured.out
|
|
650
|
+
if device_memory_limit == "1B":
|
|
651
|
+
assert "1 B" in captured.out
|
|
652
|
+
assert "[plugin]" in captured.out
|
|
653
|
+
client.shutdown()
|
|
654
|
+
|
|
655
|
+
def ucxpy_reset(timeout=20):
|
|
656
|
+
"""Reset UCX-Py with a timeout.
|
|
657
|
+
|
|
658
|
+
Attempt to reset UCX-Py, not doing so may cause a deadlock because UCX-Py is
|
|
659
|
+
not thread-safe and the Dask cluster may still be alive while a new cluster
|
|
660
|
+
and UCX-Py instances are initalized.
|
|
661
|
+
"""
|
|
662
|
+
import time
|
|
663
|
+
|
|
664
|
+
import ucp
|
|
665
|
+
|
|
666
|
+
start = time.monotonic()
|
|
667
|
+
while True:
|
|
668
|
+
try:
|
|
669
|
+
ucp.reset()
|
|
670
|
+
except ucp._libs.exceptions.UCXError as e:
|
|
671
|
+
if time.monotonic() - start > timeout:
|
|
672
|
+
raise RuntimeError(
|
|
673
|
+
f"Could not reset UCX-Py in {timeout} seconds, this may result "
|
|
674
|
+
f"in a deadlock. Failure:\n{e}"
|
|
675
|
+
)
|
|
676
|
+
continue
|
|
677
|
+
else:
|
|
678
|
+
break
|
|
679
|
+
|
|
680
|
+
if protocol == "ucx-old":
|
|
681
|
+
ucxpy_reset()
|
|
572
682
|
|
|
573
683
|
|
|
574
684
|
@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
from typing import Iterable
|
|
2
5
|
from unittest.mock import patch
|
|
3
6
|
|
|
@@ -217,6 +220,9 @@ def test_one_item_host_limit(capsys, root_dir):
|
|
|
217
220
|
assert len(dhf.manager) == 0
|
|
218
221
|
|
|
219
222
|
|
|
223
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
224
|
+
"Devices without dedicated memory resources do not support spilling"
|
|
225
|
+
)
|
|
220
226
|
def test_spill_on_demand(root_dir):
|
|
221
227
|
"""
|
|
222
228
|
Test spilling on demand by disabling the device_memory_limit
|
|
@@ -239,6 +245,9 @@ def test_spill_on_demand(root_dir):
|
|
|
239
245
|
|
|
240
246
|
|
|
241
247
|
@pytest.mark.parametrize("jit_unspill", [True, False])
|
|
248
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
249
|
+
"Devices without dedicated memory resources do not support spilling"
|
|
250
|
+
)
|
|
242
251
|
@gen_test(timeout=20)
|
|
243
252
|
async def test_local_cuda_cluster(jit_unspill):
|
|
244
253
|
"""Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
|
|
@@ -375,9 +384,9 @@ def test_externals(root_dir):
|
|
|
375
384
|
|
|
376
385
|
@patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
|
|
377
386
|
def test_incompatible_types(root_dir):
|
|
378
|
-
"""Check that ProxifyHostFile unproxifies
|
|
387
|
+
"""Check that ProxifyHostFile unproxifies ``cupy.ndarray`` on retrieval
|
|
379
388
|
|
|
380
|
-
Notice, in this test we add
|
|
389
|
+
Notice, in this test we add ``cupy.ndarray`` to the incompatible_types temporarily.
|
|
381
390
|
"""
|
|
382
391
|
cupy = pytest.importorskip("cupy")
|
|
383
392
|
cudf = pytest.importorskip("cudf")
|
|
@@ -396,6 +405,9 @@ def test_incompatible_types(root_dir):
|
|
|
396
405
|
|
|
397
406
|
@pytest.mark.parametrize("npartitions", [1, 2, 3])
|
|
398
407
|
@pytest.mark.parametrize("compatibility_mode", [True, False])
|
|
408
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
409
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
410
|
+
)
|
|
399
411
|
@gen_test(timeout=30)
|
|
400
412
|
async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
|
|
401
413
|
cudf = pytest.importorskip("cudf")
|
|
@@ -414,7 +426,7 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
|
|
|
414
426
|
ddf = dask.dataframe.from_pandas(
|
|
415
427
|
cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
|
|
416
428
|
)
|
|
417
|
-
res = ddf.shuffle(on="key", shuffle_method="tasks")
|
|
429
|
+
[res] = client.persist([ddf.shuffle(on="key", shuffle_method="tasks")])
|
|
418
430
|
|
|
419
431
|
# With compatibility mode on, we shouldn't encounter any proxy objects
|
|
420
432
|
if compatibility_mode:
|
|
@@ -428,6 +440,9 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
|
|
|
428
440
|
assert all(res) # Only proxy objects
|
|
429
441
|
|
|
430
442
|
|
|
443
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
444
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
445
|
+
)
|
|
431
446
|
@gen_test(timeout=60)
|
|
432
447
|
async def test_worker_force_spill_to_disk():
|
|
433
448
|
"""Test Dask triggering CPU-to-Disk spilling"""
|
|
@@ -440,7 +455,7 @@ async def test_worker_force_spill_to_disk():
|
|
|
440
455
|
async with Client(cluster, asynchronous=True) as client:
|
|
441
456
|
# Create a df that are spilled to host memory immediately
|
|
442
457
|
df = cudf.DataFrame({"key": np.arange(10**8)})
|
|
443
|
-
ddf = dask.dataframe.from_pandas(df, npartitions=1)
|
|
458
|
+
[ddf] = client.persist([dask.dataframe.from_pandas(df, npartitions=1)])
|
|
444
459
|
await ddf
|
|
445
460
|
|
|
446
461
|
async def f(dask_worker):
|
|
@@ -463,6 +478,9 @@ async def test_worker_force_spill_to_disk():
|
|
|
463
478
|
assert "Unmanaged memory use is high" not in log
|
|
464
479
|
|
|
465
480
|
|
|
481
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
482
|
+
"Devices without dedicated memory resources do not support JIT-Unspill"
|
|
483
|
+
)
|
|
466
484
|
def test_on_demand_debug_info():
|
|
467
485
|
"""Test worker logging when on-demand-spilling fails"""
|
|
468
486
|
rmm = pytest.importorskip("rmm")
|
|
@@ -498,3 +516,14 @@ def test_on_demand_debug_info():
|
|
|
498
516
|
assert f"WARNING - RMM allocation of {size} failed" in log
|
|
499
517
|
assert f"RMM allocs: {size}" in log
|
|
500
518
|
assert "traceback:" in log
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def test_sizeof_owner_with_cai():
|
|
522
|
+
cudf = pytest.importorskip("cudf")
|
|
523
|
+
s = cudf.Series([1, 2, 3])
|
|
524
|
+
|
|
525
|
+
items = dask_cuda.get_device_memory_objects.dispatch(s)
|
|
526
|
+
assert len(items) == 1
|
|
527
|
+
item = items[0]
|
|
528
|
+
result = dask.sizeof.sizeof(item)
|
|
529
|
+
assert result == 24
|
dask_cuda/tests/test_proxy.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import operator
|
|
2
5
|
import os
|
|
3
6
|
import pickle
|
|
@@ -23,7 +26,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
|
|
|
23
26
|
from dask_cuda.disk_io import SpillToDiskFile
|
|
24
27
|
from dask_cuda.proxify_device_objects import proxify_device_objects
|
|
25
28
|
from dask_cuda.proxify_host_file import ProxifyHostFile
|
|
26
|
-
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
|
|
29
|
+
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
|
|
27
30
|
|
|
28
31
|
# Make the "disk" serializer available and use a directory that are
|
|
29
32
|
# remove on exit.
|
|
@@ -242,7 +245,7 @@ def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers):
|
|
|
242
245
|
|
|
243
246
|
@pytest.mark.parametrize("backend", ["numpy", "cupy"])
|
|
244
247
|
def test_fixed_attribute_length(backend):
|
|
245
|
-
"""Test fixed attribute
|
|
248
|
+
"""Test fixed attribute ``x.__len__`` access
|
|
246
249
|
|
|
247
250
|
Notice, accessing fixed attributes shouldn't de-serialize the proxied object
|
|
248
251
|
"""
|
|
@@ -263,7 +266,7 @@ def test_fixed_attribute_length(backend):
|
|
|
263
266
|
|
|
264
267
|
|
|
265
268
|
def test_fixed_attribute_name():
|
|
266
|
-
"""Test fixed attribute
|
|
269
|
+
"""Test fixed attribute ``x.name`` access
|
|
267
270
|
|
|
268
271
|
Notice, accessing fixed attributes shouldn't de-serialize the proxied object
|
|
269
272
|
"""
|
|
@@ -284,6 +287,9 @@ def test_fixed_attribute_name():
|
|
|
284
287
|
|
|
285
288
|
|
|
286
289
|
@pytest.mark.parametrize("jit_unspill", [True, False])
|
|
290
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
291
|
+
"Spilling not supported in devices without dedicated memory resource"
|
|
292
|
+
)
|
|
287
293
|
@gen_test(timeout=20)
|
|
288
294
|
async def test_spilling_local_cuda_cluster(jit_unspill):
|
|
289
295
|
"""Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
|
|
@@ -386,8 +392,8 @@ def test_serializing_array_to_disk(backend, serializers, size):
|
|
|
386
392
|
class _PxyObjTest(proxy_object.ProxyObject):
|
|
387
393
|
"""
|
|
388
394
|
A class that:
|
|
389
|
-
- defines
|
|
390
|
-
calling
|
|
395
|
+
- defines ``__dask_tokenize__`` in order to avoid deserialization when
|
|
396
|
+
calling ``client.scatter()``
|
|
391
397
|
- Asserts that no deserialization is performaned when communicating.
|
|
392
398
|
"""
|
|
393
399
|
|
|
@@ -401,14 +407,12 @@ class _PxyObjTest(proxy_object.ProxyObject):
|
|
|
401
407
|
|
|
402
408
|
|
|
403
409
|
@pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
|
|
404
|
-
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "
|
|
410
|
+
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
|
|
405
411
|
@gen_test(timeout=120)
|
|
406
412
|
async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
407
413
|
"""Testing serialization of cuDF dataframe when communicating"""
|
|
408
|
-
if protocol
|
|
409
|
-
|
|
410
|
-
elif protocol == "ucxx":
|
|
411
|
-
pytest.importorskip("ucxx")
|
|
414
|
+
if protocol.startswith("ucx"):
|
|
415
|
+
get_ucx_implementation(protocol)
|
|
412
416
|
cudf = pytest.importorskip("cudf")
|
|
413
417
|
|
|
414
418
|
def task(x):
|
|
@@ -417,7 +421,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
|
417
421
|
serializers_used = x._pxy_get().serializer
|
|
418
422
|
|
|
419
423
|
# Check that `x` is serialized with the expected serializers
|
|
420
|
-
if protocol in ["ucx", "
|
|
424
|
+
if protocol in ["ucx", "ucx-old"]:
|
|
421
425
|
if send_serializers is None:
|
|
422
426
|
assert serializers_used == "cuda"
|
|
423
427
|
else:
|
|
@@ -448,15 +452,13 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
|
|
|
448
452
|
await client.submit(task, df)
|
|
449
453
|
|
|
450
454
|
|
|
451
|
-
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "
|
|
455
|
+
@pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
|
|
452
456
|
@pytest.mark.parametrize("shared_fs", [True, False])
|
|
453
457
|
@gen_test(timeout=20)
|
|
454
458
|
async def test_communicating_disk_objects(protocol, shared_fs):
|
|
455
459
|
"""Testing disk serialization of cuDF dataframe when communicating"""
|
|
456
|
-
if protocol
|
|
457
|
-
|
|
458
|
-
elif protocol == "ucxx":
|
|
459
|
-
pytest.importorskip("ucxx")
|
|
460
|
+
if protocol.startswith("ucx"):
|
|
461
|
+
get_ucx_implementation(protocol)
|
|
460
462
|
cudf = pytest.importorskip("cudf")
|
|
461
463
|
ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
|
|
462
464
|
|