dask-cuda 25.8.0__py3-none-any.whl → 25.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +1 -1
- dask_cuda/benchmarks/local_cudf_merge.py +1 -1
- dask_cuda/benchmarks/local_cudf_shuffle.py +1 -1
- dask_cuda/benchmarks/local_cupy.py +1 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +1 -1
- dask_cuda/benchmarks/utils.py +1 -1
- dask_cuda/cuda_worker.py +1 -1
- dask_cuda/get_device_memory_objects.py +1 -4
- dask_cuda/initialize.py +140 -121
- dask_cuda/local_cuda_cluster.py +10 -25
- dask_cuda/tests/test_cudf_builtin_spilling.py +3 -1
- dask_cuda/tests/test_dask_setup.py +193 -0
- dask_cuda/tests/test_dgx.py +16 -32
- dask_cuda/tests/test_explicit_comms.py +11 -10
- dask_cuda/tests/test_from_array.py +1 -5
- dask_cuda/tests/test_initialize.py +230 -41
- dask_cuda/tests/test_local_cuda_cluster.py +16 -62
- dask_cuda/tests/test_proxify_host_file.py +9 -4
- dask_cuda/tests/test_proxy.py +8 -8
- dask_cuda/tests/test_spill.py +3 -3
- dask_cuda/tests/test_utils.py +8 -23
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +12 -66
- dask_cuda/utils_test.py +0 -13
- dask_cuda/worker_spec.py +7 -9
- {dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/METADATA +11 -4
- dask_cuda-25.10.0.dist-info/RECORD +63 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +1 -1
- dask_cuda/tests/test_rdd_ucx.py +0 -160
- dask_cuda-25.8.0.dist-info/RECORD +0 -63
- {dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/WHEEL +0 -0
- {dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/licenses/LICENSE +0 -0
- {dask_cuda-25.8.0.dist-info → dask_cuda-25.10.0.dist-info}/top_level.txt +0 -0
dask_cuda/GIT_COMMIT
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
472ca1ce6d1fe836104a5a4f10b284ca9a828ea9
|
dask_cuda/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
25.
|
|
1
|
+
25.10.00
|
|
@@ -141,7 +141,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
|
|
|
141
141
|
key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
|
|
142
142
|
)
|
|
143
143
|
print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
|
|
144
|
-
if args.protocol in ["ucx", "ucxx"
|
|
144
|
+
if args.protocol in ["ucx", "ucxx"]:
|
|
145
145
|
print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
|
|
146
146
|
print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
|
|
147
147
|
print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
|
|
@@ -227,7 +227,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
|
|
|
227
227
|
)
|
|
228
228
|
print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
|
|
229
229
|
print_key_value(key="Frac-match", value=f"{args.frac_match}")
|
|
230
|
-
if args.protocol in ["ucx", "ucxx"
|
|
230
|
+
if args.protocol in ["ucx", "ucxx"]:
|
|
231
231
|
print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
|
|
232
232
|
print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
|
|
233
233
|
print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
|
|
@@ -152,7 +152,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
|
|
|
152
152
|
key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
|
|
153
153
|
)
|
|
154
154
|
print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
|
|
155
|
-
if args.protocol in ["ucx", "ucxx"
|
|
155
|
+
if args.protocol in ["ucx", "ucxx"]:
|
|
156
156
|
print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
|
|
157
157
|
print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
|
|
158
158
|
print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
|
|
@@ -195,7 +195,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
|
|
|
195
195
|
)
|
|
196
196
|
print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
|
|
197
197
|
print_key_value(key="Protocol", value=f"{args.protocol}")
|
|
198
|
-
if args.protocol in ["ucx", "ucxx"
|
|
198
|
+
if args.protocol in ["ucx", "ucxx"]:
|
|
199
199
|
print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
|
|
200
200
|
print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
|
|
201
201
|
print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
|
|
@@ -80,7 +80,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
|
|
|
80
80
|
)
|
|
81
81
|
print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
|
|
82
82
|
print_key_value(key="Protocol", value=f"{args.protocol}")
|
|
83
|
-
if args.protocol in ["ucx", "ucxx"
|
|
83
|
+
if args.protocol in ["ucx", "ucxx"]:
|
|
84
84
|
print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
|
|
85
85
|
print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
|
|
86
86
|
print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
|
dask_cuda/benchmarks/utils.py
CHANGED
dask_cuda/cuda_worker.py
CHANGED
|
@@ -210,7 +210,7 @@ class CUDAWorker(Server):
|
|
|
210
210
|
name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
|
|
211
211
|
local_directory=local_directory,
|
|
212
212
|
config={
|
|
213
|
-
"distributed
|
|
213
|
+
"distributed-ucxx": get_ucx_config(
|
|
214
214
|
enable_tcp_over_ucx=enable_tcp_over_ucx,
|
|
215
215
|
enable_infiniband=enable_infiniband,
|
|
216
216
|
enable_nvlink=enable_nvlink,
|
|
@@ -119,11 +119,8 @@ def get_device_memory_objects_register_cudf():
|
|
|
119
119
|
return []
|
|
120
120
|
|
|
121
121
|
@dispatch.register(cudf.core.index.Index)
|
|
122
|
-
def get_device_memory_objects_cudf_index(obj):
|
|
123
|
-
return dispatch(obj._values)
|
|
124
|
-
|
|
125
122
|
@dispatch.register(cudf.core.multiindex.MultiIndex)
|
|
126
|
-
def
|
|
123
|
+
def get_device_memory_objects_cudf_index(obj):
|
|
127
124
|
return dispatch(obj._columns)
|
|
128
125
|
|
|
129
126
|
@dispatch.register(cudf.core.column.ColumnBase)
|
dask_cuda/initialize.py
CHANGED
|
@@ -5,126 +5,177 @@ import logging
|
|
|
5
5
|
import os
|
|
6
6
|
|
|
7
7
|
import click
|
|
8
|
-
import
|
|
8
|
+
import cuda.core.experimental
|
|
9
9
|
|
|
10
10
|
import dask
|
|
11
|
-
from distributed.diagnostics.nvml import
|
|
11
|
+
from distributed.diagnostics.nvml import (
|
|
12
|
+
CudaDeviceInfo,
|
|
13
|
+
get_device_index_and_uuid,
|
|
14
|
+
has_cuda_context,
|
|
15
|
+
)
|
|
12
16
|
|
|
13
|
-
from .utils import
|
|
17
|
+
from .utils import get_ucx_config
|
|
14
18
|
|
|
15
19
|
logger = logging.getLogger(__name__)
|
|
16
20
|
|
|
17
21
|
|
|
22
|
+
pre_existing_cuda_context = None
|
|
23
|
+
cuda_context_created = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_warning_suffix = (
|
|
27
|
+
"This is often the result of a CUDA-enabled library calling a CUDA runtime "
|
|
28
|
+
"function before Dask-CUDA can spawn worker processes. Please make sure any such "
|
|
29
|
+
"function calls don't happen at import time or in the global scope of a program."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _get_device_and_uuid_str(device_info: CudaDeviceInfo) -> str:
|
|
34
|
+
return f"{device_info.device_index} ({str(device_info.uuid)})"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _warn_existing_cuda_context(device_info: CudaDeviceInfo, pid: int) -> None:
|
|
38
|
+
device_uuid_str = _get_device_and_uuid_str(device_info)
|
|
39
|
+
logger.warning(
|
|
40
|
+
f"A CUDA context for device {device_uuid_str} already exists "
|
|
41
|
+
f"on process ID {pid}. {_warning_suffix}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _warn_cuda_context_wrong_device(
|
|
46
|
+
device_info_expected: CudaDeviceInfo, device_info_actual: CudaDeviceInfo, pid: int
|
|
47
|
+
) -> None:
|
|
48
|
+
expected_device_uuid_str = _get_device_and_uuid_str(device_info_expected)
|
|
49
|
+
actual_device_uuid_str = _get_device_and_uuid_str(device_info_actual)
|
|
50
|
+
logger.warning(
|
|
51
|
+
f"Worker with process ID {pid} should have a CUDA context assigned to device "
|
|
52
|
+
f"{expected_device_uuid_str}, but instead the CUDA context is on device "
|
|
53
|
+
f"{actual_device_uuid_str}. {_warning_suffix}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _mock_test_device() -> bool:
|
|
58
|
+
"""Check whether running tests in a single-GPU environment.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
Whether running tests in a single-GPU environment, determined by checking whether
|
|
64
|
+
`DASK_CUDA_TEST_SINGLE_GPU` environment variable is set to a value different than
|
|
65
|
+
`"0"`.
|
|
66
|
+
"""
|
|
67
|
+
return int(os.environ.get("DASK_CUDA_TEST_SINGLE_GPU", "0")) != 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_device_str() -> str:
|
|
71
|
+
"""Get the device string.
|
|
72
|
+
|
|
73
|
+
Get a string with the first device (first element before the comma), which may be
|
|
74
|
+
an index or a UUID.
|
|
75
|
+
|
|
76
|
+
Always returns "0" when running tests in a single-GPU environment, determined by
|
|
77
|
+
the result returned by `_mock_test_device()`.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
The device string.
|
|
82
|
+
"""
|
|
83
|
+
if _mock_test_device():
|
|
84
|
+
return "0"
|
|
85
|
+
else:
|
|
86
|
+
return os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
|
|
87
|
+
|
|
88
|
+
|
|
18
89
|
def _create_cuda_context_handler():
|
|
19
|
-
|
|
90
|
+
"""Create a CUDA context on the current device.
|
|
91
|
+
|
|
92
|
+
A CUDA context is created on the current device if one does not exist yet, and not
|
|
93
|
+
running tests on a single-GPU environment, determined by the result returned by
|
|
94
|
+
`_mock_test_device()`.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
The device string.
|
|
99
|
+
"""
|
|
100
|
+
if _mock_test_device():
|
|
20
101
|
try:
|
|
21
|
-
|
|
22
|
-
except
|
|
102
|
+
cuda.core.experimental.Device().set_current()
|
|
103
|
+
except Exception:
|
|
23
104
|
pass
|
|
24
105
|
else:
|
|
25
|
-
|
|
106
|
+
cuda.core.experimental.Device().set_current()
|
|
26
107
|
|
|
27
108
|
|
|
28
|
-
def
|
|
29
|
-
|
|
30
|
-
# TODO: update when UCX-Py is removed, see
|
|
31
|
-
# https://github.com/rapidsai/dask-cuda/issues/1517
|
|
32
|
-
import distributed.comm.ucx
|
|
109
|
+
def _create_cuda_context_and_warn():
|
|
110
|
+
"""Create CUDA context and warn depending on certain conditions.
|
|
33
111
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
cuda_visible_device = get_device_index_and_uuid(
|
|
37
|
-
os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
|
|
38
|
-
)
|
|
39
|
-
ctx = has_cuda_context()
|
|
40
|
-
if (
|
|
41
|
-
ctx.has_context
|
|
42
|
-
and not distributed.comm.ucx.cuda_context_created.has_context
|
|
43
|
-
):
|
|
44
|
-
distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
|
|
45
|
-
|
|
46
|
-
_create_cuda_context_handler()
|
|
47
|
-
|
|
48
|
-
if not distributed.comm.ucx.cuda_context_created.has_context:
|
|
49
|
-
ctx = has_cuda_context()
|
|
50
|
-
if ctx.has_context and ctx.device_info != cuda_visible_device:
|
|
51
|
-
distributed.comm.ucx._warn_cuda_context_wrong_device(
|
|
52
|
-
cuda_visible_device, ctx.device_info, os.getpid()
|
|
53
|
-
)
|
|
112
|
+
Warns if a pre-existing CUDA context already existed or if the resulting CUDA
|
|
113
|
+
context was created in the wrong device.
|
|
54
114
|
|
|
55
|
-
|
|
56
|
-
|
|
115
|
+
This function is almost an identical duplicate from
|
|
116
|
+
`distributed_ucxx.ucxx.init_once`, the duplication is necessary because Dask-CUDA
|
|
117
|
+
needs to support `protocol="tcp"` as well, even when distributed-ucxx is not
|
|
118
|
+
installed, but this here runs _after_ comms have started, which is fine for TCP
|
|
119
|
+
because the time when CUDA context is created is not important. The code needs to
|
|
120
|
+
live also in distributed-ucxx because there the time when a CUDA context is created
|
|
121
|
+
matters, and it needs to happen _before_ UCX is initialized, but comms in
|
|
122
|
+
Distributed is initialized before preload, and thus only after this function
|
|
123
|
+
executes.
|
|
57
124
|
|
|
125
|
+
Raises
|
|
126
|
+
------
|
|
127
|
+
Exception
|
|
128
|
+
If anything wrong happened during context initialization.
|
|
58
129
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
None
|
|
133
|
+
"""
|
|
134
|
+
global pre_existing_cuda_context, cuda_context_created
|
|
135
|
+
|
|
136
|
+
cuda_visible_device = get_device_index_and_uuid(_get_device_str())
|
|
137
|
+
pre_existing_cuda_context = has_cuda_context()
|
|
138
|
+
if pre_existing_cuda_context.has_context:
|
|
139
|
+
_warn_existing_cuda_context(pre_existing_cuda_context.device_info, os.getpid())
|
|
140
|
+
|
|
141
|
+
_create_cuda_context_handler()
|
|
142
|
+
|
|
143
|
+
cuda_context_created = has_cuda_context()
|
|
144
|
+
if (
|
|
145
|
+
cuda_context_created.has_context
|
|
146
|
+
and cuda_context_created.device_info.uuid != cuda_visible_device.uuid
|
|
147
|
+
):
|
|
148
|
+
_warn_cuda_context_wrong_device(
|
|
149
|
+
cuda_visible_device, cuda_context_created.device_info, os.getpid()
|
|
150
|
+
)
|
|
69
151
|
|
|
70
152
|
|
|
71
|
-
def
|
|
153
|
+
def _create_cuda_context():
|
|
72
154
|
try:
|
|
73
155
|
# Added here to ensure the parent `LocalCUDACluster` process creates the CUDA
|
|
74
156
|
# context directly from the UCX module, thus avoiding a similar warning there.
|
|
75
157
|
import distributed_ucxx.ucxx
|
|
158
|
+
except ImportError:
|
|
159
|
+
pass
|
|
160
|
+
else:
|
|
161
|
+
if distributed_ucxx.ucxx.ucxx is not None:
|
|
162
|
+
# UCXX has already initialized (and warned if necessary)
|
|
163
|
+
return
|
|
76
164
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
cuda_visible_device = get_device_index_and_uuid(
|
|
80
|
-
os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
|
|
81
|
-
)
|
|
82
|
-
ctx = has_cuda_context()
|
|
83
|
-
if (
|
|
84
|
-
ctx.has_context
|
|
85
|
-
and not distributed_ucxx.ucxx.cuda_context_created.has_context
|
|
86
|
-
):
|
|
87
|
-
distributed_ucxx.ucxx._warn_existing_cuda_context(ctx, os.getpid())
|
|
88
|
-
|
|
89
|
-
_create_cuda_context_handler()
|
|
90
|
-
|
|
91
|
-
if not distributed_ucxx.ucxx.cuda_context_created.has_context:
|
|
92
|
-
ctx = has_cuda_context()
|
|
93
|
-
if ctx.has_context and ctx.device_info != cuda_visible_device:
|
|
94
|
-
distributed_ucxx.ucxx._warn_cuda_context_wrong_device(
|
|
95
|
-
cuda_visible_device, ctx.device_info, os.getpid()
|
|
96
|
-
)
|
|
97
|
-
|
|
165
|
+
try:
|
|
166
|
+
_create_cuda_context_and_warn()
|
|
98
167
|
except Exception:
|
|
99
168
|
logger.error("Unable to start CUDA Context", exc_info=True)
|
|
100
169
|
|
|
101
170
|
|
|
102
|
-
def _create_cuda_context(protocol="ucx"):
|
|
103
|
-
if protocol not in ["ucx", "ucxx", "ucx-old"]:
|
|
104
|
-
return
|
|
105
|
-
|
|
106
|
-
try:
|
|
107
|
-
ucx_implementation = _get_active_ucx_implementation_name(protocol)
|
|
108
|
-
except ValueError:
|
|
109
|
-
# Not a UCX protocol, just raise CUDA context warnings if needed.
|
|
110
|
-
_warn_generic()
|
|
111
|
-
else:
|
|
112
|
-
if ucx_implementation == "ucxx":
|
|
113
|
-
_initialize_ucxx()
|
|
114
|
-
else:
|
|
115
|
-
_initialize_ucx()
|
|
116
|
-
_warn_generic()
|
|
117
|
-
|
|
118
|
-
|
|
119
171
|
def initialize(
|
|
120
172
|
create_cuda_context=True,
|
|
121
173
|
enable_tcp_over_ucx=None,
|
|
122
174
|
enable_infiniband=None,
|
|
123
175
|
enable_nvlink=None,
|
|
124
176
|
enable_rdmacm=None,
|
|
125
|
-
protocol="ucx",
|
|
126
177
|
):
|
|
127
|
-
"""Create CUDA context and initialize
|
|
178
|
+
"""Create CUDA context and initialize UCXX configuration.
|
|
128
179
|
|
|
129
180
|
Sometimes it is convenient to initialize the CUDA context, particularly before
|
|
130
181
|
starting up Dask worker processes which create a variety of threads.
|
|
@@ -173,12 +224,11 @@ def initialize(
|
|
|
173
224
|
enable_infiniband=enable_infiniband,
|
|
174
225
|
enable_nvlink=enable_nvlink,
|
|
175
226
|
enable_rdmacm=enable_rdmacm,
|
|
176
|
-
protocol=protocol,
|
|
177
227
|
)
|
|
178
|
-
dask.config.set({"distributed
|
|
228
|
+
dask.config.set({"distributed-ucxx": ucx_config})
|
|
179
229
|
|
|
180
230
|
if create_cuda_context:
|
|
181
|
-
_create_cuda_context(
|
|
231
|
+
_create_cuda_context()
|
|
182
232
|
|
|
183
233
|
|
|
184
234
|
@click.command()
|
|
@@ -187,40 +237,9 @@ def initialize(
|
|
|
187
237
|
default=False,
|
|
188
238
|
help="Create CUDA context",
|
|
189
239
|
)
|
|
190
|
-
@click.option(
|
|
191
|
-
"--protocol",
|
|
192
|
-
default=None,
|
|
193
|
-
type=str,
|
|
194
|
-
help="Communication protocol, such as: 'tcp', 'tls', 'ucx' or 'ucxx'.",
|
|
195
|
-
)
|
|
196
|
-
@click.option(
|
|
197
|
-
"--enable-tcp-over-ucx/--disable-tcp-over-ucx",
|
|
198
|
-
default=False,
|
|
199
|
-
help="Enable TCP communication over UCX",
|
|
200
|
-
)
|
|
201
|
-
@click.option(
|
|
202
|
-
"--enable-infiniband/--disable-infiniband",
|
|
203
|
-
default=False,
|
|
204
|
-
help="Enable InfiniBand communication",
|
|
205
|
-
)
|
|
206
|
-
@click.option(
|
|
207
|
-
"--enable-nvlink/--disable-nvlink",
|
|
208
|
-
default=False,
|
|
209
|
-
help="Enable NVLink communication",
|
|
210
|
-
)
|
|
211
|
-
@click.option(
|
|
212
|
-
"--enable-rdmacm/--disable-rdmacm",
|
|
213
|
-
default=False,
|
|
214
|
-
help="Enable RDMA connection manager, currently requires InfiniBand enabled.",
|
|
215
|
-
)
|
|
216
240
|
def dask_setup(
|
|
217
|
-
|
|
241
|
+
worker,
|
|
218
242
|
create_cuda_context,
|
|
219
|
-
protocol,
|
|
220
|
-
enable_tcp_over_ucx,
|
|
221
|
-
enable_infiniband,
|
|
222
|
-
enable_nvlink,
|
|
223
|
-
enable_rdmacm,
|
|
224
243
|
):
|
|
225
244
|
if create_cuda_context:
|
|
226
|
-
_create_cuda_context(
|
|
245
|
+
_create_cuda_context()
|
dask_cuda/local_cuda_cluster.py
CHANGED
|
@@ -47,8 +47,8 @@ class LocalCUDACluster(LocalCluster):
|
|
|
47
47
|
respect this hardware as much as possible.
|
|
48
48
|
|
|
49
49
|
Each worker process is automatically assigned the correct CPU cores and network
|
|
50
|
-
interface cards to maximize performance. If UCX and
|
|
51
|
-
and NVLink connections can be used to optimize data transfer performance.
|
|
50
|
+
interface cards to maximize performance. If UCX and distributed-ucxx are available,
|
|
51
|
+
InfiniBand and NVLink connections can be used to optimize data transfer performance.
|
|
52
52
|
|
|
53
53
|
Parameters
|
|
54
54
|
----------
|
|
@@ -105,16 +105,13 @@ class LocalCUDACluster(LocalCluster):
|
|
|
105
105
|
are not supported or disabled.
|
|
106
106
|
enable_infiniband : bool, default None
|
|
107
107
|
Set environment variables to enable UCX over InfiniBand, requires
|
|
108
|
-
``protocol="ucx"``, ``
|
|
109
|
-
``enable_tcp_over_ucx=True`` when ``True``.
|
|
108
|
+
``protocol="ucx"``, and implies ``enable_tcp_over_ucx=True`` when ``True``.
|
|
110
109
|
enable_nvlink : bool, default None
|
|
111
110
|
Set environment variables to enable UCX over NVLink, requires
|
|
112
|
-
``protocol="ucx"``, ``
|
|
113
|
-
``enable_tcp_over_ucx=True`` when ``True``.
|
|
111
|
+
``protocol="ucx"``, and implies ``enable_tcp_over_ucx=True`` when ``True``.
|
|
114
112
|
enable_rdmacm : bool, default None
|
|
115
113
|
Set environment variables to enable UCX RDMA connection manager support,
|
|
116
|
-
requires ``protocol="ucx"``,
|
|
117
|
-
and ``enable_infiniband=True``.
|
|
114
|
+
requires ``protocol="ucx"``, and ``enable_infiniband=True``.
|
|
118
115
|
rmm_pool_size : int, str or None, default None
|
|
119
116
|
RMM pool size to initialize each worker with. Can be an integer (bytes), float
|
|
120
117
|
(fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
|
|
@@ -208,8 +205,7 @@ class LocalCUDACluster(LocalCluster):
|
|
|
208
205
|
Raises
|
|
209
206
|
------
|
|
210
207
|
TypeError
|
|
211
|
-
If InfiniBand or NVLink are enabled and
|
|
212
|
-
``protocol not in ("ucx", "ucxx", "ucx-old")``.
|
|
208
|
+
If InfiniBand or NVLink are enabled and ``protocol != "ucx"``.
|
|
213
209
|
ValueError
|
|
214
210
|
If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
|
|
215
211
|
cannot be imported.
|
|
@@ -355,20 +351,9 @@ class LocalCUDACluster(LocalCluster):
|
|
|
355
351
|
|
|
356
352
|
if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
|
|
357
353
|
if protocol is None:
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
)
|
|
361
|
-
if ucx_protocol is not None:
|
|
362
|
-
# TODO: remove when UCX-Py is removed,
|
|
363
|
-
# see https://github.com/rapidsai/dask-cuda/issues/1517
|
|
364
|
-
protocol = ucx_protocol
|
|
365
|
-
else:
|
|
366
|
-
protocol = "ucx"
|
|
367
|
-
elif protocol not in ("ucx", "ucxx", "ucx-old"):
|
|
368
|
-
raise TypeError(
|
|
369
|
-
"Enabling InfiniBand or NVLink requires protocol='ucx', "
|
|
370
|
-
"protocol='ucxx' or protocol='ucx-old'"
|
|
371
|
-
)
|
|
354
|
+
protocol = "ucx"
|
|
355
|
+
if protocol not in ("ucx", "ucxx"):
|
|
356
|
+
raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
|
|
372
357
|
|
|
373
358
|
self.host = kwargs.get("host", None)
|
|
374
359
|
|
|
@@ -420,7 +405,7 @@ class LocalCUDACluster(LocalCluster):
|
|
|
420
405
|
) + ["dask_cuda.initialize"]
|
|
421
406
|
self.new_spec["options"]["preload_argv"] = self.new_spec["options"].get(
|
|
422
407
|
"preload_argv", []
|
|
423
|
-
) + ["--create-cuda-context"
|
|
408
|
+
) + ["--create-cuda-context"]
|
|
424
409
|
|
|
425
410
|
self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
|
|
426
411
|
self.scale(n_workers)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
1
3
|
import pytest
|
|
2
4
|
|
|
3
5
|
from distributed.sizeof import safe_sizeof
|
|
@@ -6,6 +8,7 @@ from dask_cuda.device_host_file import DeviceHostFile
|
|
|
6
8
|
from dask_cuda.is_spillable_object import is_spillable_object
|
|
7
9
|
from dask_cuda.proxify_host_file import ProxifyHostFile
|
|
8
10
|
|
|
11
|
+
cudf = pytest.importorskip("cudf")
|
|
9
12
|
cupy = pytest.importorskip("cupy")
|
|
10
13
|
pandas = pytest.importorskip("pandas")
|
|
11
14
|
|
|
@@ -14,7 +17,6 @@ pytest.importorskip(
|
|
|
14
17
|
reason="Current version of cudf doesn't support built-in spilling",
|
|
15
18
|
)
|
|
16
19
|
|
|
17
|
-
import cudf # noqa: E402
|
|
18
20
|
from cudf.core.buffer.spill_manager import ( # noqa: E402
|
|
19
21
|
SpillManager,
|
|
20
22
|
get_global_manager,
|