dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/_compat.py +18 -0
- dask_cuda/benchmarks/common.py +4 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
- dask_cuda/benchmarks/local_cudf_merge.py +5 -2
- dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
- dask_cuda/benchmarks/local_cupy.py +4 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
- dask_cuda/benchmarks/utils.py +7 -4
- dask_cuda/cli.py +21 -15
- dask_cuda/cuda_worker.py +27 -57
- dask_cuda/device_host_file.py +31 -15
- dask_cuda/disk_io.py +7 -4
- dask_cuda/explicit_comms/comms.py +11 -7
- dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
- dask_cuda/get_device_memory_objects.py +18 -3
- dask_cuda/initialize.py +80 -44
- dask_cuda/is_device_object.py +4 -1
- dask_cuda/is_spillable_object.py +4 -1
- dask_cuda/local_cuda_cluster.py +63 -66
- dask_cuda/plugins.py +17 -16
- dask_cuda/proxify_device_objects.py +15 -10
- dask_cuda/proxify_host_file.py +30 -27
- dask_cuda/proxy_object.py +20 -17
- dask_cuda/tests/conftest.py +41 -0
- dask_cuda/tests/test_dask_cuda_worker.py +114 -27
- dask_cuda/tests/test_dgx.py +10 -18
- dask_cuda/tests/test_explicit_comms.py +51 -18
- dask_cuda/tests/test_from_array.py +7 -5
- dask_cuda/tests/test_initialize.py +16 -37
- dask_cuda/tests/test_local_cuda_cluster.py +164 -54
- dask_cuda/tests/test_proxify_host_file.py +33 -4
- dask_cuda/tests/test_proxy.py +18 -16
- dask_cuda/tests/test_rdd_ucx.py +160 -0
- dask_cuda/tests/test_spill.py +107 -27
- dask_cuda/tests/test_utils.py +106 -20
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +319 -68
- dask_cuda/utils_test.py +23 -7
- dask_cuda/worker_common.py +196 -0
- dask_cuda/worker_spec.py +12 -5
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
- dask_cuda-25.8.0.dist-info/RECORD +63 -0
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
- dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
- shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
- shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
- dask_cuda-25.4.0.dist-info/RECORD +0 -56
- dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
dask_cuda/local_cuda_cluster.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import copy
|
|
2
5
|
import logging
|
|
3
6
|
import os
|
|
@@ -8,18 +11,15 @@ import dask
|
|
|
8
11
|
from distributed import LocalCluster, Nanny, Worker
|
|
9
12
|
from distributed.worker_memory import parse_memory_limit
|
|
10
13
|
|
|
11
|
-
from .device_host_file import DeviceHostFile
|
|
12
14
|
from .initialize import initialize
|
|
13
|
-
from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
|
|
14
|
-
from .proxify_host_file import ProxifyHostFile
|
|
15
15
|
from .utils import (
|
|
16
16
|
cuda_visible_devices,
|
|
17
|
-
get_cpu_affinity,
|
|
18
17
|
get_ucx_config,
|
|
19
18
|
nvml_device_index,
|
|
20
19
|
parse_cuda_visible_device,
|
|
21
20
|
parse_device_memory_limit,
|
|
22
21
|
)
|
|
22
|
+
from .worker_common import worker_data_function, worker_plugins
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class LoggedWorker(Worker):
|
|
@@ -68,11 +68,16 @@ class LocalCUDACluster(LocalCluster):
|
|
|
68
68
|
starts spilling to disk (not available if JIT-Unspill is enabled). Can be an
|
|
69
69
|
integer (bytes), float (fraction of total system memory), string (like ``"5GB"``
|
|
70
70
|
or ``"5000M"``), or ``"auto"``, 0, or ``None`` for no memory management.
|
|
71
|
-
device_memory_limit : int, float, str, or None, default
|
|
71
|
+
device_memory_limit : int, float, str, or None, default "default"
|
|
72
72
|
Size of the CUDA device LRU cache, which is used to determine when the worker
|
|
73
73
|
starts spilling to host memory. Can be an integer (bytes), float (fraction of
|
|
74
|
-
total device memory), string (like ``"5GB"`` or ``"5000M"``),
|
|
74
|
+
total device memory), string (like ``"5GB"`` or ``"5000M"``), ``"auto"``, ``0``
|
|
75
75
|
or ``None`` to disable spilling to host (i.e. allow full device memory usage).
|
|
76
|
+
Another special value ``"default"`` (which happens to be the default) is also
|
|
77
|
+
available and uses the recommended Dask-CUDA's defaults and means 80% of the
|
|
78
|
+
total device memory (analogous to ``0.8``), and disabled spilling (analogous
|
|
79
|
+
to ``auto``/``0``) on devices without a dedicated memory resource, such as
|
|
80
|
+
system on a chip (SoC) devices.
|
|
76
81
|
enable_cudf_spill : bool, default False
|
|
77
82
|
Enable automatic cuDF spilling.
|
|
78
83
|
|
|
@@ -87,7 +92,7 @@ class LocalCUDACluster(LocalCluster):
|
|
|
87
92
|
``dask.temporary-directory`` in the local Dask configuration, using the current
|
|
88
93
|
working directory if this is not set.
|
|
89
94
|
shared_filesystem: bool or None, default None
|
|
90
|
-
Whether the
|
|
95
|
+
Whether the ``local_directory`` above is shared between all workers or not.
|
|
91
96
|
If ``None``, the "jit-unspill-shared-fs" config value are used, which
|
|
92
97
|
defaults to True. Notice, in all other cases this option defaults to False,
|
|
93
98
|
but on a local cluster it defaults to True -- we assume all workers use the
|
|
@@ -100,13 +105,16 @@ class LocalCUDACluster(LocalCluster):
|
|
|
100
105
|
are not supported or disabled.
|
|
101
106
|
enable_infiniband : bool, default None
|
|
102
107
|
Set environment variables to enable UCX over InfiniBand, requires
|
|
103
|
-
``protocol="ucx"``
|
|
108
|
+
``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
|
|
109
|
+
``enable_tcp_over_ucx=True`` when ``True``.
|
|
104
110
|
enable_nvlink : bool, default None
|
|
105
|
-
Set environment variables to enable UCX over NVLink, requires
|
|
106
|
-
|
|
111
|
+
Set environment variables to enable UCX over NVLink, requires
|
|
112
|
+
``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``, and implies
|
|
113
|
+
``enable_tcp_over_ucx=True`` when ``True``.
|
|
107
114
|
enable_rdmacm : bool, default None
|
|
108
115
|
Set environment variables to enable UCX RDMA connection manager support,
|
|
109
|
-
requires ``protocol="ucx"``
|
|
116
|
+
requires ``protocol="ucx"``, ``protocol="ucxx"`` or ``protocol="ucx-old"``,
|
|
117
|
+
and ``enable_infiniband=True``.
|
|
110
118
|
rmm_pool_size : int, str or None, default None
|
|
111
119
|
RMM pool size to initialize each worker with. Can be an integer (bytes), float
|
|
112
120
|
(fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
|
|
@@ -123,8 +131,8 @@ class LocalCUDACluster(LocalCluster):
|
|
|
123
131
|
and to set the maximum pool size.
|
|
124
132
|
|
|
125
133
|
.. note::
|
|
126
|
-
When paired with
|
|
127
|
-
due to fragmentation.
|
|
134
|
+
When paired with ``--enable-rmm-async`` the maximum size cannot be
|
|
135
|
+
guaranteed due to fragmentation.
|
|
128
136
|
|
|
129
137
|
.. note::
|
|
130
138
|
This size is a per-worker configuration, and not cluster-wide.
|
|
@@ -140,9 +148,8 @@ class LocalCUDACluster(LocalCluster):
|
|
|
140
148
|
See ``rmm.mr.CudaAsyncMemoryResource`` for more info.
|
|
141
149
|
|
|
142
150
|
.. warning::
|
|
143
|
-
The asynchronous allocator
|
|
144
|
-
|
|
145
|
-
result in an exception.
|
|
151
|
+
The asynchronous allocator is incompatible with RMM pools and managed
|
|
152
|
+
memory. Trying to enable both will result in an exception.
|
|
146
153
|
rmm_allocator_external_lib_list: str, list or None, default None
|
|
147
154
|
List of external libraries for which to set RMM as the allocator.
|
|
148
155
|
Supported options are: ``["torch", "cupy"]``. Can be a comma-separated string
|
|
@@ -201,7 +208,8 @@ class LocalCUDACluster(LocalCluster):
|
|
|
201
208
|
Raises
|
|
202
209
|
------
|
|
203
210
|
TypeError
|
|
204
|
-
If InfiniBand or NVLink are enabled and
|
|
211
|
+
If InfiniBand or NVLink are enabled and
|
|
212
|
+
``protocol not in ("ucx", "ucxx", "ucx-old")``.
|
|
205
213
|
ValueError
|
|
206
214
|
If RMM pool, RMM managed memory or RMM async allocator are requested but RMM
|
|
207
215
|
cannot be imported.
|
|
@@ -221,10 +229,9 @@ class LocalCUDACluster(LocalCluster):
|
|
|
221
229
|
n_workers=None,
|
|
222
230
|
threads_per_worker=1,
|
|
223
231
|
memory_limit="auto",
|
|
224
|
-
device_memory_limit=
|
|
232
|
+
device_memory_limit="default",
|
|
225
233
|
enable_cudf_spill=False,
|
|
226
234
|
cudf_spill_stats=0,
|
|
227
|
-
data=None,
|
|
228
235
|
local_directory=None,
|
|
229
236
|
shared_filesystem=None,
|
|
230
237
|
protocol=None,
|
|
@@ -242,7 +249,6 @@ class LocalCUDACluster(LocalCluster):
|
|
|
242
249
|
rmm_track_allocations=False,
|
|
243
250
|
jit_unspill=None,
|
|
244
251
|
log_spilling=False,
|
|
245
|
-
worker_class=None,
|
|
246
252
|
pre_import=None,
|
|
247
253
|
**kwargs,
|
|
248
254
|
):
|
|
@@ -339,40 +345,29 @@ class LocalCUDACluster(LocalCluster):
|
|
|
339
345
|
jit_unspill = dask.config.get("jit-unspill", default=False)
|
|
340
346
|
data = kwargs.pop("data", None)
|
|
341
347
|
if data is None:
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
data = (
|
|
352
|
-
ProxifyHostFile,
|
|
353
|
-
{
|
|
354
|
-
"device_memory_limit": self.device_memory_limit,
|
|
355
|
-
"memory_limit": self.memory_limit,
|
|
356
|
-
"shared_filesystem": shared_filesystem,
|
|
357
|
-
},
|
|
358
|
-
)
|
|
359
|
-
else:
|
|
360
|
-
data = (
|
|
361
|
-
DeviceHostFile,
|
|
362
|
-
{
|
|
363
|
-
"device_memory_limit": self.device_memory_limit,
|
|
364
|
-
"memory_limit": self.memory_limit,
|
|
365
|
-
"log_spilling": log_spilling,
|
|
366
|
-
},
|
|
367
|
-
)
|
|
348
|
+
self.data = worker_data_function(
|
|
349
|
+
device_memory_limit=self.device_memory_limit,
|
|
350
|
+
memory_limit=self.memory_limit,
|
|
351
|
+
jit_unspill=jit_unspill,
|
|
352
|
+
enable_cudf_spill=enable_cudf_spill,
|
|
353
|
+
shared_filesystem=shared_filesystem,
|
|
354
|
+
)
|
|
368
355
|
|
|
369
356
|
if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
|
|
370
357
|
if protocol is None:
|
|
371
|
-
|
|
372
|
-
|
|
358
|
+
ucx_protocol = dask.config.get(
|
|
359
|
+
"distributed.comm.ucx.ucx-protocol", default=None
|
|
360
|
+
)
|
|
361
|
+
if ucx_protocol is not None:
|
|
362
|
+
# TODO: remove when UCX-Py is removed,
|
|
363
|
+
# see https://github.com/rapidsai/dask-cuda/issues/1517
|
|
364
|
+
protocol = ucx_protocol
|
|
365
|
+
else:
|
|
366
|
+
protocol = "ucx"
|
|
367
|
+
elif protocol not in ("ucx", "ucxx", "ucx-old"):
|
|
373
368
|
raise TypeError(
|
|
374
|
-
"Enabling InfiniBand or NVLink requires protocol='ucx'
|
|
375
|
-
"protocol='ucxx'"
|
|
369
|
+
"Enabling InfiniBand or NVLink requires protocol='ucx', "
|
|
370
|
+
"protocol='ucxx' or protocol='ucx-old'"
|
|
376
371
|
)
|
|
377
372
|
|
|
378
373
|
self.host = kwargs.get("host", None)
|
|
@@ -385,6 +380,7 @@ class LocalCUDACluster(LocalCluster):
|
|
|
385
380
|
enable_rdmacm=enable_rdmacm,
|
|
386
381
|
)
|
|
387
382
|
|
|
383
|
+
worker_class = kwargs.pop("worker_class", None)
|
|
388
384
|
if worker_class is not None:
|
|
389
385
|
if log_spilling is True:
|
|
390
386
|
raise ValueError(
|
|
@@ -441,28 +437,29 @@ class LocalCUDACluster(LocalCluster):
|
|
|
441
437
|
spec = copy.deepcopy(self.new_spec)
|
|
442
438
|
worker_count = self.cuda_visible_devices.index(name)
|
|
443
439
|
visible_devices = cuda_visible_devices(worker_count, self.cuda_visible_devices)
|
|
440
|
+
device_index = nvml_device_index(0, visible_devices)
|
|
444
441
|
spec["options"].update(
|
|
445
442
|
{
|
|
446
443
|
"env": {
|
|
447
444
|
"CUDA_VISIBLE_DEVICES": visible_devices,
|
|
448
445
|
},
|
|
449
|
-
"
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
external_lib_list=self.rmm_allocator_external_lib_list,
|
|
446
|
+
**({"data": self.data(device_index)} if hasattr(self, "data") else {}),
|
|
447
|
+
"plugins": worker_plugins(
|
|
448
|
+
device_index=device_index,
|
|
449
|
+
rmm_initial_pool_size=self.rmm_pool_size,
|
|
450
|
+
rmm_maximum_pool_size=self.rmm_maximum_pool_size,
|
|
451
|
+
rmm_managed_memory=self.rmm_managed_memory,
|
|
452
|
+
rmm_async_alloc=self.rmm_async,
|
|
453
|
+
rmm_release_threshold=self.rmm_release_threshold,
|
|
454
|
+
rmm_log_directory=self.rmm_log_directory,
|
|
455
|
+
rmm_track_allocations=self.rmm_track_allocations,
|
|
456
|
+
rmm_allocator_external_lib_list=(
|
|
457
|
+
self.rmm_allocator_external_lib_list
|
|
462
458
|
),
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
459
|
+
pre_import=self.pre_import,
|
|
460
|
+
enable_cudf_spill=self.enable_cudf_spill,
|
|
461
|
+
cudf_spill_stats=self.cudf_spill_stats,
|
|
462
|
+
),
|
|
466
463
|
}
|
|
467
464
|
)
|
|
468
465
|
|
dask_cuda/plugins.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import importlib
|
|
2
5
|
import logging
|
|
3
6
|
import os
|
|
@@ -5,7 +8,7 @@ from typing import Callable, Dict
|
|
|
5
8
|
|
|
6
9
|
from distributed import WorkerPlugin
|
|
7
10
|
|
|
8
|
-
from .utils import get_rmm_log_file_name,
|
|
11
|
+
from .utils import get_rmm_log_file_name, parse_device_bytes
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
class CPUAffinity(WorkerPlugin):
|
|
@@ -75,28 +78,26 @@ class RMMSetup(WorkerPlugin):
|
|
|
75
78
|
self.external_lib_list = external_lib_list
|
|
76
79
|
|
|
77
80
|
def setup(self, worker=None):
|
|
78
|
-
|
|
79
|
-
self.initial_pool_size =
|
|
80
|
-
|
|
81
|
-
)
|
|
81
|
+
self.initial_pool_size = parse_device_bytes(
|
|
82
|
+
self.initial_pool_size, alignment_size=256
|
|
83
|
+
)
|
|
82
84
|
|
|
83
85
|
if self.async_alloc:
|
|
84
86
|
import rmm
|
|
85
87
|
|
|
86
|
-
|
|
87
|
-
self.release_threshold =
|
|
88
|
-
|
|
89
|
-
)
|
|
88
|
+
self.release_threshold = parse_device_bytes(
|
|
89
|
+
self.release_threshold, alignment_size=256
|
|
90
|
+
)
|
|
90
91
|
|
|
91
92
|
mr = rmm.mr.CudaAsyncMemoryResource(
|
|
92
93
|
initial_pool_size=self.initial_pool_size,
|
|
93
94
|
release_threshold=self.release_threshold,
|
|
94
95
|
)
|
|
95
96
|
|
|
97
|
+
self.maximum_pool_size = parse_device_bytes(
|
|
98
|
+
self.maximum_pool_size, alignment_size=256
|
|
99
|
+
)
|
|
96
100
|
if self.maximum_pool_size is not None:
|
|
97
|
-
self.maximum_pool_size = parse_device_memory_limit(
|
|
98
|
-
self.maximum_pool_size, alignment_size=256
|
|
99
|
-
)
|
|
100
101
|
mr = rmm.mr.LimitingResourceAdaptor(
|
|
101
102
|
mr, allocation_limit=self.maximum_pool_size
|
|
102
103
|
)
|
|
@@ -114,10 +115,9 @@ class RMMSetup(WorkerPlugin):
|
|
|
114
115
|
pool_allocator = False if self.initial_pool_size is None else True
|
|
115
116
|
|
|
116
117
|
if self.initial_pool_size is not None:
|
|
117
|
-
|
|
118
|
-
self.maximum_pool_size =
|
|
119
|
-
|
|
120
|
-
)
|
|
118
|
+
self.maximum_pool_size = parse_device_bytes(
|
|
119
|
+
self.maximum_pool_size, alignment_size=256
|
|
120
|
+
)
|
|
121
121
|
|
|
122
122
|
rmm.reinitialize(
|
|
123
123
|
pool_allocator=pool_allocator,
|
|
@@ -129,6 +129,7 @@ class RMMSetup(WorkerPlugin):
|
|
|
129
129
|
worker, self.logging, self.log_directory
|
|
130
130
|
),
|
|
131
131
|
)
|
|
132
|
+
|
|
132
133
|
if self.rmm_track_allocations:
|
|
133
134
|
import rmm
|
|
134
135
|
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import functools
|
|
2
5
|
import pydoc
|
|
3
6
|
from collections import defaultdict
|
|
@@ -58,9 +61,9 @@ def proxify_device_objects(
|
|
|
58
61
|
) -> T:
|
|
59
62
|
"""Wrap device objects in ProxyObject
|
|
60
63
|
|
|
61
|
-
Search through
|
|
62
|
-
It uses
|
|
63
|
-
objects found in
|
|
64
|
+
Search through ``obj`` and wraps all CUDA device objects in ProxyObject.
|
|
65
|
+
It uses ``proxied_id_to_proxy`` to make sure that identical CUDA device
|
|
66
|
+
objects found in ``obj`` are wrapped by the same ProxyObject.
|
|
64
67
|
|
|
65
68
|
Parameters
|
|
66
69
|
----------
|
|
@@ -68,11 +71,11 @@ def proxify_device_objects(
|
|
|
68
71
|
Object to search through or wrap in a ProxyObject.
|
|
69
72
|
proxied_id_to_proxy: MutableMapping[int, ProxyObject]
|
|
70
73
|
Dict mapping the id() of proxied objects (CUDA device objects) to
|
|
71
|
-
their proxy and is updated with all new proxied objects found in
|
|
74
|
+
their proxy and is updated with all new proxied objects found in ``obj``.
|
|
72
75
|
If None, use an empty dict.
|
|
73
76
|
found_proxies: List[ProxyObject]
|
|
74
|
-
List of found proxies in
|
|
75
|
-
including those already in
|
|
77
|
+
List of found proxies in ``obj``. Notice, this includes all proxies found,
|
|
78
|
+
including those already in ``proxied_id_to_proxy``.
|
|
76
79
|
If None, use an empty list.
|
|
77
80
|
excl_proxies: bool
|
|
78
81
|
Don't add found objects that are already ProxyObject to found_proxies.
|
|
@@ -83,7 +86,7 @@ def proxify_device_objects(
|
|
|
83
86
|
Returns
|
|
84
87
|
-------
|
|
85
88
|
ret: Any
|
|
86
|
-
A copy of
|
|
89
|
+
A copy of ``obj`` where all CUDA device objects are wrapped in ProxyObject
|
|
87
90
|
"""
|
|
88
91
|
_register_incompatible_types()
|
|
89
92
|
|
|
@@ -102,7 +105,7 @@ def unproxify_device_objects(
|
|
|
102
105
|
) -> T:
|
|
103
106
|
"""Unproxify device objects
|
|
104
107
|
|
|
105
|
-
Search through
|
|
108
|
+
Search through ``obj`` and un-wraps all CUDA device objects.
|
|
106
109
|
|
|
107
110
|
Parameters
|
|
108
111
|
----------
|
|
@@ -117,7 +120,7 @@ def unproxify_device_objects(
|
|
|
117
120
|
Returns
|
|
118
121
|
-------
|
|
119
122
|
ret: Any
|
|
120
|
-
A copy of
|
|
123
|
+
A copy of ``obj`` where all CUDA device objects are unproxify
|
|
121
124
|
"""
|
|
122
125
|
if isinstance(obj, dict):
|
|
123
126
|
return {
|
|
@@ -242,7 +245,9 @@ def _register_cudf():
|
|
|
242
245
|
|
|
243
246
|
@dispatch.register(cudf.DataFrame)
|
|
244
247
|
@dispatch.register(cudf.Series)
|
|
245
|
-
@dispatch.register(cudf.
|
|
248
|
+
@dispatch.register(cudf.Index)
|
|
249
|
+
@dispatch.register(cudf.MultiIndex)
|
|
250
|
+
@dispatch.register(cudf.RangeIndex)
|
|
246
251
|
def proxify_device_object_cudf_dataframe(
|
|
247
252
|
obj, proxied_id_to_proxy, found_proxies, excl_proxies
|
|
248
253
|
):
|
dask_cuda/proxify_host_file.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import abc
|
|
2
5
|
import gc
|
|
3
6
|
import io
|
|
@@ -64,29 +67,29 @@ class Proxies(abc.ABC):
|
|
|
64
67
|
|
|
65
68
|
@abc.abstractmethod
|
|
66
69
|
def mem_usage_add(self, proxy: ProxyObject) -> None:
|
|
67
|
-
"""Given a new proxy, update
|
|
70
|
+
"""Given a new proxy, update ``self._mem_usage``"""
|
|
68
71
|
|
|
69
72
|
@abc.abstractmethod
|
|
70
73
|
def mem_usage_remove(self, proxy: ProxyObject) -> None:
|
|
71
|
-
"""Removal of proxy, update
|
|
74
|
+
"""Removal of proxy, update ``self._mem_usage``"""
|
|
72
75
|
|
|
73
76
|
@abc.abstractmethod
|
|
74
77
|
def buffer_info(self) -> List[Tuple[float, int, List[ProxyObject]]]:
|
|
75
78
|
"""Return a list of buffer information
|
|
76
79
|
|
|
77
80
|
The returned format is:
|
|
78
|
-
|
|
81
|
+
``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
|
|
79
82
|
"""
|
|
80
83
|
|
|
81
84
|
def add(self, proxy: ProxyObject) -> None:
|
|
82
|
-
"""Add a proxy for tracking, calls
|
|
85
|
+
"""Add a proxy for tracking, calls ``self.mem_usage_add``"""
|
|
83
86
|
assert not self.contains_proxy_id(id(proxy))
|
|
84
87
|
with self._lock:
|
|
85
88
|
self._proxy_id_to_proxy[id(proxy)] = weakref.ref(proxy)
|
|
86
89
|
self.mem_usage_add(proxy)
|
|
87
90
|
|
|
88
91
|
def remove(self, proxy: ProxyObject) -> None:
|
|
89
|
-
"""Remove proxy from tracking, calls
|
|
92
|
+
"""Remove proxy from tracking, calls ``self.mem_usage_remove``"""
|
|
90
93
|
with self._lock:
|
|
91
94
|
del self._proxy_id_to_proxy[id(proxy)]
|
|
92
95
|
self.mem_usage_remove(proxy)
|
|
@@ -323,13 +326,13 @@ class ProxyManager:
|
|
|
323
326
|
assert header["serializer"] == pxy.serializer
|
|
324
327
|
|
|
325
328
|
def proxify(self, obj: T, duplicate_check=True) -> Tuple[T, bool]:
|
|
326
|
-
"""Proxify
|
|
329
|
+
"""Proxify ``obj`` and add found proxies to the ``Proxies`` collections
|
|
327
330
|
|
|
328
|
-
Search through
|
|
331
|
+
Search through ``obj`` and wrap all CUDA device objects in ProxyObject.
|
|
329
332
|
If duplicate_check is True, identical CUDA device objects found in
|
|
330
|
-
|
|
333
|
+
``obj`` are wrapped by the same ProxyObject.
|
|
331
334
|
|
|
332
|
-
Returns the proxified object and a boolean, which is
|
|
335
|
+
Returns the proxified object and a boolean, which is ``True`` when one or
|
|
333
336
|
more incompatible-types were found.
|
|
334
337
|
|
|
335
338
|
Parameters
|
|
@@ -337,7 +340,7 @@ class ProxyManager:
|
|
|
337
340
|
obj
|
|
338
341
|
Object to search through or wrap in a ProxyObject.
|
|
339
342
|
duplicate_check
|
|
340
|
-
Make sure that identical CUDA device objects found in
|
|
343
|
+
Make sure that identical CUDA device objects found in ``obj`` are
|
|
341
344
|
wrapped by the same ProxyObject. This check comes with a significant
|
|
342
345
|
overhead hence it is recommended setting to False when it is known
|
|
343
346
|
that no duplicate exist.
|
|
@@ -380,11 +383,11 @@ class ProxyManager:
|
|
|
380
383
|
proxies_access: Callable[[], List[Tuple[float, int, List[ProxyObject]]]],
|
|
381
384
|
serializer: Callable[[ProxyObject], None],
|
|
382
385
|
) -> int:
|
|
383
|
-
"""Evict buffers retrieved by calling
|
|
386
|
+
"""Evict buffers retrieved by calling ``proxies_access``
|
|
384
387
|
|
|
385
|
-
Calls
|
|
386
|
-
enough proxies to free up at a minimum
|
|
387
|
-
spill a proxy,
|
|
388
|
+
Calls ``proxies_access`` to retrieve a list of proxies and then spills
|
|
389
|
+
enough proxies to free up at a minimum ``nbytes`` bytes. In order to
|
|
390
|
+
spill a proxy, ``serializer`` is called.
|
|
388
391
|
|
|
389
392
|
Parameters
|
|
390
393
|
----------
|
|
@@ -392,7 +395,7 @@ class ProxyManager:
|
|
|
392
395
|
Number of bytes to evict.
|
|
393
396
|
proxies_access: callable
|
|
394
397
|
Function that returns a list of proxies pack in a tuple like:
|
|
395
|
-
|
|
398
|
+
``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
|
|
396
399
|
serializer: callable
|
|
397
400
|
Function that serialize the given proxy object.
|
|
398
401
|
|
|
@@ -423,7 +426,7 @@ class ProxyManager:
|
|
|
423
426
|
def maybe_evict_from_device(self, extra_dev_mem=0) -> None:
|
|
424
427
|
"""Evict buffers until total memory usage is below device-memory-limit
|
|
425
428
|
|
|
426
|
-
Adds
|
|
429
|
+
Adds ``extra_dev_mem`` to the current total memory usage when comparing
|
|
427
430
|
against device-memory-limit.
|
|
428
431
|
"""
|
|
429
432
|
mem_over_usage = (
|
|
@@ -439,7 +442,7 @@ class ProxyManager:
|
|
|
439
442
|
def maybe_evict_from_host(self, extra_host_mem=0) -> None:
|
|
440
443
|
"""Evict buffers until total memory usage is below host-memory-limit
|
|
441
444
|
|
|
442
|
-
Adds
|
|
445
|
+
Adds ``extra_host_mem`` to the current total memory usage when comparing
|
|
443
446
|
against device-memory-limit.
|
|
444
447
|
"""
|
|
445
448
|
assert self._host_memory_limit is not None
|
|
@@ -466,7 +469,7 @@ class ProxifyHostFile(MutableMapping):
|
|
|
466
469
|
workers in Distributed.
|
|
467
470
|
|
|
468
471
|
It wraps all CUDA device objects in a ProxyObject instance and maintains
|
|
469
|
-
|
|
472
|
+
``device_memory_limit`` by spilling ProxyObject on-the-fly. This addresses
|
|
470
473
|
some issues with the default DeviceHostFile host, which tracks device
|
|
471
474
|
memory inaccurately see <https://github.com/rapidsai/dask-cuda/pull/451>
|
|
472
475
|
|
|
@@ -488,16 +491,16 @@ class ProxifyHostFile(MutableMapping):
|
|
|
488
491
|
memory_limit: int
|
|
489
492
|
Number of bytes of host memory used before spilling to disk.
|
|
490
493
|
shared_filesystem: bool or None, default None
|
|
491
|
-
Whether the
|
|
494
|
+
Whether the ``local_directory`` above is shared between all workers or not.
|
|
492
495
|
If ``None``, the "jit-unspill-shared-fs" config value are used, which
|
|
493
496
|
defaults to False.
|
|
494
|
-
Notice, a shared filesystem must support the
|
|
497
|
+
Notice, a shared filesystem must support the ``os.link()`` operation.
|
|
495
498
|
compatibility_mode: bool or None, default None
|
|
496
499
|
Enables compatibility-mode, which means that items are un-proxified before
|
|
497
500
|
retrieval. This makes it possible to get some of the JIT-unspill benefits
|
|
498
501
|
without having to be ProxyObject compatible. In order to still allow specific
|
|
499
|
-
ProxyObjects, set the
|
|
500
|
-
|
|
502
|
+
ProxyObjects, set the ``mark_as_explicit_proxies=True`` when proxifying with
|
|
503
|
+
``proxify_device_objects()``. If ``None``, the "jit-unspill-compatibility-mode"
|
|
501
504
|
config value are used, which defaults to False.
|
|
502
505
|
spill_on_demand: bool or None, default None
|
|
503
506
|
Enables spilling when the RMM memory pool goes out of memory. If ``None``,
|
|
@@ -639,7 +642,7 @@ class ProxifyHostFile(MutableMapping):
|
|
|
639
642
|
"""Manually evict 1% of host limit.
|
|
640
643
|
|
|
641
644
|
Dask uses this to trigger CPU-to-Disk spilling. We don't know how much
|
|
642
|
-
we need to spill but Dask will call
|
|
645
|
+
we need to spill but Dask will call ``evict()`` repeatedly until enough
|
|
643
646
|
is spilled. We ask for 1% each time.
|
|
644
647
|
|
|
645
648
|
Return
|
|
@@ -658,9 +661,9 @@ class ProxifyHostFile(MutableMapping):
|
|
|
658
661
|
|
|
659
662
|
@property
|
|
660
663
|
def fast(self):
|
|
661
|
-
"""Alternative access to
|
|
664
|
+
"""Alternative access to ``.evict()`` used by Dask
|
|
662
665
|
|
|
663
|
-
Dask expects
|
|
666
|
+
Dask expects ``.fast.evict()`` to be available for manually triggering
|
|
664
667
|
of CPU-to-Disk spilling.
|
|
665
668
|
"""
|
|
666
669
|
if len(self.manager._host) == 0:
|
|
@@ -758,9 +761,9 @@ class ProxifyHostFile(MutableMapping):
|
|
|
758
761
|
|
|
759
762
|
@classmethod
|
|
760
763
|
def serialize_proxy_to_disk_inplace(cls, proxy: ProxyObject) -> None:
|
|
761
|
-
"""Serialize
|
|
764
|
+
"""Serialize ``proxy`` to disk.
|
|
762
765
|
|
|
763
|
-
Avoid de-serializing if
|
|
766
|
+
Avoid de-serializing if ``proxy`` is serialized using "dask" or
|
|
764
767
|
"pickle". In this case the already serialized data is written
|
|
765
768
|
directly to disk.
|
|
766
769
|
|