dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/benchmarks/common.py +4 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
- dask_cuda/benchmarks/local_cudf_merge.py +5 -2
- dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
- dask_cuda/benchmarks/local_cupy.py +4 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
- dask_cuda/benchmarks/utils.py +7 -4
- dask_cuda/cli.py +21 -15
- dask_cuda/cuda_worker.py +27 -57
- dask_cuda/device_host_file.py +31 -15
- dask_cuda/disk_io.py +7 -4
- dask_cuda/explicit_comms/comms.py +11 -7
- dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
- dask_cuda/get_device_memory_objects.py +3 -3
- dask_cuda/initialize.py +80 -44
- dask_cuda/local_cuda_cluster.py +63 -66
- dask_cuda/plugins.py +17 -16
- dask_cuda/proxify_device_objects.py +12 -10
- dask_cuda/proxify_host_file.py +30 -27
- dask_cuda/proxy_object.py +20 -17
- dask_cuda/tests/conftest.py +41 -0
- dask_cuda/tests/test_dask_cuda_worker.py +109 -25
- dask_cuda/tests/test_dgx.py +10 -18
- dask_cuda/tests/test_explicit_comms.py +30 -12
- dask_cuda/tests/test_from_array.py +7 -5
- dask_cuda/tests/test_initialize.py +16 -37
- dask_cuda/tests/test_local_cuda_cluster.py +159 -52
- dask_cuda/tests/test_proxify_host_file.py +19 -3
- dask_cuda/tests/test_proxy.py +18 -16
- dask_cuda/tests/test_rdd_ucx.py +160 -0
- dask_cuda/tests/test_spill.py +7 -0
- dask_cuda/tests/test_utils.py +106 -20
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +261 -38
- dask_cuda/utils_test.py +23 -7
- dask_cuda/worker_common.py +196 -0
- dask_cuda/worker_spec.py +12 -5
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
- dask_cuda-25.8.0.dist-info/RECORD +63 -0
- dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
- shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
- shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
- dask_cuda-25.6.0.dist-info/RECORD +0 -57
- dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
# Copyright (c) 2025 NVIDIA CORPORATION.
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
2
4
|
import functools
|
|
3
5
|
import pydoc
|
|
4
6
|
from collections import defaultdict
|
|
@@ -59,9 +61,9 @@ def proxify_device_objects(
|
|
|
59
61
|
) -> T:
|
|
60
62
|
"""Wrap device objects in ProxyObject
|
|
61
63
|
|
|
62
|
-
Search through
|
|
63
|
-
It uses
|
|
64
|
-
objects found in
|
|
64
|
+
Search through ``obj`` and wraps all CUDA device objects in ProxyObject.
|
|
65
|
+
It uses ``proxied_id_to_proxy`` to make sure that identical CUDA device
|
|
66
|
+
objects found in ``obj`` are wrapped by the same ProxyObject.
|
|
65
67
|
|
|
66
68
|
Parameters
|
|
67
69
|
----------
|
|
@@ -69,11 +71,11 @@ def proxify_device_objects(
|
|
|
69
71
|
Object to search through or wrap in a ProxyObject.
|
|
70
72
|
proxied_id_to_proxy: MutableMapping[int, ProxyObject]
|
|
71
73
|
Dict mapping the id() of proxied objects (CUDA device objects) to
|
|
72
|
-
their proxy and is updated with all new proxied objects found in
|
|
74
|
+
their proxy and is updated with all new proxied objects found in ``obj``.
|
|
73
75
|
If None, use an empty dict.
|
|
74
76
|
found_proxies: List[ProxyObject]
|
|
75
|
-
List of found proxies in
|
|
76
|
-
including those already in
|
|
77
|
+
List of found proxies in ``obj``. Notice, this includes all proxies found,
|
|
78
|
+
including those already in ``proxied_id_to_proxy``.
|
|
77
79
|
If None, use an empty list.
|
|
78
80
|
excl_proxies: bool
|
|
79
81
|
Don't add found objects that are already ProxyObject to found_proxies.
|
|
@@ -84,7 +86,7 @@ def proxify_device_objects(
|
|
|
84
86
|
Returns
|
|
85
87
|
-------
|
|
86
88
|
ret: Any
|
|
87
|
-
A copy of
|
|
89
|
+
A copy of ``obj`` where all CUDA device objects are wrapped in ProxyObject
|
|
88
90
|
"""
|
|
89
91
|
_register_incompatible_types()
|
|
90
92
|
|
|
@@ -103,7 +105,7 @@ def unproxify_device_objects(
|
|
|
103
105
|
) -> T:
|
|
104
106
|
"""Unproxify device objects
|
|
105
107
|
|
|
106
|
-
Search through
|
|
108
|
+
Search through ``obj`` and un-wraps all CUDA device objects.
|
|
107
109
|
|
|
108
110
|
Parameters
|
|
109
111
|
----------
|
|
@@ -118,7 +120,7 @@ def unproxify_device_objects(
|
|
|
118
120
|
Returns
|
|
119
121
|
-------
|
|
120
122
|
ret: Any
|
|
121
|
-
A copy of
|
|
123
|
+
A copy of ``obj`` where all CUDA device objects are unproxify
|
|
122
124
|
"""
|
|
123
125
|
if isinstance(obj, dict):
|
|
124
126
|
return {
|
dask_cuda/proxify_host_file.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import abc
|
|
2
5
|
import gc
|
|
3
6
|
import io
|
|
@@ -64,29 +67,29 @@ class Proxies(abc.ABC):
|
|
|
64
67
|
|
|
65
68
|
@abc.abstractmethod
|
|
66
69
|
def mem_usage_add(self, proxy: ProxyObject) -> None:
|
|
67
|
-
"""Given a new proxy, update
|
|
70
|
+
"""Given a new proxy, update ``self._mem_usage``"""
|
|
68
71
|
|
|
69
72
|
@abc.abstractmethod
|
|
70
73
|
def mem_usage_remove(self, proxy: ProxyObject) -> None:
|
|
71
|
-
"""Removal of proxy, update
|
|
74
|
+
"""Removal of proxy, update ``self._mem_usage``"""
|
|
72
75
|
|
|
73
76
|
@abc.abstractmethod
|
|
74
77
|
def buffer_info(self) -> List[Tuple[float, int, List[ProxyObject]]]:
|
|
75
78
|
"""Return a list of buffer information
|
|
76
79
|
|
|
77
80
|
The returned format is:
|
|
78
|
-
|
|
81
|
+
``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
|
|
79
82
|
"""
|
|
80
83
|
|
|
81
84
|
def add(self, proxy: ProxyObject) -> None:
|
|
82
|
-
"""Add a proxy for tracking, calls
|
|
85
|
+
"""Add a proxy for tracking, calls ``self.mem_usage_add``"""
|
|
83
86
|
assert not self.contains_proxy_id(id(proxy))
|
|
84
87
|
with self._lock:
|
|
85
88
|
self._proxy_id_to_proxy[id(proxy)] = weakref.ref(proxy)
|
|
86
89
|
self.mem_usage_add(proxy)
|
|
87
90
|
|
|
88
91
|
def remove(self, proxy: ProxyObject) -> None:
|
|
89
|
-
"""Remove proxy from tracking, calls
|
|
92
|
+
"""Remove proxy from tracking, calls ``self.mem_usage_remove``"""
|
|
90
93
|
with self._lock:
|
|
91
94
|
del self._proxy_id_to_proxy[id(proxy)]
|
|
92
95
|
self.mem_usage_remove(proxy)
|
|
@@ -323,13 +326,13 @@ class ProxyManager:
|
|
|
323
326
|
assert header["serializer"] == pxy.serializer
|
|
324
327
|
|
|
325
328
|
def proxify(self, obj: T, duplicate_check=True) -> Tuple[T, bool]:
|
|
326
|
-
"""Proxify
|
|
329
|
+
"""Proxify ``obj`` and add found proxies to the ``Proxies`` collections
|
|
327
330
|
|
|
328
|
-
Search through
|
|
331
|
+
Search through ``obj`` and wrap all CUDA device objects in ProxyObject.
|
|
329
332
|
If duplicate_check is True, identical CUDA device objects found in
|
|
330
|
-
|
|
333
|
+
``obj`` are wrapped by the same ProxyObject.
|
|
331
334
|
|
|
332
|
-
Returns the proxified object and a boolean, which is
|
|
335
|
+
Returns the proxified object and a boolean, which is ``True`` when one or
|
|
333
336
|
more incompatible-types were found.
|
|
334
337
|
|
|
335
338
|
Parameters
|
|
@@ -337,7 +340,7 @@ class ProxyManager:
|
|
|
337
340
|
obj
|
|
338
341
|
Object to search through or wrap in a ProxyObject.
|
|
339
342
|
duplicate_check
|
|
340
|
-
Make sure that identical CUDA device objects found in
|
|
343
|
+
Make sure that identical CUDA device objects found in ``obj`` are
|
|
341
344
|
wrapped by the same ProxyObject. This check comes with a significant
|
|
342
345
|
overhead hence it is recommended setting to False when it is known
|
|
343
346
|
that no duplicate exist.
|
|
@@ -380,11 +383,11 @@ class ProxyManager:
|
|
|
380
383
|
proxies_access: Callable[[], List[Tuple[float, int, List[ProxyObject]]]],
|
|
381
384
|
serializer: Callable[[ProxyObject], None],
|
|
382
385
|
) -> int:
|
|
383
|
-
"""Evict buffers retrieved by calling
|
|
386
|
+
"""Evict buffers retrieved by calling ``proxies_access``
|
|
384
387
|
|
|
385
|
-
Calls
|
|
386
|
-
enough proxies to free up at a minimum
|
|
387
|
-
spill a proxy,
|
|
388
|
+
Calls ``proxies_access`` to retrieve a list of proxies and then spills
|
|
389
|
+
enough proxies to free up at a minimum ``nbytes`` bytes. In order to
|
|
390
|
+
spill a proxy, ``serializer`` is called.
|
|
388
391
|
|
|
389
392
|
Parameters
|
|
390
393
|
----------
|
|
@@ -392,7 +395,7 @@ class ProxyManager:
|
|
|
392
395
|
Number of bytes to evict.
|
|
393
396
|
proxies_access: callable
|
|
394
397
|
Function that returns a list of proxies pack in a tuple like:
|
|
395
|
-
|
|
398
|
+
``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
|
|
396
399
|
serializer: callable
|
|
397
400
|
Function that serialize the given proxy object.
|
|
398
401
|
|
|
@@ -423,7 +426,7 @@ class ProxyManager:
|
|
|
423
426
|
def maybe_evict_from_device(self, extra_dev_mem=0) -> None:
|
|
424
427
|
"""Evict buffers until total memory usage is below device-memory-limit
|
|
425
428
|
|
|
426
|
-
Adds
|
|
429
|
+
Adds ``extra_dev_mem`` to the current total memory usage when comparing
|
|
427
430
|
against device-memory-limit.
|
|
428
431
|
"""
|
|
429
432
|
mem_over_usage = (
|
|
@@ -439,7 +442,7 @@ class ProxyManager:
|
|
|
439
442
|
def maybe_evict_from_host(self, extra_host_mem=0) -> None:
|
|
440
443
|
"""Evict buffers until total memory usage is below host-memory-limit
|
|
441
444
|
|
|
442
|
-
Adds
|
|
445
|
+
Adds ``extra_host_mem`` to the current total memory usage when comparing
|
|
443
446
|
against device-memory-limit.
|
|
444
447
|
"""
|
|
445
448
|
assert self._host_memory_limit is not None
|
|
@@ -466,7 +469,7 @@ class ProxifyHostFile(MutableMapping):
|
|
|
466
469
|
workers in Distributed.
|
|
467
470
|
|
|
468
471
|
It wraps all CUDA device objects in a ProxyObject instance and maintains
|
|
469
|
-
|
|
472
|
+
``device_memory_limit`` by spilling ProxyObject on-the-fly. This addresses
|
|
470
473
|
some issues with the default DeviceHostFile host, which tracks device
|
|
471
474
|
memory inaccurately see <https://github.com/rapidsai/dask-cuda/pull/451>
|
|
472
475
|
|
|
@@ -488,16 +491,16 @@ class ProxifyHostFile(MutableMapping):
|
|
|
488
491
|
memory_limit: int
|
|
489
492
|
Number of bytes of host memory used before spilling to disk.
|
|
490
493
|
shared_filesystem: bool or None, default None
|
|
491
|
-
Whether the
|
|
494
|
+
Whether the ``local_directory`` above is shared between all workers or not.
|
|
492
495
|
If ``None``, the "jit-unspill-shared-fs" config value are used, which
|
|
493
496
|
defaults to False.
|
|
494
|
-
Notice, a shared filesystem must support the
|
|
497
|
+
Notice, a shared filesystem must support the ``os.link()`` operation.
|
|
495
498
|
compatibility_mode: bool or None, default None
|
|
496
499
|
Enables compatibility-mode, which means that items are un-proxified before
|
|
497
500
|
retrieval. This makes it possible to get some of the JIT-unspill benefits
|
|
498
501
|
without having to be ProxyObject compatible. In order to still allow specific
|
|
499
|
-
ProxyObjects, set the
|
|
500
|
-
|
|
502
|
+
ProxyObjects, set the ``mark_as_explicit_proxies=True`` when proxifying with
|
|
503
|
+
``proxify_device_objects()``. If ``None``, the "jit-unspill-compatibility-mode"
|
|
501
504
|
config value are used, which defaults to False.
|
|
502
505
|
spill_on_demand: bool or None, default None
|
|
503
506
|
Enables spilling when the RMM memory pool goes out of memory. If ``None``,
|
|
@@ -639,7 +642,7 @@ class ProxifyHostFile(MutableMapping):
|
|
|
639
642
|
"""Manually evict 1% of host limit.
|
|
640
643
|
|
|
641
644
|
Dask uses this to trigger CPU-to-Disk spilling. We don't know how much
|
|
642
|
-
we need to spill but Dask will call
|
|
645
|
+
we need to spill but Dask will call ``evict()`` repeatedly until enough
|
|
643
646
|
is spilled. We ask for 1% each time.
|
|
644
647
|
|
|
645
648
|
Return
|
|
@@ -658,9 +661,9 @@ class ProxifyHostFile(MutableMapping):
|
|
|
658
661
|
|
|
659
662
|
@property
|
|
660
663
|
def fast(self):
|
|
661
|
-
"""Alternative access to
|
|
664
|
+
"""Alternative access to ``.evict()`` used by Dask
|
|
662
665
|
|
|
663
|
-
Dask expects
|
|
666
|
+
Dask expects ``.fast.evict()`` to be available for manually triggering
|
|
664
667
|
of CPU-to-Disk spilling.
|
|
665
668
|
"""
|
|
666
669
|
if len(self.manager._host) == 0:
|
|
@@ -758,9 +761,9 @@ class ProxifyHostFile(MutableMapping):
|
|
|
758
761
|
|
|
759
762
|
@classmethod
|
|
760
763
|
def serialize_proxy_to_disk_inplace(cls, proxy: ProxyObject) -> None:
|
|
761
|
-
"""Serialize
|
|
764
|
+
"""Serialize ``proxy`` to disk.
|
|
762
765
|
|
|
763
|
-
Avoid de-serializing if
|
|
766
|
+
Avoid de-serializing if ``proxy`` is serialized using "dask" or
|
|
764
767
|
"pickle". In this case the already serialized data is written
|
|
765
768
|
directly to disk.
|
|
766
769
|
|
dask_cuda/proxy_object.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import copy as _copy
|
|
2
5
|
import functools
|
|
3
6
|
import operator
|
|
@@ -52,21 +55,21 @@ def asproxy(
|
|
|
52
55
|
serializers: Optional[Iterable[str]] = None,
|
|
53
56
|
subclass: Optional[Type["ProxyObject"]] = None,
|
|
54
57
|
) -> "ProxyObject":
|
|
55
|
-
"""Wrap
|
|
58
|
+
"""Wrap ``obj`` in a ProxyObject object if it isn't already.
|
|
56
59
|
|
|
57
60
|
Parameters
|
|
58
61
|
----------
|
|
59
62
|
obj: object
|
|
60
63
|
Object to wrap in a ProxyObject object.
|
|
61
64
|
serializers: Iterable[str], optional
|
|
62
|
-
Serializers to use to serialize
|
|
65
|
+
Serializers to use to serialize ``obj``. If None, no serialization is done.
|
|
63
66
|
subclass: class, optional
|
|
64
67
|
Specify a subclass of ProxyObject to create instead of ProxyObject.
|
|
65
|
-
|
|
68
|
+
``subclass`` must be pickable.
|
|
66
69
|
|
|
67
70
|
Returns
|
|
68
71
|
-------
|
|
69
|
-
The ProxyObject proxying
|
|
72
|
+
The ProxyObject proxying ``obj``
|
|
70
73
|
"""
|
|
71
74
|
if isinstance(obj, ProxyObject): # Already a proxy object
|
|
72
75
|
ret = obj
|
|
@@ -119,7 +122,7 @@ def unproxy(obj):
|
|
|
119
122
|
|
|
120
123
|
Returns
|
|
121
124
|
-------
|
|
122
|
-
The proxied object or
|
|
125
|
+
The proxied object or ``obj`` itself if it isn't a ProxyObject
|
|
123
126
|
"""
|
|
124
127
|
try:
|
|
125
128
|
obj = obj._pxy_deserialize()
|
|
@@ -185,16 +188,16 @@ class ProxyDetail:
|
|
|
185
188
|
Dictionary of attributes that are accessible without deserializing
|
|
186
189
|
the proxied object.
|
|
187
190
|
type_serialized: bytes
|
|
188
|
-
Pickled type of
|
|
191
|
+
Pickled type of ``obj``.
|
|
189
192
|
typename: str
|
|
190
|
-
Name of the type of
|
|
193
|
+
Name of the type of ``obj``.
|
|
191
194
|
is_cuda_object: boolean
|
|
192
|
-
Whether
|
|
195
|
+
Whether ``obj`` is a CUDA object or not.
|
|
193
196
|
subclass: bytes
|
|
194
197
|
Pickled type to use instead of ProxyObject when deserializing. The type
|
|
195
198
|
must inherit from ProxyObject.
|
|
196
199
|
serializers: str, optional
|
|
197
|
-
Serializers to use to serialize
|
|
200
|
+
Serializers to use to serialize ``obj``. If None, no serialization is done.
|
|
198
201
|
explicit_proxy: bool
|
|
199
202
|
Mark the proxy object as "explicit", which means that the user allows it
|
|
200
203
|
as input argument to dask tasks even in compatibility-mode.
|
|
@@ -258,7 +261,7 @@ class ProxyDetail:
|
|
|
258
261
|
return self.serializer is not None
|
|
259
262
|
|
|
260
263
|
def serialize(self, serializers: Iterable[str]) -> Tuple[dict, list]:
|
|
261
|
-
"""Inplace serialization of the proxied object using the
|
|
264
|
+
"""Inplace serialization of the proxied object using the ``serializers``
|
|
262
265
|
|
|
263
266
|
Parameters
|
|
264
267
|
----------
|
|
@@ -333,7 +336,7 @@ class ProxyObject:
|
|
|
333
336
|
ProxyObject has some limitations and doesn't mimic the proxied object perfectly.
|
|
334
337
|
Thus, if encountering problems remember that it is always possible to use unproxy()
|
|
335
338
|
to access the proxied object directly or disable JIT deserialization completely
|
|
336
|
-
with
|
|
339
|
+
with ``jit_unspill=False``.
|
|
337
340
|
|
|
338
341
|
Type checking using instance() works as expected but direct type checking
|
|
339
342
|
doesn't:
|
|
@@ -386,7 +389,7 @@ class ProxyObject:
|
|
|
386
389
|
serializers: Iterable[str],
|
|
387
390
|
proxy_detail: Optional[ProxyDetail] = None,
|
|
388
391
|
) -> None:
|
|
389
|
-
"""Inplace serialization of the proxied object using the
|
|
392
|
+
"""Inplace serialization of the proxied object using the ``serializers``
|
|
390
393
|
|
|
391
394
|
Parameters
|
|
392
395
|
----------
|
|
@@ -787,8 +790,8 @@ class ProxyObject:
|
|
|
787
790
|
def obj_pxy_is_device_object(obj: ProxyObject):
|
|
788
791
|
"""
|
|
789
792
|
In order to avoid de-serializing the proxied object,
|
|
790
|
-
we check
|
|
791
|
-
|
|
793
|
+
we check ``is_cuda_object`` instead of the default
|
|
794
|
+
``hasattr(o, "__cuda_array_interface__")`` check.
|
|
792
795
|
"""
|
|
793
796
|
return obj._pxy_get().is_cuda_object
|
|
794
797
|
|
|
@@ -830,7 +833,7 @@ def obj_pxy_dask_serialize(obj: ProxyObject):
|
|
|
830
833
|
|
|
831
834
|
As serializers, it uses "dask" or "pickle", which means that proxied CUDA objects
|
|
832
835
|
are spilled to main memory before communicated. Deserialization is needed, unless
|
|
833
|
-
obj is serialized to disk on a shared filesystem see
|
|
836
|
+
obj is serialized to disk on a shared filesystem see ``handle_disk_serialized()``.
|
|
834
837
|
"""
|
|
835
838
|
pxy = obj._pxy_get(copy=True)
|
|
836
839
|
if pxy.serializer == "disk":
|
|
@@ -851,7 +854,7 @@ def obj_pxy_cuda_serialize(obj: ProxyObject):
|
|
|
851
854
|
|
|
852
855
|
As serializers, it uses "cuda", which means that proxied CUDA objects are _not_
|
|
853
856
|
spilled to main memory before communicated. However, we still have to handle disk
|
|
854
|
-
serialized proxied like in
|
|
857
|
+
serialized proxied like in ``obj_pxy_dask_serialize()``
|
|
855
858
|
"""
|
|
856
859
|
pxy = obj._pxy_get(copy=True)
|
|
857
860
|
if pxy.serializer in ("dask", "pickle"):
|
|
@@ -897,7 +900,7 @@ def obj_pxy_dask_deserialize(header, frames):
|
|
|
897
900
|
|
|
898
901
|
|
|
899
902
|
def unproxify_input_wrapper(func):
|
|
900
|
-
"""Unproxify the input of
|
|
903
|
+
"""Unproxify the input of ``func``"""
|
|
901
904
|
|
|
902
905
|
@functools.wraps(func)
|
|
903
906
|
def wrapper(*args, **kwargs):
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from dask_cuda.utils import has_device_memory_resource
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def pytest_configure(config):
|
|
10
|
+
"""Register custom markers."""
|
|
11
|
+
config.addinivalue_line(
|
|
12
|
+
"markers",
|
|
13
|
+
"skip_if_no_device_memory: mark test to skip if device has no dedicated memory "
|
|
14
|
+
"resource",
|
|
15
|
+
)
|
|
16
|
+
config.addinivalue_line(
|
|
17
|
+
"markers",
|
|
18
|
+
"skip_if_device_memory: mark test to skip if device has dedicated memory "
|
|
19
|
+
"resource",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def pytest_collection_modifyitems(items):
|
|
24
|
+
"""Handle skip_if_no_device_memory marker."""
|
|
25
|
+
for item in items:
|
|
26
|
+
if item.get_closest_marker("skip_if_no_device_memory"):
|
|
27
|
+
skip_marker = item.get_closest_marker("skip_if_no_device_memory")
|
|
28
|
+
reason = skip_marker.kwargs.get(
|
|
29
|
+
"reason", "Test requires device with dedicated memory resource"
|
|
30
|
+
)
|
|
31
|
+
item.add_marker(
|
|
32
|
+
pytest.mark.skipif(not has_device_memory_resource(), reason=reason)
|
|
33
|
+
)
|
|
34
|
+
if item.get_closest_marker("skip_if_device_memory"):
|
|
35
|
+
skip_marker = item.get_closest_marker("skip_if_device_memory")
|
|
36
|
+
reason = skip_marker.kwargs.get(
|
|
37
|
+
"reason", "Test requires device without dedicated memory resource"
|
|
38
|
+
)
|
|
39
|
+
item.add_marker(
|
|
40
|
+
pytest.mark.skipif(has_device_memory_resource(), reason=reason)
|
|
41
|
+
)
|
|
@@ -21,13 +21,16 @@ from dask_cuda.utils import (
|
|
|
21
21
|
get_gpu_count_mig,
|
|
22
22
|
get_gpu_uuid,
|
|
23
23
|
get_n_gpus,
|
|
24
|
+
has_device_memory_resource,
|
|
24
25
|
wait_workers,
|
|
25
26
|
)
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
@patch.dict(
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
@patch.dict(
|
|
30
|
+
os.environ,
|
|
31
|
+
{"CUDA_VISIBLE_DEVICES": "0,3,7,8", "DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC": "1"},
|
|
32
|
+
)
|
|
33
|
+
def test_cuda_visible_devices(loop): # noqa: F811
|
|
31
34
|
with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
|
|
32
35
|
with popen(
|
|
33
36
|
[
|
|
@@ -37,14 +40,10 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop): # noqa: F811
|
|
|
37
40
|
"127.0.0.1:9359",
|
|
38
41
|
"--host",
|
|
39
42
|
"127.0.0.1",
|
|
40
|
-
"--device-memory-limit",
|
|
41
|
-
"1 MB",
|
|
42
|
-
"--nthreads",
|
|
43
|
-
str(nthreads),
|
|
44
43
|
"--no-dashboard",
|
|
45
44
|
"--worker-class",
|
|
46
45
|
"dask_cuda.utils_test.MockWorker",
|
|
47
|
-
]
|
|
46
|
+
],
|
|
48
47
|
):
|
|
49
48
|
with Client("127.0.0.1:9359", loop=loop) as client:
|
|
50
49
|
assert wait_workers(client, n_gpus=4)
|
|
@@ -58,12 +57,43 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop): # noqa: F811
|
|
|
58
57
|
for v in result.values():
|
|
59
58
|
del expected[v]
|
|
60
59
|
|
|
61
|
-
|
|
60
|
+
assert len(expected) == 0
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_memory_limit_and_nthreads(loop): # noqa: F811
|
|
64
|
+
nthreads = 4
|
|
65
|
+
|
|
66
|
+
device_memory_limit_args = []
|
|
67
|
+
if has_device_memory_resource():
|
|
68
|
+
device_memory_limit_args = ["--device-memory-limit", "1 MB"]
|
|
69
|
+
|
|
70
|
+
with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
|
|
71
|
+
with popen(
|
|
72
|
+
[
|
|
73
|
+
"dask",
|
|
74
|
+
"cuda",
|
|
75
|
+
"worker",
|
|
76
|
+
"127.0.0.1:9359",
|
|
77
|
+
"--host",
|
|
78
|
+
"127.0.0.1",
|
|
79
|
+
*device_memory_limit_args,
|
|
80
|
+
"--nthreads",
|
|
81
|
+
str(nthreads),
|
|
82
|
+
"--no-dashboard",
|
|
83
|
+
"--worker-class",
|
|
84
|
+
"dask_cuda.utils_test.MockWorker",
|
|
85
|
+
],
|
|
86
|
+
):
|
|
87
|
+
with Client("127.0.0.1:9359", loop=loop) as client:
|
|
88
|
+
assert wait_workers(client, n_gpus=get_n_gpus())
|
|
89
|
+
|
|
90
|
+
def get_visible_devices():
|
|
91
|
+
return os.environ["CUDA_VISIBLE_DEVICES"]
|
|
92
|
+
|
|
93
|
+
workers = client.scheduler_info(n_workers=-1)["workers"]
|
|
62
94
|
for w in workers.values():
|
|
63
95
|
assert w["memory_limit"] == MEMORY_LIMIT // len(workers)
|
|
64
96
|
|
|
65
|
-
assert len(expected) == 0
|
|
66
|
-
|
|
67
97
|
|
|
68
98
|
def test_rmm_pool(loop): # noqa: F811
|
|
69
99
|
rmm = pytest.importorskip("rmm")
|
|
@@ -119,11 +149,6 @@ def test_rmm_managed(loop): # noqa: F811
|
|
|
119
149
|
def test_rmm_async(loop): # noqa: F811
|
|
120
150
|
rmm = pytest.importorskip("rmm")
|
|
121
151
|
|
|
122
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
123
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
124
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
125
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
126
|
-
|
|
127
152
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
128
153
|
with popen(
|
|
129
154
|
[
|
|
@@ -159,11 +184,6 @@ def test_rmm_async(loop): # noqa: F811
|
|
|
159
184
|
def test_rmm_async_with_maximum_pool_size(loop): # noqa: F811
|
|
160
185
|
rmm = pytest.importorskip("rmm")
|
|
161
186
|
|
|
162
|
-
driver_version = rmm._cuda.gpu.driverGetVersion()
|
|
163
|
-
runtime_version = rmm._cuda.gpu.runtimeGetVersion()
|
|
164
|
-
if driver_version < 11020 or runtime_version < 11020:
|
|
165
|
-
pytest.skip("cudaMallocAsync not supported")
|
|
166
|
-
|
|
167
187
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
168
188
|
with popen(
|
|
169
189
|
[
|
|
@@ -263,8 +283,12 @@ def test_cudf_spill_disabled(loop): # noqa: F811
|
|
|
263
283
|
assert v == 0
|
|
264
284
|
|
|
265
285
|
|
|
286
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
287
|
+
"Devices without dedicated memory resources cannot enable cuDF spill"
|
|
288
|
+
)
|
|
266
289
|
def test_cudf_spill(loop): # noqa: F811
|
|
267
290
|
cudf = pytest.importorskip("cudf")
|
|
291
|
+
|
|
268
292
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
269
293
|
with popen(
|
|
270
294
|
[
|
|
@@ -292,6 +316,24 @@ def test_cudf_spill(loop): # noqa: F811
|
|
|
292
316
|
assert v == 2
|
|
293
317
|
|
|
294
318
|
|
|
319
|
+
@pytest.mark.skip_if_device_memory(
|
|
320
|
+
"Devices with dedicated memory resources cannot test error"
|
|
321
|
+
)
|
|
322
|
+
def test_cudf_spill_no_dedicated_memory_error():
|
|
323
|
+
pytest.importorskip("cudf")
|
|
324
|
+
|
|
325
|
+
ret = subprocess.run(
|
|
326
|
+
["dask", "cuda", "worker", "127.0.0.1:9369", "--enable-cudf-spill"],
|
|
327
|
+
capture_output=True,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
assert ret.returncode != 0
|
|
331
|
+
assert (
|
|
332
|
+
b"cuDF spilling is not supported on devices without dedicated memory"
|
|
333
|
+
in ret.stderr
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
|
|
295
337
|
@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
|
|
296
338
|
def test_dashboard_address(loop): # noqa: F811
|
|
297
339
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
@@ -472,6 +514,11 @@ def test_rmm_track_allocations(loop): # noqa: F811
|
|
|
472
514
|
@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
|
|
473
515
|
def test_get_cluster_configuration(loop): # noqa: F811
|
|
474
516
|
pytest.importorskip("rmm")
|
|
517
|
+
|
|
518
|
+
device_memory_limit_args = []
|
|
519
|
+
if has_device_memory_resource():
|
|
520
|
+
device_memory_limit_args += ["--device-memory-limit", "30 B"]
|
|
521
|
+
|
|
475
522
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
476
523
|
with popen(
|
|
477
524
|
[
|
|
@@ -481,8 +528,7 @@ def test_get_cluster_configuration(loop): # noqa: F811
|
|
|
481
528
|
"127.0.0.1:9369",
|
|
482
529
|
"--host",
|
|
483
530
|
"127.0.0.1",
|
|
484
|
-
|
|
485
|
-
"30 B",
|
|
531
|
+
*device_memory_limit_args,
|
|
486
532
|
"--rmm-pool-size",
|
|
487
533
|
"2 GB",
|
|
488
534
|
"--rmm-maximum-pool-size",
|
|
@@ -499,12 +545,17 @@ def test_get_cluster_configuration(loop): # noqa: F811
|
|
|
499
545
|
assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
|
|
500
546
|
assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
|
|
501
547
|
assert ret["jit-unspill"] is False
|
|
502
|
-
|
|
548
|
+
if has_device_memory_resource():
|
|
549
|
+
assert ret["device-memory-limit"] == 30
|
|
503
550
|
|
|
504
551
|
|
|
505
552
|
@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
|
|
553
|
+
@pytest.mark.skip_if_no_device_memory(
|
|
554
|
+
"Devices without dedicated memory resources do not support fractional limits"
|
|
555
|
+
)
|
|
506
556
|
def test_worker_fraction_limits(loop): # noqa: F811
|
|
507
557
|
pytest.importorskip("rmm")
|
|
558
|
+
|
|
508
559
|
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
509
560
|
with popen(
|
|
510
561
|
[
|
|
@@ -545,6 +596,33 @@ def test_worker_fraction_limits(loop): # noqa: F811
|
|
|
545
596
|
)
|
|
546
597
|
|
|
547
598
|
|
|
599
|
+
@pytest.mark.parametrize(
|
|
600
|
+
"argument", ["pool_size", "maximum_pool_size", "release_threshold"]
|
|
601
|
+
)
|
|
602
|
+
@pytest.mark.skip_if_device_memory(
|
|
603
|
+
"Devices with dedicated memory resources cannot test error"
|
|
604
|
+
)
|
|
605
|
+
def test_worker_fraction_limits_no_dedicated_memory(argument):
|
|
606
|
+
if argument == "pool_size":
|
|
607
|
+
argument_list = ["--rmm-pool-size", "0.1"]
|
|
608
|
+
elif argument == "maximum_pool_size":
|
|
609
|
+
argument_list = ["--rmm-pool-size", "1 GiB", "--rmm-maximum-pool-size", "0.1"]
|
|
610
|
+
else:
|
|
611
|
+
argument_list = ["--rmm-async", "--rmm-release-threshold", "0.1"]
|
|
612
|
+
|
|
613
|
+
with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
|
|
614
|
+
ret = subprocess.run(
|
|
615
|
+
["dask", "cuda", "worker", "127.0.0.1:9369", *argument_list],
|
|
616
|
+
capture_output=True,
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
assert ret.returncode != 0
|
|
620
|
+
assert (
|
|
621
|
+
b"Fractional of total device memory not supported in devices without a "
|
|
622
|
+
b"dedicated memory resource" in ret.stderr
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
|
|
548
626
|
@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
|
|
549
627
|
def test_worker_timeout():
|
|
550
628
|
ret = subprocess.run(
|
|
@@ -595,6 +673,12 @@ def test_worker_cudf_spill_warning(enable_cudf_spill_warning): # noqa: F811
|
|
|
595
673
|
capture_output=True,
|
|
596
674
|
)
|
|
597
675
|
if enable_cudf_spill_warning:
|
|
598
|
-
|
|
676
|
+
if has_device_memory_resource():
|
|
677
|
+
assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
|
|
678
|
+
else:
|
|
679
|
+
assert (
|
|
680
|
+
b"cuDF spilling is not supported on devices without dedicated "
|
|
681
|
+
b"memory" in ret.stderr
|
|
682
|
+
)
|
|
599
683
|
else:
|
|
600
684
|
assert b"UserWarning: cuDF spilling is enabled" not in ret.stderr
|