dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. dask_cuda/GIT_COMMIT +1 -1
  2. dask_cuda/VERSION +1 -1
  3. dask_cuda/benchmarks/common.py +4 -1
  4. dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
  5. dask_cuda/benchmarks/local_cudf_merge.py +5 -2
  6. dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
  7. dask_cuda/benchmarks/local_cupy.py +4 -1
  8. dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
  9. dask_cuda/benchmarks/utils.py +7 -4
  10. dask_cuda/cli.py +21 -15
  11. dask_cuda/cuda_worker.py +27 -57
  12. dask_cuda/device_host_file.py +31 -15
  13. dask_cuda/disk_io.py +7 -4
  14. dask_cuda/explicit_comms/comms.py +11 -7
  15. dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
  16. dask_cuda/get_device_memory_objects.py +3 -3
  17. dask_cuda/initialize.py +80 -44
  18. dask_cuda/local_cuda_cluster.py +63 -66
  19. dask_cuda/plugins.py +17 -16
  20. dask_cuda/proxify_device_objects.py +12 -10
  21. dask_cuda/proxify_host_file.py +30 -27
  22. dask_cuda/proxy_object.py +20 -17
  23. dask_cuda/tests/conftest.py +41 -0
  24. dask_cuda/tests/test_dask_cuda_worker.py +109 -25
  25. dask_cuda/tests/test_dgx.py +10 -18
  26. dask_cuda/tests/test_explicit_comms.py +30 -12
  27. dask_cuda/tests/test_from_array.py +7 -5
  28. dask_cuda/tests/test_initialize.py +16 -37
  29. dask_cuda/tests/test_local_cuda_cluster.py +159 -52
  30. dask_cuda/tests/test_proxify_host_file.py +19 -3
  31. dask_cuda/tests/test_proxy.py +18 -16
  32. dask_cuda/tests/test_rdd_ucx.py +160 -0
  33. dask_cuda/tests/test_spill.py +7 -0
  34. dask_cuda/tests/test_utils.py +106 -20
  35. dask_cuda/tests/test_worker_spec.py +5 -2
  36. dask_cuda/utils.py +261 -38
  37. dask_cuda/utils_test.py +23 -7
  38. dask_cuda/worker_common.py +196 -0
  39. dask_cuda/worker_spec.py +12 -5
  40. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
  41. dask_cuda-25.8.0.dist-info/RECORD +63 -0
  42. dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
  43. shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
  44. shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
  45. shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
  46. dask_cuda-25.6.0.dist-info/RECORD +0 -57
  47. dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
  48. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
  49. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
  50. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,6 @@
1
- # Copyright (c) 2025 NVIDIA CORPORATION.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
2
4
  import functools
3
5
  import pydoc
4
6
  from collections import defaultdict
@@ -59,9 +61,9 @@ def proxify_device_objects(
59
61
  ) -> T:
60
62
  """Wrap device objects in ProxyObject
61
63
 
62
- Search through `obj` and wraps all CUDA device objects in ProxyObject.
63
- It uses `proxied_id_to_proxy` to make sure that identical CUDA device
64
- objects found in `obj` are wrapped by the same ProxyObject.
64
+ Search through ``obj`` and wraps all CUDA device objects in ProxyObject.
65
+ It uses ``proxied_id_to_proxy`` to make sure that identical CUDA device
66
+ objects found in ``obj`` are wrapped by the same ProxyObject.
65
67
 
66
68
  Parameters
67
69
  ----------
@@ -69,11 +71,11 @@ def proxify_device_objects(
69
71
  Object to search through or wrap in a ProxyObject.
70
72
  proxied_id_to_proxy: MutableMapping[int, ProxyObject]
71
73
  Dict mapping the id() of proxied objects (CUDA device objects) to
72
- their proxy and is updated with all new proxied objects found in `obj`.
74
+ their proxy and is updated with all new proxied objects found in ``obj``.
73
75
  If None, use an empty dict.
74
76
  found_proxies: List[ProxyObject]
75
- List of found proxies in `obj`. Notice, this includes all proxies found,
76
- including those already in `proxied_id_to_proxy`.
77
+ List of found proxies in ``obj``. Notice, this includes all proxies found,
78
+ including those already in ``proxied_id_to_proxy``.
77
79
  If None, use an empty list.
78
80
  excl_proxies: bool
79
81
  Don't add found objects that are already ProxyObject to found_proxies.
@@ -84,7 +86,7 @@ def proxify_device_objects(
84
86
  Returns
85
87
  -------
86
88
  ret: Any
87
- A copy of `obj` where all CUDA device objects are wrapped in ProxyObject
89
+ A copy of ``obj`` where all CUDA device objects are wrapped in ProxyObject
88
90
  """
89
91
  _register_incompatible_types()
90
92
 
@@ -103,7 +105,7 @@ def unproxify_device_objects(
103
105
  ) -> T:
104
106
  """Unproxify device objects
105
107
 
106
- Search through `obj` and un-wraps all CUDA device objects.
108
+ Search through ``obj`` and un-wraps all CUDA device objects.
107
109
 
108
110
  Parameters
109
111
  ----------
@@ -118,7 +120,7 @@ def unproxify_device_objects(
118
120
  Returns
119
121
  -------
120
122
  ret: Any
121
- A copy of `obj` where all CUDA device objects are unproxify
123
+ A copy of ``obj`` where all CUDA device objects are unproxify
122
124
  """
123
125
  if isinstance(obj, dict):
124
126
  return {
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import abc
2
5
  import gc
3
6
  import io
@@ -64,29 +67,29 @@ class Proxies(abc.ABC):
64
67
 
65
68
  @abc.abstractmethod
66
69
  def mem_usage_add(self, proxy: ProxyObject) -> None:
67
- """Given a new proxy, update `self._mem_usage`"""
70
+ """Given a new proxy, update ``self._mem_usage``"""
68
71
 
69
72
  @abc.abstractmethod
70
73
  def mem_usage_remove(self, proxy: ProxyObject) -> None:
71
- """Removal of proxy, update `self._mem_usage`"""
74
+ """Removal of proxy, update ``self._mem_usage``"""
72
75
 
73
76
  @abc.abstractmethod
74
77
  def buffer_info(self) -> List[Tuple[float, int, List[ProxyObject]]]:
75
78
  """Return a list of buffer information
76
79
 
77
80
  The returned format is:
78
- `[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]
81
+ ``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
79
82
  """
80
83
 
81
84
  def add(self, proxy: ProxyObject) -> None:
82
- """Add a proxy for tracking, calls `self.mem_usage_add`"""
85
+ """Add a proxy for tracking, calls ``self.mem_usage_add``"""
83
86
  assert not self.contains_proxy_id(id(proxy))
84
87
  with self._lock:
85
88
  self._proxy_id_to_proxy[id(proxy)] = weakref.ref(proxy)
86
89
  self.mem_usage_add(proxy)
87
90
 
88
91
  def remove(self, proxy: ProxyObject) -> None:
89
- """Remove proxy from tracking, calls `self.mem_usage_remove`"""
92
+ """Remove proxy from tracking, calls ``self.mem_usage_remove``"""
90
93
  with self._lock:
91
94
  del self._proxy_id_to_proxy[id(proxy)]
92
95
  self.mem_usage_remove(proxy)
@@ -323,13 +326,13 @@ class ProxyManager:
323
326
  assert header["serializer"] == pxy.serializer
324
327
 
325
328
  def proxify(self, obj: T, duplicate_check=True) -> Tuple[T, bool]:
326
- """Proxify `obj` and add found proxies to the `Proxies` collections
329
+ """Proxify ``obj`` and add found proxies to the ``Proxies`` collections
327
330
 
328
- Search through `obj` and wrap all CUDA device objects in ProxyObject.
331
+ Search through ``obj`` and wrap all CUDA device objects in ProxyObject.
329
332
  If duplicate_check is True, identical CUDA device objects found in
330
- `obj` are wrapped by the same ProxyObject.
333
+ ``obj`` are wrapped by the same ProxyObject.
331
334
 
332
- Returns the proxified object and a boolean, which is `True` when one or
335
+ Returns the proxified object and a boolean, which is ``True`` when one or
333
336
  more incompatible-types were found.
334
337
 
335
338
  Parameters
@@ -337,7 +340,7 @@ class ProxyManager:
337
340
  obj
338
341
  Object to search through or wrap in a ProxyObject.
339
342
  duplicate_check
340
- Make sure that identical CUDA device objects found in `obj` are
343
+ Make sure that identical CUDA device objects found in ``obj`` are
341
344
  wrapped by the same ProxyObject. This check comes with a significant
342
345
  overhead hence it is recommended setting to False when it is known
343
346
  that no duplicate exist.
@@ -380,11 +383,11 @@ class ProxyManager:
380
383
  proxies_access: Callable[[], List[Tuple[float, int, List[ProxyObject]]]],
381
384
  serializer: Callable[[ProxyObject], None],
382
385
  ) -> int:
383
- """Evict buffers retrieved by calling `proxies_access`
386
+ """Evict buffers retrieved by calling ``proxies_access``
384
387
 
385
- Calls `proxies_access` to retrieve a list of proxies and then spills
386
- enough proxies to free up at a minimum `nbytes` bytes. In order to
387
- spill a proxy, `serializer` is called.
388
+ Calls ``proxies_access`` to retrieve a list of proxies and then spills
389
+ enough proxies to free up at a minimum ``nbytes`` bytes. In order to
390
+ spill a proxy, ``serializer`` is called.
388
391
 
389
392
  Parameters
390
393
  ----------
@@ -392,7 +395,7 @@ class ProxyManager:
392
395
  Number of bytes to evict.
393
396
  proxies_access: callable
394
397
  Function that returns a list of proxies pack in a tuple like:
395
- `[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]
398
+ ``[(<access-time>, <size-of-buffer>, <list-of-proxies>), ...]``
396
399
  serializer: callable
397
400
  Function that serialize the given proxy object.
398
401
 
@@ -423,7 +426,7 @@ class ProxyManager:
423
426
  def maybe_evict_from_device(self, extra_dev_mem=0) -> None:
424
427
  """Evict buffers until total memory usage is below device-memory-limit
425
428
 
426
- Adds `extra_dev_mem` to the current total memory usage when comparing
429
+ Adds ``extra_dev_mem`` to the current total memory usage when comparing
427
430
  against device-memory-limit.
428
431
  """
429
432
  mem_over_usage = (
@@ -439,7 +442,7 @@ class ProxyManager:
439
442
  def maybe_evict_from_host(self, extra_host_mem=0) -> None:
440
443
  """Evict buffers until total memory usage is below host-memory-limit
441
444
 
442
- Adds `extra_host_mem` to the current total memory usage when comparing
445
+ Adds ``extra_host_mem`` to the current total memory usage when comparing
443
446
  against device-memory-limit.
444
447
  """
445
448
  assert self._host_memory_limit is not None
@@ -466,7 +469,7 @@ class ProxifyHostFile(MutableMapping):
466
469
  workers in Distributed.
467
470
 
468
471
  It wraps all CUDA device objects in a ProxyObject instance and maintains
469
- `device_memory_limit` by spilling ProxyObject on-the-fly. This addresses
472
+ ``device_memory_limit`` by spilling ProxyObject on-the-fly. This addresses
470
473
  some issues with the default DeviceHostFile host, which tracks device
471
474
  memory inaccurately see <https://github.com/rapidsai/dask-cuda/pull/451>
472
475
 
@@ -488,16 +491,16 @@ class ProxifyHostFile(MutableMapping):
488
491
  memory_limit: int
489
492
  Number of bytes of host memory used before spilling to disk.
490
493
  shared_filesystem: bool or None, default None
491
- Whether the `local_directory` above is shared between all workers or not.
494
+ Whether the ``local_directory`` above is shared between all workers or not.
492
495
  If ``None``, the "jit-unspill-shared-fs" config value are used, which
493
496
  defaults to False.
494
- Notice, a shared filesystem must support the `os.link()` operation.
497
+ Notice, a shared filesystem must support the ``os.link()`` operation.
495
498
  compatibility_mode: bool or None, default None
496
499
  Enables compatibility-mode, which means that items are un-proxified before
497
500
  retrieval. This makes it possible to get some of the JIT-unspill benefits
498
501
  without having to be ProxyObject compatible. In order to still allow specific
499
- ProxyObjects, set the `mark_as_explicit_proxies=True` when proxifying with
500
- `proxify_device_objects()`. If ``None``, the "jit-unspill-compatibility-mode"
502
+ ProxyObjects, set the ``mark_as_explicit_proxies=True`` when proxifying with
503
+ ``proxify_device_objects()``. If ``None``, the "jit-unspill-compatibility-mode"
501
504
  config value are used, which defaults to False.
502
505
  spill_on_demand: bool or None, default None
503
506
  Enables spilling when the RMM memory pool goes out of memory. If ``None``,
@@ -639,7 +642,7 @@ class ProxifyHostFile(MutableMapping):
639
642
  """Manually evict 1% of host limit.
640
643
 
641
644
  Dask uses this to trigger CPU-to-Disk spilling. We don't know how much
642
- we need to spill but Dask will call `evict()` repeatedly until enough
645
+ we need to spill but Dask will call ``evict()`` repeatedly until enough
643
646
  is spilled. We ask for 1% each time.
644
647
 
645
648
  Return
@@ -658,9 +661,9 @@ class ProxifyHostFile(MutableMapping):
658
661
 
659
662
  @property
660
663
  def fast(self):
661
- """Alternative access to `.evict()` used by Dask
664
+ """Alternative access to ``.evict()`` used by Dask
662
665
 
663
- Dask expects `.fast.evict()` to be available for manually triggering
666
+ Dask expects ``.fast.evict()`` to be available for manually triggering
664
667
  of CPU-to-Disk spilling.
665
668
  """
666
669
  if len(self.manager._host) == 0:
@@ -758,9 +761,9 @@ class ProxifyHostFile(MutableMapping):
758
761
 
759
762
  @classmethod
760
763
  def serialize_proxy_to_disk_inplace(cls, proxy: ProxyObject) -> None:
761
- """Serialize `proxy` to disk.
764
+ """Serialize ``proxy`` to disk.
762
765
 
763
- Avoid de-serializing if `proxy` is serialized using "dask" or
766
+ Avoid de-serializing if ``proxy`` is serialized using "dask" or
764
767
  "pickle". In this case the already serialized data is written
765
768
  directly to disk.
766
769
 
dask_cuda/proxy_object.py CHANGED
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import copy as _copy
2
5
  import functools
3
6
  import operator
@@ -52,21 +55,21 @@ def asproxy(
52
55
  serializers: Optional[Iterable[str]] = None,
53
56
  subclass: Optional[Type["ProxyObject"]] = None,
54
57
  ) -> "ProxyObject":
55
- """Wrap `obj` in a ProxyObject object if it isn't already.
58
+ """Wrap ``obj`` in a ProxyObject object if it isn't already.
56
59
 
57
60
  Parameters
58
61
  ----------
59
62
  obj: object
60
63
  Object to wrap in a ProxyObject object.
61
64
  serializers: Iterable[str], optional
62
- Serializers to use to serialize `obj`. If None, no serialization is done.
65
+ Serializers to use to serialize ``obj``. If None, no serialization is done.
63
66
  subclass: class, optional
64
67
  Specify a subclass of ProxyObject to create instead of ProxyObject.
65
- `subclass` must be pickable.
68
+ ``subclass`` must be pickable.
66
69
 
67
70
  Returns
68
71
  -------
69
- The ProxyObject proxying `obj`
72
+ The ProxyObject proxying ``obj``
70
73
  """
71
74
  if isinstance(obj, ProxyObject): # Already a proxy object
72
75
  ret = obj
@@ -119,7 +122,7 @@ def unproxy(obj):
119
122
 
120
123
  Returns
121
124
  -------
122
- The proxied object or `obj` itself if it isn't a ProxyObject
125
+ The proxied object or ``obj`` itself if it isn't a ProxyObject
123
126
  """
124
127
  try:
125
128
  obj = obj._pxy_deserialize()
@@ -185,16 +188,16 @@ class ProxyDetail:
185
188
  Dictionary of attributes that are accessible without deserializing
186
189
  the proxied object.
187
190
  type_serialized: bytes
188
- Pickled type of `obj`.
191
+ Pickled type of ``obj``.
189
192
  typename: str
190
- Name of the type of `obj`.
193
+ Name of the type of ``obj``.
191
194
  is_cuda_object: boolean
192
- Whether `obj` is a CUDA object or not.
195
+ Whether ``obj`` is a CUDA object or not.
193
196
  subclass: bytes
194
197
  Pickled type to use instead of ProxyObject when deserializing. The type
195
198
  must inherit from ProxyObject.
196
199
  serializers: str, optional
197
- Serializers to use to serialize `obj`. If None, no serialization is done.
200
+ Serializers to use to serialize ``obj``. If None, no serialization is done.
198
201
  explicit_proxy: bool
199
202
  Mark the proxy object as "explicit", which means that the user allows it
200
203
  as input argument to dask tasks even in compatibility-mode.
@@ -258,7 +261,7 @@ class ProxyDetail:
258
261
  return self.serializer is not None
259
262
 
260
263
  def serialize(self, serializers: Iterable[str]) -> Tuple[dict, list]:
261
- """Inplace serialization of the proxied object using the `serializers`
264
+ """Inplace serialization of the proxied object using the ``serializers``
262
265
 
263
266
  Parameters
264
267
  ----------
@@ -333,7 +336,7 @@ class ProxyObject:
333
336
  ProxyObject has some limitations and doesn't mimic the proxied object perfectly.
334
337
  Thus, if encountering problems remember that it is always possible to use unproxy()
335
338
  to access the proxied object directly or disable JIT deserialization completely
336
- with `jit_unspill=False`.
339
+ with ``jit_unspill=False``.
337
340
 
338
341
  Type checking using instance() works as expected but direct type checking
339
342
  doesn't:
@@ -386,7 +389,7 @@ class ProxyObject:
386
389
  serializers: Iterable[str],
387
390
  proxy_detail: Optional[ProxyDetail] = None,
388
391
  ) -> None:
389
- """Inplace serialization of the proxied object using the `serializers`
392
+ """Inplace serialization of the proxied object using the ``serializers``
390
393
 
391
394
  Parameters
392
395
  ----------
@@ -787,8 +790,8 @@ class ProxyObject:
787
790
  def obj_pxy_is_device_object(obj: ProxyObject):
788
791
  """
789
792
  In order to avoid de-serializing the proxied object,
790
- we check `is_cuda_object` instead of the default
791
- `hasattr(o, "__cuda_array_interface__")` check.
793
+ we check ``is_cuda_object`` instead of the default
794
+ ``hasattr(o, "__cuda_array_interface__")`` check.
792
795
  """
793
796
  return obj._pxy_get().is_cuda_object
794
797
 
@@ -830,7 +833,7 @@ def obj_pxy_dask_serialize(obj: ProxyObject):
830
833
 
831
834
  As serializers, it uses "dask" or "pickle", which means that proxied CUDA objects
832
835
  are spilled to main memory before communicated. Deserialization is needed, unless
833
- obj is serialized to disk on a shared filesystem see `handle_disk_serialized()`.
836
+ obj is serialized to disk on a shared filesystem see ``handle_disk_serialized()``.
834
837
  """
835
838
  pxy = obj._pxy_get(copy=True)
836
839
  if pxy.serializer == "disk":
@@ -851,7 +854,7 @@ def obj_pxy_cuda_serialize(obj: ProxyObject):
851
854
 
852
855
  As serializers, it uses "cuda", which means that proxied CUDA objects are _not_
853
856
  spilled to main memory before communicated. However, we still have to handle disk
854
- serialized proxied like in `obj_pxy_dask_serialize()`
857
+ serialized proxied like in ``obj_pxy_dask_serialize()``
855
858
  """
856
859
  pxy = obj._pxy_get(copy=True)
857
860
  if pxy.serializer in ("dask", "pickle"):
@@ -897,7 +900,7 @@ def obj_pxy_dask_deserialize(header, frames):
897
900
 
898
901
 
899
902
  def unproxify_input_wrapper(func):
900
- """Unproxify the input of `func`"""
903
+ """Unproxify the input of ``func``"""
901
904
 
902
905
  @functools.wraps(func)
903
906
  def wrapper(*args, **kwargs):
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import pytest
5
+
6
+ from dask_cuda.utils import has_device_memory_resource
7
+
8
+
9
+ def pytest_configure(config):
10
+ """Register custom markers."""
11
+ config.addinivalue_line(
12
+ "markers",
13
+ "skip_if_no_device_memory: mark test to skip if device has no dedicated memory "
14
+ "resource",
15
+ )
16
+ config.addinivalue_line(
17
+ "markers",
18
+ "skip_if_device_memory: mark test to skip if device has dedicated memory "
19
+ "resource",
20
+ )
21
+
22
+
23
+ def pytest_collection_modifyitems(items):
24
+ """Handle skip_if_no_device_memory marker."""
25
+ for item in items:
26
+ if item.get_closest_marker("skip_if_no_device_memory"):
27
+ skip_marker = item.get_closest_marker("skip_if_no_device_memory")
28
+ reason = skip_marker.kwargs.get(
29
+ "reason", "Test requires device with dedicated memory resource"
30
+ )
31
+ item.add_marker(
32
+ pytest.mark.skipif(not has_device_memory_resource(), reason=reason)
33
+ )
34
+ if item.get_closest_marker("skip_if_device_memory"):
35
+ skip_marker = item.get_closest_marker("skip_if_device_memory")
36
+ reason = skip_marker.kwargs.get(
37
+ "reason", "Test requires device without dedicated memory resource"
38
+ )
39
+ item.add_marker(
40
+ pytest.mark.skipif(has_device_memory_resource(), reason=reason)
41
+ )
@@ -21,13 +21,16 @@ from dask_cuda.utils import (
21
21
  get_gpu_count_mig,
22
22
  get_gpu_uuid,
23
23
  get_n_gpus,
24
+ has_device_memory_resource,
24
25
  wait_workers,
25
26
  )
26
27
 
27
28
 
28
- @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,3,7,8"})
29
- def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop): # noqa: F811
30
- nthreads = 4
29
+ @patch.dict(
30
+ os.environ,
31
+ {"CUDA_VISIBLE_DEVICES": "0,3,7,8", "DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC": "1"},
32
+ )
33
+ def test_cuda_visible_devices(loop): # noqa: F811
31
34
  with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
32
35
  with popen(
33
36
  [
@@ -37,14 +40,10 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop): # noqa: F811
37
40
  "127.0.0.1:9359",
38
41
  "--host",
39
42
  "127.0.0.1",
40
- "--device-memory-limit",
41
- "1 MB",
42
- "--nthreads",
43
- str(nthreads),
44
43
  "--no-dashboard",
45
44
  "--worker-class",
46
45
  "dask_cuda.utils_test.MockWorker",
47
- ]
46
+ ],
48
47
  ):
49
48
  with Client("127.0.0.1:9359", loop=loop) as client:
50
49
  assert wait_workers(client, n_gpus=4)
@@ -58,12 +57,43 @@ def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop): # noqa: F811
58
57
  for v in result.values():
59
58
  del expected[v]
60
59
 
61
- workers = client.scheduler_info()["workers"]
60
+ assert len(expected) == 0
61
+
62
+
63
+ def test_memory_limit_and_nthreads(loop): # noqa: F811
64
+ nthreads = 4
65
+
66
+ device_memory_limit_args = []
67
+ if has_device_memory_resource():
68
+ device_memory_limit_args = ["--device-memory-limit", "1 MB"]
69
+
70
+ with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
71
+ with popen(
72
+ [
73
+ "dask",
74
+ "cuda",
75
+ "worker",
76
+ "127.0.0.1:9359",
77
+ "--host",
78
+ "127.0.0.1",
79
+ *device_memory_limit_args,
80
+ "--nthreads",
81
+ str(nthreads),
82
+ "--no-dashboard",
83
+ "--worker-class",
84
+ "dask_cuda.utils_test.MockWorker",
85
+ ],
86
+ ):
87
+ with Client("127.0.0.1:9359", loop=loop) as client:
88
+ assert wait_workers(client, n_gpus=get_n_gpus())
89
+
90
+ def get_visible_devices():
91
+ return os.environ["CUDA_VISIBLE_DEVICES"]
92
+
93
+ workers = client.scheduler_info(n_workers=-1)["workers"]
62
94
  for w in workers.values():
63
95
  assert w["memory_limit"] == MEMORY_LIMIT // len(workers)
64
96
 
65
- assert len(expected) == 0
66
-
67
97
 
68
98
  def test_rmm_pool(loop): # noqa: F811
69
99
  rmm = pytest.importorskip("rmm")
@@ -119,11 +149,6 @@ def test_rmm_managed(loop): # noqa: F811
119
149
  def test_rmm_async(loop): # noqa: F811
120
150
  rmm = pytest.importorskip("rmm")
121
151
 
122
- driver_version = rmm._cuda.gpu.driverGetVersion()
123
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
124
- if driver_version < 11020 or runtime_version < 11020:
125
- pytest.skip("cudaMallocAsync not supported")
126
-
127
152
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
128
153
  with popen(
129
154
  [
@@ -159,11 +184,6 @@ def test_rmm_async(loop): # noqa: F811
159
184
  def test_rmm_async_with_maximum_pool_size(loop): # noqa: F811
160
185
  rmm = pytest.importorskip("rmm")
161
186
 
162
- driver_version = rmm._cuda.gpu.driverGetVersion()
163
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
164
- if driver_version < 11020 or runtime_version < 11020:
165
- pytest.skip("cudaMallocAsync not supported")
166
-
167
187
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
168
188
  with popen(
169
189
  [
@@ -263,8 +283,12 @@ def test_cudf_spill_disabled(loop): # noqa: F811
263
283
  assert v == 0
264
284
 
265
285
 
286
+ @pytest.mark.skip_if_no_device_memory(
287
+ "Devices without dedicated memory resources cannot enable cuDF spill"
288
+ )
266
289
  def test_cudf_spill(loop): # noqa: F811
267
290
  cudf = pytest.importorskip("cudf")
291
+
268
292
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
269
293
  with popen(
270
294
  [
@@ -292,6 +316,24 @@ def test_cudf_spill(loop): # noqa: F811
292
316
  assert v == 2
293
317
 
294
318
 
319
+ @pytest.mark.skip_if_device_memory(
320
+ "Devices with dedicated memory resources cannot test error"
321
+ )
322
+ def test_cudf_spill_no_dedicated_memory_error():
323
+ pytest.importorskip("cudf")
324
+
325
+ ret = subprocess.run(
326
+ ["dask", "cuda", "worker", "127.0.0.1:9369", "--enable-cudf-spill"],
327
+ capture_output=True,
328
+ )
329
+
330
+ assert ret.returncode != 0
331
+ assert (
332
+ b"cuDF spilling is not supported on devices without dedicated memory"
333
+ in ret.stderr
334
+ )
335
+
336
+
295
337
  @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
296
338
  def test_dashboard_address(loop): # noqa: F811
297
339
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
@@ -472,6 +514,11 @@ def test_rmm_track_allocations(loop): # noqa: F811
472
514
  @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
473
515
  def test_get_cluster_configuration(loop): # noqa: F811
474
516
  pytest.importorskip("rmm")
517
+
518
+ device_memory_limit_args = []
519
+ if has_device_memory_resource():
520
+ device_memory_limit_args += ["--device-memory-limit", "30 B"]
521
+
475
522
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
476
523
  with popen(
477
524
  [
@@ -481,8 +528,7 @@ def test_get_cluster_configuration(loop): # noqa: F811
481
528
  "127.0.0.1:9369",
482
529
  "--host",
483
530
  "127.0.0.1",
484
- "--device-memory-limit",
485
- "30 B",
531
+ *device_memory_limit_args,
486
532
  "--rmm-pool-size",
487
533
  "2 GB",
488
534
  "--rmm-maximum-pool-size",
@@ -499,12 +545,17 @@ def test_get_cluster_configuration(loop): # noqa: F811
499
545
  assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
500
546
  assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
501
547
  assert ret["jit-unspill"] is False
502
- assert ret["device-memory-limit"] == 30
548
+ if has_device_memory_resource():
549
+ assert ret["device-memory-limit"] == 30
503
550
 
504
551
 
505
552
  @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
553
+ @pytest.mark.skip_if_no_device_memory(
554
+ "Devices without dedicated memory resources do not support fractional limits"
555
+ )
506
556
  def test_worker_fraction_limits(loop): # noqa: F811
507
557
  pytest.importorskip("rmm")
558
+
508
559
  with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
509
560
  with popen(
510
561
  [
@@ -545,6 +596,33 @@ def test_worker_fraction_limits(loop): # noqa: F811
545
596
  )
546
597
 
547
598
 
599
+ @pytest.mark.parametrize(
600
+ "argument", ["pool_size", "maximum_pool_size", "release_threshold"]
601
+ )
602
+ @pytest.mark.skip_if_device_memory(
603
+ "Devices with dedicated memory resources cannot test error"
604
+ )
605
+ def test_worker_fraction_limits_no_dedicated_memory(argument):
606
+ if argument == "pool_size":
607
+ argument_list = ["--rmm-pool-size", "0.1"]
608
+ elif argument == "maximum_pool_size":
609
+ argument_list = ["--rmm-pool-size", "1 GiB", "--rmm-maximum-pool-size", "0.1"]
610
+ else:
611
+ argument_list = ["--rmm-async", "--rmm-release-threshold", "0.1"]
612
+
613
+ with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
614
+ ret = subprocess.run(
615
+ ["dask", "cuda", "worker", "127.0.0.1:9369", *argument_list],
616
+ capture_output=True,
617
+ )
618
+
619
+ assert ret.returncode != 0
620
+ assert (
621
+ b"Fractional of total device memory not supported in devices without a "
622
+ b"dedicated memory resource" in ret.stderr
623
+ )
624
+
625
+
548
626
  @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
549
627
  def test_worker_timeout():
550
628
  ret = subprocess.run(
@@ -595,6 +673,12 @@ def test_worker_cudf_spill_warning(enable_cudf_spill_warning): # noqa: F811
595
673
  capture_output=True,
596
674
  )
597
675
  if enable_cudf_spill_warning:
598
- assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
676
+ if has_device_memory_resource():
677
+ assert b"UserWarning: cuDF spilling is enabled" in ret.stderr
678
+ else:
679
+ assert (
680
+ b"cuDF spilling is not supported on devices without dedicated "
681
+ b"memory" in ret.stderr
682
+ )
599
683
  else:
600
684
  assert b"UserWarning: cuDF spilling is enabled" not in ret.stderr