dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. dask_cuda/GIT_COMMIT +1 -1
  2. dask_cuda/VERSION +1 -1
  3. dask_cuda/benchmarks/common.py +4 -1
  4. dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
  5. dask_cuda/benchmarks/local_cudf_merge.py +5 -2
  6. dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
  7. dask_cuda/benchmarks/local_cupy.py +4 -1
  8. dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
  9. dask_cuda/benchmarks/utils.py +7 -4
  10. dask_cuda/cli.py +21 -15
  11. dask_cuda/cuda_worker.py +27 -57
  12. dask_cuda/device_host_file.py +31 -15
  13. dask_cuda/disk_io.py +7 -4
  14. dask_cuda/explicit_comms/comms.py +11 -7
  15. dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
  16. dask_cuda/get_device_memory_objects.py +3 -3
  17. dask_cuda/initialize.py +80 -44
  18. dask_cuda/local_cuda_cluster.py +63 -66
  19. dask_cuda/plugins.py +17 -16
  20. dask_cuda/proxify_device_objects.py +12 -10
  21. dask_cuda/proxify_host_file.py +30 -27
  22. dask_cuda/proxy_object.py +20 -17
  23. dask_cuda/tests/conftest.py +41 -0
  24. dask_cuda/tests/test_dask_cuda_worker.py +109 -25
  25. dask_cuda/tests/test_dgx.py +10 -18
  26. dask_cuda/tests/test_explicit_comms.py +30 -12
  27. dask_cuda/tests/test_from_array.py +7 -5
  28. dask_cuda/tests/test_initialize.py +16 -37
  29. dask_cuda/tests/test_local_cuda_cluster.py +159 -52
  30. dask_cuda/tests/test_proxify_host_file.py +19 -3
  31. dask_cuda/tests/test_proxy.py +18 -16
  32. dask_cuda/tests/test_rdd_ucx.py +160 -0
  33. dask_cuda/tests/test_spill.py +7 -0
  34. dask_cuda/tests/test_utils.py +106 -20
  35. dask_cuda/tests/test_worker_spec.py +5 -2
  36. dask_cuda/utils.py +261 -38
  37. dask_cuda/utils_test.py +23 -7
  38. dask_cuda/worker_common.py +196 -0
  39. dask_cuda/worker_spec.py +12 -5
  40. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
  41. dask_cuda-25.8.0.dist-info/RECORD +63 -0
  42. dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
  43. shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
  44. shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
  45. shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
  46. dask_cuda-25.6.0.dist-info/RECORD +0 -57
  47. dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
  48. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
  49. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
  50. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  import asyncio
5
+ import contextlib
5
6
  import os
6
7
  import pkgutil
7
8
  import sys
@@ -20,15 +21,17 @@ from dask_cuda.utils import (
20
21
  get_device_total_memory,
21
22
  get_gpu_count_mig,
22
23
  get_gpu_uuid,
24
+ has_device_memory_resource,
23
25
  print_cluster_config,
24
26
  )
25
- from dask_cuda.utils_test import MockWorker
27
+ from dask_cuda.utils_test import MockWorker, get_ucx_implementation
26
28
 
27
29
 
28
30
  @gen_test(timeout=20)
29
31
  async def test_local_cuda_cluster():
30
32
  async with LocalCUDACluster(
31
- scheduler_port=0, asynchronous=True, device_memory_limit=1
33
+ scheduler_port=0,
34
+ asynchronous=True,
32
35
  ) as cluster:
33
36
  async with Client(cluster, asynchronous=True) as client:
34
37
  assert len(cluster.workers) == utils.get_n_gpus()
@@ -68,8 +71,8 @@ async def test_with_subset_of_cuda_visible_devices():
68
71
  async with LocalCUDACluster(
69
72
  scheduler_port=0,
70
73
  asynchronous=True,
71
- device_memory_limit=1,
72
74
  worker_class=MockWorker,
75
+ data=dict,
73
76
  ) as cluster:
74
77
  async with Client(cluster, asynchronous=True) as client:
75
78
  assert len(cluster.workers) == 4
@@ -92,14 +95,11 @@ async def test_with_subset_of_cuda_visible_devices():
92
95
 
93
96
  @pytest.mark.parametrize(
94
97
  "protocol",
95
- ["ucx", "ucxx"],
98
+ ["ucx", "ucx-old"],
96
99
  )
97
100
  @gen_test(timeout=20)
98
101
  async def test_ucx_protocol(protocol):
99
- if protocol == "ucx":
100
- pytest.importorskip("ucp")
101
- elif protocol == "ucxx":
102
- pytest.importorskip("ucxx")
102
+ get_ucx_implementation(protocol)
103
103
 
104
104
  async with LocalCUDACluster(
105
105
  protocol=protocol, asynchronous=True, data=dict
@@ -112,35 +112,32 @@ async def test_ucx_protocol(protocol):
112
112
 
113
113
  @pytest.mark.parametrize(
114
114
  "protocol",
115
- ["ucx", "ucxx"],
115
+ ["ucx", "ucx-old"],
116
116
  )
117
117
  @gen_test(timeout=20)
118
118
  async def test_explicit_ucx_with_protocol_none(protocol):
119
- if protocol == "ucx":
120
- pytest.importorskip("ucp")
121
- elif protocol == "ucxx":
122
- pytest.importorskip("ucxx")
119
+ get_ucx_implementation(protocol)
123
120
 
124
121
  initialize(protocol=protocol, enable_tcp_over_ucx=True)
125
122
  async with LocalCUDACluster(
126
- protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
123
+ protocol=None,
124
+ enable_tcp_over_ucx=True,
125
+ asynchronous=True,
127
126
  ) as cluster:
128
127
  assert all(
129
- ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
128
+ ws.address.startswith(f"{protocol}://")
129
+ for ws in cluster.scheduler.workers.values()
130
130
  )
131
131
 
132
132
 
133
133
  @pytest.mark.filterwarnings("ignore:Exception ignored in")
134
134
  @pytest.mark.parametrize(
135
135
  "protocol",
136
- ["ucx", "ucxx"],
136
+ ["ucx", "ucx-old"],
137
137
  )
138
138
  @gen_test(timeout=20)
139
139
  async def test_ucx_protocol_type_error(protocol):
140
- if protocol == "ucx":
141
- pytest.importorskip("ucp")
142
- elif protocol == "ucxx":
143
- pytest.importorskip("ucxx")
140
+ get_ucx_implementation(protocol)
144
141
 
145
142
  initialize(protocol=protocol, enable_tcp_over_ucx=True)
146
143
  with pytest.raises(TypeError):
@@ -153,7 +150,10 @@ async def test_ucx_protocol_type_error(protocol):
153
150
  @gen_test(timeout=20)
154
151
  async def test_n_workers():
155
152
  async with LocalCUDACluster(
156
- CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
153
+ CUDA_VISIBLE_DEVICES="0,1",
154
+ worker_class=MockWorker,
155
+ asynchronous=True,
156
+ data=dict,
157
157
  ) as cluster:
158
158
  assert len(cluster.workers) == 2
159
159
  assert len(cluster.worker_spec) == 2
@@ -208,10 +208,13 @@ async def test_no_memory_limits_cudaworker():
208
208
  @gen_test(timeout=20)
209
209
  async def test_all_to_all():
210
210
  async with LocalCUDACluster(
211
- CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
211
+ CUDA_VISIBLE_DEVICES="0,1",
212
+ worker_class=MockWorker,
213
+ asynchronous=True,
214
+ data=dict,
212
215
  ) as cluster:
213
216
  async with Client(cluster, asynchronous=True) as client:
214
- workers = list(client.scheduler_info()["workers"])
217
+ workers = list(client.scheduler_info(n_workers=-1)["workers"])
215
218
  n_workers = len(workers)
216
219
  await utils.all_to_all(client)
217
220
  # assert all to all has resulted in all data on every worker
@@ -263,11 +266,6 @@ async def test_rmm_managed():
263
266
  async def test_rmm_async():
264
267
  rmm = pytest.importorskip("rmm")
265
268
 
266
- driver_version = rmm._cuda.gpu.driverGetVersion()
267
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
268
- if driver_version < 11020 or runtime_version < 11020:
269
- pytest.skip("cudaMallocAsync not supported")
270
-
271
269
  async with LocalCUDACluster(
272
270
  rmm_async=True,
273
271
  rmm_pool_size="2GB",
@@ -290,11 +288,6 @@ async def test_rmm_async():
290
288
  async def test_rmm_async_with_maximum_pool_size():
291
289
  rmm = pytest.importorskip("rmm")
292
290
 
293
- driver_version = rmm._cuda.gpu.driverGetVersion()
294
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
295
- if driver_version < 11020 or runtime_version < 11020:
296
- pytest.skip("cudaMallocAsync not supported")
297
-
298
291
  async with LocalCUDACluster(
299
292
  rmm_async=True,
300
293
  rmm_pool_size="2GB",
@@ -381,7 +374,6 @@ async def test_cluster_worker():
381
374
  async with LocalCUDACluster(
382
375
  scheduler_port=0,
383
376
  asynchronous=True,
384
- device_memory_limit=1,
385
377
  n_workers=1,
386
378
  ) as cluster:
387
379
  assert len(cluster.workers) == 1
@@ -464,7 +456,7 @@ async def test_get_cluster_configuration():
464
456
  async with LocalCUDACluster(
465
457
  rmm_pool_size="2GB",
466
458
  rmm_maximum_pool_size="3GB",
467
- device_memory_limit="30B",
459
+ device_memory_limit="30B" if has_device_memory_resource() else None,
468
460
  CUDA_VISIBLE_DEVICES="0",
469
461
  scheduler_port=0,
470
462
  asynchronous=True,
@@ -474,10 +466,14 @@ async def test_get_cluster_configuration():
474
466
  assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
475
467
  assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
476
468
  assert ret["jit-unspill"] is False
477
- assert ret["device-memory-limit"] == 30
469
+ if has_device_memory_resource():
470
+ assert ret["device-memory-limit"] == 30
478
471
 
479
472
 
480
473
  @gen_test(timeout=20)
474
+ @pytest.mark.skip_if_no_device_memory(
475
+ "Devices without dedicated memory resources do not support fractional limits"
476
+ )
481
477
  async def test_worker_fraction_limits():
482
478
  async with LocalCUDACluster(
483
479
  dashboard_address=None,
@@ -503,6 +499,40 @@ async def test_worker_fraction_limits():
503
499
  )
504
500
 
505
501
 
502
+ # Intentionally not using @gen_test to skip cleanup checks
503
+ @pytest.mark.parametrize(
504
+ "argument", ["pool_size", "maximum_pool_size", "release_threshold"]
505
+ )
506
+ @pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
507
+ @pytest.mark.skip_if_device_memory(
508
+ "Devices with dedicated memory resources cannot test error"
509
+ )
510
+ def test_worker_fraction_limits_no_dedicated_memory(argument):
511
+ async def _test_worker_fraction_limits_no_dedicated_memory():
512
+ if argument == "pool_size":
513
+ kwargs = {"rmm_pool_size": "0.1"}
514
+ elif argument == "maximum_pool_size":
515
+ kwargs = {"rmm_pool_size": "1 GiB", "rmm_maximum_pool_size": "0.1"}
516
+ else:
517
+ kwargs = {"rmm_async": True, "rmm_release_threshold": "0.1"}
518
+
519
+ with raises_with_cause(
520
+ RuntimeError,
521
+ "Nanny failed to start",
522
+ RuntimeError,
523
+ "Worker failed to start",
524
+ ValueError,
525
+ "Fractional of total device memory not supported in devices without a "
526
+ "dedicated memory resource",
527
+ ):
528
+ await LocalCUDACluster(
529
+ asynchronous=True,
530
+ **kwargs,
531
+ )
532
+
533
+ asyncio.run(_test_worker_fraction_limits_no_dedicated_memory())
534
+
535
+
506
536
  @gen_test(timeout=20)
507
537
  async def test_cudf_spill_disabled():
508
538
  cudf = pytest.importorskip("cudf")
@@ -527,6 +557,9 @@ async def test_cudf_spill_disabled():
527
557
 
528
558
 
529
559
  @gen_test(timeout=20)
560
+ @pytest.mark.skip_if_no_device_memory(
561
+ "Devices without dedicated memory resources cannot enable cuDF spill"
562
+ )
530
563
  async def test_cudf_spill():
531
564
  cudf = pytest.importorskip("cudf")
532
565
 
@@ -551,27 +584,101 @@ async def test_cudf_spill():
551
584
  assert v == 2
552
585
 
553
586
 
587
+ @pytest.mark.skip_if_device_memory(
588
+ "Devices with dedicated memory resources cannot test error"
589
+ )
590
+ @gen_test(timeout=20)
591
+ async def test_cudf_spill_no_dedicated_memory():
592
+ cudf = pytest.importorskip("cudf") # noqa: F841
593
+
594
+ with pytest.raises(
595
+ ValueError,
596
+ match="cuDF spilling is not supported on devices without dedicated memory",
597
+ ):
598
+ await LocalCUDACluster(
599
+ enable_cudf_spill=True,
600
+ cudf_spill_stats=2,
601
+ asynchronous=True,
602
+ )
603
+
604
+
554
605
  @pytest.mark.parametrize(
555
606
  "protocol",
556
- ["ucx", "ucxx"],
607
+ ["ucx", "ucx-old"],
557
608
  )
558
- def test_print_cluster_config(capsys, protocol):
559
- if protocol == "ucx":
560
- pytest.importorskip("ucp")
561
- elif protocol == "ucxx":
562
- pytest.importorskip("ucxx")
609
+ @pytest.mark.parametrize(
610
+ "jit_unspill",
611
+ [False, True],
612
+ )
613
+ @pytest.mark.parametrize(
614
+ "device_memory_limit",
615
+ [None, "1B"],
616
+ )
617
+ def test_print_cluster_config(capsys, protocol, jit_unspill, device_memory_limit):
618
+ get_ucx_implementation(protocol)
563
619
 
564
620
  pytest.importorskip("rich")
565
- with LocalCUDACluster(
566
- n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
567
- ) as cluster:
568
- with Client(cluster) as client:
569
- print_cluster_config(client)
570
- captured = capsys.readouterr()
571
- assert "Dask Cluster Configuration" in captured.out
572
- assert protocol in captured.out
573
- assert "1 B" in captured.out
574
- assert "[plugin]" in captured.out
621
+
622
+ ctx = contextlib.nullcontext()
623
+ if not has_device_memory_resource():
624
+ if device_memory_limit:
625
+ ctx = pytest.raises(
626
+ ValueError,
627
+ match="device_memory_limit is set but device has no dedicated memory.",
628
+ )
629
+ if jit_unspill:
630
+ # JIT-Unspill exception has precedence, thus overwrite ctx if both are
631
+ # enabled
632
+ ctx = pytest.raises(
633
+ ValueError,
634
+ match="JIT-Unspill is not supported on devices without dedicated "
635
+ "memory",
636
+ )
637
+
638
+ with ctx:
639
+ with LocalCUDACluster(
640
+ n_workers=1,
641
+ device_memory_limit=device_memory_limit,
642
+ jit_unspill=jit_unspill,
643
+ protocol=protocol,
644
+ ) as cluster:
645
+ with Client(cluster) as client:
646
+ print_cluster_config(client)
647
+ captured = capsys.readouterr()
648
+ assert "Dask Cluster Configuration" in captured.out
649
+ assert protocol in captured.out
650
+ if device_memory_limit == "1B":
651
+ assert "1 B" in captured.out
652
+ assert "[plugin]" in captured.out
653
+ client.shutdown()
654
+
655
+ def ucxpy_reset(timeout=20):
656
+ """Reset UCX-Py with a timeout.
657
+
658
+ Attempt to reset UCX-Py, not doing so may cause a deadlock because UCX-Py is
659
+ not thread-safe and the Dask cluster may still be alive while a new cluster
660
+ and UCX-Py instances are initalized.
661
+ """
662
+ import time
663
+
664
+ import ucp
665
+
666
+ start = time.monotonic()
667
+ while True:
668
+ try:
669
+ ucp.reset()
670
+ except ucp._libs.exceptions.UCXError as e:
671
+ if time.monotonic() - start > timeout:
672
+ raise RuntimeError(
673
+ f"Could not reset UCX-Py in {timeout} seconds, this may result "
674
+ f"in a deadlock. Failure:\n{e}"
675
+ )
676
+ continue
677
+ else:
678
+ break
679
+
680
+ if protocol == "ucx-old":
681
+ ucxpy_reset()
575
682
 
576
683
 
577
684
  @pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
@@ -1,4 +1,5 @@
1
- # Copyright (c) 2025, NVIDIA CORPORATION.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
2
3
 
3
4
  from typing import Iterable
4
5
  from unittest.mock import patch
@@ -219,6 +220,9 @@ def test_one_item_host_limit(capsys, root_dir):
219
220
  assert len(dhf.manager) == 0
220
221
 
221
222
 
223
+ @pytest.mark.skip_if_no_device_memory(
224
+ "Devices without dedicated memory resources do not support spilling"
225
+ )
222
226
  def test_spill_on_demand(root_dir):
223
227
  """
224
228
  Test spilling on demand by disabling the device_memory_limit
@@ -241,6 +245,9 @@ def test_spill_on_demand(root_dir):
241
245
 
242
246
 
243
247
  @pytest.mark.parametrize("jit_unspill", [True, False])
248
+ @pytest.mark.skip_if_no_device_memory(
249
+ "Devices without dedicated memory resources do not support spilling"
250
+ )
244
251
  @gen_test(timeout=20)
245
252
  async def test_local_cuda_cluster(jit_unspill):
246
253
  """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -377,9 +384,9 @@ def test_externals(root_dir):
377
384
 
378
385
  @patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
379
386
  def test_incompatible_types(root_dir):
380
- """Check that ProxifyHostFile unproxifies `cupy.ndarray` on retrieval
387
+ """Check that ProxifyHostFile unproxifies ``cupy.ndarray`` on retrieval
381
388
 
382
- Notice, in this test we add `cupy.ndarray` to the incompatible_types temporarily.
389
+ Notice, in this test we add ``cupy.ndarray`` to the incompatible_types temporarily.
383
390
  """
384
391
  cupy = pytest.importorskip("cupy")
385
392
  cudf = pytest.importorskip("cudf")
@@ -398,6 +405,9 @@ def test_incompatible_types(root_dir):
398
405
 
399
406
  @pytest.mark.parametrize("npartitions", [1, 2, 3])
400
407
  @pytest.mark.parametrize("compatibility_mode", [True, False])
408
+ @pytest.mark.skip_if_no_device_memory(
409
+ "Devices without dedicated memory resources do not support JIT-Unspill"
410
+ )
401
411
  @gen_test(timeout=30)
402
412
  async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
403
413
  cudf = pytest.importorskip("cudf")
@@ -430,6 +440,9 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
430
440
  assert all(res) # Only proxy objects
431
441
 
432
442
 
443
+ @pytest.mark.skip_if_no_device_memory(
444
+ "Devices without dedicated memory resources do not support JIT-Unspill"
445
+ )
433
446
  @gen_test(timeout=60)
434
447
  async def test_worker_force_spill_to_disk():
435
448
  """Test Dask triggering CPU-to-Disk spilling"""
@@ -465,6 +478,9 @@ async def test_worker_force_spill_to_disk():
465
478
  assert "Unmanaged memory use is high" not in log
466
479
 
467
480
 
481
+ @pytest.mark.skip_if_no_device_memory(
482
+ "Devices without dedicated memory resources do not support JIT-Unspill"
483
+ )
468
484
  def test_on_demand_debug_info():
469
485
  """Test worker logging when on-demand-spilling fails"""
470
486
  rmm = pytest.importorskip("rmm")
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import operator
2
5
  import os
3
6
  import pickle
@@ -23,7 +26,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
23
26
  from dask_cuda.disk_io import SpillToDiskFile
24
27
  from dask_cuda.proxify_device_objects import proxify_device_objects
25
28
  from dask_cuda.proxify_host_file import ProxifyHostFile
26
- from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
29
+ from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
27
30
 
28
31
  # Make the "disk" serializer available and use a directory that are
29
32
  # remove on exit.
@@ -242,7 +245,7 @@ def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers):
242
245
 
243
246
  @pytest.mark.parametrize("backend", ["numpy", "cupy"])
244
247
  def test_fixed_attribute_length(backend):
245
- """Test fixed attribute `x.__len__` access
248
+ """Test fixed attribute ``x.__len__`` access
246
249
 
247
250
  Notice, accessing fixed attributes shouldn't de-serialize the proxied object
248
251
  """
@@ -263,7 +266,7 @@ def test_fixed_attribute_length(backend):
263
266
 
264
267
 
265
268
  def test_fixed_attribute_name():
266
- """Test fixed attribute `x.name` access
269
+ """Test fixed attribute ``x.name`` access
267
270
 
268
271
  Notice, accessing fixed attributes shouldn't de-serialize the proxied object
269
272
  """
@@ -284,6 +287,9 @@ def test_fixed_attribute_name():
284
287
 
285
288
 
286
289
  @pytest.mark.parametrize("jit_unspill", [True, False])
290
+ @pytest.mark.skip_if_no_device_memory(
291
+ "Spilling not supported in devices without dedicated memory resource"
292
+ )
287
293
  @gen_test(timeout=20)
288
294
  async def test_spilling_local_cuda_cluster(jit_unspill):
289
295
  """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -386,8 +392,8 @@ def test_serializing_array_to_disk(backend, serializers, size):
386
392
  class _PxyObjTest(proxy_object.ProxyObject):
387
393
  """
388
394
  A class that:
389
- - defines `__dask_tokenize__` in order to avoid deserialization when
390
- calling `client.scatter()`
395
+ - defines ``__dask_tokenize__`` in order to avoid deserialization when
396
+ calling ``client.scatter()``
391
397
  - Asserts that no deserialization is performaned when communicating.
392
398
  """
393
399
 
@@ -401,14 +407,12 @@ class _PxyObjTest(proxy_object.ProxyObject):
401
407
 
402
408
 
403
409
  @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
404
- @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
410
+ @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
405
411
  @gen_test(timeout=120)
406
412
  async def test_communicating_proxy_objects(protocol, send_serializers):
407
413
  """Testing serialization of cuDF dataframe when communicating"""
408
- if protocol == "ucx":
409
- pytest.importorskip("ucp")
410
- elif protocol == "ucxx":
411
- pytest.importorskip("ucxx")
414
+ if protocol.startswith("ucx"):
415
+ get_ucx_implementation(protocol)
412
416
  cudf = pytest.importorskip("cudf")
413
417
 
414
418
  def task(x):
@@ -417,7 +421,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
417
421
  serializers_used = x._pxy_get().serializer
418
422
 
419
423
  # Check that `x` is serialized with the expected serializers
420
- if protocol in ["ucx", "ucxx"]:
424
+ if protocol in ["ucx", "ucx-old"]:
421
425
  if send_serializers is None:
422
426
  assert serializers_used == "cuda"
423
427
  else:
@@ -448,15 +452,13 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
448
452
  await client.submit(task, df)
449
453
 
450
454
 
451
- @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
455
+ @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
452
456
  @pytest.mark.parametrize("shared_fs", [True, False])
453
457
  @gen_test(timeout=20)
454
458
  async def test_communicating_disk_objects(protocol, shared_fs):
455
459
  """Testing disk serialization of cuDF dataframe when communicating"""
456
- if protocol == "ucx":
457
- pytest.importorskip("ucp")
458
- elif protocol == "ucxx":
459
- pytest.importorskip("ucxx")
460
+ if protocol.startswith("ucx"):
461
+ get_ucx_implementation(protocol)
460
462
  cudf = pytest.importorskip("cudf")
461
463
  ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
462
464
 
@@ -0,0 +1,160 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+
5
+ import importlib
6
+ import io
7
+ import multiprocessing as mp
8
+ import sys
9
+
10
+ import pytest
11
+
12
+ from dask_cuda import LocalCUDACluster
13
+
14
+ mp = mp.get_context("spawn") # type: ignore
15
+
16
+
17
+ def _has_distributed_ucxx() -> bool:
18
+ return bool(importlib.util.find_spec("distributed_ucxx"))
19
+
20
+
21
+ def _test_protocol_ucx():
22
+ with LocalCUDACluster(protocol="ucx") as cluster:
23
+ assert cluster.scheduler_comm.address.startswith("ucx://")
24
+
25
+ if _has_distributed_ucxx():
26
+ import distributed_ucxx
27
+
28
+ assert all(
29
+ isinstance(batched_send.comm, distributed_ucxx.ucxx.UCXX)
30
+ for batched_send in cluster.scheduler.stream_comms.values()
31
+ )
32
+ else:
33
+ import rapids_dask_dependency
34
+
35
+ assert all(
36
+ isinstance(
37
+ batched_send.comm,
38
+ rapids_dask_dependency.patches.distributed.comm.__rdd_patch_ucx.UCX,
39
+ )
40
+ for batched_send in cluster.scheduler.stream_comms.values()
41
+ )
42
+
43
+
44
+ def _test_protocol_ucxx():
45
+ if _has_distributed_ucxx():
46
+ with LocalCUDACluster(protocol="ucxx") as cluster:
47
+ assert cluster.scheduler_comm.address.startswith("ucxx://")
48
+ import distributed_ucxx
49
+
50
+ assert all(
51
+ isinstance(batched_send.comm, distributed_ucxx.ucxx.UCXX)
52
+ for batched_send in cluster.scheduler.stream_comms.values()
53
+ )
54
+ else:
55
+ with pytest.raises(RuntimeError, match="Cluster failed to start"):
56
+ LocalCUDACluster(protocol="ucxx")
57
+
58
+
59
+ def _test_protocol_ucx_old():
60
+ with LocalCUDACluster(protocol="ucx-old") as cluster:
61
+ assert cluster.scheduler_comm.address.startswith("ucx-old://")
62
+
63
+ import rapids_dask_dependency
64
+
65
+ assert all(
66
+ isinstance(
67
+ batched_send.comm,
68
+ rapids_dask_dependency.patches.distributed.comm.__rdd_patch_ucx.UCX,
69
+ )
70
+ for batched_send in cluster.scheduler.stream_comms.values()
71
+ )
72
+
73
+
74
+ def _run_test_with_output_capture(test_func_name, conn):
75
+ """Run a test function in a subprocess and capture stdout/stderr."""
76
+ # Redirect stdout and stderr to capture output
77
+ old_stdout = sys.stdout
78
+ old_stderr = sys.stderr
79
+ captured_output = io.StringIO()
80
+ sys.stdout = sys.stderr = captured_output
81
+
82
+ try:
83
+ # Import and run the test function
84
+ if test_func_name == "_test_protocol_ucx":
85
+ _test_protocol_ucx()
86
+ elif test_func_name == "_test_protocol_ucxx":
87
+ _test_protocol_ucxx()
88
+ elif test_func_name == "_test_protocol_ucx_old":
89
+ _test_protocol_ucx_old()
90
+ else:
91
+ raise ValueError(f"Unknown test function: {test_func_name}")
92
+
93
+ output = captured_output.getvalue()
94
+ conn.send((True, output)) # True = success
95
+ except Exception as e:
96
+ output = captured_output.getvalue()
97
+ output += f"\nException: {e}"
98
+ import traceback
99
+
100
+ output += f"\nTraceback:\n{traceback.format_exc()}"
101
+ conn.send((False, output)) # False = failure
102
+ finally:
103
+ # Restore original stdout/stderr
104
+ sys.stdout = old_stdout
105
+ sys.stderr = old_stderr
106
+ conn.close()
107
+
108
+
109
+ @pytest.mark.parametrize("protocol", ["ucx", "ucxx", "ucx-old"])
110
+ def test_rdd_protocol(protocol):
111
+ """Test rapids-dask-dependency protocol selection"""
112
+ if protocol == "ucx":
113
+ test_func_name = "_test_protocol_ucx"
114
+ elif protocol == "ucxx":
115
+ test_func_name = "_test_protocol_ucxx"
116
+ else:
117
+ test_func_name = "_test_protocol_ucx_old"
118
+
119
+ # Create a pipe for communication between parent and child processes
120
+ parent_conn, child_conn = mp.Pipe()
121
+ p = mp.Process(
122
+ target=_run_test_with_output_capture, args=(test_func_name, child_conn)
123
+ )
124
+
125
+ p.start()
126
+ p.join(timeout=60)
127
+
128
+ if p.is_alive():
129
+ p.kill()
130
+ p.close()
131
+ raise TimeoutError("Test process timed out")
132
+
133
+ # Get the result from the child process
134
+ success, output = parent_conn.recv()
135
+
136
+ # Check that the test passed
137
+ assert success, f"Test failed in subprocess. Output:\n{output}"
138
+
139
+ # For the ucx protocol, check if warnings are printed when distributed_ucxx is not
140
+ # available
141
+ if protocol == "ucx" and not _has_distributed_ucxx():
142
+ # Check if the warning about protocol='ucx' is printed
143
+ print(f"Output for {protocol} protocol:\n{output}")
144
+ assert (
145
+ "you have requested protocol='ucx'" in output
146
+ ), f"Expected warning not found in output: {output}"
147
+ assert (
148
+ "'distributed-ucxx' is not installed" in output
149
+ ), f"Expected warning about distributed-ucxx not found in output: {output}"
150
+ elif protocol == "ucx" and _has_distributed_ucxx():
151
+ # When distributed_ucxx is available, the warning should NOT be printed
152
+ assert "you have requested protocol='ucx'" not in output, (
153
+ "Warning should not be printed when distributed_ucxx is available: "
154
+ f"{output}"
155
+ )
156
+ elif protocol == "ucx-old":
157
+ # The ucx-old protocol should not generate warnings
158
+ assert (
159
+ "you have requested protocol='ucx'" not in output
160
+ ), f"Warning should not be printed for ucx-old protocol: {output}"
@@ -20,6 +20,13 @@ import dask_cudf
20
20
  from dask_cuda import LocalCUDACluster, utils
21
21
  from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
22
22
 
23
+ if not utils.has_device_memory_resource():
24
+ pytest.skip(
25
+ "No spilling tests supported for devices without memory resources. "
26
+ "See https://github.com/rapidsai/dask-cuda/issues/1510",
27
+ allow_module_level=True,
28
+ )
29
+
23
30
  if utils.get_device_total_memory() < 1e10:
24
31
  pytest.skip("Not enough GPU memory", allow_module_level=True)
25
32