dask-cuda 25.4.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dask_cuda/GIT_COMMIT +1 -1
  2. dask_cuda/VERSION +1 -1
  3. dask_cuda/_compat.py +18 -0
  4. dask_cuda/benchmarks/common.py +4 -1
  5. dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
  6. dask_cuda/benchmarks/local_cudf_merge.py +5 -2
  7. dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
  8. dask_cuda/benchmarks/local_cupy.py +4 -1
  9. dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
  10. dask_cuda/benchmarks/utils.py +7 -4
  11. dask_cuda/cli.py +21 -15
  12. dask_cuda/cuda_worker.py +27 -57
  13. dask_cuda/device_host_file.py +31 -15
  14. dask_cuda/disk_io.py +7 -4
  15. dask_cuda/explicit_comms/comms.py +11 -7
  16. dask_cuda/explicit_comms/dataframe/shuffle.py +147 -55
  17. dask_cuda/get_device_memory_objects.py +18 -3
  18. dask_cuda/initialize.py +80 -44
  19. dask_cuda/is_device_object.py +4 -1
  20. dask_cuda/is_spillable_object.py +4 -1
  21. dask_cuda/local_cuda_cluster.py +63 -66
  22. dask_cuda/plugins.py +17 -16
  23. dask_cuda/proxify_device_objects.py +15 -10
  24. dask_cuda/proxify_host_file.py +30 -27
  25. dask_cuda/proxy_object.py +20 -17
  26. dask_cuda/tests/conftest.py +41 -0
  27. dask_cuda/tests/test_dask_cuda_worker.py +114 -27
  28. dask_cuda/tests/test_dgx.py +10 -18
  29. dask_cuda/tests/test_explicit_comms.py +51 -18
  30. dask_cuda/tests/test_from_array.py +7 -5
  31. dask_cuda/tests/test_initialize.py +16 -37
  32. dask_cuda/tests/test_local_cuda_cluster.py +164 -54
  33. dask_cuda/tests/test_proxify_host_file.py +33 -4
  34. dask_cuda/tests/test_proxy.py +18 -16
  35. dask_cuda/tests/test_rdd_ucx.py +160 -0
  36. dask_cuda/tests/test_spill.py +107 -27
  37. dask_cuda/tests/test_utils.py +106 -20
  38. dask_cuda/tests/test_worker_spec.py +5 -2
  39. dask_cuda/utils.py +319 -68
  40. dask_cuda/utils_test.py +23 -7
  41. dask_cuda/worker_common.py +196 -0
  42. dask_cuda/worker_spec.py +12 -5
  43. {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +5 -4
  44. dask_cuda-25.8.0.dist-info/RECORD +63 -0
  45. {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +1 -1
  46. dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
  47. shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
  48. shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
  49. shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
  50. dask_cuda-25.4.0.dist-info/RECORD +0 -56
  51. dask_cuda-25.4.0.dist-info/top_level.txt +0 -5
  52. {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
  53. {dask_cuda-25.4.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import multiprocessing as mp
2
5
  import sys
3
6
 
@@ -11,7 +14,7 @@ from distributed.deploy.local import LocalCluster
11
14
 
12
15
  from dask_cuda.initialize import initialize
13
16
  from dask_cuda.utils import get_ucx_config
14
- from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
17
+ from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
15
18
 
16
19
  mp = mp.get_context("spawn") # type: ignore
17
20
 
@@ -22,10 +25,7 @@ mp = mp.get_context("spawn") # type: ignore
22
25
 
23
26
 
24
27
  def _test_initialize_ucx_tcp(protocol):
25
- if protocol == "ucx":
26
- ucp = pytest.importorskip("ucp")
27
- elif protocol == "ucxx":
28
- ucp = pytest.importorskip("ucxx")
28
+ ucp = get_ucx_implementation(protocol)
29
29
 
30
30
  kwargs = {"enable_tcp_over_ucx": True}
31
31
  initialize(protocol=protocol, **kwargs)
@@ -55,12 +55,9 @@ def _test_initialize_ucx_tcp(protocol):
55
55
  assert all(client.run(check_ucx_options).values())
56
56
 
57
57
 
58
- @pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
58
+ @pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
59
59
  def test_initialize_ucx_tcp(protocol):
60
- if protocol == "ucx":
61
- pytest.importorskip("ucp")
62
- elif protocol == "ucxx":
63
- pytest.importorskip("ucxx")
60
+ get_ucx_implementation(protocol)
64
61
 
65
62
  p = mp.Process(target=_test_initialize_ucx_tcp, args=(protocol,))
66
63
  p.start()
@@ -69,10 +66,7 @@ def test_initialize_ucx_tcp(protocol):
69
66
 
70
67
 
71
68
  def _test_initialize_ucx_nvlink(protocol):
72
- if protocol == "ucx":
73
- ucp = pytest.importorskip("ucp")
74
- elif protocol == "ucxx":
75
- ucp = pytest.importorskip("ucxx")
69
+ ucp = get_ucx_implementation(protocol)
76
70
 
77
71
  kwargs = {"enable_nvlink": True}
78
72
  initialize(protocol=protocol, **kwargs)
@@ -103,12 +97,9 @@ def _test_initialize_ucx_nvlink(protocol):
103
97
  assert all(client.run(check_ucx_options).values())
104
98
 
105
99
 
106
- @pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
100
+ @pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
107
101
  def test_initialize_ucx_nvlink(protocol):
108
- if protocol == "ucx":
109
- pytest.importorskip("ucp")
110
- elif protocol == "ucxx":
111
- pytest.importorskip("ucxx")
102
+ get_ucx_implementation(protocol)
112
103
 
113
104
  p = mp.Process(target=_test_initialize_ucx_nvlink, args=(protocol,))
114
105
  p.start()
@@ -117,10 +108,7 @@ def test_initialize_ucx_nvlink(protocol):
117
108
 
118
109
 
119
110
  def _test_initialize_ucx_infiniband(protocol):
120
- if protocol == "ucx":
121
- ucp = pytest.importorskip("ucp")
122
- elif protocol == "ucxx":
123
- ucp = pytest.importorskip("ucxx")
111
+ ucp = get_ucx_implementation(protocol)
124
112
 
125
113
  kwargs = {"enable_infiniband": True}
126
114
  initialize(protocol=protocol, **kwargs)
@@ -154,12 +142,9 @@ def _test_initialize_ucx_infiniband(protocol):
154
142
  @pytest.mark.skipif(
155
143
  "ib0" not in psutil.net_if_addrs(), reason="Infiniband interface ib0 not found"
156
144
  )
157
- @pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
145
+ @pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
158
146
  def test_initialize_ucx_infiniband(protocol):
159
- if protocol == "ucx":
160
- pytest.importorskip("ucp")
161
- elif protocol == "ucxx":
162
- pytest.importorskip("ucxx")
147
+ get_ucx_implementation(protocol)
163
148
 
164
149
  p = mp.Process(target=_test_initialize_ucx_infiniband, args=(protocol,))
165
150
  p.start()
@@ -168,10 +153,7 @@ def test_initialize_ucx_infiniband(protocol):
168
153
 
169
154
 
170
155
  def _test_initialize_ucx_all(protocol):
171
- if protocol == "ucx":
172
- ucp = pytest.importorskip("ucp")
173
- elif protocol == "ucxx":
174
- ucp = pytest.importorskip("ucxx")
156
+ ucp = get_ucx_implementation(protocol)
175
157
 
176
158
  initialize(protocol=protocol)
177
159
  with LocalCluster(
@@ -204,12 +186,9 @@ def _test_initialize_ucx_all(protocol):
204
186
  assert all(client.run(check_ucx_options).values())
205
187
 
206
188
 
207
- @pytest.mark.parametrize("protocol", ["ucx", "ucxx"])
189
+ @pytest.mark.parametrize("protocol", ["ucx", "ucx-old"])
208
190
  def test_initialize_ucx_all(protocol):
209
- if protocol == "ucx":
210
- pytest.importorskip("ucp")
211
- elif protocol == "ucxx":
212
- pytest.importorskip("ucxx")
191
+ get_ucx_implementation(protocol)
213
192
 
214
193
  p = mp.Process(target=_test_initialize_ucx_all, args=(protocol,))
215
194
  p.start()
@@ -1,4 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import asyncio
5
+ import contextlib
2
6
  import os
3
7
  import pkgutil
4
8
  import sys
@@ -16,16 +20,18 @@ from dask_cuda.utils import (
16
20
  get_cluster_configuration,
17
21
  get_device_total_memory,
18
22
  get_gpu_count_mig,
19
- get_gpu_uuid_from_index,
23
+ get_gpu_uuid,
24
+ has_device_memory_resource,
20
25
  print_cluster_config,
21
26
  )
22
- from dask_cuda.utils_test import MockWorker
27
+ from dask_cuda.utils_test import MockWorker, get_ucx_implementation
23
28
 
24
29
 
25
30
  @gen_test(timeout=20)
26
31
  async def test_local_cuda_cluster():
27
32
  async with LocalCUDACluster(
28
- scheduler_port=0, asynchronous=True, device_memory_limit=1
33
+ scheduler_port=0,
34
+ asynchronous=True,
29
35
  ) as cluster:
30
36
  async with Client(cluster, asynchronous=True) as client:
31
37
  assert len(cluster.workers) == utils.get_n_gpus()
@@ -65,8 +71,8 @@ async def test_with_subset_of_cuda_visible_devices():
65
71
  async with LocalCUDACluster(
66
72
  scheduler_port=0,
67
73
  asynchronous=True,
68
- device_memory_limit=1,
69
74
  worker_class=MockWorker,
75
+ data=dict,
70
76
  ) as cluster:
71
77
  async with Client(cluster, asynchronous=True) as client:
72
78
  assert len(cluster.workers) == 4
@@ -89,14 +95,11 @@ async def test_with_subset_of_cuda_visible_devices():
89
95
 
90
96
  @pytest.mark.parametrize(
91
97
  "protocol",
92
- ["ucx", "ucxx"],
98
+ ["ucx", "ucx-old"],
93
99
  )
94
100
  @gen_test(timeout=20)
95
101
  async def test_ucx_protocol(protocol):
96
- if protocol == "ucx":
97
- pytest.importorskip("ucp")
98
- elif protocol == "ucxx":
99
- pytest.importorskip("ucxx")
102
+ get_ucx_implementation(protocol)
100
103
 
101
104
  async with LocalCUDACluster(
102
105
  protocol=protocol, asynchronous=True, data=dict
@@ -109,35 +112,32 @@ async def test_ucx_protocol(protocol):
109
112
 
110
113
  @pytest.mark.parametrize(
111
114
  "protocol",
112
- ["ucx", "ucxx"],
115
+ ["ucx", "ucx-old"],
113
116
  )
114
117
  @gen_test(timeout=20)
115
118
  async def test_explicit_ucx_with_protocol_none(protocol):
116
- if protocol == "ucx":
117
- pytest.importorskip("ucp")
118
- elif protocol == "ucxx":
119
- pytest.importorskip("ucxx")
119
+ get_ucx_implementation(protocol)
120
120
 
121
121
  initialize(protocol=protocol, enable_tcp_over_ucx=True)
122
122
  async with LocalCUDACluster(
123
- protocol=None, enable_tcp_over_ucx=True, asynchronous=True, data=dict
123
+ protocol=None,
124
+ enable_tcp_over_ucx=True,
125
+ asynchronous=True,
124
126
  ) as cluster:
125
127
  assert all(
126
- ws.address.startswith("ucx://") for ws in cluster.scheduler.workers.values()
128
+ ws.address.startswith(f"{protocol}://")
129
+ for ws in cluster.scheduler.workers.values()
127
130
  )
128
131
 
129
132
 
130
133
  @pytest.mark.filterwarnings("ignore:Exception ignored in")
131
134
  @pytest.mark.parametrize(
132
135
  "protocol",
133
- ["ucx", "ucxx"],
136
+ ["ucx", "ucx-old"],
134
137
  )
135
138
  @gen_test(timeout=20)
136
139
  async def test_ucx_protocol_type_error(protocol):
137
- if protocol == "ucx":
138
- pytest.importorskip("ucp")
139
- elif protocol == "ucxx":
140
- pytest.importorskip("ucxx")
140
+ get_ucx_implementation(protocol)
141
141
 
142
142
  initialize(protocol=protocol, enable_tcp_over_ucx=True)
143
143
  with pytest.raises(TypeError):
@@ -150,7 +150,10 @@ async def test_ucx_protocol_type_error(protocol):
150
150
  @gen_test(timeout=20)
151
151
  async def test_n_workers():
152
152
  async with LocalCUDACluster(
153
- CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
153
+ CUDA_VISIBLE_DEVICES="0,1",
154
+ worker_class=MockWorker,
155
+ asynchronous=True,
156
+ data=dict,
154
157
  ) as cluster:
155
158
  assert len(cluster.workers) == 2
156
159
  assert len(cluster.worker_spec) == 2
@@ -205,10 +208,13 @@ async def test_no_memory_limits_cudaworker():
205
208
  @gen_test(timeout=20)
206
209
  async def test_all_to_all():
207
210
  async with LocalCUDACluster(
208
- CUDA_VISIBLE_DEVICES="0,1", worker_class=MockWorker, asynchronous=True
211
+ CUDA_VISIBLE_DEVICES="0,1",
212
+ worker_class=MockWorker,
213
+ asynchronous=True,
214
+ data=dict,
209
215
  ) as cluster:
210
216
  async with Client(cluster, asynchronous=True) as client:
211
- workers = list(client.scheduler_info()["workers"])
217
+ workers = list(client.scheduler_info(n_workers=-1)["workers"])
212
218
  n_workers = len(workers)
213
219
  await utils.all_to_all(client)
214
220
  # assert all to all has resulted in all data on every worker
@@ -260,11 +266,6 @@ async def test_rmm_managed():
260
266
  async def test_rmm_async():
261
267
  rmm = pytest.importorskip("rmm")
262
268
 
263
- driver_version = rmm._cuda.gpu.driverGetVersion()
264
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
265
- if driver_version < 11020 or runtime_version < 11020:
266
- pytest.skip("cudaMallocAsync not supported")
267
-
268
269
  async with LocalCUDACluster(
269
270
  rmm_async=True,
270
271
  rmm_pool_size="2GB",
@@ -287,11 +288,6 @@ async def test_rmm_async():
287
288
  async def test_rmm_async_with_maximum_pool_size():
288
289
  rmm = pytest.importorskip("rmm")
289
290
 
290
- driver_version = rmm._cuda.gpu.driverGetVersion()
291
- runtime_version = rmm._cuda.gpu.runtimeGetVersion()
292
- if driver_version < 11020 or runtime_version < 11020:
293
- pytest.skip("cudaMallocAsync not supported")
294
-
295
291
  async with LocalCUDACluster(
296
292
  rmm_async=True,
297
293
  rmm_pool_size="2GB",
@@ -378,7 +374,6 @@ async def test_cluster_worker():
378
374
  async with LocalCUDACluster(
379
375
  scheduler_port=0,
380
376
  asynchronous=True,
381
- device_memory_limit=1,
382
377
  n_workers=1,
383
378
  ) as cluster:
384
379
  assert len(cluster.workers) == 1
@@ -419,7 +414,7 @@ async def test_available_mig_workers():
419
414
 
420
415
  @gen_test(timeout=20)
421
416
  async def test_gpu_uuid():
422
- gpu_uuid = get_gpu_uuid_from_index(0)
417
+ gpu_uuid = get_gpu_uuid(0)
423
418
 
424
419
  async with LocalCUDACluster(
425
420
  CUDA_VISIBLE_DEVICES=gpu_uuid,
@@ -461,7 +456,7 @@ async def test_get_cluster_configuration():
461
456
  async with LocalCUDACluster(
462
457
  rmm_pool_size="2GB",
463
458
  rmm_maximum_pool_size="3GB",
464
- device_memory_limit="30B",
459
+ device_memory_limit="30B" if has_device_memory_resource() else None,
465
460
  CUDA_VISIBLE_DEVICES="0",
466
461
  scheduler_port=0,
467
462
  asynchronous=True,
@@ -471,10 +466,14 @@ async def test_get_cluster_configuration():
471
466
  assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
472
467
  assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
473
468
  assert ret["jit-unspill"] is False
474
- assert ret["device-memory-limit"] == 30
469
+ if has_device_memory_resource():
470
+ assert ret["device-memory-limit"] == 30
475
471
 
476
472
 
477
473
  @gen_test(timeout=20)
474
+ @pytest.mark.skip_if_no_device_memory(
475
+ "Devices without dedicated memory resources do not support fractional limits"
476
+ )
478
477
  async def test_worker_fraction_limits():
479
478
  async with LocalCUDACluster(
480
479
  dashboard_address=None,
@@ -500,6 +499,40 @@ async def test_worker_fraction_limits():
500
499
  )
501
500
 
502
501
 
502
+ # Intentionally not using @gen_test to skip cleanup checks
503
+ @pytest.mark.parametrize(
504
+ "argument", ["pool_size", "maximum_pool_size", "release_threshold"]
505
+ )
506
+ @pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
507
+ @pytest.mark.skip_if_device_memory(
508
+ "Devices with dedicated memory resources cannot test error"
509
+ )
510
+ def test_worker_fraction_limits_no_dedicated_memory(argument):
511
+ async def _test_worker_fraction_limits_no_dedicated_memory():
512
+ if argument == "pool_size":
513
+ kwargs = {"rmm_pool_size": "0.1"}
514
+ elif argument == "maximum_pool_size":
515
+ kwargs = {"rmm_pool_size": "1 GiB", "rmm_maximum_pool_size": "0.1"}
516
+ else:
517
+ kwargs = {"rmm_async": True, "rmm_release_threshold": "0.1"}
518
+
519
+ with raises_with_cause(
520
+ RuntimeError,
521
+ "Nanny failed to start",
522
+ RuntimeError,
523
+ "Worker failed to start",
524
+ ValueError,
525
+ "Fractional of total device memory not supported in devices without a "
526
+ "dedicated memory resource",
527
+ ):
528
+ await LocalCUDACluster(
529
+ asynchronous=True,
530
+ **kwargs,
531
+ )
532
+
533
+ asyncio.run(_test_worker_fraction_limits_no_dedicated_memory())
534
+
535
+
503
536
  @gen_test(timeout=20)
504
537
  async def test_cudf_spill_disabled():
505
538
  cudf = pytest.importorskip("cudf")
@@ -524,6 +557,9 @@ async def test_cudf_spill_disabled():
524
557
 
525
558
 
526
559
  @gen_test(timeout=20)
560
+ @pytest.mark.skip_if_no_device_memory(
561
+ "Devices without dedicated memory resources cannot enable cuDF spill"
562
+ )
527
563
  async def test_cudf_spill():
528
564
  cudf = pytest.importorskip("cudf")
529
565
 
@@ -548,27 +584,101 @@ async def test_cudf_spill():
548
584
  assert v == 2
549
585
 
550
586
 
587
+ @pytest.mark.skip_if_device_memory(
588
+ "Devices with dedicated memory resources cannot test error"
589
+ )
590
+ @gen_test(timeout=20)
591
+ async def test_cudf_spill_no_dedicated_memory():
592
+ cudf = pytest.importorskip("cudf") # noqa: F841
593
+
594
+ with pytest.raises(
595
+ ValueError,
596
+ match="cuDF spilling is not supported on devices without dedicated memory",
597
+ ):
598
+ await LocalCUDACluster(
599
+ enable_cudf_spill=True,
600
+ cudf_spill_stats=2,
601
+ asynchronous=True,
602
+ )
603
+
604
+
551
605
  @pytest.mark.parametrize(
552
606
  "protocol",
553
- ["ucx", "ucxx"],
607
+ ["ucx", "ucx-old"],
608
+ )
609
+ @pytest.mark.parametrize(
610
+ "jit_unspill",
611
+ [False, True],
554
612
  )
555
- def test_print_cluster_config(capsys, protocol):
556
- if protocol == "ucx":
557
- pytest.importorskip("ucp")
558
- elif protocol == "ucxx":
559
- pytest.importorskip("ucxx")
613
+ @pytest.mark.parametrize(
614
+ "device_memory_limit",
615
+ [None, "1B"],
616
+ )
617
+ def test_print_cluster_config(capsys, protocol, jit_unspill, device_memory_limit):
618
+ get_ucx_implementation(protocol)
560
619
 
561
620
  pytest.importorskip("rich")
562
- with LocalCUDACluster(
563
- n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol=protocol
564
- ) as cluster:
565
- with Client(cluster) as client:
566
- print_cluster_config(client)
567
- captured = capsys.readouterr()
568
- assert "Dask Cluster Configuration" in captured.out
569
- assert protocol in captured.out
570
- assert "1 B" in captured.out
571
- assert "[plugin]" in captured.out
621
+
622
+ ctx = contextlib.nullcontext()
623
+ if not has_device_memory_resource():
624
+ if device_memory_limit:
625
+ ctx = pytest.raises(
626
+ ValueError,
627
+ match="device_memory_limit is set but device has no dedicated memory.",
628
+ )
629
+ if jit_unspill:
630
+ # JIT-Unspill exception has precedence, thus overwrite ctx if both are
631
+ # enabled
632
+ ctx = pytest.raises(
633
+ ValueError,
634
+ match="JIT-Unspill is not supported on devices without dedicated "
635
+ "memory",
636
+ )
637
+
638
+ with ctx:
639
+ with LocalCUDACluster(
640
+ n_workers=1,
641
+ device_memory_limit=device_memory_limit,
642
+ jit_unspill=jit_unspill,
643
+ protocol=protocol,
644
+ ) as cluster:
645
+ with Client(cluster) as client:
646
+ print_cluster_config(client)
647
+ captured = capsys.readouterr()
648
+ assert "Dask Cluster Configuration" in captured.out
649
+ assert protocol in captured.out
650
+ if device_memory_limit == "1B":
651
+ assert "1 B" in captured.out
652
+ assert "[plugin]" in captured.out
653
+ client.shutdown()
654
+
655
+ def ucxpy_reset(timeout=20):
656
+ """Reset UCX-Py with a timeout.
657
+
658
+ Attempt to reset UCX-Py, not doing so may cause a deadlock because UCX-Py is
659
+ not thread-safe and the Dask cluster may still be alive while a new cluster
660
+ and UCX-Py instances are initalized.
661
+ """
662
+ import time
663
+
664
+ import ucp
665
+
666
+ start = time.monotonic()
667
+ while True:
668
+ try:
669
+ ucp.reset()
670
+ except ucp._libs.exceptions.UCXError as e:
671
+ if time.monotonic() - start > timeout:
672
+ raise RuntimeError(
673
+ f"Could not reset UCX-Py in {timeout} seconds, this may result "
674
+ f"in a deadlock. Failure:\n{e}"
675
+ )
676
+ continue
677
+ else:
678
+ break
679
+
680
+ if protocol == "ucx-old":
681
+ ucxpy_reset()
572
682
 
573
683
 
574
684
  @pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/1265")
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  from typing import Iterable
2
5
  from unittest.mock import patch
3
6
 
@@ -217,6 +220,9 @@ def test_one_item_host_limit(capsys, root_dir):
217
220
  assert len(dhf.manager) == 0
218
221
 
219
222
 
223
+ @pytest.mark.skip_if_no_device_memory(
224
+ "Devices without dedicated memory resources do not support spilling"
225
+ )
220
226
  def test_spill_on_demand(root_dir):
221
227
  """
222
228
  Test spilling on demand by disabling the device_memory_limit
@@ -239,6 +245,9 @@ def test_spill_on_demand(root_dir):
239
245
 
240
246
 
241
247
  @pytest.mark.parametrize("jit_unspill", [True, False])
248
+ @pytest.mark.skip_if_no_device_memory(
249
+ "Devices without dedicated memory resources do not support spilling"
250
+ )
242
251
  @gen_test(timeout=20)
243
252
  async def test_local_cuda_cluster(jit_unspill):
244
253
  """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -375,9 +384,9 @@ def test_externals(root_dir):
375
384
 
376
385
  @patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
377
386
  def test_incompatible_types(root_dir):
378
- """Check that ProxifyHostFile unproxifies `cupy.ndarray` on retrieval
387
+ """Check that ProxifyHostFile unproxifies ``cupy.ndarray`` on retrieval
379
388
 
380
- Notice, in this test we add `cupy.ndarray` to the incompatible_types temporarily.
389
+ Notice, in this test we add ``cupy.ndarray`` to the incompatible_types temporarily.
381
390
  """
382
391
  cupy = pytest.importorskip("cupy")
383
392
  cudf = pytest.importorskip("cudf")
@@ -396,6 +405,9 @@ def test_incompatible_types(root_dir):
396
405
 
397
406
  @pytest.mark.parametrize("npartitions", [1, 2, 3])
398
407
  @pytest.mark.parametrize("compatibility_mode", [True, False])
408
+ @pytest.mark.skip_if_no_device_memory(
409
+ "Devices without dedicated memory resources do not support JIT-Unspill"
410
+ )
399
411
  @gen_test(timeout=30)
400
412
  async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
401
413
  cudf = pytest.importorskip("cudf")
@@ -414,7 +426,7 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
414
426
  ddf = dask.dataframe.from_pandas(
415
427
  cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
416
428
  )
417
- res = ddf.shuffle(on="key", shuffle_method="tasks").persist()
429
+ [res] = client.persist([ddf.shuffle(on="key", shuffle_method="tasks")])
418
430
 
419
431
  # With compatibility mode on, we shouldn't encounter any proxy objects
420
432
  if compatibility_mode:
@@ -428,6 +440,9 @@ async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartiti
428
440
  assert all(res) # Only proxy objects
429
441
 
430
442
 
443
+ @pytest.mark.skip_if_no_device_memory(
444
+ "Devices without dedicated memory resources do not support JIT-Unspill"
445
+ )
431
446
  @gen_test(timeout=60)
432
447
  async def test_worker_force_spill_to_disk():
433
448
  """Test Dask triggering CPU-to-Disk spilling"""
@@ -440,7 +455,7 @@ async def test_worker_force_spill_to_disk():
440
455
  async with Client(cluster, asynchronous=True) as client:
441
456
  # Create a df that are spilled to host memory immediately
442
457
  df = cudf.DataFrame({"key": np.arange(10**8)})
443
- ddf = dask.dataframe.from_pandas(df, npartitions=1).persist()
458
+ [ddf] = client.persist([dask.dataframe.from_pandas(df, npartitions=1)])
444
459
  await ddf
445
460
 
446
461
  async def f(dask_worker):
@@ -463,6 +478,9 @@ async def test_worker_force_spill_to_disk():
463
478
  assert "Unmanaged memory use is high" not in log
464
479
 
465
480
 
481
+ @pytest.mark.skip_if_no_device_memory(
482
+ "Devices without dedicated memory resources do not support JIT-Unspill"
483
+ )
466
484
  def test_on_demand_debug_info():
467
485
  """Test worker logging when on-demand-spilling fails"""
468
486
  rmm = pytest.importorskip("rmm")
@@ -498,3 +516,14 @@ def test_on_demand_debug_info():
498
516
  assert f"WARNING - RMM allocation of {size} failed" in log
499
517
  assert f"RMM allocs: {size}" in log
500
518
  assert "traceback:" in log
519
+
520
+
521
+ def test_sizeof_owner_with_cai():
522
+ cudf = pytest.importorskip("cudf")
523
+ s = cudf.Series([1, 2, 3])
524
+
525
+ items = dask_cuda.get_device_memory_objects.dispatch(s)
526
+ assert len(items) == 1
527
+ item = items[0]
528
+ result = dask.sizeof.sizeof(item)
529
+ assert result == 24
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import operator
2
5
  import os
3
6
  import pickle
@@ -23,7 +26,7 @@ from dask_cuda import LocalCUDACluster, proxy_object
23
26
  from dask_cuda.disk_io import SpillToDiskFile
24
27
  from dask_cuda.proxify_device_objects import proxify_device_objects
25
28
  from dask_cuda.proxify_host_file import ProxifyHostFile
26
- from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
29
+ from dask_cuda.utils_test import IncreasedCloseTimeoutNanny, get_ucx_implementation
27
30
 
28
31
  # Make the "disk" serializer available and use a directory that are
29
32
  # remove on exit.
@@ -242,7 +245,7 @@ def test_serialize_of_proxied_cudf(proxy_serializers, dask_serializers):
242
245
 
243
246
  @pytest.mark.parametrize("backend", ["numpy", "cupy"])
244
247
  def test_fixed_attribute_length(backend):
245
- """Test fixed attribute `x.__len__` access
248
+ """Test fixed attribute ``x.__len__`` access
246
249
 
247
250
  Notice, accessing fixed attributes shouldn't de-serialize the proxied object
248
251
  """
@@ -263,7 +266,7 @@ def test_fixed_attribute_length(backend):
263
266
 
264
267
 
265
268
  def test_fixed_attribute_name():
266
- """Test fixed attribute `x.name` access
269
+ """Test fixed attribute ``x.name`` access
267
270
 
268
271
  Notice, accessing fixed attributes shouldn't de-serialize the proxied object
269
272
  """
@@ -284,6 +287,9 @@ def test_fixed_attribute_name():
284
287
 
285
288
 
286
289
  @pytest.mark.parametrize("jit_unspill", [True, False])
290
+ @pytest.mark.skip_if_no_device_memory(
291
+ "Spilling not supported in devices without dedicated memory resource"
292
+ )
287
293
  @gen_test(timeout=20)
288
294
  async def test_spilling_local_cuda_cluster(jit_unspill):
289
295
  """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
@@ -386,8 +392,8 @@ def test_serializing_array_to_disk(backend, serializers, size):
386
392
  class _PxyObjTest(proxy_object.ProxyObject):
387
393
  """
388
394
  A class that:
389
- - defines `__dask_tokenize__` in order to avoid deserialization when
390
- calling `client.scatter()`
395
+ - defines ``__dask_tokenize__`` in order to avoid deserialization when
396
+ calling ``client.scatter()``
391
397
  - Asserts that no deserialization is performaned when communicating.
392
398
  """
393
399
 
@@ -401,14 +407,12 @@ class _PxyObjTest(proxy_object.ProxyObject):
401
407
 
402
408
 
403
409
  @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
404
- @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
410
+ @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
405
411
  @gen_test(timeout=120)
406
412
  async def test_communicating_proxy_objects(protocol, send_serializers):
407
413
  """Testing serialization of cuDF dataframe when communicating"""
408
- if protocol == "ucx":
409
- pytest.importorskip("ucp")
410
- elif protocol == "ucxx":
411
- pytest.importorskip("ucxx")
414
+ if protocol.startswith("ucx"):
415
+ get_ucx_implementation(protocol)
412
416
  cudf = pytest.importorskip("cudf")
413
417
 
414
418
  def task(x):
@@ -417,7 +421,7 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
417
421
  serializers_used = x._pxy_get().serializer
418
422
 
419
423
  # Check that `x` is serialized with the expected serializers
420
- if protocol in ["ucx", "ucxx"]:
424
+ if protocol in ["ucx", "ucx-old"]:
421
425
  if send_serializers is None:
422
426
  assert serializers_used == "cuda"
423
427
  else:
@@ -448,15 +452,13 @@ async def test_communicating_proxy_objects(protocol, send_serializers):
448
452
  await client.submit(task, df)
449
453
 
450
454
 
451
- @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucxx"])
455
+ @pytest.mark.parametrize("protocol", ["tcp", "ucx", "ucx-old"])
452
456
  @pytest.mark.parametrize("shared_fs", [True, False])
453
457
  @gen_test(timeout=20)
454
458
  async def test_communicating_disk_objects(protocol, shared_fs):
455
459
  """Testing disk serialization of cuDF dataframe when communicating"""
456
- if protocol == "ucx":
457
- pytest.importorskip("ucp")
458
- elif protocol == "ucxx":
459
- pytest.importorskip("ucxx")
460
+ if protocol.startswith("ucx"):
461
+ get_ucx_implementation(protocol)
460
462
  cudf = pytest.importorskip("cudf")
461
463
  ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
462
464