dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. dask_cuda/GIT_COMMIT +1 -1
  2. dask_cuda/VERSION +1 -1
  3. dask_cuda/benchmarks/common.py +4 -1
  4. dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
  5. dask_cuda/benchmarks/local_cudf_merge.py +5 -2
  6. dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
  7. dask_cuda/benchmarks/local_cupy.py +4 -1
  8. dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
  9. dask_cuda/benchmarks/utils.py +7 -4
  10. dask_cuda/cli.py +21 -15
  11. dask_cuda/cuda_worker.py +27 -57
  12. dask_cuda/device_host_file.py +31 -15
  13. dask_cuda/disk_io.py +7 -4
  14. dask_cuda/explicit_comms/comms.py +11 -7
  15. dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
  16. dask_cuda/get_device_memory_objects.py +3 -3
  17. dask_cuda/initialize.py +80 -44
  18. dask_cuda/local_cuda_cluster.py +63 -66
  19. dask_cuda/plugins.py +17 -16
  20. dask_cuda/proxify_device_objects.py +12 -10
  21. dask_cuda/proxify_host_file.py +30 -27
  22. dask_cuda/proxy_object.py +20 -17
  23. dask_cuda/tests/conftest.py +41 -0
  24. dask_cuda/tests/test_dask_cuda_worker.py +109 -25
  25. dask_cuda/tests/test_dgx.py +10 -18
  26. dask_cuda/tests/test_explicit_comms.py +30 -12
  27. dask_cuda/tests/test_from_array.py +7 -5
  28. dask_cuda/tests/test_initialize.py +16 -37
  29. dask_cuda/tests/test_local_cuda_cluster.py +159 -52
  30. dask_cuda/tests/test_proxify_host_file.py +19 -3
  31. dask_cuda/tests/test_proxy.py +18 -16
  32. dask_cuda/tests/test_rdd_ucx.py +160 -0
  33. dask_cuda/tests/test_spill.py +7 -0
  34. dask_cuda/tests/test_utils.py +106 -20
  35. dask_cuda/tests/test_worker_spec.py +5 -2
  36. dask_cuda/utils.py +261 -38
  37. dask_cuda/utils_test.py +23 -7
  38. dask_cuda/worker_common.py +196 -0
  39. dask_cuda/worker_spec.py +12 -5
  40. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
  41. dask_cuda-25.8.0.dist-info/RECORD +63 -0
  42. dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
  43. shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
  44. shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
  45. shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
  46. dask_cuda-25.6.0.dist-info/RECORD +0 -57
  47. dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
  48. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
  49. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
  50. {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
dask_cuda/utils_test.py CHANGED
@@ -1,14 +1,19 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  from typing import Literal
2
5
 
3
6
  import distributed
4
7
  from distributed import Nanny, Worker
5
8
 
9
+ from .utils import _get_active_ucx_implementation_name
10
+
6
11
 
7
12
  class MockWorker(Worker):
8
13
  """Mock Worker class preventing NVML from getting used by SystemMonitor.
9
14
 
10
15
  By preventing the Worker from initializing NVML in the SystemMonitor, we can
11
- mock test multiple devices in `CUDA_VISIBLE_DEVICES` behavior with single-GPU
16
+ mock test multiple devices in ``CUDA_VISIBLE_DEVICES`` behavior with single-GPU
12
17
  machines.
13
18
  """
14
19
 
@@ -26,20 +31,31 @@ class MockWorker(Worker):
26
31
 
27
32
 
28
33
  class IncreasedCloseTimeoutNanny(Nanny):
29
- """Increase `Nanny`'s close timeout.
34
+ """Increase ``Nanny``'s close timeout.
30
35
 
31
- The internal close timeout mechanism of `Nanny` recomputes the time left to kill
32
- the `Worker` process based on elapsed time of the close task, which may leave
36
+ The internal close timeout mechanism of ``Nanny`` recomputes the time left to kill
37
+ the ``Worker`` process based on elapsed time of the close task, which may leave
33
38
  very little time for the subprocess to shutdown cleanly, which may cause tests
34
39
  to fail when the system is under higher load. This class increases the default
35
- close timeout of 5.0 seconds that `Nanny` sets by default, which can be overriden
40
+ close timeout of 5.0 seconds that ``Nanny`` sets by default, which can be overriden
36
41
  via Distributed's public API.
37
42
 
38
- This class can be used with the `worker_class` argument of `LocalCluster` or
39
- `LocalCUDACluster` to provide a much higher default of 30.0 seconds.
43
+ This class can be used with the ``worker_class`` argument of ``LocalCluster`` or
44
+ ``LocalCUDACluster`` to provide a much higher default of 30.0 seconds.
40
45
  """
41
46
 
42
47
  async def close( # type:ignore[override]
43
48
  self, timeout: float = 30.0, reason: str = "nanny-close"
44
49
  ) -> Literal["OK"]:
45
50
  return await super().close(timeout=timeout, reason=reason)
51
+
52
+
53
+ def get_ucx_implementation(protocol):
54
+ import pytest
55
+
56
+ protocol = _get_active_ucx_implementation_name(protocol)
57
+
58
+ if protocol == "ucxx":
59
+ return pytest.importorskip("ucxx")
60
+ else:
61
+ return pytest.importorskip("ucp")
@@ -0,0 +1,196 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import os
5
+ import warnings
6
+
7
+ from .device_host_file import DeviceHostFile
8
+ from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
9
+ from .proxify_host_file import ProxifyHostFile
10
+ from .utils import (
11
+ get_cpu_affinity,
12
+ has_device_memory_resource,
13
+ parse_device_memory_limit,
14
+ )
15
+
16
+
17
+ def worker_data_function(
18
+ device_memory_limit=None,
19
+ memory_limit=None,
20
+ jit_unspill=False,
21
+ enable_cudf_spill=False,
22
+ shared_filesystem=None,
23
+ ):
24
+ """
25
+ Create a data function for CUDA workers based on memory configuration.
26
+
27
+ This function creates and returns a callable that generates data configuration
28
+ for CUDA workers. The returned callable takes a device index parameter and
29
+ returns the appropriate data configuration for that device.
30
+
31
+ Parameters
32
+ ----------
33
+ device_memory_limit : str or int, optional
34
+ Limit of device memory, defaults to None
35
+ memory_limit : str or int, optional
36
+ Limit of host memory, defaults to None
37
+ jit_unspill : bool, optional
38
+ Whether to enable JIT unspill functionality, defaults to False
39
+ enable_cudf_spill : bool, optional
40
+ Whether to enable cuDF spilling, defaults to False
41
+ shared_filesystem : str or bool, optional
42
+ Whether to use shared filesystem for spilling, defaults to None
43
+
44
+ Returns
45
+ -------
46
+ callable
47
+ A function that takes device index `device_index` and returns appropriate
48
+ data configuration based on the availability of an dedicated device memory
49
+ resource and arguments passed to the worker.
50
+ """
51
+
52
+ def data(device_index):
53
+ if int(os.environ.get("DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC", "0")) != 0:
54
+ return {}
55
+
56
+ # First raise errors for invalid configurations
57
+ if not has_device_memory_resource(device_index):
58
+ if jit_unspill:
59
+ raise ValueError(
60
+ "JIT-Unspill is not supported on devices without dedicated memory, "
61
+ "such as system on a chip (SoC) devices."
62
+ )
63
+ elif enable_cudf_spill:
64
+ raise ValueError(
65
+ "cuDF spilling is not supported on devices without dedicated "
66
+ "memory, such as system on a chip (SoC) devices."
67
+ )
68
+ elif device_memory_limit not in [None, "default"]:
69
+ raise ValueError(
70
+ "device_memory_limit is set but device has no dedicated memory."
71
+ )
72
+
73
+ if device_memory_limit is None and memory_limit is None:
74
+ # All spilling is disabled
75
+ return {}
76
+ elif not has_device_memory_resource(device_index):
77
+ if device_memory_limit == "default" and memory_limit is None:
78
+ # Devices without a dedicated memory resource only support default
79
+ # host<->disk spilling via Dask's default mechanism.
80
+ return {}
81
+ # Devices without a dedicated memory resource only support default
82
+ # host<->disk spilling via Dask's default mechanism.
83
+ return None
84
+ else:
85
+ if jit_unspill:
86
+ # JIT-Unspill is enabled
87
+ if enable_cudf_spill:
88
+ warnings.warn(
89
+ "Enabling cuDF spilling and JIT-Unspill together is not "
90
+ "safe, consider disabling JIT-Unspill."
91
+ )
92
+
93
+ return (
94
+ ProxifyHostFile,
95
+ {
96
+ "device_memory_limit": parse_device_memory_limit(
97
+ device_memory_limit, device_index=device_index
98
+ ),
99
+ "memory_limit": memory_limit,
100
+ "shared_filesystem": shared_filesystem,
101
+ },
102
+ )
103
+ else:
104
+ # Device has dedicated memory and host memory is limited
105
+ return (
106
+ DeviceHostFile,
107
+ {
108
+ "device_memory_limit": parse_device_memory_limit(
109
+ device_memory_limit, device_index=device_index
110
+ ),
111
+ "memory_limit": memory_limit,
112
+ },
113
+ )
114
+
115
+ return data
116
+
117
+
118
+ def worker_plugins(
119
+ *,
120
+ device_index,
121
+ rmm_initial_pool_size,
122
+ rmm_maximum_pool_size,
123
+ rmm_managed_memory,
124
+ rmm_async_alloc,
125
+ rmm_release_threshold,
126
+ rmm_log_directory,
127
+ rmm_track_allocations,
128
+ rmm_allocator_external_lib_list,
129
+ pre_import,
130
+ enable_cudf_spill,
131
+ cudf_spill_stats,
132
+ ):
133
+ """Create a set of plugins for CUDA workers with specified configurations.
134
+
135
+ This function creates and returns a set of plugins that configure various aspects
136
+ of CUDA worker behavior, including CPU affinity, RMM memory management, pre-import
137
+ modules and cuDF spilling functionality.
138
+
139
+ Parameters
140
+ ----------
141
+ device_index : int
142
+ The CUDA device index to configure
143
+ rmm_initial_pool_size : int or str
144
+ Initial size of the RMM memory pool
145
+ rmm_maximum_pool_size : int or str
146
+ Maximum size of the RMM memory pool
147
+ rmm_managed_memory : bool
148
+ Whether to use CUDA managed memory
149
+ rmm_async_alloc : bool
150
+ Whether to use asynchronous allocation
151
+ rmm_release_threshold : int
152
+ Memory threshold for releasing memory back to the system
153
+ rmm_log_directory : str
154
+ Directory for RMM logging
155
+ rmm_track_allocations : bool
156
+ Whether to track memory allocations
157
+ rmm_allocator_external_lib_list : list
158
+ List of external libraries to use with RMM allocator
159
+ pre_import : list
160
+ List of modules to pre-import
161
+ enable_cudf_spill : bool
162
+ Whether to enable cuDF spilling
163
+ cudf_spill_stats : bool
164
+ Whether to track cuDF spilling statistics
165
+
166
+ Returns
167
+ -------
168
+ set
169
+ A set of configured plugins including:
170
+ - CPUAffinity: Configures CPU affinity for the worker
171
+ - RMMSetup: Configures RMM memory management
172
+ - PreImport: Handles module pre-importing
173
+ - CUDFSetup: Configures cuDF functionality and spilling
174
+ """
175
+ if int(os.environ.get("DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC", "0")) != 0:
176
+ return {
177
+ PreImport(pre_import),
178
+ CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
179
+ }
180
+ return {
181
+ CPUAffinity(
182
+ get_cpu_affinity(device_index),
183
+ ),
184
+ RMMSetup(
185
+ initial_pool_size=rmm_initial_pool_size,
186
+ maximum_pool_size=rmm_maximum_pool_size,
187
+ managed_memory=rmm_managed_memory,
188
+ async_alloc=rmm_async_alloc,
189
+ release_threshold=rmm_release_threshold,
190
+ log_directory=rmm_log_directory,
191
+ track_allocations=rmm_track_allocations,
192
+ external_lib_list=rmm_allocator_external_lib_list,
193
+ ),
194
+ PreImport(pre_import),
195
+ CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
196
+ }
dask_cuda/worker_spec.py CHANGED
@@ -1,3 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
1
4
  import os
2
5
 
3
6
  from dask.distributed import Nanny
@@ -6,7 +9,7 @@ from distributed.system import MEMORY_LIMIT
6
9
  from .initialize import initialize
7
10
  from .local_cuda_cluster import cuda_visible_devices
8
11
  from .plugins import CPUAffinity
9
- from .utils import get_cpu_affinity, get_gpu_count
12
+ from .utils import _get_active_ucx_implementation_name, get_cpu_affinity, get_gpu_count
10
13
 
11
14
 
12
15
  def worker_spec(
@@ -81,10 +84,14 @@ def worker_spec(
81
84
  'preload_argv': ['--create-cuda-context']}}}
82
85
 
83
86
  """
84
- if (
85
- enable_tcp_over_ucx or enable_infiniband or enable_nvlink
86
- ) and protocol != "ucx":
87
- raise TypeError("Enabling InfiniBand or NVLink requires protocol='ucx'")
87
+ if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
88
+ try:
89
+ _get_active_ucx_implementation_name(protocol)
90
+ except ValueError:
91
+ raise TypeError(
92
+ "Enabling InfiniBand or NVLink requires protocol='ucx', "
93
+ "protocol='ucxx' or protocol='ucx-old'"
94
+ ) from None
88
95
 
89
96
  if CUDA_VISIBLE_DEVICES is None:
90
97
  CUDA_VISIBLE_DEVICES = os.environ.get(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dask-cuda
3
- Version: 25.6.0
3
+ Version: 25.8.0
4
4
  Summary: Utilities for Dask and CUDA interactions
5
5
  Author: NVIDIA Corporation
6
6
  License: Apache-2.0
@@ -24,7 +24,7 @@ Requires-Dist: numba<0.62.0a0,>=0.59.1
24
24
  Requires-Dist: numpy<3.0a0,>=1.23
25
25
  Requires-Dist: pandas>=1.3
26
26
  Requires-Dist: pynvml<13.0.0a0,>=12.0.0
27
- Requires-Dist: rapids-dask-dependency==25.6.*
27
+ Requires-Dist: rapids-dask-dependency==25.8.*
28
28
  Requires-Dist: zict>=2.0.0
29
29
  Provides-Extra: docs
30
30
  Requires-Dist: numpydoc>=1.1.0; extra == "docs"
@@ -0,0 +1,63 @@
1
+ dask_cuda/GIT_COMMIT,sha256=RAxzl5uQmWcKe5o7s9iHwueNfKc91R_-kniXtpRqmng,41
2
+ dask_cuda/VERSION,sha256=mZz9G1Ul4kEOksaMu07UE-AVtGzT1t91nQu3CK9KUAk,8
3
+ dask_cuda/__init__.py,sha256=Wbc7R0voN4vsQkb7SKuVXH0YXuXtfnAxrupxfM4lT10,1933
4
+ dask_cuda/_compat.py,sha256=AG2lKGAtZitDPBjHeFDKLTN_B5HKodrhZ2kHlk1Z-D0,498
5
+ dask_cuda/_version.py,sha256=cHDO9AzNtxkCVhwYu7hL3H7RPAkQnxpKBjElOst3rkI,964
6
+ dask_cuda/cli.py,sha256=VRYuryhViVWkCH7H7fDDTMToSOC17nAUMIPbd3K2jRs,17490
7
+ dask_cuda/cuda_worker.py,sha256=7ZLZ3AY3l1fLumx2XynUOej5Sx6bwZQRomT4Nj9pbyA,8590
8
+ dask_cuda/device_host_file.py,sha256=wTiTyYthUrR8l2WM7iV5lvjQDzeYxnnK7GUlPsHI6p4,11042
9
+ dask_cuda/disk_io.py,sha256=IpD2hA-AjTw3SEu3w9pT9ELAUpKW0XcphPh8WwlKH70,6757
10
+ dask_cuda/get_device_memory_objects.py,sha256=Nk0f5kv2f0e6JSd5hrwenAOARgF4NS21Zv_x3zF3ONI,4577
11
+ dask_cuda/initialize.py,sha256=JBgXNzt52OzcusC89k0eVn2ivbCoCzZxOPNvQgj3YfE,7385
12
+ dask_cuda/is_device_object.py,sha256=x9klFdeQzLcug7wZMxN3GK2AS121tlDe-LQ2uznm5yo,1179
13
+ dask_cuda/is_spillable_object.py,sha256=8gj6QgtKcmzrpQwy8rE-pS1R8tjaJOeD-Fzr6LumjJg,1596
14
+ dask_cuda/local_cuda_cluster.py,sha256=pocJgHbs8h2z_hfChU2_s7kwYKuYTgFZtmrEgYHjWwc,20735
15
+ dask_cuda/plugins.py,sha256=u4gWQy8DgaTAsd59KCNXLG6zmdmGg5qhaI8ha2rMEFs,7085
16
+ dask_cuda/proxify_device_objects.py,sha256=hGHK2gBkuaMYyrZqcitRGUiH_up56R1hsC7b_6YcCBU,8325
17
+ dask_cuda/proxify_host_file.py,sha256=TbdszPvAgV4CRPMz99tumnCWwiCTqMBZqHQua3RRpHE,31031
18
+ dask_cuda/proxy_object.py,sha256=eII-S0vdpFa6NLebpHFVQuJ2eyXJ5cBg8sucgFEa91g,30307
19
+ dask_cuda/utils.py,sha256=smUv6DGJlHgXjfASVJMlRL1NgECiwwQ2zgn_FCWxMQ4,33454
20
+ dask_cuda/utils_test.py,sha256=CKRMB5KUAg-7VSf21AU1TQoqGpJXm9ftgRV7mGIGQ3s,2132
21
+ dask_cuda/worker_common.py,sha256=uXoYZ1IZanAbHddfpL3NbVDV5WqCF0m94nOiqqvEnxc,7168
22
+ dask_cuda/worker_spec.py,sha256=cI4vS08gyrIU3PKJIjjWZNXChUm5Pv9LwaShPqYYMUQ,4698
23
+ dask_cuda/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ dask_cuda/benchmarks/common.py,sha256=4AHgWkomo1RsPwH6eogz4vbE9vg_Dg-krScn9B1BRuw,7057
25
+ dask_cuda/benchmarks/local_cudf_groupby.py,sha256=ehKOJqnHgQoEyNWuebIWdJP8f_bRiwBd4pax3hkQP_8,8646
26
+ dask_cuda/benchmarks/local_cudf_merge.py,sha256=D2yXcEj1dLvfdd-ugV6kLgcUoiHzLmN3jNNs_BjaWME,12341
27
+ dask_cuda/benchmarks/local_cudf_shuffle.py,sha256=MxUHzif_7TFPj96yE-Y3-U_NMXD6VJLNB1fB39LXgrk,8386
28
+ dask_cuda/benchmarks/local_cupy.py,sha256=1riE9_hVkDxfSP2pWU0h2VF9HTXhF3C2ArKb19KMnVo,10457
29
+ dask_cuda/benchmarks/local_cupy_map_overlap.py,sha256=27OtGOhmHIQhHSAmyOm0GgUX8u10npmwBo_q5fLAqUM,6151
30
+ dask_cuda/benchmarks/read_parquet.py,sha256=spKu6RLWYngPZq9hnaoU0mz7INIaJnErfqjBG2wH8Zc,7614
31
+ dask_cuda/benchmarks/utils.py,sha256=pl0VG8BQnIrVvpdVQtP8ybX1SOcv1E01jZzhkfOIPw4,30235
32
+ dask_cuda/explicit_comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ dask_cuda/explicit_comms/comms.py,sha256=FbSDPQUz0gwQNa8EQMNEGEwWCM2KhuDk4CiNNjZy6qo,11611
34
+ dask_cuda/explicit_comms/dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=vq-NBPYtKvKcfo7-eikPYaeWDf-V_DTHR5PVE_E-dyE,23863
36
+ dask_cuda/tests/conftest.py,sha256=NjTnsgu7UewTpoTakt-2bpe0rtarwni25anjZPWbbb0,1501
37
+ dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=qVN9J0Hdv66A9COFArLIdRriyyxEKpS3lEZGHbVHaq8,4903
38
+ dask_cuda/tests/test_dask_cuda_worker.py,sha256=iOkYAGSYfql7Wxxnb1hZJ7oedQyLmweXAHpQhckaTgY,23232
39
+ dask_cuda/tests/test_device_host_file.py,sha256=79ssUISo1YhsW_7HdwqPfsH2LRzS2bi5BjPym1Sdgqw,5882
40
+ dask_cuda/tests/test_dgx.py,sha256=Y7OgFlrq_BzoF4L8qoA5byKX1kDA1foaFzT-gEfhrsI,7294
41
+ dask_cuda/tests/test_explicit_comms.py,sha256=ULrVhOcVavEWyZBxVSXDOFT_nznF8mRrU1M3fq_DUjo,20622
42
+ dask_cuda/tests/test_from_array.py,sha256=-mXVwrw3MjB32vFfdUvxiwojqu9mEYNyjZjGPg1h1sE,733
43
+ dask_cuda/tests/test_gds.py,sha256=j1Huud6UGm1fbkyRLQEz_ysrVw__5AimwSn_M-2GEvs,1513
44
+ dask_cuda/tests/test_initialize.py,sha256=2Fm84sAN0A9Kxhm7PiWgrwppNCaJLA2yj65Y3LPOrNw,7542
45
+ dask_cuda/tests/test_local_cuda_cluster.py,sha256=wYhY4OzQcYDtevL2peT6k5bAmZs2MDpQMrwPs2To7AY,23083
46
+ dask_cuda/tests/test_proxify_host_file.py,sha256=6YPQia-V5tcVOy2ZNwL9ZpD6qKxKCcaNZZK-8f4aqpM,19996
47
+ dask_cuda/tests/test_proxy.py,sha256=3uREA-d4y0lmGHKGJ5z9OiAYTBie1Y3bpB8djQRrluE,23996
48
+ dask_cuda/tests/test_rdd_ucx.py,sha256=nMbEm7kxAATHfVyZOkfpexyk-ZKtCbLzvPHdspK877M,5433
49
+ dask_cuda/tests/test_spill.py,sha256=QmBdFBvmZsDbOo0IAeUAJQfMyx-BBxokosch4gZHa00,15704
50
+ dask_cuda/tests/test_utils.py,sha256=bTpiNQyKTxLHzHTLn-G0SWVgetq_tqUGq7rbafvdwgg,12297
51
+ dask_cuda/tests/test_version.py,sha256=vK2HjlRLX0nxwvRsYxBqhoZryBNZklzA-vdnyuWDxVg,365
52
+ dask_cuda/tests/test_worker_spec.py,sha256=dlZ4OIOl7CVm4euMtvCUif1QetCWcntSRdwzAmkH6ec,2550
53
+ dask_cuda-25.8.0.dist-info/licenses/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
54
+ examples/ucx/client_initialize.py,sha256=YN3AXHF8btcMd6NicKKhKR9SXouAsK1foJhFspbOn70,1262
55
+ examples/ucx/local_cuda_cluster.py,sha256=7xVY3EhwhkY2L4VZin_BiMCbrjhirDNChoC86KiETNc,1983
56
+ shared-actions/check_nightly_success/check-nightly-success/check.py,sha256=lBhwgJALfUXdk4B9IhYf1AV0OUu0dQol3GtUe-CnUgY,5379
57
+ shared-actions/telemetry-impls/summarize/bump_time.py,sha256=FXcGDqjCabsL6vlqd5RIV2PCWi2ns_ju8Ul0ERM21cA,2033
58
+ shared-actions/telemetry-impls/summarize/send_trace.py,sha256=i30O_cKZ1OtXIbqaTE3R1JLJSw2XoYdmoVpM2Nc9wj8,16555
59
+ dask_cuda-25.8.0.dist-info/METADATA,sha256=euY_vTYPba5yD4W8FtKDhCAjF4qs0ZU6-z5cs2lxYQg,2345
60
+ dask_cuda-25.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
61
+ dask_cuda-25.8.0.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
62
+ dask_cuda-25.8.0.dist-info/top_level.txt,sha256=SaQxjNeXM9ZhEBjU6CY25OIOt2UojSREGiVxg9UIc08,63
63
+ dask_cuda-25.8.0.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ ci
2
+ conda
3
+ dask_cuda
4
+ examples
5
+ shared-actions
6
+ telemetry-artifacts
@@ -0,0 +1,148 @@
1
+ # Copyright (c) 2024-2025, NVIDIA CORPORATION.
2
+
3
+ """Check whether a GHA workflow has run successfully in the last N days."""
4
+ # ruff: noqa: INP001
5
+
6
+ import argparse
7
+ import os
8
+ import re
9
+ import sys
10
+ from collections import defaultdict
11
+ from datetime import datetime
12
+
13
+ import requests
14
+
15
+ # Constants
16
+ GITHUB_TOKEN = os.environ["RAPIDS_GH_TOKEN"]
17
+ GOOD_STATUSES = {"success"}
18
+
19
+
20
+ def main(
21
+ repo: str,
22
+ repo_owner: str,
23
+ workflow_id: str,
24
+ max_days_without_success: int,
25
+ num_attempts: int = 5,
26
+ ) -> bool:
27
+ """Check whether a GHA workflow has run successfully in the last N days.
28
+
29
+ Returns True if the workflow has not run successfully in the last N days, False
30
+ otherwise (values are inverted for use as a return code).
31
+ """
32
+ headers = {"Authorization": f"token {GITHUB_TOKEN}"}
33
+ url = f"https://api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs"
34
+ exceptions = []
35
+ for _ in range(num_attempts):
36
+ try:
37
+ response = requests.get(url, headers=headers, timeout=10)
38
+ response.raise_for_status()
39
+ break
40
+ except requests.RequestException as e:
41
+ exceptions.append(e)
42
+ else:
43
+ sep = "\n\t"
44
+ msg = (
45
+ f"Failed to fetch {url} after {num_attempts} attempts with the following "
46
+ f"errors: {sep}{'{sep}'.join(exceptions)}"
47
+ )
48
+ raise RuntimeError(msg)
49
+
50
+ runs = response.json()["workflow_runs"]
51
+ tz = datetime.fromisoformat(runs[0]["run_started_at"]).tzinfo
52
+ now = datetime.now(tz=tz)
53
+
54
+ latest_success = {}
55
+ workflow_active_for_max_days = {}
56
+ # Rather frustratingly, the workflow runs returned from the GitHub API can
57
+ # have alternating ordering of `head_branch`
58
+ # e.g.
59
+ # run[0]['head_branch'] == "branch-25.02"
60
+ # run[1]['head_branch'] == "branch-25.04"
61
+ # run[2]['head_branch'] == "branch-25.02"
62
+ #
63
+ # In this situation, the behavior of `itertools.groupby` (previously used
64
+ # here) is to only group _consecutive_ runs, so the results of the
65
+ # subsequent branch match (i.e. the second group of `branch-25.02` runs)
66
+ # will overwrite the results of the first one, potentially overwriting a
67
+ # previous success. The snippet below unifies the groups so it's more like a
68
+ # SQL groupby and there is no chance of overwriting.
69
+ branch_dict = defaultdict(list)
70
+ for run in runs:
71
+ branch_dict[run["head_branch"]].append(run)
72
+
73
+ for branch, branch_runs in branch_dict.items():
74
+ # Only consider RAPIDS release branches, which have versions like
75
+ # '25.02' (RAPIDS) or '0.42' (ucxx, ucx-py).
76
+ if not re.match("branch-[0-9]{1,2}.[0-9]{2}", branch):
77
+ continue
78
+
79
+ latest_success[branch] = None
80
+ runs = sorted(branch_runs, key=lambda r: r["run_started_at"], reverse=True)
81
+ for run in runs:
82
+ days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days
83
+ if days_since_run > max_days_without_success:
84
+ break
85
+ if run["conclusion"] in GOOD_STATUSES:
86
+ latest_success[branch] = run
87
+ break
88
+
89
+ workflow_active_for_max_days[branch] = False
90
+ if len(runs) > 0:
91
+ run = runs[-1]
92
+ days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days
93
+ if days_since_run > max_days_without_success:
94
+ workflow_active_for_max_days[branch] = True
95
+
96
+ latest_branch = max(latest_success)
97
+ has_latest_success = latest_success[latest_branch] is not None
98
+
99
+ # We are producing Unix return codes so success/failure is inverted from the
100
+ # expected Python boolean values.
101
+ if has_latest_success:
102
+ print( # noqa: T201
103
+ f"The most recent successful run of the {workflow_id} workflow on "
104
+ f"{latest_branch} was "
105
+ f"{datetime.fromisoformat(latest_success[latest_branch]['run_started_at'])}, "
106
+ f"which is within the last {max_days_without_success} days. View logs:"
107
+ f"\n - {latest_success[latest_branch]['html_url']}"
108
+ )
109
+ return 0
110
+ elif not workflow_active_for_max_days[latest_branch]:
111
+ print( # noqa: T201
112
+ f"The oldest run of the {workflow_id} workflow on {latest_branch} was less "
113
+ f"than {max_days_without_success} days ago. This exempts the workflow from "
114
+ "check-nightly-success because the workflow has not been running for very long."
115
+ )
116
+ return 0
117
+
118
+ print( # noqa: T201
119
+ f"{latest_branch} has no successful runs of {workflow_id} in the last {max_days_without_success} days"
120
+ )
121
+ return 1
122
+
123
+
124
+ if __name__ == "__main__":
125
+ parser = argparse.ArgumentParser()
126
+ parser.add_argument("repo", type=str, help="Repository name")
127
+ parser.add_argument(
128
+ "--repo-owner",
129
+ default="rapidsai",
130
+ help="Repository organization/owner",
131
+ )
132
+ parser.add_argument("--workflow-id", default="test.yaml", help="Workflow ID")
133
+ parser.add_argument(
134
+ "--max-days-without-success",
135
+ type=int,
136
+ default=7,
137
+ help="Maximum number of days without a successful run",
138
+ )
139
+ args = parser.parse_args()
140
+
141
+ sys.exit(
142
+ main(
143
+ args.repo,
144
+ args.repo_owner,
145
+ args.workflow_id,
146
+ args.max_days_without_success,
147
+ ),
148
+ )
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env python
2
+ # Copyright (c) 2024-2025, NVIDIA CORPORATION.
3
+
4
+ # This script is meant to act on an 'all_jobs.json' file that comes from
5
+ # the summarize job when debug info is enabled. Bumping the time makes
6
+ # it easier to re-run the span-sending python script and check results
7
+ # in either Jaeger or Grafana
8
+
9
+ import datetime
10
+ import json
11
+
12
+ with open("all_jobs.json") as f:
13
+ jobs = json.load(f)
14
+
15
+
16
+ def _parse_time(x: str) -> int:
17
+ return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9)
18
+
19
+
20
+ start_time = _parse_time(jobs[0]["created_at"])
21
+ needed_time = _parse_time(jobs[-3]["completed_at"]) - _parse_time(jobs[0]["created_at"])
22
+ new_start_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
23
+
24
+ for idx, job in enumerate(jobs):
25
+ if job["created_at"]:
26
+ job["created_at"] = (
27
+ new_start_time + datetime.timedelta(seconds=(_parse_time(job["created_at"]) - start_time) / 1e9)
28
+ ).strftime("%Y-%m-%dT%H:%M:%SZ")
29
+ if job["started_at"]:
30
+ job["started_at"] = (
31
+ new_start_time + datetime.timedelta(seconds=(_parse_time(job["started_at"]) - start_time) / 1e9)
32
+ ).strftime("%Y-%m-%dT%H:%M:%SZ")
33
+ if job["completed_at"]:
34
+ job["completed_at"] = (
35
+ new_start_time + datetime.timedelta(seconds=(_parse_time(job["completed_at"]) - start_time) / 1e9)
36
+ ).strftime("%Y-%m-%dT%H:%M:%SZ")
37
+ steps = []
38
+ for step in job["steps"]:
39
+ if step["started_at"]:
40
+ step["started_at"] = (
41
+ new_start_time + datetime.timedelta(seconds=(_parse_time(step["started_at"]) - start_time) / 1e9)
42
+ ).strftime("%Y-%m-%dT%H:%M:%SZ")
43
+ if step["completed_at"]:
44
+ step["completed_at"] = (
45
+ new_start_time + datetime.timedelta(seconds=(_parse_time(step["completed_at"]) - start_time) / 1e9)
46
+ ).strftime("%Y-%m-%dT%H:%M:%SZ")
47
+ steps.append(step)
48
+ job["steps"] = steps
49
+
50
+ jobs[idx] = job
51
+
52
+
53
+ with open("all_jobs.json", "w") as f:
54
+ json.dump(jobs, f)