dask-cuda 25.6.0__py3-none-any.whl → 25.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dask_cuda/GIT_COMMIT +1 -1
- dask_cuda/VERSION +1 -1
- dask_cuda/benchmarks/common.py +4 -1
- dask_cuda/benchmarks/local_cudf_groupby.py +4 -1
- dask_cuda/benchmarks/local_cudf_merge.py +5 -2
- dask_cuda/benchmarks/local_cudf_shuffle.py +5 -2
- dask_cuda/benchmarks/local_cupy.py +4 -1
- dask_cuda/benchmarks/local_cupy_map_overlap.py +4 -1
- dask_cuda/benchmarks/utils.py +7 -4
- dask_cuda/cli.py +21 -15
- dask_cuda/cuda_worker.py +27 -57
- dask_cuda/device_host_file.py +31 -15
- dask_cuda/disk_io.py +7 -4
- dask_cuda/explicit_comms/comms.py +11 -7
- dask_cuda/explicit_comms/dataframe/shuffle.py +23 -23
- dask_cuda/get_device_memory_objects.py +3 -3
- dask_cuda/initialize.py +80 -44
- dask_cuda/local_cuda_cluster.py +63 -66
- dask_cuda/plugins.py +17 -16
- dask_cuda/proxify_device_objects.py +12 -10
- dask_cuda/proxify_host_file.py +30 -27
- dask_cuda/proxy_object.py +20 -17
- dask_cuda/tests/conftest.py +41 -0
- dask_cuda/tests/test_dask_cuda_worker.py +109 -25
- dask_cuda/tests/test_dgx.py +10 -18
- dask_cuda/tests/test_explicit_comms.py +30 -12
- dask_cuda/tests/test_from_array.py +7 -5
- dask_cuda/tests/test_initialize.py +16 -37
- dask_cuda/tests/test_local_cuda_cluster.py +159 -52
- dask_cuda/tests/test_proxify_host_file.py +19 -3
- dask_cuda/tests/test_proxy.py +18 -16
- dask_cuda/tests/test_rdd_ucx.py +160 -0
- dask_cuda/tests/test_spill.py +7 -0
- dask_cuda/tests/test_utils.py +106 -20
- dask_cuda/tests/test_worker_spec.py +5 -2
- dask_cuda/utils.py +261 -38
- dask_cuda/utils_test.py +23 -7
- dask_cuda/worker_common.py +196 -0
- dask_cuda/worker_spec.py +12 -5
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/METADATA +2 -2
- dask_cuda-25.8.0.dist-info/RECORD +63 -0
- dask_cuda-25.8.0.dist-info/top_level.txt +6 -0
- shared-actions/check_nightly_success/check-nightly-success/check.py +148 -0
- shared-actions/telemetry-impls/summarize/bump_time.py +54 -0
- shared-actions/telemetry-impls/summarize/send_trace.py +409 -0
- dask_cuda-25.6.0.dist-info/RECORD +0 -57
- dask_cuda-25.6.0.dist-info/top_level.txt +0 -4
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/WHEEL +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/entry_points.txt +0 -0
- {dask_cuda-25.6.0.dist-info → dask_cuda-25.8.0.dist-info}/licenses/LICENSE +0 -0
dask_cuda/utils_test.py
CHANGED
|
@@ -1,14 +1,19 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
from typing import Literal
|
|
2
5
|
|
|
3
6
|
import distributed
|
|
4
7
|
from distributed import Nanny, Worker
|
|
5
8
|
|
|
9
|
+
from .utils import _get_active_ucx_implementation_name
|
|
10
|
+
|
|
6
11
|
|
|
7
12
|
class MockWorker(Worker):
|
|
8
13
|
"""Mock Worker class preventing NVML from getting used by SystemMonitor.
|
|
9
14
|
|
|
10
15
|
By preventing the Worker from initializing NVML in the SystemMonitor, we can
|
|
11
|
-
mock test multiple devices in
|
|
16
|
+
mock test multiple devices in ``CUDA_VISIBLE_DEVICES`` behavior with single-GPU
|
|
12
17
|
machines.
|
|
13
18
|
"""
|
|
14
19
|
|
|
@@ -26,20 +31,31 @@ class MockWorker(Worker):
|
|
|
26
31
|
|
|
27
32
|
|
|
28
33
|
class IncreasedCloseTimeoutNanny(Nanny):
|
|
29
|
-
"""Increase
|
|
34
|
+
"""Increase ``Nanny``'s close timeout.
|
|
30
35
|
|
|
31
|
-
The internal close timeout mechanism of
|
|
32
|
-
the
|
|
36
|
+
The internal close timeout mechanism of ``Nanny`` recomputes the time left to kill
|
|
37
|
+
the ``Worker`` process based on elapsed time of the close task, which may leave
|
|
33
38
|
very little time for the subprocess to shutdown cleanly, which may cause tests
|
|
34
39
|
to fail when the system is under higher load. This class increases the default
|
|
35
|
-
close timeout of 5.0 seconds that
|
|
40
|
+
close timeout of 5.0 seconds that ``Nanny`` sets by default, which can be overriden
|
|
36
41
|
via Distributed's public API.
|
|
37
42
|
|
|
38
|
-
This class can be used with the
|
|
39
|
-
|
|
43
|
+
This class can be used with the ``worker_class`` argument of ``LocalCluster`` or
|
|
44
|
+
``LocalCUDACluster`` to provide a much higher default of 30.0 seconds.
|
|
40
45
|
"""
|
|
41
46
|
|
|
42
47
|
async def close( # type:ignore[override]
|
|
43
48
|
self, timeout: float = 30.0, reason: str = "nanny-close"
|
|
44
49
|
) -> Literal["OK"]:
|
|
45
50
|
return await super().close(timeout=timeout, reason=reason)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_ucx_implementation(protocol):
|
|
54
|
+
import pytest
|
|
55
|
+
|
|
56
|
+
protocol = _get_active_ucx_implementation_name(protocol)
|
|
57
|
+
|
|
58
|
+
if protocol == "ucxx":
|
|
59
|
+
return pytest.importorskip("ucxx")
|
|
60
|
+
else:
|
|
61
|
+
return pytest.importorskip("ucp")
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
from .device_host_file import DeviceHostFile
|
|
8
|
+
from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
|
|
9
|
+
from .proxify_host_file import ProxifyHostFile
|
|
10
|
+
from .utils import (
|
|
11
|
+
get_cpu_affinity,
|
|
12
|
+
has_device_memory_resource,
|
|
13
|
+
parse_device_memory_limit,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def worker_data_function(
|
|
18
|
+
device_memory_limit=None,
|
|
19
|
+
memory_limit=None,
|
|
20
|
+
jit_unspill=False,
|
|
21
|
+
enable_cudf_spill=False,
|
|
22
|
+
shared_filesystem=None,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Create a data function for CUDA workers based on memory configuration.
|
|
26
|
+
|
|
27
|
+
This function creates and returns a callable that generates data configuration
|
|
28
|
+
for CUDA workers. The returned callable takes a device index parameter and
|
|
29
|
+
returns the appropriate data configuration for that device.
|
|
30
|
+
|
|
31
|
+
Parameters
|
|
32
|
+
----------
|
|
33
|
+
device_memory_limit : str or int, optional
|
|
34
|
+
Limit of device memory, defaults to None
|
|
35
|
+
memory_limit : str or int, optional
|
|
36
|
+
Limit of host memory, defaults to None
|
|
37
|
+
jit_unspill : bool, optional
|
|
38
|
+
Whether to enable JIT unspill functionality, defaults to False
|
|
39
|
+
enable_cudf_spill : bool, optional
|
|
40
|
+
Whether to enable cuDF spilling, defaults to False
|
|
41
|
+
shared_filesystem : str or bool, optional
|
|
42
|
+
Whether to use shared filesystem for spilling, defaults to None
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
callable
|
|
47
|
+
A function that takes device index `device_index` and returns appropriate
|
|
48
|
+
data configuration based on the availability of an dedicated device memory
|
|
49
|
+
resource and arguments passed to the worker.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def data(device_index):
|
|
53
|
+
if int(os.environ.get("DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC", "0")) != 0:
|
|
54
|
+
return {}
|
|
55
|
+
|
|
56
|
+
# First raise errors for invalid configurations
|
|
57
|
+
if not has_device_memory_resource(device_index):
|
|
58
|
+
if jit_unspill:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
"JIT-Unspill is not supported on devices without dedicated memory, "
|
|
61
|
+
"such as system on a chip (SoC) devices."
|
|
62
|
+
)
|
|
63
|
+
elif enable_cudf_spill:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"cuDF spilling is not supported on devices without dedicated "
|
|
66
|
+
"memory, such as system on a chip (SoC) devices."
|
|
67
|
+
)
|
|
68
|
+
elif device_memory_limit not in [None, "default"]:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"device_memory_limit is set but device has no dedicated memory."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if device_memory_limit is None and memory_limit is None:
|
|
74
|
+
# All spilling is disabled
|
|
75
|
+
return {}
|
|
76
|
+
elif not has_device_memory_resource(device_index):
|
|
77
|
+
if device_memory_limit == "default" and memory_limit is None:
|
|
78
|
+
# Devices without a dedicated memory resource only support default
|
|
79
|
+
# host<->disk spilling via Dask's default mechanism.
|
|
80
|
+
return {}
|
|
81
|
+
# Devices without a dedicated memory resource only support default
|
|
82
|
+
# host<->disk spilling via Dask's default mechanism.
|
|
83
|
+
return None
|
|
84
|
+
else:
|
|
85
|
+
if jit_unspill:
|
|
86
|
+
# JIT-Unspill is enabled
|
|
87
|
+
if enable_cudf_spill:
|
|
88
|
+
warnings.warn(
|
|
89
|
+
"Enabling cuDF spilling and JIT-Unspill together is not "
|
|
90
|
+
"safe, consider disabling JIT-Unspill."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return (
|
|
94
|
+
ProxifyHostFile,
|
|
95
|
+
{
|
|
96
|
+
"device_memory_limit": parse_device_memory_limit(
|
|
97
|
+
device_memory_limit, device_index=device_index
|
|
98
|
+
),
|
|
99
|
+
"memory_limit": memory_limit,
|
|
100
|
+
"shared_filesystem": shared_filesystem,
|
|
101
|
+
},
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
# Device has dedicated memory and host memory is limited
|
|
105
|
+
return (
|
|
106
|
+
DeviceHostFile,
|
|
107
|
+
{
|
|
108
|
+
"device_memory_limit": parse_device_memory_limit(
|
|
109
|
+
device_memory_limit, device_index=device_index
|
|
110
|
+
),
|
|
111
|
+
"memory_limit": memory_limit,
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return data
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def worker_plugins(
|
|
119
|
+
*,
|
|
120
|
+
device_index,
|
|
121
|
+
rmm_initial_pool_size,
|
|
122
|
+
rmm_maximum_pool_size,
|
|
123
|
+
rmm_managed_memory,
|
|
124
|
+
rmm_async_alloc,
|
|
125
|
+
rmm_release_threshold,
|
|
126
|
+
rmm_log_directory,
|
|
127
|
+
rmm_track_allocations,
|
|
128
|
+
rmm_allocator_external_lib_list,
|
|
129
|
+
pre_import,
|
|
130
|
+
enable_cudf_spill,
|
|
131
|
+
cudf_spill_stats,
|
|
132
|
+
):
|
|
133
|
+
"""Create a set of plugins for CUDA workers with specified configurations.
|
|
134
|
+
|
|
135
|
+
This function creates and returns a set of plugins that configure various aspects
|
|
136
|
+
of CUDA worker behavior, including CPU affinity, RMM memory management, pre-import
|
|
137
|
+
modules and cuDF spilling functionality.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
device_index : int
|
|
142
|
+
The CUDA device index to configure
|
|
143
|
+
rmm_initial_pool_size : int or str
|
|
144
|
+
Initial size of the RMM memory pool
|
|
145
|
+
rmm_maximum_pool_size : int or str
|
|
146
|
+
Maximum size of the RMM memory pool
|
|
147
|
+
rmm_managed_memory : bool
|
|
148
|
+
Whether to use CUDA managed memory
|
|
149
|
+
rmm_async_alloc : bool
|
|
150
|
+
Whether to use asynchronous allocation
|
|
151
|
+
rmm_release_threshold : int
|
|
152
|
+
Memory threshold for releasing memory back to the system
|
|
153
|
+
rmm_log_directory : str
|
|
154
|
+
Directory for RMM logging
|
|
155
|
+
rmm_track_allocations : bool
|
|
156
|
+
Whether to track memory allocations
|
|
157
|
+
rmm_allocator_external_lib_list : list
|
|
158
|
+
List of external libraries to use with RMM allocator
|
|
159
|
+
pre_import : list
|
|
160
|
+
List of modules to pre-import
|
|
161
|
+
enable_cudf_spill : bool
|
|
162
|
+
Whether to enable cuDF spilling
|
|
163
|
+
cudf_spill_stats : bool
|
|
164
|
+
Whether to track cuDF spilling statistics
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
set
|
|
169
|
+
A set of configured plugins including:
|
|
170
|
+
- CPUAffinity: Configures CPU affinity for the worker
|
|
171
|
+
- RMMSetup: Configures RMM memory management
|
|
172
|
+
- PreImport: Handles module pre-importing
|
|
173
|
+
- CUDFSetup: Configures cuDF functionality and spilling
|
|
174
|
+
"""
|
|
175
|
+
if int(os.environ.get("DASK_CUDA_TEST_DISABLE_DEVICE_SPECIFIC", "0")) != 0:
|
|
176
|
+
return {
|
|
177
|
+
PreImport(pre_import),
|
|
178
|
+
CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
|
|
179
|
+
}
|
|
180
|
+
return {
|
|
181
|
+
CPUAffinity(
|
|
182
|
+
get_cpu_affinity(device_index),
|
|
183
|
+
),
|
|
184
|
+
RMMSetup(
|
|
185
|
+
initial_pool_size=rmm_initial_pool_size,
|
|
186
|
+
maximum_pool_size=rmm_maximum_pool_size,
|
|
187
|
+
managed_memory=rmm_managed_memory,
|
|
188
|
+
async_alloc=rmm_async_alloc,
|
|
189
|
+
release_threshold=rmm_release_threshold,
|
|
190
|
+
log_directory=rmm_log_directory,
|
|
191
|
+
track_allocations=rmm_track_allocations,
|
|
192
|
+
external_lib_list=rmm_allocator_external_lib_list,
|
|
193
|
+
),
|
|
194
|
+
PreImport(pre_import),
|
|
195
|
+
CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
|
|
196
|
+
}
|
dask_cuda/worker_spec.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
1
4
|
import os
|
|
2
5
|
|
|
3
6
|
from dask.distributed import Nanny
|
|
@@ -6,7 +9,7 @@ from distributed.system import MEMORY_LIMIT
|
|
|
6
9
|
from .initialize import initialize
|
|
7
10
|
from .local_cuda_cluster import cuda_visible_devices
|
|
8
11
|
from .plugins import CPUAffinity
|
|
9
|
-
from .utils import get_cpu_affinity, get_gpu_count
|
|
12
|
+
from .utils import _get_active_ucx_implementation_name, get_cpu_affinity, get_gpu_count
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
def worker_spec(
|
|
@@ -81,10 +84,14 @@ def worker_spec(
|
|
|
81
84
|
'preload_argv': ['--create-cuda-context']}}}
|
|
82
85
|
|
|
83
86
|
"""
|
|
84
|
-
if
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
|
|
88
|
+
try:
|
|
89
|
+
_get_active_ucx_implementation_name(protocol)
|
|
90
|
+
except ValueError:
|
|
91
|
+
raise TypeError(
|
|
92
|
+
"Enabling InfiniBand or NVLink requires protocol='ucx', "
|
|
93
|
+
"protocol='ucxx' or protocol='ucx-old'"
|
|
94
|
+
) from None
|
|
88
95
|
|
|
89
96
|
if CUDA_VISIBLE_DEVICES is None:
|
|
90
97
|
CUDA_VISIBLE_DEVICES = os.environ.get(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dask-cuda
|
|
3
|
-
Version: 25.
|
|
3
|
+
Version: 25.8.0
|
|
4
4
|
Summary: Utilities for Dask and CUDA interactions
|
|
5
5
|
Author: NVIDIA Corporation
|
|
6
6
|
License: Apache-2.0
|
|
@@ -24,7 +24,7 @@ Requires-Dist: numba<0.62.0a0,>=0.59.1
|
|
|
24
24
|
Requires-Dist: numpy<3.0a0,>=1.23
|
|
25
25
|
Requires-Dist: pandas>=1.3
|
|
26
26
|
Requires-Dist: pynvml<13.0.0a0,>=12.0.0
|
|
27
|
-
Requires-Dist: rapids-dask-dependency==25.
|
|
27
|
+
Requires-Dist: rapids-dask-dependency==25.8.*
|
|
28
28
|
Requires-Dist: zict>=2.0.0
|
|
29
29
|
Provides-Extra: docs
|
|
30
30
|
Requires-Dist: numpydoc>=1.1.0; extra == "docs"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
dask_cuda/GIT_COMMIT,sha256=RAxzl5uQmWcKe5o7s9iHwueNfKc91R_-kniXtpRqmng,41
|
|
2
|
+
dask_cuda/VERSION,sha256=mZz9G1Ul4kEOksaMu07UE-AVtGzT1t91nQu3CK9KUAk,8
|
|
3
|
+
dask_cuda/__init__.py,sha256=Wbc7R0voN4vsQkb7SKuVXH0YXuXtfnAxrupxfM4lT10,1933
|
|
4
|
+
dask_cuda/_compat.py,sha256=AG2lKGAtZitDPBjHeFDKLTN_B5HKodrhZ2kHlk1Z-D0,498
|
|
5
|
+
dask_cuda/_version.py,sha256=cHDO9AzNtxkCVhwYu7hL3H7RPAkQnxpKBjElOst3rkI,964
|
|
6
|
+
dask_cuda/cli.py,sha256=VRYuryhViVWkCH7H7fDDTMToSOC17nAUMIPbd3K2jRs,17490
|
|
7
|
+
dask_cuda/cuda_worker.py,sha256=7ZLZ3AY3l1fLumx2XynUOej5Sx6bwZQRomT4Nj9pbyA,8590
|
|
8
|
+
dask_cuda/device_host_file.py,sha256=wTiTyYthUrR8l2WM7iV5lvjQDzeYxnnK7GUlPsHI6p4,11042
|
|
9
|
+
dask_cuda/disk_io.py,sha256=IpD2hA-AjTw3SEu3w9pT9ELAUpKW0XcphPh8WwlKH70,6757
|
|
10
|
+
dask_cuda/get_device_memory_objects.py,sha256=Nk0f5kv2f0e6JSd5hrwenAOARgF4NS21Zv_x3zF3ONI,4577
|
|
11
|
+
dask_cuda/initialize.py,sha256=JBgXNzt52OzcusC89k0eVn2ivbCoCzZxOPNvQgj3YfE,7385
|
|
12
|
+
dask_cuda/is_device_object.py,sha256=x9klFdeQzLcug7wZMxN3GK2AS121tlDe-LQ2uznm5yo,1179
|
|
13
|
+
dask_cuda/is_spillable_object.py,sha256=8gj6QgtKcmzrpQwy8rE-pS1R8tjaJOeD-Fzr6LumjJg,1596
|
|
14
|
+
dask_cuda/local_cuda_cluster.py,sha256=pocJgHbs8h2z_hfChU2_s7kwYKuYTgFZtmrEgYHjWwc,20735
|
|
15
|
+
dask_cuda/plugins.py,sha256=u4gWQy8DgaTAsd59KCNXLG6zmdmGg5qhaI8ha2rMEFs,7085
|
|
16
|
+
dask_cuda/proxify_device_objects.py,sha256=hGHK2gBkuaMYyrZqcitRGUiH_up56R1hsC7b_6YcCBU,8325
|
|
17
|
+
dask_cuda/proxify_host_file.py,sha256=TbdszPvAgV4CRPMz99tumnCWwiCTqMBZqHQua3RRpHE,31031
|
|
18
|
+
dask_cuda/proxy_object.py,sha256=eII-S0vdpFa6NLebpHFVQuJ2eyXJ5cBg8sucgFEa91g,30307
|
|
19
|
+
dask_cuda/utils.py,sha256=smUv6DGJlHgXjfASVJMlRL1NgECiwwQ2zgn_FCWxMQ4,33454
|
|
20
|
+
dask_cuda/utils_test.py,sha256=CKRMB5KUAg-7VSf21AU1TQoqGpJXm9ftgRV7mGIGQ3s,2132
|
|
21
|
+
dask_cuda/worker_common.py,sha256=uXoYZ1IZanAbHddfpL3NbVDV5WqCF0m94nOiqqvEnxc,7168
|
|
22
|
+
dask_cuda/worker_spec.py,sha256=cI4vS08gyrIU3PKJIjjWZNXChUm5Pv9LwaShPqYYMUQ,4698
|
|
23
|
+
dask_cuda/benchmarks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
dask_cuda/benchmarks/common.py,sha256=4AHgWkomo1RsPwH6eogz4vbE9vg_Dg-krScn9B1BRuw,7057
|
|
25
|
+
dask_cuda/benchmarks/local_cudf_groupby.py,sha256=ehKOJqnHgQoEyNWuebIWdJP8f_bRiwBd4pax3hkQP_8,8646
|
|
26
|
+
dask_cuda/benchmarks/local_cudf_merge.py,sha256=D2yXcEj1dLvfdd-ugV6kLgcUoiHzLmN3jNNs_BjaWME,12341
|
|
27
|
+
dask_cuda/benchmarks/local_cudf_shuffle.py,sha256=MxUHzif_7TFPj96yE-Y3-U_NMXD6VJLNB1fB39LXgrk,8386
|
|
28
|
+
dask_cuda/benchmarks/local_cupy.py,sha256=1riE9_hVkDxfSP2pWU0h2VF9HTXhF3C2ArKb19KMnVo,10457
|
|
29
|
+
dask_cuda/benchmarks/local_cupy_map_overlap.py,sha256=27OtGOhmHIQhHSAmyOm0GgUX8u10npmwBo_q5fLAqUM,6151
|
|
30
|
+
dask_cuda/benchmarks/read_parquet.py,sha256=spKu6RLWYngPZq9hnaoU0mz7INIaJnErfqjBG2wH8Zc,7614
|
|
31
|
+
dask_cuda/benchmarks/utils.py,sha256=pl0VG8BQnIrVvpdVQtP8ybX1SOcv1E01jZzhkfOIPw4,30235
|
|
32
|
+
dask_cuda/explicit_comms/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
dask_cuda/explicit_comms/comms.py,sha256=FbSDPQUz0gwQNa8EQMNEGEwWCM2KhuDk4CiNNjZy6qo,11611
|
|
34
|
+
dask_cuda/explicit_comms/dataframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
dask_cuda/explicit_comms/dataframe/shuffle.py,sha256=vq-NBPYtKvKcfo7-eikPYaeWDf-V_DTHR5PVE_E-dyE,23863
|
|
36
|
+
dask_cuda/tests/conftest.py,sha256=NjTnsgu7UewTpoTakt-2bpe0rtarwni25anjZPWbbb0,1501
|
|
37
|
+
dask_cuda/tests/test_cudf_builtin_spilling.py,sha256=qVN9J0Hdv66A9COFArLIdRriyyxEKpS3lEZGHbVHaq8,4903
|
|
38
|
+
dask_cuda/tests/test_dask_cuda_worker.py,sha256=iOkYAGSYfql7Wxxnb1hZJ7oedQyLmweXAHpQhckaTgY,23232
|
|
39
|
+
dask_cuda/tests/test_device_host_file.py,sha256=79ssUISo1YhsW_7HdwqPfsH2LRzS2bi5BjPym1Sdgqw,5882
|
|
40
|
+
dask_cuda/tests/test_dgx.py,sha256=Y7OgFlrq_BzoF4L8qoA5byKX1kDA1foaFzT-gEfhrsI,7294
|
|
41
|
+
dask_cuda/tests/test_explicit_comms.py,sha256=ULrVhOcVavEWyZBxVSXDOFT_nznF8mRrU1M3fq_DUjo,20622
|
|
42
|
+
dask_cuda/tests/test_from_array.py,sha256=-mXVwrw3MjB32vFfdUvxiwojqu9mEYNyjZjGPg1h1sE,733
|
|
43
|
+
dask_cuda/tests/test_gds.py,sha256=j1Huud6UGm1fbkyRLQEz_ysrVw__5AimwSn_M-2GEvs,1513
|
|
44
|
+
dask_cuda/tests/test_initialize.py,sha256=2Fm84sAN0A9Kxhm7PiWgrwppNCaJLA2yj65Y3LPOrNw,7542
|
|
45
|
+
dask_cuda/tests/test_local_cuda_cluster.py,sha256=wYhY4OzQcYDtevL2peT6k5bAmZs2MDpQMrwPs2To7AY,23083
|
|
46
|
+
dask_cuda/tests/test_proxify_host_file.py,sha256=6YPQia-V5tcVOy2ZNwL9ZpD6qKxKCcaNZZK-8f4aqpM,19996
|
|
47
|
+
dask_cuda/tests/test_proxy.py,sha256=3uREA-d4y0lmGHKGJ5z9OiAYTBie1Y3bpB8djQRrluE,23996
|
|
48
|
+
dask_cuda/tests/test_rdd_ucx.py,sha256=nMbEm7kxAATHfVyZOkfpexyk-ZKtCbLzvPHdspK877M,5433
|
|
49
|
+
dask_cuda/tests/test_spill.py,sha256=QmBdFBvmZsDbOo0IAeUAJQfMyx-BBxokosch4gZHa00,15704
|
|
50
|
+
dask_cuda/tests/test_utils.py,sha256=bTpiNQyKTxLHzHTLn-G0SWVgetq_tqUGq7rbafvdwgg,12297
|
|
51
|
+
dask_cuda/tests/test_version.py,sha256=vK2HjlRLX0nxwvRsYxBqhoZryBNZklzA-vdnyuWDxVg,365
|
|
52
|
+
dask_cuda/tests/test_worker_spec.py,sha256=dlZ4OIOl7CVm4euMtvCUif1QetCWcntSRdwzAmkH6ec,2550
|
|
53
|
+
dask_cuda-25.8.0.dist-info/licenses/LICENSE,sha256=MjI3I-EgxfEvZlgjk82rgiFsZqSDXHFETd2QJ89UwDA,11348
|
|
54
|
+
examples/ucx/client_initialize.py,sha256=YN3AXHF8btcMd6NicKKhKR9SXouAsK1foJhFspbOn70,1262
|
|
55
|
+
examples/ucx/local_cuda_cluster.py,sha256=7xVY3EhwhkY2L4VZin_BiMCbrjhirDNChoC86KiETNc,1983
|
|
56
|
+
shared-actions/check_nightly_success/check-nightly-success/check.py,sha256=lBhwgJALfUXdk4B9IhYf1AV0OUu0dQol3GtUe-CnUgY,5379
|
|
57
|
+
shared-actions/telemetry-impls/summarize/bump_time.py,sha256=FXcGDqjCabsL6vlqd5RIV2PCWi2ns_ju8Ul0ERM21cA,2033
|
|
58
|
+
shared-actions/telemetry-impls/summarize/send_trace.py,sha256=i30O_cKZ1OtXIbqaTE3R1JLJSw2XoYdmoVpM2Nc9wj8,16555
|
|
59
|
+
dask_cuda-25.8.0.dist-info/METADATA,sha256=euY_vTYPba5yD4W8FtKDhCAjF4qs0ZU6-z5cs2lxYQg,2345
|
|
60
|
+
dask_cuda-25.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
61
|
+
dask_cuda-25.8.0.dist-info/entry_points.txt,sha256=UcRaKVEpywtxc6pF1VnfMB0UK4sJg7a8_NdZF67laPM,136
|
|
62
|
+
dask_cuda-25.8.0.dist-info/top_level.txt,sha256=SaQxjNeXM9ZhEBjU6CY25OIOt2UojSREGiVxg9UIc08,63
|
|
63
|
+
dask_cuda-25.8.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
|
|
2
|
+
|
|
3
|
+
"""Check whether a GHA workflow has run successfully in the last N days."""
|
|
4
|
+
# ruff: noqa: INP001
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import sys
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
# Constants
|
|
16
|
+
GITHUB_TOKEN = os.environ["RAPIDS_GH_TOKEN"]
|
|
17
|
+
GOOD_STATUSES = {"success"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def main(
|
|
21
|
+
repo: str,
|
|
22
|
+
repo_owner: str,
|
|
23
|
+
workflow_id: str,
|
|
24
|
+
max_days_without_success: int,
|
|
25
|
+
num_attempts: int = 5,
|
|
26
|
+
) -> bool:
|
|
27
|
+
"""Check whether a GHA workflow has run successfully in the last N days.
|
|
28
|
+
|
|
29
|
+
Returns True if the workflow has not run successfully in the last N days, False
|
|
30
|
+
otherwise (values are inverted for use as a return code).
|
|
31
|
+
"""
|
|
32
|
+
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
|
|
33
|
+
url = f"https://api.github.com/repos/{repo_owner}/{repo}/actions/workflows/{workflow_id}/runs"
|
|
34
|
+
exceptions = []
|
|
35
|
+
for _ in range(num_attempts):
|
|
36
|
+
try:
|
|
37
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
38
|
+
response.raise_for_status()
|
|
39
|
+
break
|
|
40
|
+
except requests.RequestException as e:
|
|
41
|
+
exceptions.append(e)
|
|
42
|
+
else:
|
|
43
|
+
sep = "\n\t"
|
|
44
|
+
msg = (
|
|
45
|
+
f"Failed to fetch {url} after {num_attempts} attempts with the following "
|
|
46
|
+
f"errors: {sep}{'{sep}'.join(exceptions)}"
|
|
47
|
+
)
|
|
48
|
+
raise RuntimeError(msg)
|
|
49
|
+
|
|
50
|
+
runs = response.json()["workflow_runs"]
|
|
51
|
+
tz = datetime.fromisoformat(runs[0]["run_started_at"]).tzinfo
|
|
52
|
+
now = datetime.now(tz=tz)
|
|
53
|
+
|
|
54
|
+
latest_success = {}
|
|
55
|
+
workflow_active_for_max_days = {}
|
|
56
|
+
# Rather frustratingly, the workflow runs returned from the GitHub API can
|
|
57
|
+
# have alternating ordering of `head_branch`
|
|
58
|
+
# e.g.
|
|
59
|
+
# run[0]['head_branch'] == "branch-25.02"
|
|
60
|
+
# run[1]['head_branch'] == "branch-25.04"
|
|
61
|
+
# run[2]['head_branch'] == "branch-25.02"
|
|
62
|
+
#
|
|
63
|
+
# In this situation, the behavior of `itertools.groupby` (previously used
|
|
64
|
+
# here) is to only group _consecutive_ runs, so the results of the
|
|
65
|
+
# subsequent branch match (i.e. the second group of `branch-25.02` runs)
|
|
66
|
+
# will overwrite the results of the first one, potentially overwriting a
|
|
67
|
+
# previous success. The snippet below unifies the groups so it's more like a
|
|
68
|
+
# SQL groupby and there is no chance of overwriting.
|
|
69
|
+
branch_dict = defaultdict(list)
|
|
70
|
+
for run in runs:
|
|
71
|
+
branch_dict[run["head_branch"]].append(run)
|
|
72
|
+
|
|
73
|
+
for branch, branch_runs in branch_dict.items():
|
|
74
|
+
# Only consider RAPIDS release branches, which have versions like
|
|
75
|
+
# '25.02' (RAPIDS) or '0.42' (ucxx, ucx-py).
|
|
76
|
+
if not re.match("branch-[0-9]{1,2}.[0-9]{2}", branch):
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
latest_success[branch] = None
|
|
80
|
+
runs = sorted(branch_runs, key=lambda r: r["run_started_at"], reverse=True)
|
|
81
|
+
for run in runs:
|
|
82
|
+
days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days
|
|
83
|
+
if days_since_run > max_days_without_success:
|
|
84
|
+
break
|
|
85
|
+
if run["conclusion"] in GOOD_STATUSES:
|
|
86
|
+
latest_success[branch] = run
|
|
87
|
+
break
|
|
88
|
+
|
|
89
|
+
workflow_active_for_max_days[branch] = False
|
|
90
|
+
if len(runs) > 0:
|
|
91
|
+
run = runs[-1]
|
|
92
|
+
days_since_run = (now - datetime.fromisoformat(run["run_started_at"])).days
|
|
93
|
+
if days_since_run > max_days_without_success:
|
|
94
|
+
workflow_active_for_max_days[branch] = True
|
|
95
|
+
|
|
96
|
+
latest_branch = max(latest_success)
|
|
97
|
+
has_latest_success = latest_success[latest_branch] is not None
|
|
98
|
+
|
|
99
|
+
# We are producing Unix return codes so success/failure is inverted from the
|
|
100
|
+
# expected Python boolean values.
|
|
101
|
+
if has_latest_success:
|
|
102
|
+
print( # noqa: T201
|
|
103
|
+
f"The most recent successful run of the {workflow_id} workflow on "
|
|
104
|
+
f"{latest_branch} was "
|
|
105
|
+
f"{datetime.fromisoformat(latest_success[latest_branch]['run_started_at'])}, "
|
|
106
|
+
f"which is within the last {max_days_without_success} days. View logs:"
|
|
107
|
+
f"\n - {latest_success[latest_branch]['html_url']}"
|
|
108
|
+
)
|
|
109
|
+
return 0
|
|
110
|
+
elif not workflow_active_for_max_days[latest_branch]:
|
|
111
|
+
print( # noqa: T201
|
|
112
|
+
f"The oldest run of the {workflow_id} workflow on {latest_branch} was less "
|
|
113
|
+
f"than {max_days_without_success} days ago. This exempts the workflow from "
|
|
114
|
+
"check-nightly-success because the workflow has not been running for very long."
|
|
115
|
+
)
|
|
116
|
+
return 0
|
|
117
|
+
|
|
118
|
+
print( # noqa: T201
|
|
119
|
+
f"{latest_branch} has no successful runs of {workflow_id} in the last {max_days_without_success} days"
|
|
120
|
+
)
|
|
121
|
+
return 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
parser = argparse.ArgumentParser()
|
|
126
|
+
parser.add_argument("repo", type=str, help="Repository name")
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--repo-owner",
|
|
129
|
+
default="rapidsai",
|
|
130
|
+
help="Repository organization/owner",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument("--workflow-id", default="test.yaml", help="Workflow ID")
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"--max-days-without-success",
|
|
135
|
+
type=int,
|
|
136
|
+
default=7,
|
|
137
|
+
help="Maximum number of days without a successful run",
|
|
138
|
+
)
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
|
|
141
|
+
sys.exit(
|
|
142
|
+
main(
|
|
143
|
+
args.repo,
|
|
144
|
+
args.repo_owner,
|
|
145
|
+
args.workflow_id,
|
|
146
|
+
args.max_days_without_success,
|
|
147
|
+
),
|
|
148
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
|
|
3
|
+
|
|
4
|
+
# This script is meant to act on an 'all_jobs.json' file that comes from
|
|
5
|
+
# the summarize job when debug info is enabled. Bumping the time makes
|
|
6
|
+
# it easier to re-run the span-sending python script and check results
|
|
7
|
+
# in either Jaeger or Grafana
|
|
8
|
+
|
|
9
|
+
import datetime
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
with open("all_jobs.json") as f:
|
|
13
|
+
jobs = json.load(f)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_time(x: str) -> int:
|
|
17
|
+
return int(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timestamp() * 1e9)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
start_time = _parse_time(jobs[0]["created_at"])
|
|
21
|
+
needed_time = _parse_time(jobs[-3]["completed_at"]) - _parse_time(jobs[0]["created_at"])
|
|
22
|
+
new_start_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
|
|
23
|
+
|
|
24
|
+
for idx, job in enumerate(jobs):
|
|
25
|
+
if job["created_at"]:
|
|
26
|
+
job["created_at"] = (
|
|
27
|
+
new_start_time + datetime.timedelta(seconds=(_parse_time(job["created_at"]) - start_time) / 1e9)
|
|
28
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
29
|
+
if job["started_at"]:
|
|
30
|
+
job["started_at"] = (
|
|
31
|
+
new_start_time + datetime.timedelta(seconds=(_parse_time(job["started_at"]) - start_time) / 1e9)
|
|
32
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
33
|
+
if job["completed_at"]:
|
|
34
|
+
job["completed_at"] = (
|
|
35
|
+
new_start_time + datetime.timedelta(seconds=(_parse_time(job["completed_at"]) - start_time) / 1e9)
|
|
36
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
37
|
+
steps = []
|
|
38
|
+
for step in job["steps"]:
|
|
39
|
+
if step["started_at"]:
|
|
40
|
+
step["started_at"] = (
|
|
41
|
+
new_start_time + datetime.timedelta(seconds=(_parse_time(step["started_at"]) - start_time) / 1e9)
|
|
42
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
43
|
+
if step["completed_at"]:
|
|
44
|
+
step["completed_at"] = (
|
|
45
|
+
new_start_time + datetime.timedelta(seconds=(_parse_time(step["completed_at"]) - start_time) / 1e9)
|
|
46
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
47
|
+
steps.append(step)
|
|
48
|
+
job["steps"] = steps
|
|
49
|
+
|
|
50
|
+
jobs[idx] = job
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
with open("all_jobs.json", "w") as f:
|
|
54
|
+
json.dump(jobs, f)
|