nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nvidia/nccl/include/nccl.h +571 -0
- nvidia/nccl/include/nccl_device/comm.h +10 -0
- nvidia/nccl/include/nccl_device/coop.h +152 -0
- nvidia/nccl/include/nccl_device/core.h +150 -0
- nvidia/nccl/include/nccl_device/impl/comm__funcs.h +10 -0
- nvidia/nccl/include/nccl_device/impl/comm__types.h +40 -0
- nvidia/nccl/include/nccl_device/impl/core__funcs.h +210 -0
- nvidia/nccl/include/nccl_device/impl/core__types.h +26 -0
- nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h +229 -0
- nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h +37 -0
- nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h +126 -0
- nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h +46 -0
- nvidia/nccl/include/nccl_device/impl/ptr__funcs.h +157 -0
- nvidia/nccl/include/nccl_device/impl/ptr__types.h +11 -0
- nvidia/nccl/include/nccl_device/ll_a2a.h +53 -0
- nvidia/nccl/include/nccl_device/mem_barrier.h +35 -0
- nvidia/nccl/include/nccl_device/ptr.h +61 -0
- nvidia/nccl/include/nccl_device/utility.h +352 -0
- nvidia/nccl/include/nccl_device.h +15 -0
- nvidia/nccl/lib/libnccl.so.2 +0 -0
- nvidia_nccl_cu13-2.28.3.dist-info/METADATA +45 -0
- nvidia_nccl_cu13-2.28.3.dist-info/RECORD +25 -0
- nvidia_nccl_cu13-2.28.3.dist-info/WHEEL +5 -0
- nvidia_nccl_cu13-2.28.3.dist-info/licenses/License.txt +39 -0
- nvidia_nccl_cu13-2.28.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_PTR_H_
|
|
8
|
+
#define _NCCL_DEVICE_PTR_H_
|
|
9
|
+
#include "core.h"
|
|
10
|
+
#include <stdint.h>
|
|
11
|
+
|
|
12
|
+
#if __cplusplus
|
|
13
|
+
template<typename T>
|
|
14
|
+
struct ncclSymPtr {
|
|
15
|
+
using ElementType = T;
|
|
16
|
+
ncclWindow_t window;
|
|
17
|
+
size_t offset;
|
|
18
|
+
|
|
19
|
+
NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr(ncclWindow_t window=nullptr, size_t offset=0);
|
|
20
|
+
|
|
21
|
+
template<typename U>
|
|
22
|
+
NCCL_HOST_DEVICE_INLINE operator ncclSymPtr<U>() const;
|
|
23
|
+
|
|
24
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(int d);
|
|
25
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned int d);
|
|
26
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long d);
|
|
27
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long d);
|
|
28
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long long d);
|
|
29
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long long d);
|
|
30
|
+
|
|
31
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(int d);
|
|
32
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned int d);
|
|
33
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long d);
|
|
34
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long d);
|
|
35
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long long d);
|
|
36
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long long d);
|
|
37
|
+
|
|
38
|
+
#if __CUDACC__
|
|
39
|
+
NCCL_DEVICE_INLINE T* localPtr() const;
|
|
40
|
+
NCCL_DEVICE_INLINE T* lsaPtr(int peer) const;
|
|
41
|
+
NCCL_DEVICE_INLINE T* peerPtr(int peer) const;
|
|
42
|
+
NCCL_DEVICE_INLINE T* peerPtr(ncclTeam team, int peer) const;
|
|
43
|
+
NCCL_DEVICE_INLINE T* multimemPtr(ncclMultimemHandle mmHandle) const;
|
|
44
|
+
NCCL_DEVICE_INLINE T* lsaMultimemPtr(ncclDevComm const&) const;
|
|
45
|
+
#endif
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
template<typename T, typename Int>
|
|
49
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d);
|
|
50
|
+
template<typename T, typename Int>
|
|
51
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d);
|
|
52
|
+
template<typename T>
|
|
53
|
+
NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b);
|
|
54
|
+
|
|
55
|
+
template<typename T, typename Int>
|
|
56
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator==(ncclSymPtr<T> a, ncclSymPtr<T> b);
|
|
57
|
+
template<typename T, typename Int>
|
|
58
|
+
NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b);
|
|
59
|
+
#endif
|
|
60
|
+
|
|
61
|
+
#endif
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_UTILITY_H_
|
|
8
|
+
#define _NCCL_DEVICE_UTILITY_H_
|
|
9
|
+
|
|
10
|
+
#if __CUDACC__
|
|
11
|
+
#define NCCL_DEVICE_INLINE __device__ __forceinline__
|
|
12
|
+
#define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
|
|
13
|
+
#else
|
|
14
|
+
#ifndef __host__
|
|
15
|
+
#define __host__
|
|
16
|
+
#endif
|
|
17
|
+
#define NCCL_DEVICE_INLINE
|
|
18
|
+
#define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline))
|
|
19
|
+
#endif
|
|
20
|
+
|
|
21
|
+
#if __cplusplus
|
|
22
|
+
#define NCCL_EXTERN_C extern "C"
|
|
23
|
+
#else
|
|
24
|
+
#define NCCL_EXTERN_C
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
#include <stdint.h>
|
|
28
|
+
#include <stdbool.h>
|
|
29
|
+
|
|
30
|
+
#if __CUDACC__
|
|
31
|
+
#include <cuda/atomic>
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
#if __cplusplus
|
|
35
|
+
namespace nccl {
|
|
36
|
+
namespace utility {
|
|
37
|
+
|
|
38
|
+
template<typename T>
|
|
39
|
+
T&& declval() noexcept {
|
|
40
|
+
static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval.");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
44
|
+
NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) {
|
|
45
|
+
return (x+y-1)/y;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
49
|
+
NCCL_HOST_DEVICE_INLINE constexpr Z roundUp(X x, Y y) {
|
|
50
|
+
return (x+y-1) - (x+y-1)%y;
|
|
51
|
+
}
|
|
52
|
+
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
53
|
+
NCCL_HOST_DEVICE_INLINE constexpr Z roundDown(X x, Y y) {
|
|
54
|
+
return x - x%y;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// assumes second argument is a power of 2
|
|
58
|
+
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
59
|
+
NCCL_HOST_DEVICE_INLINE constexpr Z alignUp(X x, Y a) {
|
|
60
|
+
return (x + a-1) & -Z(a);
|
|
61
|
+
}
|
|
62
|
+
template<typename T>
|
|
63
|
+
NCCL_HOST_DEVICE_INLINE T* alignUp(T* x, size_t a) {
|
|
64
|
+
static_assert(sizeof(T) == 1, "Only single byte types allowed.");
|
|
65
|
+
return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
|
|
66
|
+
}
|
|
67
|
+
template<typename T>
|
|
68
|
+
NCCL_HOST_DEVICE_INLINE void* alignUp(void const* x, size_t a) {
|
|
69
|
+
return reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// assumes second argument is a power of 2
|
|
73
|
+
template<typename X, typename Y, typename Z = decltype(X()+int())>
|
|
74
|
+
NCCL_HOST_DEVICE_INLINE constexpr Z alignDown(X x, Y a) {
|
|
75
|
+
return x & -Z(a);
|
|
76
|
+
}
|
|
77
|
+
template<typename T>
|
|
78
|
+
NCCL_HOST_DEVICE_INLINE T* alignDown(T* x, size_t a) {
|
|
79
|
+
static_assert(sizeof(T) == 1, "Only single byte types allowed.");
|
|
80
|
+
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
|
|
81
|
+
}
|
|
82
|
+
template<typename T>
|
|
83
|
+
NCCL_HOST_DEVICE_INLINE void* alignDown(void const* x, size_t a) {
|
|
84
|
+
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
template<typename T>
|
|
88
|
+
NCCL_HOST_DEVICE_INLINE T add4G(T base, int delta4G) {
|
|
89
|
+
union { uint32_t u32[2]; T tmp; };
|
|
90
|
+
tmp = base;
|
|
91
|
+
u32[1] += delta4G;
|
|
92
|
+
return tmp;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
template<typename Int>
|
|
97
|
+
NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) {
|
|
98
|
+
return (x & (x-1)) == 0;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Produce the reciprocal of x for use in idivByRcp
|
|
102
|
+
NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) {
|
|
103
|
+
return uint32_t(-1)/x + isPow2(x);
|
|
104
|
+
}
|
|
105
|
+
NCCL_HOST_DEVICE_INLINE constexpr uint64_t idivRcp64(uint64_t x) {
|
|
106
|
+
return uint64_t(-1)/x + isPow2(x);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
NCCL_HOST_DEVICE_INLINE uint32_t mul32hi(uint32_t a, uint32_t b) {
|
|
110
|
+
#if __CUDA_ARCH__
|
|
111
|
+
return __umulhi(a, b);
|
|
112
|
+
#else
|
|
113
|
+
return uint64_t(a)*b >> 32;
|
|
114
|
+
#endif
|
|
115
|
+
}
|
|
116
|
+
NCCL_HOST_DEVICE_INLINE uint64_t mul64hi(uint64_t a, uint64_t b) {
|
|
117
|
+
#if __CUDA_ARCH__
|
|
118
|
+
return __umul64hi(a, b);
|
|
119
|
+
#else
|
|
120
|
+
return (uint64_t)(((unsigned __int128)a)*b >> 64);
|
|
121
|
+
#endif
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Produce the reciprocal of x*y given their respective reciprocals. This incurs
|
|
125
|
+
// no integer division on device.
|
|
126
|
+
NCCL_HOST_DEVICE_INLINE uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
|
|
127
|
+
if (xrcp == 0) return yrcp;
|
|
128
|
+
if (yrcp == 0) return xrcp;
|
|
129
|
+
uint32_t rcp = mul32hi(xrcp, yrcp);
|
|
130
|
+
uint32_t rem = -x*y*rcp;
|
|
131
|
+
if (x*y <= rem) rcp += 1;
|
|
132
|
+
return rcp;
|
|
133
|
+
}
|
|
134
|
+
NCCL_HOST_DEVICE_INLINE uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
|
|
135
|
+
if (xrcp == 0) return yrcp;
|
|
136
|
+
if (yrcp == 0) return xrcp;
|
|
137
|
+
uint64_t rcp = mul64hi(xrcp, yrcp);
|
|
138
|
+
uint64_t rem = -x*y*rcp;
|
|
139
|
+
if (x*y <= rem) rcp += 1;
|
|
140
|
+
return rcp;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Fast unsigned integer division where divisor has precomputed reciprocal.
|
|
144
|
+
// idivFast(x, y, idivRcp(y)) == x/y
|
|
145
|
+
NCCL_HOST_DEVICE_INLINE void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
|
|
146
|
+
uint32_t q = yrcp == 0 ? x : mul32hi(x, yrcp);
|
|
147
|
+
uint32_t r = x - y*q;
|
|
148
|
+
if (r >= y) { q += 1; r -= y; }
|
|
149
|
+
*quo = q;
|
|
150
|
+
*rem = r;
|
|
151
|
+
}
|
|
152
|
+
NCCL_HOST_DEVICE_INLINE void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
|
|
153
|
+
uint32_t q = yrcp == 0 ? x : mul64hi(x, yrcp);
|
|
154
|
+
uint32_t r = x - y*q;
|
|
155
|
+
if (r >= y) { q += 1; r -= y; }
|
|
156
|
+
*quo = q;
|
|
157
|
+
*rem = r;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
NCCL_HOST_DEVICE_INLINE uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
|
161
|
+
uint32_t q, r;
|
|
162
|
+
idivmodFast32(&q, &r, x, y, yrcp);
|
|
163
|
+
return q;
|
|
164
|
+
}
|
|
165
|
+
NCCL_HOST_DEVICE_INLINE uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
|
166
|
+
uint64_t q, r;
|
|
167
|
+
idivmodFast64(&q, &r, x, y, yrcp);
|
|
168
|
+
return q;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
NCCL_HOST_DEVICE_INLINE uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
|
|
172
|
+
uint32_t q, r;
|
|
173
|
+
idivmodFast32(&q, &r, x, y, yrcp);
|
|
174
|
+
return r;
|
|
175
|
+
}
|
|
176
|
+
NCCL_HOST_DEVICE_INLINE uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
|
|
177
|
+
uint64_t q, r;
|
|
178
|
+
idivmodFast64(&q, &r, x, y, yrcp);
|
|
179
|
+
return r;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
#if __CUDACC__
|
|
183
|
+
// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
|
|
184
|
+
// Pass these to idivFast64() for fast division on the GPU.
|
|
185
|
+
NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) {
|
|
186
|
+
static constexpr uint64_t table[65] = {
|
|
187
|
+
idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
|
|
188
|
+
idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
|
|
189
|
+
idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
|
|
190
|
+
idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
|
|
191
|
+
idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
|
|
192
|
+
idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
|
|
193
|
+
idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
|
|
194
|
+
idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
|
|
195
|
+
idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
|
|
196
|
+
idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
|
|
197
|
+
idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
|
|
198
|
+
idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
|
|
199
|
+
idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
|
|
200
|
+
idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
|
|
201
|
+
idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
|
|
202
|
+
idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
|
|
203
|
+
idivRcp64(0x40)
|
|
204
|
+
};
|
|
205
|
+
return table[x];
|
|
206
|
+
}
|
|
207
|
+
#endif
|
|
208
|
+
|
|
209
|
+
#if __CUDACC__
|
|
210
|
+
NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) {
|
|
211
|
+
return idivRcp64_upto64(x)>>32;
|
|
212
|
+
}
|
|
213
|
+
#endif
|
|
214
|
+
|
|
215
|
+
#if __CUDACC__
|
|
216
|
+
NCCL_DEVICE_INLINE void fenceAcquireGpu() {
|
|
217
|
+
static __device__ int dummy;
|
|
218
|
+
int tmp;
|
|
219
|
+
asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
|
|
220
|
+
dummy = tmp;
|
|
221
|
+
}
|
|
222
|
+
NCCL_DEVICE_INLINE void fenceReleaseGpu() {
|
|
223
|
+
cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
|
|
224
|
+
}
|
|
225
|
+
#endif
|
|
226
|
+
|
|
227
|
+
#if __CUDACC__
|
|
228
|
+
NCCL_DEVICE_INLINE cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
|
|
229
|
+
return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
|
|
230
|
+
ord == cuda::memory_order_acq_rel ? cuda::memory_order_acquire :
|
|
231
|
+
ord;
|
|
232
|
+
}
|
|
233
|
+
NCCL_DEVICE_INLINE cuda::memory_order releaseOrderOf(cuda::memory_order ord) {
|
|
234
|
+
return ord == cuda::memory_order_acquire ? cuda::memory_order_relaxed :
|
|
235
|
+
ord == cuda::memory_order_acq_rel ? cuda::memory_order_release :
|
|
236
|
+
ord;
|
|
237
|
+
}
|
|
238
|
+
#endif
|
|
239
|
+
|
|
240
|
+
#if __CUDACC__
|
|
241
|
+
NCCL_DEVICE_INLINE int lane() {
|
|
242
|
+
int ret;
|
|
243
|
+
asm("mov.u32 %0, %%laneid;" : "=r"(ret));
|
|
244
|
+
return ret;
|
|
245
|
+
}
|
|
246
|
+
NCCL_DEVICE_INLINE unsigned int lanemask_lt() {
|
|
247
|
+
unsigned int ret;
|
|
248
|
+
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
|
|
249
|
+
return ret;
|
|
250
|
+
}
|
|
251
|
+
#endif
|
|
252
|
+
|
|
253
|
+
#if __CUDACC__
|
|
254
|
+
// Load anything, but cache like its constant memory.
|
|
255
|
+
template<typename T>
|
|
256
|
+
NCCL_DEVICE_INLINE T loadConst(T const *p) {
|
|
257
|
+
if (alignof(T) == 1) {
|
|
258
|
+
union { uint8_t part[sizeof(T)]; T ret; };
|
|
259
|
+
for (int i=0; i < (int)sizeof(T); i++) part[i] = __ldg((uint8_t const*)p + i);
|
|
260
|
+
return ret;
|
|
261
|
+
} else if (alignof(T) == 2) {
|
|
262
|
+
union { uint16_t part[sizeof(T)/2]; T ret; };
|
|
263
|
+
for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = __ldg((uint16_t const*)p + i);
|
|
264
|
+
return ret;
|
|
265
|
+
} else if (alignof(T) == 4) {
|
|
266
|
+
union { uint32_t part[sizeof(T)/4]; T ret; };
|
|
267
|
+
for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = __ldg((uint32_t const*)p + i);
|
|
268
|
+
return ret;
|
|
269
|
+
} else if (alignof(T) == 8) {
|
|
270
|
+
union { uint64_t part[sizeof(T)/8]; T ret; };
|
|
271
|
+
for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = __ldg((uint64_t const*)p + i);
|
|
272
|
+
return ret;
|
|
273
|
+
} else { // alignof(T) >= 16
|
|
274
|
+
union { ulonglong2 part[sizeof(T)/16]; T ret; };
|
|
275
|
+
for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = __ldg((ulonglong2 const*)p + i);
|
|
276
|
+
return ret;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
#endif
|
|
280
|
+
|
|
281
|
+
////////////////////////////////////////////////////////////////////////////////
|
|
282
|
+
// Optional<T>: Holds a T that may or may not be constructed. An Optional
|
|
283
|
+
// constructed with a Present<Arg...> will have its T constructed via the
|
|
284
|
+
// T::T(Arg...) constructor. An Optional constructed with a Absent will not
|
|
285
|
+
// have its T constructed.
|
|
286
|
+
|
|
287
|
+
template<int ...vals>
|
|
288
|
+
struct IntSeq {};
|
|
289
|
+
|
|
290
|
+
template<int n, int m, int ...i>
|
|
291
|
+
struct IntSeqUpTo: IntSeqUpTo<n, m+1, i..., m> {};
|
|
292
|
+
template<int n, int ...i>
|
|
293
|
+
struct IntSeqUpTo<n, n, i...> { using Type = IntSeq<i...>; };
|
|
294
|
+
|
|
295
|
+
// Present<Arg...>: Packs a list of arguments together to be passed to Optional<T>.
|
|
296
|
+
template<typename ...Arg>
|
|
297
|
+
struct Present;
|
|
298
|
+
template<>
|
|
299
|
+
struct Present<> {};
|
|
300
|
+
template<typename H, typename ...T>
|
|
301
|
+
struct Present<H, T...> {
|
|
302
|
+
H h;
|
|
303
|
+
Present<T...> t;
|
|
304
|
+
|
|
305
|
+
NCCL_HOST_DEVICE_INLINE H get(IntSeq<0>) {
|
|
306
|
+
return static_cast<H>(h);
|
|
307
|
+
}
|
|
308
|
+
template<int i>
|
|
309
|
+
NCCL_HOST_DEVICE_INLINE decltype(auto) get(IntSeq<i>) {
|
|
310
|
+
return t.get(IntSeq<i-1>{});
|
|
311
|
+
}
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
NCCL_HOST_DEVICE_INLINE Present<> present() {
|
|
315
|
+
return Present<>{};
|
|
316
|
+
}
|
|
317
|
+
template<typename H, typename ...T>
|
|
318
|
+
NCCL_HOST_DEVICE_INLINE Present<H&&, T&&...> present(H&& h, T&& ...t) {
|
|
319
|
+
return Present<H&&, T&&...>{static_cast<H&&>(h), present(static_cast<T&&>(t)...)};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
struct Absent {};
|
|
323
|
+
|
|
324
|
+
template<typename T>
|
|
325
|
+
struct Optional {
|
|
326
|
+
bool present; // Is `thing` constructed.
|
|
327
|
+
union { T thing; };
|
|
328
|
+
|
|
329
|
+
// Construct with absent thing:
|
|
330
|
+
NCCL_HOST_DEVICE_INLINE constexpr Optional(): present(false) {}
|
|
331
|
+
NCCL_HOST_DEVICE_INLINE constexpr Optional(Absent): present(false) {}
|
|
332
|
+
|
|
333
|
+
// Helper constructor
|
|
334
|
+
template<int ...i, typename ...Arg>
|
|
335
|
+
NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args, IntSeq<i...>):
|
|
336
|
+
present(true),
|
|
337
|
+
thing{args.get(IntSeq<i>())...} {
|
|
338
|
+
}
|
|
339
|
+
// Construct with present thing:
|
|
340
|
+
template<typename ...Arg>
|
|
341
|
+
NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args):
|
|
342
|
+
Optional(args, IntSeqUpTo<sizeof...(Arg), 0>::Type()) {
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
NCCL_HOST_DEVICE_INLINE ~Optional() {
|
|
346
|
+
if (present) thing.~T();
|
|
347
|
+
}
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
}}
|
|
351
|
+
#endif // __cplusplus
|
|
352
|
+
#endif
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#include "nccl_device/impl/comm__funcs.h"
|
|
8
|
+
#include "nccl_device/coop.h"
|
|
9
|
+
#include "nccl_device/impl/core__funcs.h"
|
|
10
|
+
#include "nccl_device/impl/ll_a2a__funcs.h"
|
|
11
|
+
#include "nccl_device/impl/mem_barrier__funcs.h"
|
|
12
|
+
//#include "nccl_device/net_barrier__funcs.h"
|
|
13
|
+
//#include "nccl_device/net_scratch_a2a__funcs.h"
|
|
14
|
+
//#include "nccl_device/barrier__funcs.h"
|
|
15
|
+
#include "nccl_device/impl/ptr__funcs.h"
|
|
Binary file
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nvidia-nccl-cu13
|
|
3
|
+
Version: 2.28.3
|
|
4
|
+
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
|
|
5
|
+
Home-page: https://developer.nvidia.com/cuda-zone
|
|
6
|
+
Author: Nvidia CUDA Installer Team
|
|
7
|
+
Author-email: compute_installer@nvidia.com
|
|
8
|
+
License-Expression: LicenseRef-NVIDIA-Proprietary
|
|
9
|
+
Keywords: cuda,nvidia,runtime,machine learning,deep learning
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Natural Language :: English
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.5
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Software Development
|
|
28
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
29
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
30
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
31
|
+
Requires-Python: >=3
|
|
32
|
+
License-File: License.txt
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: home-page
|
|
38
|
+
Dynamic: keywords
|
|
39
|
+
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
Dynamic: license-expression
|
|
42
|
+
Dynamic: requires-python
|
|
43
|
+
Dynamic: summary
|
|
44
|
+
|
|
45
|
+
NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
nvidia/nccl/include/nccl.h,sha256=IK7t8WAvdNaI5fZ2B-2XWbfEEU1VjzuLRR4bRWa5f9g,24962
|
|
2
|
+
nvidia/nccl/include/nccl_device.h,sha256=ezE--yGdfirMW4jOKjKLy4a60_bVwcHzohSrIcIK1jc,647
|
|
3
|
+
nvidia/nccl/include/nccl_device/comm.h,sha256=NI1zfO81NPABwxXDSNrXCSkXuVDomHTDbhW8HllDCxo,344
|
|
4
|
+
nvidia/nccl/include/nccl_device/coop.h,sha256=lDqtFaHmEarBqCOVdqVu3fOuzcmgwBZHCf30GQIoiDs,4558
|
|
5
|
+
nvidia/nccl/include/nccl_device/core.h,sha256=b4lIT2As3nv7Pb8S2SsYsb_kkRl7f69VTU6FKtCQcSQ,5775
|
|
6
|
+
nvidia/nccl/include/nccl_device/ll_a2a.h,sha256=rL8t8ygiBwuCcQyR9g5v-_2VQVJ81QsUcmij7pBQP8E,2059
|
|
7
|
+
nvidia/nccl/include/nccl_device/mem_barrier.h,sha256=tXAWPUKr545oOfTWoPivtXh97GC-c8of6kBl3nRuFcE,1400
|
|
8
|
+
nvidia/nccl/include/nccl_device/ptr.h,sha256=Ee2Osw362B4l6ro8hVmPAbVvHSYsnDSOcPisPzhNHZs,2401
|
|
9
|
+
nvidia/nccl/include/nccl_device/utility.h,sha256=FxKHc6FUeMriKOOH-q-zs2WpiDjVdkAX_aoog5tG8BE,11135
|
|
10
|
+
nvidia/nccl/include/nccl_device/impl/comm__funcs.h,sha256=CyFNrVnQJfH2XKEypZXNxgt770UOk9V-Hn6rrRz4j2U,396
|
|
11
|
+
nvidia/nccl/include/nccl_device/impl/comm__types.h,sha256=VlxpsIjoJvq7tTBXkndAm4CSln6RVgBEDmZXpl5t2HE,1015
|
|
12
|
+
nvidia/nccl/include/nccl_device/impl/core__funcs.h,sha256=XeInTurZfwAabc3ulv0OT34P31V9XivQ5yTsUHtQ0_8,7217
|
|
13
|
+
nvidia/nccl/include/nccl_device/impl/core__types.h,sha256=e0rNUgtoo9bMSlx5p8kvLz-eNCvSZvZ3xgCz7FYuPoc,690
|
|
14
|
+
nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h,sha256=ngfj-F0SJuimkbm2LThw_fKGf6HzVRvLS-IopLJKXQ8,7613
|
|
15
|
+
nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h,sha256=VaxEOmgP-HHSAzlPQibeSc1zuRyAA1sG1eodS5MuCVM,927
|
|
16
|
+
nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h,sha256=ZvNkcxUhlViryJNl18YB9BIYDEx-6MC9YNd21175fHQ,4263
|
|
17
|
+
nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h,sha256=ASicvg5QKNEpE8scE34YagS2WOBiN-E7ELou_FSGw9M,1401
|
|
18
|
+
nvidia/nccl/include/nccl_device/impl/ptr__funcs.h,sha256=9DCVRMfgmy6fo37LNgtAsNClXAr5n3krDEhfvLyVV_4,4872
|
|
19
|
+
nvidia/nccl/include/nccl_device/impl/ptr__types.h,sha256=VF55-p5iHnuSH8BaG7asLH3SbCzJI9NNMOO-ck_7Lx4,413
|
|
20
|
+
nvidia/nccl/lib/libnccl.so.2,sha256=zVPO2EjUZGSY92iMgpKlCPpa2Nv-8Stt5NxVPkj07u8,215851232
|
|
21
|
+
nvidia_nccl_cu13-2.28.3.dist-info/licenses/License.txt,sha256=DwF0prTgszrCY3W_cpUzB1sy9MUaW2gCo9dC19zcmnY,1895
|
|
22
|
+
nvidia_nccl_cu13-2.28.3.dist-info/METADATA,sha256=HW7-ITCQK2pNT1f4oUJ66lU-c5XFLlgQi1MAuBQmuyI,2026
|
|
23
|
+
nvidia_nccl_cu13-2.28.3.dist-info/WHEEL,sha256=AKe2MPjYnM9usl7274sMSObT8kQGfc080ndvOeoBBiQ,110
|
|
24
|
+
nvidia_nccl_cu13-2.28.3.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
|
|
25
|
+
nvidia_nccl_cu13-2.28.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
|
|
2
|
+
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
|
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
|
5
|
+
modification, are permitted provided that the following conditions
|
|
6
|
+
are met:
|
|
7
|
+
* Redistributions of source code must retain the above copyright
|
|
8
|
+
notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
documentation and/or other materials provided with the distribution.
|
|
12
|
+
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
|
|
13
|
+
Laboratory, the U.S. Department of Energy, nor the names of their
|
|
14
|
+
contributors may be used to endorse or promote products derived
|
|
15
|
+
from this software without specific prior written permission.
|
|
16
|
+
|
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
18
|
+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
19
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
20
|
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
21
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
22
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
23
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
24
|
+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
25
|
+
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
26
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
27
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
28
|
+
|
|
29
|
+
The U.S. Department of Energy funded the development of this software
|
|
30
|
+
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
This code also includes files from the NVIDIA Tools Extension SDK project.
|
|
34
|
+
|
|
35
|
+
See:
|
|
36
|
+
|
|
37
|
+
https://github.com/NVIDIA/NVTX
|
|
38
|
+
|
|
39
|
+
for more information and license details.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nvidia
|