nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_PTR_H_
8
+ #define _NCCL_DEVICE_PTR_H_
9
+ #include "core.h"
10
+ #include <stdint.h>
11
+
12
+ #if __cplusplus
13
+ template<typename T>
14
+ struct ncclSymPtr {
15
+ using ElementType = T;
16
+ ncclWindow_t window;
17
+ size_t offset;
18
+
19
+ NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr(ncclWindow_t window=nullptr, size_t offset=0);
20
+
21
+ template<typename U>
22
+ NCCL_HOST_DEVICE_INLINE operator ncclSymPtr<U>() const;
23
+
24
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(int d);
25
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned int d);
26
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long d);
27
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long d);
28
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long long d);
29
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long long d);
30
+
31
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(int d);
32
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned int d);
33
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long d);
34
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long d);
35
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long long d);
36
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long long d);
37
+
38
+ #if __CUDACC__
39
+ NCCL_DEVICE_INLINE T* localPtr() const;
40
+ NCCL_DEVICE_INLINE T* lsaPtr(int peer) const;
41
+ NCCL_DEVICE_INLINE T* peerPtr(int peer) const;
42
+ NCCL_DEVICE_INLINE T* peerPtr(ncclTeam team, int peer) const;
43
+ NCCL_DEVICE_INLINE T* multimemPtr(ncclMultimemHandle mmHandle) const;
44
+ NCCL_DEVICE_INLINE T* lsaMultimemPtr(ncclDevComm const&) const;
45
+ #endif
46
+ };
47
+
48
+ template<typename T, typename Int>
49
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d);
50
+ template<typename T, typename Int>
51
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d);
52
+ template<typename T>
53
+ NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b);
54
+
55
+ template<typename T, typename Int>
56
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator==(ncclSymPtr<T> a, ncclSymPtr<T> b);
57
+ template<typename T, typename Int>
58
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b);
59
+ #endif
60
+
61
+ #endif
@@ -0,0 +1,352 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_UTILITY_H_
8
+ #define _NCCL_DEVICE_UTILITY_H_
9
+
10
+ #if __CUDACC__
11
+ #define NCCL_DEVICE_INLINE __device__ __forceinline__
12
+ #define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
13
+ #else
14
+ #ifndef __host__
15
+ #define __host__
16
+ #endif
17
+ #define NCCL_DEVICE_INLINE
18
+ #define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline))
19
+ #endif
20
+
21
+ #if __cplusplus
22
+ #define NCCL_EXTERN_C extern "C"
23
+ #else
24
+ #define NCCL_EXTERN_C
25
+ #endif
26
+
27
+ #include <stdint.h>
28
+ #include <stdbool.h>
29
+
30
+ #if __CUDACC__
31
+ #include <cuda/atomic>
32
+ #endif
33
+
34
+ #if __cplusplus
35
+ namespace nccl {
36
+ namespace utility {
37
+
38
+ template<typename T>
39
+ T&& declval() noexcept {
40
+ static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval.");
41
+ }
42
+
43
+ template<typename X, typename Y, typename Z = decltype(X()+Y())>
44
+ NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) {
45
+ return (x+y-1)/y;
46
+ }
47
+
48
+ template<typename X, typename Y, typename Z = decltype(X()+Y())>
49
+ NCCL_HOST_DEVICE_INLINE constexpr Z roundUp(X x, Y y) {
50
+ return (x+y-1) - (x+y-1)%y;
51
+ }
52
+ template<typename X, typename Y, typename Z = decltype(X()+Y())>
53
+ NCCL_HOST_DEVICE_INLINE constexpr Z roundDown(X x, Y y) {
54
+ return x - x%y;
55
+ }
56
+
57
+ // assumes second argument is a power of 2
58
+ template<typename X, typename Y, typename Z = decltype(X()+Y())>
59
+ NCCL_HOST_DEVICE_INLINE constexpr Z alignUp(X x, Y a) {
60
+ return (x + a-1) & -Z(a);
61
+ }
62
+ template<typename T>
63
+ NCCL_HOST_DEVICE_INLINE T* alignUp(T* x, size_t a) {
64
+ static_assert(sizeof(T) == 1, "Only single byte types allowed.");
65
+ return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
66
+ }
67
+ template<typename T>
68
+ NCCL_HOST_DEVICE_INLINE void* alignUp(void const* x, size_t a) {
69
+ return reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
70
+ }
71
+
72
+ // assumes second argument is a power of 2
73
+ template<typename X, typename Y, typename Z = decltype(X()+int())>
74
+ NCCL_HOST_DEVICE_INLINE constexpr Z alignDown(X x, Y a) {
75
+ return x & -Z(a);
76
+ }
77
+ template<typename T>
78
+ NCCL_HOST_DEVICE_INLINE T* alignDown(T* x, size_t a) {
79
+ static_assert(sizeof(T) == 1, "Only single byte types allowed.");
80
+ return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
81
+ }
82
+ template<typename T>
83
+ NCCL_HOST_DEVICE_INLINE void* alignDown(void const* x, size_t a) {
84
+ return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
85
+ }
86
+
87
+ template<typename T>
88
+ NCCL_HOST_DEVICE_INLINE T add4G(T base, int delta4G) {
89
+ union { uint32_t u32[2]; T tmp; };
90
+ tmp = base;
91
+ u32[1] += delta4G;
92
+ return tmp;
93
+ }
94
+
95
+
96
+ template<typename Int>
97
+ NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) {
98
+ return (x & (x-1)) == 0;
99
+ }
100
+
101
+ // Produce the reciprocal of x for use in idivByRcp
102
+ NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) {
103
+ return uint32_t(-1)/x + isPow2(x);
104
+ }
105
+ NCCL_HOST_DEVICE_INLINE constexpr uint64_t idivRcp64(uint64_t x) {
106
+ return uint64_t(-1)/x + isPow2(x);
107
+ }
108
+
109
+ NCCL_HOST_DEVICE_INLINE uint32_t mul32hi(uint32_t a, uint32_t b) {
110
+ #if __CUDA_ARCH__
111
+ return __umulhi(a, b);
112
+ #else
113
+ return uint64_t(a)*b >> 32;
114
+ #endif
115
+ }
116
+ NCCL_HOST_DEVICE_INLINE uint64_t mul64hi(uint64_t a, uint64_t b) {
117
+ #if __CUDA_ARCH__
118
+ return __umul64hi(a, b);
119
+ #else
120
+ return (uint64_t)(((unsigned __int128)a)*b >> 64);
121
+ #endif
122
+ }
123
+
124
+ // Produce the reciprocal of x*y given their respective reciprocals. This incurs
125
+ // no integer division on device.
126
+ NCCL_HOST_DEVICE_INLINE uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
127
+ if (xrcp == 0) return yrcp;
128
+ if (yrcp == 0) return xrcp;
129
+ uint32_t rcp = mul32hi(xrcp, yrcp);
130
+ uint32_t rem = -x*y*rcp;
131
+ if (x*y <= rem) rcp += 1;
132
+ return rcp;
133
+ }
134
+ NCCL_HOST_DEVICE_INLINE uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
135
+ if (xrcp == 0) return yrcp;
136
+ if (yrcp == 0) return xrcp;
137
+ uint64_t rcp = mul64hi(xrcp, yrcp);
138
+ uint64_t rem = -x*y*rcp;
139
+ if (x*y <= rem) rcp += 1;
140
+ return rcp;
141
+ }
142
+
143
+ // Fast unsigned integer division where divisor has precomputed reciprocal.
144
+ // idivFast(x, y, idivRcp(y)) == x/y
145
+ NCCL_HOST_DEVICE_INLINE void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
146
+ uint32_t q = yrcp == 0 ? x : mul32hi(x, yrcp);
147
+ uint32_t r = x - y*q;
148
+ if (r >= y) { q += 1; r -= y; }
149
+ *quo = q;
150
+ *rem = r;
151
+ }
152
+ NCCL_HOST_DEVICE_INLINE void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
153
+ uint32_t q = yrcp == 0 ? x : mul64hi(x, yrcp);
154
+ uint32_t r = x - y*q;
155
+ if (r >= y) { q += 1; r -= y; }
156
+ *quo = q;
157
+ *rem = r;
158
+ }
159
+
160
+ NCCL_HOST_DEVICE_INLINE uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
161
+ uint32_t q, r;
162
+ idivmodFast32(&q, &r, x, y, yrcp);
163
+ return q;
164
+ }
165
+ NCCL_HOST_DEVICE_INLINE uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
166
+ uint64_t q, r;
167
+ idivmodFast64(&q, &r, x, y, yrcp);
168
+ return q;
169
+ }
170
+
171
+ NCCL_HOST_DEVICE_INLINE uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
172
+ uint32_t q, r;
173
+ idivmodFast32(&q, &r, x, y, yrcp);
174
+ return r;
175
+ }
176
+ NCCL_HOST_DEVICE_INLINE uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
177
+ uint64_t q, r;
178
+ idivmodFast64(&q, &r, x, y, yrcp);
179
+ return r;
180
+ }
181
+
182
+ #if __CUDACC__
183
+ // Precomputed integer reciprocoals for denominator values 1..64 inclusive.
184
+ // Pass these to idivFast64() for fast division on the GPU.
185
+ NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) {
186
+ static constexpr uint64_t table[65] = {
187
+ idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
188
+ idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
189
+ idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
190
+ idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
191
+ idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
192
+ idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
193
+ idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
194
+ idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
195
+ idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
196
+ idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
197
+ idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
198
+ idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
199
+ idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
200
+ idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
201
+ idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
202
+ idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
203
+ idivRcp64(0x40)
204
+ };
205
+ return table[x];
206
+ }
207
+ #endif
208
+
209
+ #if __CUDACC__
210
+ NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) {
211
+ return idivRcp64_upto64(x)>>32;
212
+ }
213
+ #endif
214
+
215
+ #if __CUDACC__
216
+ NCCL_DEVICE_INLINE void fenceAcquireGpu() {
217
+ static __device__ int dummy;
218
+ int tmp;
219
+ asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
220
+ dummy = tmp;
221
+ }
222
+ NCCL_DEVICE_INLINE void fenceReleaseGpu() {
223
+ cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
224
+ }
225
+ #endif
226
+
227
+ #if __CUDACC__
228
+ NCCL_DEVICE_INLINE cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
229
+ return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
230
+ ord == cuda::memory_order_acq_rel ? cuda::memory_order_acquire :
231
+ ord;
232
+ }
233
+ NCCL_DEVICE_INLINE cuda::memory_order releaseOrderOf(cuda::memory_order ord) {
234
+ return ord == cuda::memory_order_acquire ? cuda::memory_order_relaxed :
235
+ ord == cuda::memory_order_acq_rel ? cuda::memory_order_release :
236
+ ord;
237
+ }
238
+ #endif
239
+
240
+ #if __CUDACC__
241
+ NCCL_DEVICE_INLINE int lane() {
242
+ int ret;
243
+ asm("mov.u32 %0, %%laneid;" : "=r"(ret));
244
+ return ret;
245
+ }
246
+ NCCL_DEVICE_INLINE unsigned int lanemask_lt() {
247
+ unsigned int ret;
248
+ asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
249
+ return ret;
250
+ }
251
+ #endif
252
+
253
+ #if __CUDACC__
254
+ // Load anything, but cache like its constant memory.
255
+ template<typename T>
256
+ NCCL_DEVICE_INLINE T loadConst(T const *p) {
257
+ if (alignof(T) == 1) {
258
+ union { uint8_t part[sizeof(T)]; T ret; };
259
+ for (int i=0; i < (int)sizeof(T); i++) part[i] = __ldg((uint8_t const*)p + i);
260
+ return ret;
261
+ } else if (alignof(T) == 2) {
262
+ union { uint16_t part[sizeof(T)/2]; T ret; };
263
+ for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = __ldg((uint16_t const*)p + i);
264
+ return ret;
265
+ } else if (alignof(T) == 4) {
266
+ union { uint32_t part[sizeof(T)/4]; T ret; };
267
+ for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = __ldg((uint32_t const*)p + i);
268
+ return ret;
269
+ } else if (alignof(T) == 8) {
270
+ union { uint64_t part[sizeof(T)/8]; T ret; };
271
+ for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = __ldg((uint64_t const*)p + i);
272
+ return ret;
273
+ } else { // alignof(T) >= 16
274
+ union { ulonglong2 part[sizeof(T)/16]; T ret; };
275
+ for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = __ldg((ulonglong2 const*)p + i);
276
+ return ret;
277
+ }
278
+ }
279
+ #endif
280
+
281
+ ////////////////////////////////////////////////////////////////////////////////
282
+ // Optional<T>: Holds a T that may or may not be constructed. An Optional
283
+ // constructed with a Present<Arg...> will have its T constructed via the
284
+ // T::T(Arg...) constructor. An Optional constructed with a Absent will not
285
+ // have its T constructed.
286
+
287
+ template<int ...vals>
288
+ struct IntSeq {};
289
+
290
+ template<int n, int m, int ...i>
291
+ struct IntSeqUpTo: IntSeqUpTo<n, m+1, i..., m> {};
292
+ template<int n, int ...i>
293
+ struct IntSeqUpTo<n, n, i...> { using Type = IntSeq<i...>; };
294
+
295
+ // Present<Arg...>: Packs a list of arguments together to be passed to Optional<T>.
296
+ template<typename ...Arg>
297
+ struct Present;
298
+ template<>
299
+ struct Present<> {};
300
+ template<typename H, typename ...T>
301
+ struct Present<H, T...> {
302
+ H h;
303
+ Present<T...> t;
304
+
305
+ NCCL_HOST_DEVICE_INLINE H get(IntSeq<0>) {
306
+ return static_cast<H>(h);
307
+ }
308
+ template<int i>
309
+ NCCL_HOST_DEVICE_INLINE decltype(auto) get(IntSeq<i>) {
310
+ return t.get(IntSeq<i-1>{});
311
+ }
312
+ };
313
+
314
+ NCCL_HOST_DEVICE_INLINE Present<> present() {
315
+ return Present<>{};
316
+ }
317
+ template<typename H, typename ...T>
318
+ NCCL_HOST_DEVICE_INLINE Present<H&&, T&&...> present(H&& h, T&& ...t) {
319
+ return Present<H&&, T&&...>{static_cast<H&&>(h), present(static_cast<T&&>(t)...)};
320
+ }
321
+
322
+ struct Absent {};
323
+
324
+ template<typename T>
325
+ struct Optional {
326
+ bool present; // Is `thing` constructed.
327
+ union { T thing; };
328
+
329
+ // Construct with absent thing:
330
+ NCCL_HOST_DEVICE_INLINE constexpr Optional(): present(false) {}
331
+ NCCL_HOST_DEVICE_INLINE constexpr Optional(Absent): present(false) {}
332
+
333
+ // Helper constructor
334
+ template<int ...i, typename ...Arg>
335
+ NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args, IntSeq<i...>):
336
+ present(true),
337
+ thing{args.get(IntSeq<i>())...} {
338
+ }
339
+ // Construct with present thing:
340
+ template<typename ...Arg>
341
+ NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args):
342
+ Optional(args, IntSeqUpTo<sizeof...(Arg), 0>::Type()) {
343
+ }
344
+
345
+ NCCL_HOST_DEVICE_INLINE ~Optional() {
346
+ if (present) thing.~T();
347
+ }
348
+ };
349
+
350
+ }}
351
+ #endif // __cplusplus
352
+ #endif
@@ -0,0 +1,15 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #include "nccl_device/impl/comm__funcs.h"
8
+ #include "nccl_device/coop.h"
9
+ #include "nccl_device/impl/core__funcs.h"
10
+ #include "nccl_device/impl/ll_a2a__funcs.h"
11
+ #include "nccl_device/impl/mem_barrier__funcs.h"
12
+ //#include "nccl_device/net_barrier__funcs.h"
13
+ //#include "nccl_device/net_scratch_a2a__funcs.h"
14
+ //#include "nccl_device/barrier__funcs.h"
15
+ #include "nccl_device/impl/ptr__funcs.h"
Binary file
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: nvidia-nccl-cu13
3
+ Version: 2.28.3
4
+ Summary: NVIDIA Collective Communication Library (NCCL) Runtime
5
+ Home-page: https://developer.nvidia.com/cuda-zone
6
+ Author: Nvidia CUDA Installer Team
7
+ Author-email: compute_installer@nvidia.com
8
+ License-Expression: LicenseRef-NVIDIA-Proprietary
9
+ Keywords: cuda,nvidia,runtime,machine learning,deep learning
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Natural Language :: English
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.5
17
+ Classifier: Programming Language :: Python :: 3.6
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3 :: Only
24
+ Classifier: Topic :: Scientific/Engineering
25
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
26
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
+ Classifier: Topic :: Software Development
28
+ Classifier: Topic :: Software Development :: Libraries
29
+ Classifier: Operating System :: Microsoft :: Windows
30
+ Classifier: Operating System :: POSIX :: Linux
31
+ Requires-Python: >=3
32
+ License-File: License.txt
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: home-page
38
+ Dynamic: keywords
39
+ Dynamic: license
40
+ Dynamic: license-file
41
+ Dynamic: license-expression
42
+ Dynamic: requires-python
43
+ Dynamic: summary
44
+
45
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets.
@@ -0,0 +1,25 @@
1
+ nvidia/nccl/include/nccl.h,sha256=IK7t8WAvdNaI5fZ2B-2XWbfEEU1VjzuLRR4bRWa5f9g,24962
2
+ nvidia/nccl/include/nccl_device.h,sha256=ezE--yGdfirMW4jOKjKLy4a60_bVwcHzohSrIcIK1jc,647
3
+ nvidia/nccl/include/nccl_device/comm.h,sha256=NI1zfO81NPABwxXDSNrXCSkXuVDomHTDbhW8HllDCxo,344
4
+ nvidia/nccl/include/nccl_device/coop.h,sha256=lDqtFaHmEarBqCOVdqVu3fOuzcmgwBZHCf30GQIoiDs,4558
5
+ nvidia/nccl/include/nccl_device/core.h,sha256=b4lIT2As3nv7Pb8S2SsYsb_kkRl7f69VTU6FKtCQcSQ,5775
6
+ nvidia/nccl/include/nccl_device/ll_a2a.h,sha256=rL8t8ygiBwuCcQyR9g5v-_2VQVJ81QsUcmij7pBQP8E,2059
7
+ nvidia/nccl/include/nccl_device/mem_barrier.h,sha256=tXAWPUKr545oOfTWoPivtXh97GC-c8of6kBl3nRuFcE,1400
8
+ nvidia/nccl/include/nccl_device/ptr.h,sha256=Ee2Osw362B4l6ro8hVmPAbVvHSYsnDSOcPisPzhNHZs,2401
9
+ nvidia/nccl/include/nccl_device/utility.h,sha256=FxKHc6FUeMriKOOH-q-zs2WpiDjVdkAX_aoog5tG8BE,11135
10
+ nvidia/nccl/include/nccl_device/impl/comm__funcs.h,sha256=CyFNrVnQJfH2XKEypZXNxgt770UOk9V-Hn6rrRz4j2U,396
11
+ nvidia/nccl/include/nccl_device/impl/comm__types.h,sha256=VlxpsIjoJvq7tTBXkndAm4CSln6RVgBEDmZXpl5t2HE,1015
12
+ nvidia/nccl/include/nccl_device/impl/core__funcs.h,sha256=XeInTurZfwAabc3ulv0OT34P31V9XivQ5yTsUHtQ0_8,7217
13
+ nvidia/nccl/include/nccl_device/impl/core__types.h,sha256=e0rNUgtoo9bMSlx5p8kvLz-eNCvSZvZ3xgCz7FYuPoc,690
14
+ nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h,sha256=ngfj-F0SJuimkbm2LThw_fKGf6HzVRvLS-IopLJKXQ8,7613
15
+ nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h,sha256=VaxEOmgP-HHSAzlPQibeSc1zuRyAA1sG1eodS5MuCVM,927
16
+ nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h,sha256=ZvNkcxUhlViryJNl18YB9BIYDEx-6MC9YNd21175fHQ,4263
17
+ nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h,sha256=ASicvg5QKNEpE8scE34YagS2WOBiN-E7ELou_FSGw9M,1401
18
+ nvidia/nccl/include/nccl_device/impl/ptr__funcs.h,sha256=9DCVRMfgmy6fo37LNgtAsNClXAr5n3krDEhfvLyVV_4,4872
19
+ nvidia/nccl/include/nccl_device/impl/ptr__types.h,sha256=VF55-p5iHnuSH8BaG7asLH3SbCzJI9NNMOO-ck_7Lx4,413
20
+ nvidia/nccl/lib/libnccl.so.2,sha256=zVPO2EjUZGSY92iMgpKlCPpa2Nv-8Stt5NxVPkj07u8,215851232
21
+ nvidia_nccl_cu13-2.28.3.dist-info/licenses/License.txt,sha256=DwF0prTgszrCY3W_cpUzB1sy9MUaW2gCo9dC19zcmnY,1895
22
+ nvidia_nccl_cu13-2.28.3.dist-info/METADATA,sha256=HW7-ITCQK2pNT1f4oUJ66lU-c5XFLlgQi1MAuBQmuyI,2026
23
+ nvidia_nccl_cu13-2.28.3.dist-info/WHEEL,sha256=AKe2MPjYnM9usl7274sMSObT8kQGfc080ndvOeoBBiQ,110
24
+ nvidia_nccl_cu13-2.28.3.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
25
+ nvidia_nccl_cu13-2.28.3.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-manylinux_2_18_aarch64
5
+
@@ -0,0 +1,39 @@
1
+
2
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in the
11
+ documentation and/or other materials provided with the distribution.
12
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13
+ Laboratory, the U.S. Department of Energy, nor the names of their
14
+ contributors may be used to endorse or promote products derived
15
+ from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ The U.S. Department of Energy funded the development of this software
30
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31
+
32
+
33
+ This code also includes files from the NVIDIA Tools Extension SDK project.
34
+
35
+ See:
36
+
37
+ https://github.com/NVIDIA/NVTX
38
+
39
+ for more information and license details.
@@ -0,0 +1 @@
1
+ nvidia