nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_LL_A2A__FUNCS_H_
8
+ #define _NCCL_DEVICE_LL_A2A__FUNCS_H_
9
+ #include "ll_a2a__types.h"
10
+ #include "comm__types.h"
11
+ #include "../utility.h"
12
+
13
+ #if __CUDACC__
14
+ template<typename Coop>
15
+ NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::ncclLLA2ASession(
16
+ Coop coop, ncclDevComm const& comm, ncclTeam team,
17
+ ncclLLA2AHandle handle, uint32_t block, int maxElts,
18
+ bool multimem, ncclMultimemHandle mmHandle
19
+ ):
20
+ ncclLLA2ASession_internal<Coop>{
21
+ coop, comm, team, handle, (int)block, /*pitch=*/maxElts,
22
+ multimem, mmHandle, /*epoch=*/0, /*slotsOffset=*/0
23
+ } {
24
+ uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
25
+ line += block*(1 + 2*handle.nSlots);
26
+ this->epoch = line->x + 2;
27
+ this->slotsOffset = this->calcSlotOffset();
28
+ }
29
+ #endif
30
+
31
+ #if __CUDACC__
32
+ template<typename Coop>
33
+ NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::~ncclLLA2ASession() {
34
+ uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
35
+ line += this->block*(1 + 2*this->handle.nSlots);
36
+ if (this->coop.thread_rank() == 0) line->x = this->epoch - 2;
37
+ this->coop.sync();
38
+ }
39
+ #endif
40
+
41
+ #if __CUDACC__
42
+ template<typename Coop>
43
+ template<typename T>
44
+ NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::send(int peer, int elt, T data) {
45
+ using nccl::utility::divUp;
46
+ union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
47
+ tmp = data;
48
+ uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, peer);
49
+ buf += this->slotsOffset + elt;
50
+ #pragma unroll
51
+ for (int u=0; u < divUp(sizeof(T), 8); u++) {
52
+ asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
53
+ "l"(buf + u*this->pitch),
54
+ "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
55
+ );
56
+ }
57
+ }
58
+ #endif
59
+
60
+ #if __CUDACC__
61
+ template<typename Coop>
62
+ template<typename T>
63
+ NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::bcast(int elt, T data) {
64
+ using nccl::utility::divUp;
65
+ if (this->multimem) {
66
+ union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
67
+ tmp = data;
68
+ uint4* bufmc = (uint4*)ncclGetResourceBufferMultimemPointer(this->comm, this->handle.bufHandle, this->mmHandle);
69
+ bufmc += this->slotsOffset + elt;
70
+ #pragma unroll
71
+ for (int u=0; u < divUp(sizeof(T), 8); u++) {
72
+ asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
73
+ "l"(bufmc + this->pitch*u),
74
+ "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
75
+ );
76
+ }
77
+ } else {
78
+ union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
79
+ tmp = data;
80
+ int dr = 0;
81
+ int r = this->team.rank;
82
+ #pragma unroll 1
83
+ for (; dr+8 <= this->team.nRanks; dr += 8) {
84
+ #pragma unroll
85
+ for (int ur=0; ur < 8; ur++) {
86
+ uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
87
+ buf += this->slotsOffset + elt;
88
+ #pragma unroll
89
+ for (int u=0; u < divUp(sizeof(T),8); u++) {
90
+ asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
91
+ "l"(buf + u*this->pitch),
92
+ "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
93
+ );
94
+ }
95
+ r += 1;
96
+ if (r == this->team.nRanks) r = 0;
97
+ }
98
+ }
99
+ #pragma unroll
100
+ for (int ur=0; ur < 8; ur++, dr++) {
101
+ if (dr == this->team.nRanks) break;
102
+ uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
103
+ buf += this->slotsOffset + elt;
104
+ #pragma unroll
105
+ for (int u=0; u < divUp(sizeof(T),8); u++) {
106
+ asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
107
+ "l"(buf + u*this->pitch),
108
+ "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
109
+ );
110
+ }
111
+ r += 1;
112
+ if (r == this->team.nRanks) r = 0;
113
+ }
114
+ }
115
+ }
116
+ #endif
117
+
118
+ #if __CUDACC__
119
+ template<typename Coop>
120
+ template<typename T>
121
+ NCCL_DEVICE_INLINE T ncclLLA2ASession<Coop>::recv(int elt) {
122
+ T ret[1];
123
+ this->template recvUnrolled</*MinEltCount=*/1, /*MaxEltCount=*/1>(elt, 1, 0, ret);
124
+ return ret[0];
125
+ }
126
+ #endif
127
+
128
+ #if __CUDACC__
129
+ template<typename Coop>
130
+ template<int MinEltCount, int MaxEltCount, typename T>
131
+ NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::recvUnrolled(int eltStart, int eltCount, int eltStride, T(&elts)[MaxEltCount]) {
132
+ using nccl::utility::divUp;
133
+ uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
134
+ buf += this->slotsOffset + eltStart;
135
+
136
+ uint4 tmp[MaxEltCount][divUp(sizeof(T), 8)];
137
+ #pragma unroll 1
138
+ while (true) {
139
+ #pragma unroll
140
+ for (int u=0; u < MaxEltCount; u++) {
141
+ if (u < MinEltCount || u < eltCount) {
142
+ #pragma unroll
143
+ for (int v=0; v < divUp(sizeof(T), 8); v++) {
144
+ asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];"
145
+ : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w)
146
+ : "l"(buf + u*eltStride + v*this->pitch));
147
+ }
148
+ }
149
+ }
150
+ bool okAll = true;
151
+ #pragma unroll
152
+ for (int u=0; u < MaxEltCount; u++) {
153
+ #pragma unroll
154
+ for (int v=0; v < divUp(sizeof(T), 8); v++) {
155
+ if (u < MinEltCount || u < eltCount) {
156
+ bool ok = tmp[u][v].y == this->epoch &&
157
+ tmp[u][v].w == this->epoch;
158
+ okAll &= ok;
159
+ }
160
+ }
161
+ }
162
+ if (__builtin_expect(okAll, true)) break;
163
+ }
164
+
165
+ #pragma unroll
166
+ for (int u=0; u < MaxEltCount; u++) {
167
+ if (MinEltCount <= u && u == eltCount) break;
168
+ union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
169
+ #pragma unroll
170
+ for (int v=0; v < divUp(sizeof(T), 8); v++) {
171
+ u32[v][0] = tmp[u][v].x;
172
+ u32[v][1] = tmp[u][v].z;
173
+ }
174
+ elts[u] = val;
175
+ }
176
+ }
177
+ #endif
178
+
179
+ #if __CUDACC__
180
+ template<typename Coop>
181
+ template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
182
+ NCCL_DEVICE_INLINE auto ncclLLA2ASession<Coop>::recvReduce(
183
+ int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce reduce
184
+ ) -> decltype(eltToAcc(nccl::utility::declval<Elt>())) {
185
+ using Acc = decltype(eltToAcc(nccl::utility::declval<Elt>()));
186
+ Acc acc;
187
+ int i = 0;
188
+ #pragma unroll 1
189
+ for (; i+Unroll <= eltCount; i += Unroll) {
190
+ Elt got[Unroll];
191
+ this->template recvUnrolled</*Min=*/Unroll>(eltStart + i*eltStride, Unroll, eltStride, got);
192
+ Acc acc0 = eltToAcc(got[0]);
193
+ acc = i==0 ? acc0 : reduce(acc, acc0);
194
+ #pragma unroll
195
+ for (int j=1; j < Unroll; j++) acc = reduce(acc, eltToAcc(got[j]));
196
+ }
197
+ if (i < eltCount) {
198
+ Elt got[Unroll];
199
+ this->template recvUnrolled</*Min=*/1>(eltStart + i*eltStride, eltCount-i, eltStride, got);
200
+ Acc acc0 = eltToAcc(got[0]);
201
+ acc = i==0 ? acc0 : reduce(acc, acc0);
202
+ #pragma unroll
203
+ for (int j=1; j < Unroll-1; j++) {
204
+ if (i+j < eltCount) acc = reduce(acc, eltToAcc(got[j]));
205
+ }
206
+ }
207
+ return acc;
208
+ }
209
+ #endif
210
+
211
+ #if __CUDACC__
212
+ template<typename Coop>
213
+ NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::endEpoch(Coop) {
214
+ if (__builtin_expect(this->epoch >= -2u, false)) {
215
+ this->coop.sync();
216
+ uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
217
+ buf += this->slotsOffset;
218
+ #pragma unroll 4
219
+ for (int i=this->coop.thread_rank(); i < this->handle.nSlots; i += this->coop.size()) {
220
+ buf[i] = uint4{0, 0, 0, 0};
221
+ }
222
+ }
223
+ this->coop.sync();
224
+ this->epoch += (this->epoch == -1u) ? 3 : 1;
225
+ this->slotsOffset = this->calcSlotOffset();
226
+ }
227
+ #endif
228
+
229
+ #endif // _NCCL_DEVICE_LL_A2A__FUNCS_H_
@@ -0,0 +1,37 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_LL_A2A__TYPES_H_
8
+ #define _NCCL_DEVICE_LL_A2A__TYPES_H_
9
+ #include "../ll_a2a.h"
10
+ #include "core__types.h"
11
+
12
+ struct ncclLLA2AHandle {
13
+ ncclDevResourceHandle_t bufHandle;
14
+ uint32_t nSlots;
15
+ };
16
+
17
+ #if __CUDACC__
18
+ template<typename Coop>
19
+ struct ncclLLA2ASession_internal {
20
+ Coop coop;
21
+ ncclDevComm const& comm;
22
+ ncclTeam team;
23
+ ncclLLA2AHandle handle;
24
+ int block;
25
+ int pitch;
26
+ bool multimem;
27
+ ncclMultimemHandle mmHandle;
28
+ uint32_t epoch;
29
+ uint32_t slotsOffset;
30
+
31
+ NCCL_DEVICE_INLINE uint32_t calcSlotOffset() const {
32
+ return block*(1 + 2*handle.nSlots) + 1 + (epoch & 1)*handle.nSlots;
33
+ }
34
+ };
35
+ #endif
36
+
37
+ #endif // _NCCL_DEVICE_LL_A2A__TYPES_H_
@@ -0,0 +1,126 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
8
+ #define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
9
+ #include "mem_barrier__types.h"
10
+ #include "comm__types.h"
11
+
12
+ #if __CUDACC__
13
+ template<typename Coop>
14
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
15
+ Coop coop, ncclDevComm const& comm, ncclTeam team,
16
+ ncclLsaBarrierHandle handle, uint32_t index,
17
+ bool multimem, ncclMultimemHandle mmHandle
18
+ ):
19
+ ncclLsaBarrierSession_internal<Coop>{
20
+ coop, comm, team, handle, (int)index,
21
+ #if CUDART_VERSION >= 12060
22
+ multimem,
23
+ #else // WAR for an issue with ptxas in CTK < 12.6
24
+ /*multimem=*/false,
25
+ #endif
26
+ mmHandle, /*epoch=*/0
27
+ } {
28
+ uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
29
+ this->epoch = state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index];
30
+ }
31
+ #endif
32
+
33
+ #if __CUDACC__
34
+ template<typename Coop>
35
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
36
+ Coop coop, ncclDevComm const& comm, ncclTeamTagLsa, uint32_t index, bool multimem
37
+ ): ncclLsaBarrierSession(
38
+ coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem
39
+ ) {
40
+ }
41
+ #endif
42
+
43
+ #if __CUDACC__
44
+ template<typename Coop>
45
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::~ncclLsaBarrierSession() {
46
+ uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
47
+ if (this->coop.thread_rank() == 0) {
48
+ #if __CUDA_ARCH__ == 1200 && CUDART_VERSION < 13000
49
+ // WAR for a compiler issue with CTK < 13.0
50
+ if (this->index == 0)
51
+ state[(this->multimem ? 0 : 1)*this->handle.nBarriers] = this->epoch;
52
+ else
53
+ #endif
54
+ state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index] = this->epoch;
55
+ }
56
+ this->coop.sync();
57
+ }
58
+ #endif
59
+
60
+ #if __CUDACC__
61
+ template<typename Coop>
62
+ NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::arrive(Coop, cuda::memory_order order) {
63
+ this->coop.sync();
64
+ if (this->multimem) {
65
+ #if __CUDA_ARCH__ >= 900
66
+ if (this->coop.thread_rank() == 0) {
67
+ uint32_t* inbox = this->mcInbox(/*multimem=*/true);
68
+ if (nccl::utility::releaseOrderOf(order) != cuda::memory_order_relaxed) {
69
+ asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
70
+ } else {
71
+ asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
72
+ }
73
+ }
74
+ #endif
75
+ } else {
76
+ #pragma unroll 1
77
+ for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
78
+ int peer = i + (this->team.rank <= i ? 1 : 0);
79
+ cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(peer, this->team.rank));
80
+ inbox.store(this->epoch+1, nccl::utility::releaseOrderOf(order));
81
+ }
82
+ }
83
+ }
84
+ #endif
85
+
86
+ #if __CUDACC__
87
+ template<typename Coop>
88
+ NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::wait(Coop, cuda::memory_order order) {
89
+ if (this->multimem) {
90
+ #if __CUDA_ARCH__ >= 900
91
+ if (this->coop.thread_rank() == 0) {
92
+ cuda::atomic_ref<uint32_t> inbox(*this->mcInbox(/*multimem=*/false));
93
+ #pragma unroll 1
94
+ while (true) {
95
+ uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
96
+ if (got - (this->epoch + this->team.nRanks) <= uint32_t(-1)>>1) break;
97
+ }
98
+ this->epoch += this->team.nRanks;
99
+ }
100
+ #endif
101
+ } else {
102
+ #pragma unroll 1
103
+ for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
104
+ int peer = i + (this->team.rank <= i ? 1 : 0);
105
+ cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(this->team.rank, peer));
106
+ #pragma unroll 1
107
+ while (true) {
108
+ uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
109
+ if (got - (this->epoch + 1) <= uint32_t(-1)>>1) break;
110
+ }
111
+ }
112
+ this->epoch += 1;
113
+ }
114
+ this->coop.sync();
115
+ }
116
+ #endif
117
+
118
+ #if __CUDACC__
119
+ template<typename Coop>
120
+ NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::sync(Coop coop, cuda::memory_order order) {
121
+ this->arrive(coop, order);
122
+ this->wait(coop, order);
123
+ }
124
+ #endif
125
+
126
+ #endif // _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
@@ -0,0 +1,46 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
8
+ #define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
9
+ #include "../mem_barrier.h"
10
+ #include "core__types.h"
11
+
12
+ struct ncclLsaBarrierHandle {
13
+ ncclDevResourceHandle_t bufHandle;
14
+ int nBarriers;
15
+ };
16
+
17
+ #if __CUDACC__
18
+ template<typename Coop>
19
+ struct ncclLsaBarrierSession_internal {
20
+ Coop coop;
21
+ ncclDevComm const& comm;
22
+ ncclTeam team;
23
+ ncclLsaBarrierHandle handle;
24
+ int index;
25
+ bool multimem;
26
+ ncclMultimemHandle mmHandle;
27
+ uint32_t epoch;
28
+
29
+ NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) {
30
+ uint32_t* state;
31
+ if (multimem) { // multicast
32
+ state = (uint32_t*)ncclGetResourceBufferMultimemPointer(comm, handle.bufHandle, mmHandle);
33
+ } else { // unicast
34
+ state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
35
+ }
36
+ return state + 2*handle.nBarriers + index;
37
+ }
38
+
39
+ NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) {
40
+ uint32_t* state = (uint32_t*)ncclGetResourceBufferPeerPointer(comm, handle.bufHandle, team, owner);
41
+ return state + 3*handle.nBarriers + index*team.nRanks + peer;
42
+ }
43
+ };
44
+ #endif
45
+
46
+ #endif // _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
@@ -0,0 +1,157 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_PTR__FUNCS_H_
8
+ #define _NCCL_DEVICE_PTR__FUNCS_H_
9
+ #include "ptr__types.h"
10
+ #include "core__funcs.h"
11
+ #include "comm__types.h"
12
+
13
+ #if __cplusplus
14
+
15
+ template<typename T>
16
+ NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr<T>::ncclSymPtr(ncclWindow_t window, size_t offset):
17
+ window(window), offset(offset) {
18
+ }
19
+
20
+ template<typename T>
21
+ template<typename U>
22
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>::operator ncclSymPtr<U>() const {
23
+ return {window, offset};
24
+ }
25
+
26
+ template<typename T>
27
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(int d) {
28
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
29
+ return *this;
30
+ }
31
+ template<typename T>
32
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned int d) {
33
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
34
+ return *this;
35
+ }
36
+
37
+ template<typename T>
38
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long d) {
39
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
40
+ return *this;
41
+ }
42
+ template<typename T>
43
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long d) {
44
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
45
+ return *this;
46
+ }
47
+
48
+ template<typename T>
49
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long long d) {
50
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
51
+ return *this;
52
+ }
53
+ template<typename T>
54
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long long d) {
55
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
56
+ return *this;
57
+ }
58
+
59
+ template<typename T>
60
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(int d) {
61
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
62
+ return *this;
63
+ }
64
+ template<typename T>
65
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned int d) {
66
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
67
+ return *this;
68
+ }
69
+
70
+ template<typename T>
71
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long d) {
72
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
73
+ return *this;
74
+ }
75
+ template<typename T>
76
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long d) {
77
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
78
+ return *this;
79
+ }
80
+
81
+ template<typename T>
82
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long long d) {
83
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
84
+ return *this;
85
+ }
86
+ template<typename T>
87
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long long d) {
88
+ offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
89
+ return *this;
90
+ }
91
+
92
+ #if __CUDACC__
93
+ template<typename T>
94
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::localPtr() const {
95
+ return (T*)ncclGetLocalPointer(window, offset);
96
+ }
97
+ #endif
98
+
99
+ #if __CUDACC__
100
+ template<typename T>
101
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaPtr(int peer) const {
102
+ return (T*)ncclGetLsaPointer(window, offset, peer);
103
+ }
104
+ #endif
105
+
106
+ #if __CUDACC__
107
+ template<typename T>
108
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(int peer) const {
109
+ return (T*)ncclGetPeerPointer(window, offset, peer);
110
+ }
111
+ #endif
112
+
113
+ #if __CUDACC__
114
+ template<typename T>
115
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(ncclTeam team, int peer) const {
116
+ return (T*)ncclGetPeerPointer(window, offset, team, peer);
117
+ }
118
+ #endif
119
+
120
+ #if __CUDACC__
121
+ template<typename T>
122
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::multimemPtr(ncclMultimemHandle mmHandle) const {
123
+ return (T*)ncclGetMultimemPointer(window, offset, mmHandle);
124
+ }
125
+ #endif
126
+
127
+ #if __CUDACC__
128
+ template<typename T>
129
+ NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaMultimemPtr(ncclDevComm const& comm) const {
130
+ return (T*)ncclGetLsaMultimemPointer(window, offset, comm);
131
+ }
132
+ #endif
133
+
134
+ template<typename T, typename Int>
135
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d) {
136
+ return p += d;
137
+ }
138
+ template<typename T, typename Int>
139
+ NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d) {
140
+ return p -= d;
141
+ }
142
+ template<typename T>
143
+ NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b) {
144
+ return reinterpret_cast<T*>(a.offset) - reinterpret_cast<T*>(b.offset);
145
+ }
146
+
147
+ template<typename T>
148
+ NCCL_HOST_DEVICE_INLINE bool operator==(ncclSymPtr<T> a, ncclSymPtr<T> b) {
149
+ return a.window == b.window && a.offset == b.offset;
150
+ }
151
+ template<typename T>
152
+ NCCL_HOST_DEVICE_INLINE bool operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b) {
153
+ return a.window != b.window || a.offset != b.offset;
154
+ }
155
+
156
+ #endif // __cplusplus
157
+ #endif // _NCCL_DEVICE_PTR__FUNCS_H_
@@ -0,0 +1,11 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_PTR__TYPES_H_
8
+ #define _NCCL_DEVICE_PTR__TYPES_H_
9
+ #include "../ptr.h"
10
+ #include "core__types.h"
11
+ #endif // _NCCL_DEVICE_PTR__TYPES_H_
@@ -0,0 +1,53 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_LL_A2A_H_
8
+ #define _NCCL_DEVICE_LL_A2A_H_
9
+ #include "impl/core__types.h"
10
+
11
+ struct ncclLLA2AHandle;
12
+
13
+ NCCL_EXTERN_C __host__ int ncclLLA2ACalcSlots(int maxElts, int maxEltSize);
14
+
15
+ NCCL_EXTERN_C __host__ ncclResult_t ncclLLA2ACreateRequirement(int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
16
+
17
+ #if __CUDACC__
18
+ template<typename Coop>
19
+ struct ncclLLA2ASession_internal;
20
+
21
+ template<typename Coop>
22
+ struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
23
+ NCCL_DEVICE_INLINE ncclLLA2ASession(Coop, ncclDevComm const&, ncclTeam, ncclLLA2AHandle, uint32_t block, int maxElts, bool multimem=false, ncclMultimemHandle mmHandle={});
24
+
25
+ NCCL_DEVICE_INLINE ~ncclLLA2ASession();
26
+
27
+ ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable
28
+
29
+ template<typename T>
30
+ NCCL_DEVICE_INLINE void send(int peer, int slot, T data);
31
+
32
+ template<typename T>
33
+ NCCL_DEVICE_INLINE void bcast(int slot, T data);
34
+
35
+ template<typename T>
36
+ NCCL_DEVICE_INLINE T recv(int slot);
37
+
38
+ template<int MinEltCount, int MaxEltCount, typename T>
39
+ NCCL_DEVICE_INLINE void recvUnrolled(int eltStart, int eltCount, int eltStride, T(&vals)[MaxEltCount]);
40
+
41
+ template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
42
+ NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red)
43
+ -> decltype(eltToAcc(nccl::utility::declval<Elt>())) ;
44
+
45
+ // End an alltoall region. For every peer in team you must have done both of the
46
+ // following each of which can be accomplished using any thread in coop:
47
+ // 1. Targeted that peer with at least one send().
48
+ // 2. Received from a slot targeted by that peer.
49
+ NCCL_DEVICE_INLINE void endEpoch(Coop);
50
+ };
51
+ #endif
52
+
53
+ #endif // _NCCL_DEVICE_LL_A2A_H_
@@ -0,0 +1,35 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_MEM_BARRIER_H_
8
+ #define _NCCL_DEVICE_MEM_BARRIER_H_
9
+ #include "impl/core__types.h"
10
+
11
+ struct ncclLsaBarrierHandle;
12
+
13
+ NCCL_EXTERN_C __host__ ncclResult_t ncclLsaBarrierCreateRequirement(ncclTeam_t, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
14
+
15
+ #if __CUDACC__
16
+ template<typename Coop>
17
+ struct ncclLsaBarrierSession_internal;
18
+
19
+ template<typename Coop>
20
+ struct ncclLsaBarrierSession: ncclLsaBarrierSession_internal<Coop> {
21
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeam, ncclLsaBarrierHandle, uint32_t index, bool multimem=false, ncclMultimemHandle mmHandle={});
22
+
23
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeamTagLsa, uint32_t index, bool multimem=false);
24
+
25
+ NCCL_DEVICE_INLINE ~ncclLsaBarrierSession();
26
+
27
+ ncclLsaBarrierSession(ncclLsaBarrierSession const&) = delete; // Sessions are not copyable
28
+
29
+ NCCL_DEVICE_INLINE void arrive(Coop, cuda::memory_order);
30
+ NCCL_DEVICE_INLINE void wait(Coop, cuda::memory_order);
31
+ NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order);
32
+ };
33
+ #endif
34
+
35
+ #endif // _NCCL_DEVICE_MEM_BARRIER_H_