nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl → 2.28.7__py3-none-manylinux_2_18_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. nvidia/nccl/include/nccl.h +14 -2
  2. nvidia/nccl/include/nccl_device/barrier.h +47 -0
  3. nvidia/nccl/include/nccl_device/coop.h +66 -7
  4. nvidia/nccl/include/nccl_device/core.h +19 -0
  5. nvidia/nccl/include/nccl_device/gin/gdaki/gin_gdaki.h +214 -0
  6. nvidia/nccl/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h +36 -0
  7. nvidia/nccl/include/nccl_device/gin/gin_device_api.h +18 -0
  8. nvidia/nccl/include/nccl_device/gin/gin_device_common.h +120 -0
  9. nvidia/nccl/include/nccl_device/gin/gin_device_host_common.h +24 -0
  10. nvidia/nccl/include/nccl_device/gin/proxy/gin_proxy.h +235 -0
  11. nvidia/nccl/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h +125 -0
  12. nvidia/nccl/include/nccl_device/gin.h +207 -0
  13. nvidia/nccl/include/nccl_device/gin_barrier.h +37 -0
  14. nvidia/nccl/include/nccl_device/impl/barrier__funcs.h +94 -0
  15. nvidia/nccl/include/nccl_device/impl/barrier__types.h +29 -0
  16. nvidia/nccl/include/nccl_device/impl/comm__types.h +12 -1
  17. nvidia/nccl/include/nccl_device/impl/core__funcs.h +32 -0
  18. nvidia/nccl/include/nccl_device/impl/core__types.h +3 -1
  19. nvidia/nccl/include/nccl_device/impl/gin__funcs.h +407 -0
  20. nvidia/nccl/include/nccl_device/impl/gin__types.h +10 -0
  21. nvidia/nccl/include/nccl_device/impl/gin_barrier__funcs.h +66 -0
  22. nvidia/nccl/include/nccl_device/impl/gin_barrier__types.h +31 -0
  23. nvidia/nccl/include/nccl_device/impl/{mem_barrier__funcs.h → lsa_barrier__funcs.h} +1 -1
  24. nvidia/nccl/include/nccl_device/impl/{mem_barrier__types.h → lsa_barrier__types.h} +1 -1
  25. nvidia/nccl/include/nccl_device/ll_a2a.h +2 -2
  26. nvidia/nccl/include/nccl_device/net_device.h +38 -0
  27. nvidia/nccl/include/nccl_device/utility.h +62 -12
  28. nvidia/nccl/include/nccl_device.h +5 -5
  29. nvidia/nccl/lib/libnccl.so.2 +0 -0
  30. {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/METADATA +1 -1
  31. nvidia_nccl_cu13-2.28.7.dist-info/RECORD +42 -0
  32. nvidia_nccl_cu13-2.28.3.dist-info/RECORD +0 -25
  33. /nvidia/nccl/include/nccl_device/{mem_barrier.h → lsa_barrier.h} +0 -0
  34. {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/WHEEL +0 -0
  35. {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/licenses/License.txt +0 -0
  36. {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/top_level.txt +0 -0
@@ -18,10 +18,10 @@
18
18
 
19
19
  #define NCCL_MAJOR 2
20
20
  #define NCCL_MINOR 28
21
- #define NCCL_PATCH 3
21
+ #define NCCL_PATCH 7
22
22
  #define NCCL_SUFFIX ""
23
23
 
24
- #define NCCL_VERSION_CODE 22803
24
+ #define NCCL_VERSION_CODE 22807
25
25
  #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
26
26
 
27
27
  #ifdef __cplusplus
@@ -69,6 +69,9 @@ typedef enum { ncclSuccess = 0,
69
69
  #define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
70
70
  #define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
71
71
 
72
+ /* ncclCommRevoke flags */
73
+ #define NCCL_REVOKE_DEFAULT 0x00 /* reserved for future use; must be 0 */
74
+
72
75
  /* Communicator configuration. Users can assign value to attributes to specify the
73
76
  * behavior of a communicator. */
74
77
  typedef struct ncclConfig_v22800 {
@@ -194,6 +197,15 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
194
197
  ncclResult_t ncclCommAbort(ncclComm_t comm);
195
198
  ncclResult_t pncclCommAbort(ncclComm_t comm);
196
199
 
200
+ /* Revoke a communicator. ncclCommRevoke stops all in-flight operations
201
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
202
+ * when the communicator is quiescent; then, management operations (destroy, split,
203
+ * shrink) can proceed safely. Calling ncclCommFinalize after revoke is invalid.
204
+ * Additionally, resource sharing via splitShare/shrinkShare is disabled while revoked.
205
+ * revokeFlags must be NCCL_REVOKE_DEFAULT (0). */
206
+ ncclResult_t ncclCommRevoke(ncclComm_t comm, int revokeFlags);
207
+ ncclResult_t pncclCommRevoke(ncclComm_t comm, int revokeFlags);
208
+
197
209
  /* Creates one or more communicators from an existing one.
198
210
  * Ranks with the same color will end up in the same communicator.
199
211
  * Within the new communicator, key will be used to order ranks.
@@ -0,0 +1,47 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_BARRIER_H_
8
+ #define _NCCL_DEVICE_BARRIER_H_
9
+ #include "impl/core__types.h"
10
+ #include "impl/lsa_barrier__types.h"
11
+ #include "impl/gin_barrier__types.h"
12
+
13
+ #if __CUDACC__
14
+ template<typename Coop>
15
+ struct ncclBarrierSession_internal;
16
+
17
+ template<typename Coop>
18
+ struct ncclBarrierSession: ncclBarrierSession_internal<Coop> {
19
+ // Full featured constructor:
20
+ NCCL_DEVICE_INLINE ncclBarrierSession(
21
+ Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin,
22
+ ncclLsaBarrierHandle innerBarHandle,
23
+ ncclGinBarrierHandle outerBarHandle,
24
+ uint32_t index,
25
+ bool multimem=false, ncclMultimemHandle innerMmHandle={}
26
+ );
27
+ // Convenience constructors for baked in teams:
28
+ NCCL_DEVICE_INLINE ncclBarrierSession(
29
+ Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false
30
+ );
31
+ NCCL_DEVICE_INLINE ncclBarrierSession(
32
+ Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false
33
+ );
34
+ NCCL_DEVICE_INLINE ncclBarrierSession(
35
+ Coop, ncclTeamTagRail, ncclGin, uint32_t index
36
+ );
37
+
38
+ ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable
39
+
40
+ NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& lsaBarrier();
41
+ NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ginBarrier();
42
+
43
+ NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
44
+ };
45
+ #endif
46
+
47
+ #endif // _NCCL_DEVICE_BARRIER_H_
@@ -30,7 +30,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
30
30
  return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
31
31
  }
32
32
  NCCL_DEVICE_INLINE void sync() {
33
- __syncwarp(laneMask());
33
+ if (nThreadsPow2 > 1) __syncwarp(laneMask());
34
34
  }
35
35
  };
36
36
  #endif
@@ -43,7 +43,7 @@ typedef ncclCoopTile<32> ncclCoopWarp;
43
43
  #if __CUDACC__
44
44
  struct ncclCoopLanes { // Some lanes of this warp.
45
45
  uint32_t lmask;
46
-
46
+
47
47
  NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
48
48
 
49
49
  NCCL_DEVICE_INLINE int thread_rank() const {
@@ -71,7 +71,7 @@ struct ncclCoopWarpSpan {
71
71
  NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
72
72
  warp0(warp0), nWarps(nWarps), id(id) {
73
73
  }
74
-
74
+
75
75
  NCCL_DEVICE_INLINE int thread_rank() const {
76
76
  return threadIdx.x - 32*warp0;
77
77
  }
@@ -100,16 +100,16 @@ struct ncclCoopCta {
100
100
 
101
101
  #if __CUDACC__
102
102
  template<int nThreadsPow2>
103
- NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile<nThreadsPow2> coop) {
103
+ NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopTile<nThreadsPow2> coop) {
104
104
  return coop.laneMask();
105
105
  }
106
- NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
106
+ NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopLanes coop) {
107
107
  return coop.lmask;
108
108
  }
109
- NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
109
+ NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopWarpSpan coop) {
110
110
  return -1u;
111
111
  }
112
- NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
112
+ NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopCta coop) {
113
113
  return -1u;
114
114
  }
115
115
  #endif
@@ -126,6 +126,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa
126
126
  NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
127
127
  #endif
128
128
 
129
+ #if __CUDACC__
130
+ template<int nThreads>
131
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile<nThreads>) { return true; }
132
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; }
133
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; }
134
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; }
135
+ #endif
136
+
129
137
  #if __CUDACC__
130
138
  // Pick threads of our warp that are safe to use collectively.
131
139
  NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
@@ -149,4 +157,55 @@ NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThread
149
157
  }
150
158
  #endif
151
159
 
160
+ #if __CUDACC__
161
+ template<int nThreads, typename T>
162
+ NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile<nThreads>, T value, int root, bool entrySync=true) {
163
+ constexpr int n = (sizeof(T)+4-1)/4;
164
+ union { uint32_t u[n]; T v; };
165
+ v = value;
166
+ #pragma unroll
167
+ for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads);
168
+ return v;
169
+ }
170
+ template<typename T>
171
+ NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) {
172
+ uint32_t m = coop.lmask;
173
+ uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root);
174
+ constexpr int n = (sizeof(T)+4-1)/4;
175
+ union { uint32_t u[n]; T v; };
176
+ v = value;
177
+ #pragma unroll
178
+ for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r);
179
+ return v;
180
+ }
181
+
182
+ NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() {
183
+ __shared__ ulong2 stash[15];
184
+ return stash;
185
+ }
186
+
187
+ template<typename T>
188
+ NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) {
189
+ static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required");
190
+ if (entrySync) coop.sync();
191
+ if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value;
192
+ coop.sync();
193
+ return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id];
194
+ }
195
+
196
+ NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() {
197
+ __shared__ ulong2 stash;
198
+ return &stash;
199
+ }
200
+
201
+ template<typename T>
202
+ NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) {
203
+ static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required");
204
+ if (entrySync) coop.sync();
205
+ if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value;
206
+ coop.sync();
207
+ return *(T*)ncclCoopBcast_Cta_stash();
208
+ }
209
+ #endif
210
+
152
211
  #endif
@@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t;
24
24
  typedef uint32_t ncclDevResourceHandle;
25
25
  typedef ncclDevResourceHandle ncclDevResourceHandle_t;
26
26
 
27
+ typedef uint32_t ncclGinSignal_t;
28
+ typedef uint32_t ncclGinCounter_t;
29
+
27
30
  struct ncclLsaBarrierHandle;
28
31
  typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
29
32
 
33
+ struct ncclGinBarrierHandle;
34
+ typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t;
35
+
30
36
  struct ncclLLA2AHandle;
31
37
  typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
32
38
 
@@ -59,13 +65,26 @@ struct ncclDevCommRequirements {
59
65
 
60
66
  bool lsaMultimem; // Enable multimem on lsa team
61
67
 
68
+ int barrierCount;
62
69
  int lsaBarrierCount;
70
+ int railGinBarrierCount;
71
+
72
+ int lsaLLA2ABlockCount, lsaLLA2ASlotCount;
73
+
74
+ bool ginForceEnable;
75
+ int ginContextCount; // This is a hint, the actual context count in the devcomm may not match.
76
+ int ginSignalCount; // Guaranteed to start at id=0
77
+ int ginCounterCount; // Guaranteed to start at id=0
63
78
  };
64
79
 
65
80
  struct ncclDevResourceRequirements {
66
81
  ncclDevResourceRequirements_t* next;
67
82
  size_t bufferSize, bufferAlign;
68
83
  ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
84
+ int ginSignalCount;
85
+ int ginCounterCount;
86
+ ncclGinSignal_t* outGinSignalStart;
87
+ ncclGinCounter_t* outGinCounterStart;
69
88
  };
70
89
 
71
90
  struct ncclTeamRequirements {
@@ -0,0 +1,214 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_GIN_GDAKI_H_
8
+ #define _NCCL_DEVICE_GIN_GDAKI_H_
9
+
10
+ #ifndef DOCA_VERBS_USE_CUDA_WRAPPER
11
+ #define DOCA_VERBS_USE_CUDA_WRAPPER
12
+ #endif
13
+
14
+ #ifndef DOCA_VERBS_USE_NET_WRAPPER
15
+ #define DOCA_VERBS_USE_NET_WRAPPER
16
+ #endif
17
+
18
+ #ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
19
+ #define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1
20
+ #endif
21
+
22
+ #include "../gin_device_common.h"
23
+ #include "gin_gdaki_device_host_common.h"
24
+ #include "doca_gpunetio/doca_gpunetio_device.h"
25
+
26
+ #ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
27
+ #include <stdio.h>
28
+ #endif
29
+
30
+ template <>
31
+ struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_GDAKI> {
32
+ template <typename Coop>
33
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
34
+ ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
35
+ size_t srcOff, size_t bytes, bool hasSignal,
36
+ ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
37
+ uint64_t signalOpArg, bool hasCounter,
38
+ ncclGinCounter_t counterId, bool hasDescriptor,
39
+ ncclGinDescriptorSmem* descriptor,
40
+ cuda::thread_scope required, cuda::thread_scope given) {
41
+ using nccl::utility::loadConst;
42
+
43
+ coop.sync();
44
+ if (coop.thread_rank() == 0) {
45
+ ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
46
+ doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
47
+ doca_gpu_dev_verbs_qp* companion_qp;
48
+ ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
49
+ ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin;
50
+
51
+ doca_gpu_dev_verbs_addr raddr, laddr;
52
+ if (hasWins) {
53
+ raddr.addr = dstOff;
54
+ raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
55
+ laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey);
56
+ }
57
+
58
+ doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
59
+ if (hasSignal) {
60
+ if (signalOp == ncclGinSignalInc) signalOpArg = 1;
61
+ sig_raddr.addr = sizeof(uint64_t) * signalId;
62
+ sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
63
+ sig_laddr.addr = 0;
64
+ sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
65
+ }
66
+
67
+ doca_gpu_dev_verbs_addr counter_raddr, counter_laddr;
68
+ if (hasCounter) {
69
+ companion_qp = loadConst(&gdaki->companion_gdqp) + peer;
70
+ counter_raddr.addr = sizeof(uint64_t) * counterId;
71
+ counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank);
72
+ counter_laddr.addr = 0;
73
+ counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
74
+ }
75
+
76
+ // cuda::thread_scope_system has the lowest value
77
+ if ((required == cuda::thread_scope_system) && (given > required)) {
78
+ doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
79
+ }
80
+
81
+ if (hasWins) {
82
+ if (hasSignal && hasCounter) {
83
+ doca_gpu_dev_verbs_put_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
84
+ qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr,
85
+ counter_laddr, 1);
86
+ } else if (hasSignal) {
87
+ doca_gpu_dev_verbs_put_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
88
+ qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg);
89
+ } else if (hasCounter) {
90
+ doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr,
91
+ counter_laddr, 1);
92
+ } else {
93
+ doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes);
94
+ }
95
+ } else {
96
+ if (hasCounter) {
97
+ doca_gpu_dev_verbs_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
98
+ qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1);
99
+ } else {
100
+ doca_gpu_dev_verbs_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
101
+ qp, sig_raddr, sig_laddr, signalOpArg);
102
+ }
103
+ }
104
+
105
+ #ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
106
+ doca_gpu_dev_verbs_wait(qp);
107
+ if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp);
108
+ #endif
109
+ }
110
+ coop.sync();
111
+ }
112
+ };
113
+
114
+ template <>
115
+ struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_GDAKI> {
116
+ template <typename Coop, typename T>
117
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
118
+ size_t dstOff, T srcVal, bool hasSignal,
119
+ ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
120
+ uint64_t signalOpArg, bool hasDescriptor,
121
+ ncclGinDescriptorSmem* descriptor,
122
+ cuda::thread_scope required, cuda::thread_scope given) {
123
+ using nccl::utility::loadConst;
124
+
125
+ coop.sync();
126
+ if (coop.thread_rank() == 0) {
127
+ ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
128
+ doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
129
+ ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
130
+
131
+ doca_gpu_dev_verbs_addr raddr;
132
+ raddr.addr = dstOff;
133
+ raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
134
+
135
+ doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
136
+ if (hasSignal) {
137
+ if (signalOp == ncclGinSignalInc) signalOpArg = 1;
138
+ sig_raddr.addr = sizeof(uint64_t) * signalId;
139
+ sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
140
+ sig_laddr.addr = 0;
141
+ sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
142
+ }
143
+
144
+ // cuda::thread_scope_system has the lowest value
145
+ if ((required == cuda::thread_scope_system) && (given > required)) {
146
+ doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
147
+ }
148
+
149
+ if (hasSignal) {
150
+ doca_gpu_dev_verbs_p_signal<T, DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
151
+ qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg);
152
+ } else {
153
+ doca_gpu_dev_verbs_p(qp, raddr, srcVal);
154
+ }
155
+
156
+ #ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
157
+ doca_gpu_dev_verbs_wait(qp);
158
+ #endif
159
+ }
160
+ coop.sync();
161
+ }
162
+ };
163
+
164
+ template <>
165
+ struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_GDAKI> {
166
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
167
+ using nccl::utility::loadConst;
168
+ ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
169
+ loadConst(&gdaki->counters_table.buffer)[counterId] = 0;
170
+ }
171
+ };
172
+
173
+ template <>
174
+ struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_GDAKI> {
175
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
176
+ using nccl::utility::loadConst;
177
+ ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
178
+ loadConst(&gdaki->signals_table.buffer)[signalId] = 0;
179
+ }
180
+ };
181
+
182
+ template <>
183
+ struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
184
+ NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
185
+ using nccl::utility::loadConst;
186
+ ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
187
+ return loadConst(&gdaki->counters_table.buffer) + counterId;
188
+ }
189
+ };
190
+
191
+ template <>
192
+ struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
193
+ NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
194
+ using nccl::utility::loadConst;
195
+ ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
196
+ return loadConst(&gdaki->signals_table.buffer) + signalId;
197
+ }
198
+ };
199
+
200
+ template <>
201
+ struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_GDAKI> {
202
+ template <typename Coop>
203
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
204
+ using nccl::utility::loadConst;
205
+ ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
206
+ doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp);
207
+ #pragma unroll 1
208
+ for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) {
209
+ doca_gpu_dev_verbs_wait(qps + peer);
210
+ }
211
+ }
212
+ };
213
+
214
+ #endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */
@@ -0,0 +1,36 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
8
+ #define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
9
+
10
+ #include <linux/types.h>
11
+
12
+ #define NCCL_GIN_GDAKI_VERSION 100
13
+
14
+ template <typename T>
15
+ struct ncclGinGdakiGlobalGPUBufferTable {
16
+ T *buffer;
17
+ __be32 *rkeys;
18
+ __be32 lkey;
19
+ };
20
+
21
+ struct ncclGinGdakiGPUContext {
22
+ struct doca_gpu_dev_verbs_qp *gdqp;
23
+ struct doca_gpu_dev_verbs_qp *companion_gdqp;
24
+ struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> counters_table;
25
+ struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> signals_table;
26
+
27
+ // Local buffer we don't consume but is required for some operations.
28
+ __be32 sink_buffer_lkey;
29
+ };
30
+
31
+ struct ncclGinGdakiMemHandle {
32
+ __be32 *rkeys;
33
+ __be32 lkey;
34
+ };
35
+
36
+ #endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */
@@ -0,0 +1,18 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+ #ifndef _NCCL_GIN_DEVICE_API_H_
7
+ #define _NCCL_GIN_DEVICE_API_H_
8
+
9
+ #include "gin_device_common.h"
10
+
11
+ #if NCCL_GIN_GDAKI_ENABLE
12
+ #include "gdaki/gin_gdaki.h"
13
+ #endif
14
+ #if NCCL_GIN_PROXY_ENABLE
15
+ #include "proxy/gin_proxy.h"
16
+ #endif
17
+
18
+ #endif
@@ -0,0 +1,120 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_GIN_DEVICE_COMMON_H_
8
+ #define _NCCL_GIN_DEVICE_COMMON_H_
9
+
10
+ #include "../net_device.h"
11
+ #include "../utility.h"
12
+ #include "gin_device_host_common.h"
13
+
14
+ #if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
15
+ #define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
16
+ #endif
17
+
18
+ #ifndef NCCL_GIN_PROXY_ENABLE
19
+ #define NCCL_GIN_PROXY_ENABLE 1
20
+ #endif
21
+
22
+ #ifndef NCCL_GIN_GDAKI_ENABLE
23
+ #if CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700
24
+ #define NCCL_GIN_GDAKI_ENABLE 1
25
+ #else
26
+ #define NCCL_GIN_GDAKI_ENABLE 0
27
+ #endif
28
+ #endif
29
+
30
+ #define NCCL_GIN_BACKEND_MASK_ALL \
31
+ (((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \
32
+ ((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI)
33
+
34
+ struct ncclGinCtx {
35
+ ncclNetDeviceType backend;
36
+ int rank;
37
+ int nRanks;
38
+ void* handle;
39
+ };
40
+
41
+ template <unsigned backendMask>
42
+ struct ncclGinCtx_M : ncclGinCtx {};
43
+
44
+ struct ncclGinDescriptorSmem {
45
+ alignas(16) char space[64];
46
+ };
47
+
48
+ #if __CUDACC__
49
+ template <ncclNetDeviceType backend>
50
+ struct ncclGinApi_Put {
51
+ template <typename Coop>
52
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins,
53
+ ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
54
+ size_t srcOff, size_t bytes, bool hasSignal,
55
+ ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
56
+ uint64_t signalOpArg, bool hasCounter,
57
+ ncclGinCounter_t counterId, bool hasDescriptor,
58
+ ncclGinDescriptorSmem* descriptor,
59
+ cuda::thread_scope required, cuda::thread_scope given);
60
+ };
61
+
62
+ template <ncclNetDeviceType backend>
63
+ struct ncclGinApi_PutValue {
64
+ template <typename Coop, typename T>
65
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin,
66
+ size_t dstOff, T srcData, bool hasSignal,
67
+ ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
68
+ uint64_t signalOpArg, bool hasDescriptor,
69
+ ncclGinDescriptorSmem* descriptor,
70
+ cuda::thread_scope required, cuda::thread_scope given);
71
+ };
72
+
73
+ template <ncclNetDeviceType backend>
74
+ struct ncclGinApi_GetSignalPtr {
75
+ NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId);
76
+ };
77
+ template <ncclNetDeviceType backend>
78
+ struct ncclGinApi_GetCounterPtr {
79
+ NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId);
80
+ };
81
+
82
+ template <ncclNetDeviceType backend>
83
+ struct ncclGinApi_ResetSignal {
84
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId);
85
+ };
86
+
87
+ template <ncclNetDeviceType backend>
88
+ struct ncclGinApi_ResetCounter {
89
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId);
90
+ };
91
+
92
+ template <ncclNetDeviceType backend>
93
+ struct ncclGinApi_Flush {
94
+ template <typename Coop>
95
+ NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord);
96
+ };
97
+ #endif
98
+
99
+ #if __CUDACC__
100
+ template <template <ncclNetDeviceType> typename ApiFn, unsigned beMask, typename... Arg>
101
+ NCCL_DEVICE_INLINE static decltype(auto) ncclGinCall(ncclGinCtx_M<beMask> ctx, Arg&&... arg) {
102
+ bool singleton = (beMask & (beMask - 1)) == 0; // Only one bit set
103
+ switch (singleton ? __popc(beMask - 1) : (int)ctx.backend) {
104
+ #if NCCL_GIN_PROXY_ENABLE
105
+ case (int)NCCL_NET_DEVICE_GIN_PROXY:
106
+ if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_PROXY))) __builtin_unreachable();
107
+ return ApiFn<NCCL_NET_DEVICE_GIN_PROXY>::call(ctx, static_cast<Arg&&>(arg)...);
108
+ #endif
109
+ #if NCCL_GIN_GDAKI_ENABLE
110
+ case (int)NCCL_NET_DEVICE_GIN_GDAKI:
111
+ if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_GDAKI))) __builtin_unreachable();
112
+ return ApiFn<NCCL_NET_DEVICE_GIN_GDAKI>::call(ctx, static_cast<Arg&&>(arg)...);
113
+ #endif
114
+ default:
115
+ __builtin_unreachable();
116
+ }
117
+ }
118
+ #endif
119
+
120
+ #endif
@@ -0,0 +1,24 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_GIN_DEVICE_HOST_COMMON_H_
8
+ #define _NCCL_GIN_DEVICE_HOST_COMMON_H_
9
+
10
+ #include <cuda.h>
11
+ #include "../net_device.h"
12
+ #include "../core.h" // for ncclGin{Signal|Counter}_t
13
+
14
+ #define NCCL_GIN_MAX_CONTEXTS 4
15
+
16
+ typedef struct ncclGinGpuCtx *ncclGinGpuCtx_t;
17
+ typedef void *ncclGinWindow_t;
18
+
19
+ typedef enum ncclGinSignalOp_t {
20
+ ncclGinSignalInc = 0,
21
+ ncclGinSignalAdd,
22
+ } ncclGinSignalOp_t;
23
+
24
+ #endif