nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_x86_64.whl → 2.28.7__py3-none-manylinux_2_18_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nvidia/nccl/include/nccl.h +14 -2
- nvidia/nccl/include/nccl_device/barrier.h +47 -0
- nvidia/nccl/include/nccl_device/coop.h +66 -7
- nvidia/nccl/include/nccl_device/core.h +19 -0
- nvidia/nccl/include/nccl_device/gin/gdaki/gin_gdaki.h +214 -0
- nvidia/nccl/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h +36 -0
- nvidia/nccl/include/nccl_device/gin/gin_device_api.h +18 -0
- nvidia/nccl/include/nccl_device/gin/gin_device_common.h +120 -0
- nvidia/nccl/include/nccl_device/gin/gin_device_host_common.h +24 -0
- nvidia/nccl/include/nccl_device/gin/proxy/gin_proxy.h +235 -0
- nvidia/nccl/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h +125 -0
- nvidia/nccl/include/nccl_device/gin.h +207 -0
- nvidia/nccl/include/nccl_device/gin_barrier.h +37 -0
- nvidia/nccl/include/nccl_device/impl/barrier__funcs.h +94 -0
- nvidia/nccl/include/nccl_device/impl/barrier__types.h +29 -0
- nvidia/nccl/include/nccl_device/impl/comm__types.h +12 -1
- nvidia/nccl/include/nccl_device/impl/core__funcs.h +32 -0
- nvidia/nccl/include/nccl_device/impl/core__types.h +3 -1
- nvidia/nccl/include/nccl_device/impl/gin__funcs.h +407 -0
- nvidia/nccl/include/nccl_device/impl/gin__types.h +10 -0
- nvidia/nccl/include/nccl_device/impl/gin_barrier__funcs.h +66 -0
- nvidia/nccl/include/nccl_device/impl/gin_barrier__types.h +31 -0
- nvidia/nccl/include/nccl_device/impl/{mem_barrier__funcs.h → lsa_barrier__funcs.h} +1 -1
- nvidia/nccl/include/nccl_device/impl/{mem_barrier__types.h → lsa_barrier__types.h} +1 -1
- nvidia/nccl/include/nccl_device/ll_a2a.h +2 -2
- nvidia/nccl/include/nccl_device/net_device.h +38 -0
- nvidia/nccl/include/nccl_device/utility.h +62 -12
- nvidia/nccl/include/nccl_device.h +5 -5
- nvidia/nccl/lib/libnccl.so.2 +0 -0
- {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/METADATA +1 -1
- nvidia_nccl_cu13-2.28.7.dist-info/RECORD +42 -0
- nvidia_nccl_cu13-2.28.3.dist-info/RECORD +0 -25
- /nvidia/nccl/include/nccl_device/{mem_barrier.h → lsa_barrier.h} +0 -0
- {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/WHEEL +0 -0
- {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/licenses/License.txt +0 -0
- {nvidia_nccl_cu13-2.28.3.dist-info → nvidia_nccl_cu13-2.28.7.dist-info}/top_level.txt +0 -0
nvidia/nccl/include/nccl.h
CHANGED
|
@@ -18,10 +18,10 @@
|
|
|
18
18
|
|
|
19
19
|
#define NCCL_MAJOR 2
|
|
20
20
|
#define NCCL_MINOR 28
|
|
21
|
-
#define NCCL_PATCH
|
|
21
|
+
#define NCCL_PATCH 7
|
|
22
22
|
#define NCCL_SUFFIX ""
|
|
23
23
|
|
|
24
|
-
#define NCCL_VERSION_CODE
|
|
24
|
+
#define NCCL_VERSION_CODE 22807
|
|
25
25
|
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
|
26
26
|
|
|
27
27
|
#ifdef __cplusplus
|
|
@@ -69,6 +69,9 @@ typedef enum { ncclSuccess = 0,
|
|
|
69
69
|
#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
|
|
70
70
|
#define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
|
|
71
71
|
|
|
72
|
+
/* ncclCommRevoke flags */
|
|
73
|
+
#define NCCL_REVOKE_DEFAULT 0x00 /* reserved for future use; must be 0 */
|
|
74
|
+
|
|
72
75
|
/* Communicator configuration. Users can assign value to attributes to specify the
|
|
73
76
|
* behavior of a communicator. */
|
|
74
77
|
typedef struct ncclConfig_v22800 {
|
|
@@ -194,6 +197,15 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
|
|
|
194
197
|
ncclResult_t ncclCommAbort(ncclComm_t comm);
|
|
195
198
|
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
|
196
199
|
|
|
200
|
+
/* Revoke a communicator. ncclCommRevoke stops all in-flight operations
|
|
201
|
+
* and marks communicator state as ncclInProgress. The state will change to ncclSuccess
|
|
202
|
+
* when the communicator is quiescent; then, management operations (destroy, split,
|
|
203
|
+
* shrink) can proceed safely. Calling ncclCommFinalize after revoke is invalid.
|
|
204
|
+
* Additionally, resource sharing via splitShare/shrinkShare is disabled while revoked.
|
|
205
|
+
* revokeFlags must be NCCL_REVOKE_DEFAULT (0). */
|
|
206
|
+
ncclResult_t ncclCommRevoke(ncclComm_t comm, int revokeFlags);
|
|
207
|
+
ncclResult_t pncclCommRevoke(ncclComm_t comm, int revokeFlags);
|
|
208
|
+
|
|
197
209
|
/* Creates one or more communicators from an existing one.
|
|
198
210
|
* Ranks with the same color will end up in the same communicator.
|
|
199
211
|
* Within the new communicator, key will be used to order ranks.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_BARRIER_H_
|
|
8
|
+
#define _NCCL_DEVICE_BARRIER_H_
|
|
9
|
+
#include "impl/core__types.h"
|
|
10
|
+
#include "impl/lsa_barrier__types.h"
|
|
11
|
+
#include "impl/gin_barrier__types.h"
|
|
12
|
+
|
|
13
|
+
#if __CUDACC__
|
|
14
|
+
template<typename Coop>
|
|
15
|
+
struct ncclBarrierSession_internal;
|
|
16
|
+
|
|
17
|
+
template<typename Coop>
|
|
18
|
+
struct ncclBarrierSession: ncclBarrierSession_internal<Coop> {
|
|
19
|
+
// Full featured constructor:
|
|
20
|
+
NCCL_DEVICE_INLINE ncclBarrierSession(
|
|
21
|
+
Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin,
|
|
22
|
+
ncclLsaBarrierHandle innerBarHandle,
|
|
23
|
+
ncclGinBarrierHandle outerBarHandle,
|
|
24
|
+
uint32_t index,
|
|
25
|
+
bool multimem=false, ncclMultimemHandle innerMmHandle={}
|
|
26
|
+
);
|
|
27
|
+
// Convenience constructors for baked in teams:
|
|
28
|
+
NCCL_DEVICE_INLINE ncclBarrierSession(
|
|
29
|
+
Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false
|
|
30
|
+
);
|
|
31
|
+
NCCL_DEVICE_INLINE ncclBarrierSession(
|
|
32
|
+
Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false
|
|
33
|
+
);
|
|
34
|
+
NCCL_DEVICE_INLINE ncclBarrierSession(
|
|
35
|
+
Coop, ncclTeamTagRail, ncclGin, uint32_t index
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable
|
|
39
|
+
|
|
40
|
+
NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& lsaBarrier();
|
|
41
|
+
NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ginBarrier();
|
|
42
|
+
|
|
43
|
+
NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
|
|
44
|
+
};
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
#endif // _NCCL_DEVICE_BARRIER_H_
|
|
@@ -30,7 +30,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
|
|
|
30
30
|
return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
|
|
31
31
|
}
|
|
32
32
|
NCCL_DEVICE_INLINE void sync() {
|
|
33
|
-
__syncwarp(laneMask());
|
|
33
|
+
if (nThreadsPow2 > 1) __syncwarp(laneMask());
|
|
34
34
|
}
|
|
35
35
|
};
|
|
36
36
|
#endif
|
|
@@ -43,7 +43,7 @@ typedef ncclCoopTile<32> ncclCoopWarp;
|
|
|
43
43
|
#if __CUDACC__
|
|
44
44
|
struct ncclCoopLanes { // Some lanes of this warp.
|
|
45
45
|
uint32_t lmask;
|
|
46
|
-
|
|
46
|
+
|
|
47
47
|
NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
|
|
48
48
|
|
|
49
49
|
NCCL_DEVICE_INLINE int thread_rank() const {
|
|
@@ -71,7 +71,7 @@ struct ncclCoopWarpSpan {
|
|
|
71
71
|
NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
|
|
72
72
|
warp0(warp0), nWarps(nWarps), id(id) {
|
|
73
73
|
}
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
NCCL_DEVICE_INLINE int thread_rank() const {
|
|
76
76
|
return threadIdx.x - 32*warp0;
|
|
77
77
|
}
|
|
@@ -100,16 +100,16 @@ struct ncclCoopCta {
|
|
|
100
100
|
|
|
101
101
|
#if __CUDACC__
|
|
102
102
|
template<int nThreadsPow2>
|
|
103
|
-
NCCL_DEVICE_INLINE uint32_t
|
|
103
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopTile<nThreadsPow2> coop) {
|
|
104
104
|
return coop.laneMask();
|
|
105
105
|
}
|
|
106
|
-
NCCL_DEVICE_INLINE uint32_t
|
|
106
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopLanes coop) {
|
|
107
107
|
return coop.lmask;
|
|
108
108
|
}
|
|
109
|
-
NCCL_DEVICE_INLINE uint32_t
|
|
109
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopWarpSpan coop) {
|
|
110
110
|
return -1u;
|
|
111
111
|
}
|
|
112
|
-
NCCL_DEVICE_INLINE uint32_t
|
|
112
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopCta coop) {
|
|
113
113
|
return -1u;
|
|
114
114
|
}
|
|
115
115
|
#endif
|
|
@@ -126,6 +126,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa
|
|
|
126
126
|
NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
|
|
127
127
|
#endif
|
|
128
128
|
|
|
129
|
+
#if __CUDACC__
|
|
130
|
+
template<int nThreads>
|
|
131
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile<nThreads>) { return true; }
|
|
132
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; }
|
|
133
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; }
|
|
134
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; }
|
|
135
|
+
#endif
|
|
136
|
+
|
|
129
137
|
#if __CUDACC__
|
|
130
138
|
// Pick threads of our warp that are safe to use collectively.
|
|
131
139
|
NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
|
|
@@ -149,4 +157,55 @@ NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThread
|
|
|
149
157
|
}
|
|
150
158
|
#endif
|
|
151
159
|
|
|
160
|
+
#if __CUDACC__
|
|
161
|
+
template<int nThreads, typename T>
|
|
162
|
+
NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile<nThreads>, T value, int root, bool entrySync=true) {
|
|
163
|
+
constexpr int n = (sizeof(T)+4-1)/4;
|
|
164
|
+
union { uint32_t u[n]; T v; };
|
|
165
|
+
v = value;
|
|
166
|
+
#pragma unroll
|
|
167
|
+
for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads);
|
|
168
|
+
return v;
|
|
169
|
+
}
|
|
170
|
+
template<typename T>
|
|
171
|
+
NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) {
|
|
172
|
+
uint32_t m = coop.lmask;
|
|
173
|
+
uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root);
|
|
174
|
+
constexpr int n = (sizeof(T)+4-1)/4;
|
|
175
|
+
union { uint32_t u[n]; T v; };
|
|
176
|
+
v = value;
|
|
177
|
+
#pragma unroll
|
|
178
|
+
for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r);
|
|
179
|
+
return v;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() {
|
|
183
|
+
__shared__ ulong2 stash[15];
|
|
184
|
+
return stash;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
template<typename T>
|
|
188
|
+
NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) {
|
|
189
|
+
static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required");
|
|
190
|
+
if (entrySync) coop.sync();
|
|
191
|
+
if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value;
|
|
192
|
+
coop.sync();
|
|
193
|
+
return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id];
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() {
|
|
197
|
+
__shared__ ulong2 stash;
|
|
198
|
+
return &stash;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
template<typename T>
|
|
202
|
+
NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) {
|
|
203
|
+
static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required");
|
|
204
|
+
if (entrySync) coop.sync();
|
|
205
|
+
if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value;
|
|
206
|
+
coop.sync();
|
|
207
|
+
return *(T*)ncclCoopBcast_Cta_stash();
|
|
208
|
+
}
|
|
209
|
+
#endif
|
|
210
|
+
|
|
152
211
|
#endif
|
|
@@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t;
|
|
|
24
24
|
typedef uint32_t ncclDevResourceHandle;
|
|
25
25
|
typedef ncclDevResourceHandle ncclDevResourceHandle_t;
|
|
26
26
|
|
|
27
|
+
typedef uint32_t ncclGinSignal_t;
|
|
28
|
+
typedef uint32_t ncclGinCounter_t;
|
|
29
|
+
|
|
27
30
|
struct ncclLsaBarrierHandle;
|
|
28
31
|
typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
|
|
29
32
|
|
|
33
|
+
struct ncclGinBarrierHandle;
|
|
34
|
+
typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t;
|
|
35
|
+
|
|
30
36
|
struct ncclLLA2AHandle;
|
|
31
37
|
typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
|
|
32
38
|
|
|
@@ -59,13 +65,26 @@ struct ncclDevCommRequirements {
|
|
|
59
65
|
|
|
60
66
|
bool lsaMultimem; // Enable multimem on lsa team
|
|
61
67
|
|
|
68
|
+
int barrierCount;
|
|
62
69
|
int lsaBarrierCount;
|
|
70
|
+
int railGinBarrierCount;
|
|
71
|
+
|
|
72
|
+
int lsaLLA2ABlockCount, lsaLLA2ASlotCount;
|
|
73
|
+
|
|
74
|
+
bool ginForceEnable;
|
|
75
|
+
int ginContextCount; // This is a hint, the actual context count in the devcomm may not match.
|
|
76
|
+
int ginSignalCount; // Guaranteed to start at id=0
|
|
77
|
+
int ginCounterCount; // Guaranteed to start at id=0
|
|
63
78
|
};
|
|
64
79
|
|
|
65
80
|
struct ncclDevResourceRequirements {
|
|
66
81
|
ncclDevResourceRequirements_t* next;
|
|
67
82
|
size_t bufferSize, bufferAlign;
|
|
68
83
|
ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
|
|
84
|
+
int ginSignalCount;
|
|
85
|
+
int ginCounterCount;
|
|
86
|
+
ncclGinSignal_t* outGinSignalStart;
|
|
87
|
+
ncclGinCounter_t* outGinCounterStart;
|
|
69
88
|
};
|
|
70
89
|
|
|
71
90
|
struct ncclTeamRequirements {
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_GIN_GDAKI_H_
|
|
8
|
+
#define _NCCL_DEVICE_GIN_GDAKI_H_
|
|
9
|
+
|
|
10
|
+
#ifndef DOCA_VERBS_USE_CUDA_WRAPPER
|
|
11
|
+
#define DOCA_VERBS_USE_CUDA_WRAPPER
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
#ifndef DOCA_VERBS_USE_NET_WRAPPER
|
|
15
|
+
#define DOCA_VERBS_USE_NET_WRAPPER
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
|
|
19
|
+
#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#include "../gin_device_common.h"
|
|
23
|
+
#include "gin_gdaki_device_host_common.h"
|
|
24
|
+
#include "doca_gpunetio/doca_gpunetio_device.h"
|
|
25
|
+
|
|
26
|
+
#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
|
|
27
|
+
#include <stdio.h>
|
|
28
|
+
#endif
|
|
29
|
+
|
|
30
|
+
template <>
|
|
31
|
+
struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
32
|
+
template <typename Coop>
|
|
33
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
|
|
34
|
+
ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
|
|
35
|
+
size_t srcOff, size_t bytes, bool hasSignal,
|
|
36
|
+
ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
|
|
37
|
+
uint64_t signalOpArg, bool hasCounter,
|
|
38
|
+
ncclGinCounter_t counterId, bool hasDescriptor,
|
|
39
|
+
ncclGinDescriptorSmem* descriptor,
|
|
40
|
+
cuda::thread_scope required, cuda::thread_scope given) {
|
|
41
|
+
using nccl::utility::loadConst;
|
|
42
|
+
|
|
43
|
+
coop.sync();
|
|
44
|
+
if (coop.thread_rank() == 0) {
|
|
45
|
+
ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
|
|
46
|
+
doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
|
|
47
|
+
doca_gpu_dev_verbs_qp* companion_qp;
|
|
48
|
+
ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
|
|
49
|
+
ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin;
|
|
50
|
+
|
|
51
|
+
doca_gpu_dev_verbs_addr raddr, laddr;
|
|
52
|
+
if (hasWins) {
|
|
53
|
+
raddr.addr = dstOff;
|
|
54
|
+
raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
|
|
55
|
+
laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
|
|
59
|
+
if (hasSignal) {
|
|
60
|
+
if (signalOp == ncclGinSignalInc) signalOpArg = 1;
|
|
61
|
+
sig_raddr.addr = sizeof(uint64_t) * signalId;
|
|
62
|
+
sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
|
|
63
|
+
sig_laddr.addr = 0;
|
|
64
|
+
sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
doca_gpu_dev_verbs_addr counter_raddr, counter_laddr;
|
|
68
|
+
if (hasCounter) {
|
|
69
|
+
companion_qp = loadConst(&gdaki->companion_gdqp) + peer;
|
|
70
|
+
counter_raddr.addr = sizeof(uint64_t) * counterId;
|
|
71
|
+
counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank);
|
|
72
|
+
counter_laddr.addr = 0;
|
|
73
|
+
counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// cuda::thread_scope_system has the lowest value
|
|
77
|
+
if ((required == cuda::thread_scope_system) && (given > required)) {
|
|
78
|
+
doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (hasWins) {
|
|
82
|
+
if (hasSignal && hasCounter) {
|
|
83
|
+
doca_gpu_dev_verbs_put_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
|
|
84
|
+
qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr,
|
|
85
|
+
counter_laddr, 1);
|
|
86
|
+
} else if (hasSignal) {
|
|
87
|
+
doca_gpu_dev_verbs_put_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
|
|
88
|
+
qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg);
|
|
89
|
+
} else if (hasCounter) {
|
|
90
|
+
doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr,
|
|
91
|
+
counter_laddr, 1);
|
|
92
|
+
} else {
|
|
93
|
+
doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes);
|
|
94
|
+
}
|
|
95
|
+
} else {
|
|
96
|
+
if (hasCounter) {
|
|
97
|
+
doca_gpu_dev_verbs_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
|
|
98
|
+
qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1);
|
|
99
|
+
} else {
|
|
100
|
+
doca_gpu_dev_verbs_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
|
|
101
|
+
qp, sig_raddr, sig_laddr, signalOpArg);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
|
|
106
|
+
doca_gpu_dev_verbs_wait(qp);
|
|
107
|
+
if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp);
|
|
108
|
+
#endif
|
|
109
|
+
}
|
|
110
|
+
coop.sync();
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
template <>
|
|
115
|
+
struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
116
|
+
template <typename Coop, typename T>
|
|
117
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
|
|
118
|
+
size_t dstOff, T srcVal, bool hasSignal,
|
|
119
|
+
ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
|
|
120
|
+
uint64_t signalOpArg, bool hasDescriptor,
|
|
121
|
+
ncclGinDescriptorSmem* descriptor,
|
|
122
|
+
cuda::thread_scope required, cuda::thread_scope given) {
|
|
123
|
+
using nccl::utility::loadConst;
|
|
124
|
+
|
|
125
|
+
coop.sync();
|
|
126
|
+
if (coop.thread_rank() == 0) {
|
|
127
|
+
ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
|
|
128
|
+
doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
|
|
129
|
+
ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
|
|
130
|
+
|
|
131
|
+
doca_gpu_dev_verbs_addr raddr;
|
|
132
|
+
raddr.addr = dstOff;
|
|
133
|
+
raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
|
|
134
|
+
|
|
135
|
+
doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
|
|
136
|
+
if (hasSignal) {
|
|
137
|
+
if (signalOp == ncclGinSignalInc) signalOpArg = 1;
|
|
138
|
+
sig_raddr.addr = sizeof(uint64_t) * signalId;
|
|
139
|
+
sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
|
|
140
|
+
sig_laddr.addr = 0;
|
|
141
|
+
sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// cuda::thread_scope_system has the lowest value
|
|
145
|
+
if ((required == cuda::thread_scope_system) && (given > required)) {
|
|
146
|
+
doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (hasSignal) {
|
|
150
|
+
doca_gpu_dev_verbs_p_signal<T, DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
|
|
151
|
+
qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg);
|
|
152
|
+
} else {
|
|
153
|
+
doca_gpu_dev_verbs_p(qp, raddr, srcVal);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
|
|
157
|
+
doca_gpu_dev_verbs_wait(qp);
|
|
158
|
+
#endif
|
|
159
|
+
}
|
|
160
|
+
coop.sync();
|
|
161
|
+
}
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
template <>
|
|
165
|
+
struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
166
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
|
|
167
|
+
using nccl::utility::loadConst;
|
|
168
|
+
ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
|
|
169
|
+
loadConst(&gdaki->counters_table.buffer)[counterId] = 0;
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
template <>
|
|
174
|
+
struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
175
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
|
|
176
|
+
using nccl::utility::loadConst;
|
|
177
|
+
ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
|
|
178
|
+
loadConst(&gdaki->signals_table.buffer)[signalId] = 0;
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
template <>
|
|
183
|
+
struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
184
|
+
NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
|
|
185
|
+
using nccl::utility::loadConst;
|
|
186
|
+
ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
|
|
187
|
+
return loadConst(&gdaki->counters_table.buffer) + counterId;
|
|
188
|
+
}
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
template <>
|
|
192
|
+
struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
193
|
+
NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
|
|
194
|
+
using nccl::utility::loadConst;
|
|
195
|
+
ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
|
|
196
|
+
return loadConst(&gdaki->signals_table.buffer) + signalId;
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
template <>
|
|
201
|
+
struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_GDAKI> {
|
|
202
|
+
template <typename Coop>
|
|
203
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
|
|
204
|
+
using nccl::utility::loadConst;
|
|
205
|
+
ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
|
|
206
|
+
doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp);
|
|
207
|
+
#pragma unroll 1
|
|
208
|
+
for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) {
|
|
209
|
+
doca_gpu_dev_verbs_wait(qps + peer);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
#endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
|
|
8
|
+
#define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
|
|
9
|
+
|
|
10
|
+
#include <linux/types.h>
|
|
11
|
+
|
|
12
|
+
#define NCCL_GIN_GDAKI_VERSION 100
|
|
13
|
+
|
|
14
|
+
template <typename T>
|
|
15
|
+
struct ncclGinGdakiGlobalGPUBufferTable {
|
|
16
|
+
T *buffer;
|
|
17
|
+
__be32 *rkeys;
|
|
18
|
+
__be32 lkey;
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
struct ncclGinGdakiGPUContext {
|
|
22
|
+
struct doca_gpu_dev_verbs_qp *gdqp;
|
|
23
|
+
struct doca_gpu_dev_verbs_qp *companion_gdqp;
|
|
24
|
+
struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> counters_table;
|
|
25
|
+
struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> signals_table;
|
|
26
|
+
|
|
27
|
+
// Local buffer we don't consume but is required for some operations.
|
|
28
|
+
__be32 sink_buffer_lkey;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
struct ncclGinGdakiMemHandle {
|
|
32
|
+
__be32 *rkeys;
|
|
33
|
+
__be32 lkey;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
#endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
#ifndef _NCCL_GIN_DEVICE_API_H_
|
|
7
|
+
#define _NCCL_GIN_DEVICE_API_H_
|
|
8
|
+
|
|
9
|
+
#include "gin_device_common.h"
|
|
10
|
+
|
|
11
|
+
#if NCCL_GIN_GDAKI_ENABLE
|
|
12
|
+
#include "gdaki/gin_gdaki.h"
|
|
13
|
+
#endif
|
|
14
|
+
#if NCCL_GIN_PROXY_ENABLE
|
|
15
|
+
#include "proxy/gin_proxy.h"
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
#endif
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_GIN_DEVICE_COMMON_H_
|
|
8
|
+
#define _NCCL_GIN_DEVICE_COMMON_H_
|
|
9
|
+
|
|
10
|
+
#include "../net_device.h"
|
|
11
|
+
#include "../utility.h"
|
|
12
|
+
#include "gin_device_host_common.h"
|
|
13
|
+
|
|
14
|
+
#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
|
|
15
|
+
#define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
|
|
16
|
+
#endif
|
|
17
|
+
|
|
18
|
+
#ifndef NCCL_GIN_PROXY_ENABLE
|
|
19
|
+
#define NCCL_GIN_PROXY_ENABLE 1
|
|
20
|
+
#endif
|
|
21
|
+
|
|
22
|
+
#ifndef NCCL_GIN_GDAKI_ENABLE
|
|
23
|
+
#if CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700
|
|
24
|
+
#define NCCL_GIN_GDAKI_ENABLE 1
|
|
25
|
+
#else
|
|
26
|
+
#define NCCL_GIN_GDAKI_ENABLE 0
|
|
27
|
+
#endif
|
|
28
|
+
#endif
|
|
29
|
+
|
|
30
|
+
#define NCCL_GIN_BACKEND_MASK_ALL \
|
|
31
|
+
(((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \
|
|
32
|
+
((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI)
|
|
33
|
+
|
|
34
|
+
struct ncclGinCtx {
|
|
35
|
+
ncclNetDeviceType backend;
|
|
36
|
+
int rank;
|
|
37
|
+
int nRanks;
|
|
38
|
+
void* handle;
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
template <unsigned backendMask>
|
|
42
|
+
struct ncclGinCtx_M : ncclGinCtx {};
|
|
43
|
+
|
|
44
|
+
struct ncclGinDescriptorSmem {
|
|
45
|
+
alignas(16) char space[64];
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
#if __CUDACC__
|
|
49
|
+
template <ncclNetDeviceType backend>
|
|
50
|
+
struct ncclGinApi_Put {
|
|
51
|
+
template <typename Coop>
|
|
52
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins,
|
|
53
|
+
ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
|
|
54
|
+
size_t srcOff, size_t bytes, bool hasSignal,
|
|
55
|
+
ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
|
|
56
|
+
uint64_t signalOpArg, bool hasCounter,
|
|
57
|
+
ncclGinCounter_t counterId, bool hasDescriptor,
|
|
58
|
+
ncclGinDescriptorSmem* descriptor,
|
|
59
|
+
cuda::thread_scope required, cuda::thread_scope given);
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
template <ncclNetDeviceType backend>
|
|
63
|
+
struct ncclGinApi_PutValue {
|
|
64
|
+
template <typename Coop, typename T>
|
|
65
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin,
|
|
66
|
+
size_t dstOff, T srcData, bool hasSignal,
|
|
67
|
+
ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
|
|
68
|
+
uint64_t signalOpArg, bool hasDescriptor,
|
|
69
|
+
ncclGinDescriptorSmem* descriptor,
|
|
70
|
+
cuda::thread_scope required, cuda::thread_scope given);
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
template <ncclNetDeviceType backend>
|
|
74
|
+
struct ncclGinApi_GetSignalPtr {
|
|
75
|
+
NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId);
|
|
76
|
+
};
|
|
77
|
+
template <ncclNetDeviceType backend>
|
|
78
|
+
struct ncclGinApi_GetCounterPtr {
|
|
79
|
+
NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId);
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
template <ncclNetDeviceType backend>
|
|
83
|
+
struct ncclGinApi_ResetSignal {
|
|
84
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId);
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
template <ncclNetDeviceType backend>
|
|
88
|
+
struct ncclGinApi_ResetCounter {
|
|
89
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId);
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
template <ncclNetDeviceType backend>
|
|
93
|
+
struct ncclGinApi_Flush {
|
|
94
|
+
template <typename Coop>
|
|
95
|
+
NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord);
|
|
96
|
+
};
|
|
97
|
+
#endif
|
|
98
|
+
|
|
99
|
+
#if __CUDACC__
|
|
100
|
+
template <template <ncclNetDeviceType> typename ApiFn, unsigned beMask, typename... Arg>
|
|
101
|
+
NCCL_DEVICE_INLINE static decltype(auto) ncclGinCall(ncclGinCtx_M<beMask> ctx, Arg&&... arg) {
|
|
102
|
+
bool singleton = (beMask & (beMask - 1)) == 0; // Only one bit set
|
|
103
|
+
switch (singleton ? __popc(beMask - 1) : (int)ctx.backend) {
|
|
104
|
+
#if NCCL_GIN_PROXY_ENABLE
|
|
105
|
+
case (int)NCCL_NET_DEVICE_GIN_PROXY:
|
|
106
|
+
if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_PROXY))) __builtin_unreachable();
|
|
107
|
+
return ApiFn<NCCL_NET_DEVICE_GIN_PROXY>::call(ctx, static_cast<Arg&&>(arg)...);
|
|
108
|
+
#endif
|
|
109
|
+
#if NCCL_GIN_GDAKI_ENABLE
|
|
110
|
+
case (int)NCCL_NET_DEVICE_GIN_GDAKI:
|
|
111
|
+
if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_GDAKI))) __builtin_unreachable();
|
|
112
|
+
return ApiFn<NCCL_NET_DEVICE_GIN_GDAKI>::call(ctx, static_cast<Arg&&>(arg)...);
|
|
113
|
+
#endif
|
|
114
|
+
default:
|
|
115
|
+
__builtin_unreachable();
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
#endif
|
|
119
|
+
|
|
120
|
+
#endif
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_GIN_DEVICE_HOST_COMMON_H_
|
|
8
|
+
#define _NCCL_GIN_DEVICE_HOST_COMMON_H_
|
|
9
|
+
|
|
10
|
+
#include <cuda.h>
|
|
11
|
+
#include "../net_device.h"
|
|
12
|
+
#include "../core.h" // for ncclGin{Signal|Counter}_t
|
|
13
|
+
|
|
14
|
+
#define NCCL_GIN_MAX_CONTEXTS 4
|
|
15
|
+
|
|
16
|
+
typedef struct ncclGinGpuCtx *ncclGinGpuCtx_t;
|
|
17
|
+
typedef void *ncclGinWindow_t;
|
|
18
|
+
|
|
19
|
+
typedef enum ncclGinSignalOp_t {
|
|
20
|
+
ncclGinSignalInc = 0,
|
|
21
|
+
ncclGinSignalAdd,
|
|
22
|
+
} ncclGinSignalOp_t;
|
|
23
|
+
|
|
24
|
+
#endif
|