nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nvidia/nccl/include/nccl.h +571 -0
- nvidia/nccl/include/nccl_device/comm.h +10 -0
- nvidia/nccl/include/nccl_device/coop.h +152 -0
- nvidia/nccl/include/nccl_device/core.h +150 -0
- nvidia/nccl/include/nccl_device/impl/comm__funcs.h +10 -0
- nvidia/nccl/include/nccl_device/impl/comm__types.h +40 -0
- nvidia/nccl/include/nccl_device/impl/core__funcs.h +210 -0
- nvidia/nccl/include/nccl_device/impl/core__types.h +26 -0
- nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h +229 -0
- nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h +37 -0
- nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h +126 -0
- nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h +46 -0
- nvidia/nccl/include/nccl_device/impl/ptr__funcs.h +157 -0
- nvidia/nccl/include/nccl_device/impl/ptr__types.h +11 -0
- nvidia/nccl/include/nccl_device/ll_a2a.h +53 -0
- nvidia/nccl/include/nccl_device/mem_barrier.h +35 -0
- nvidia/nccl/include/nccl_device/ptr.h +61 -0
- nvidia/nccl/include/nccl_device/utility.h +352 -0
- nvidia/nccl/include/nccl_device.h +15 -0
- nvidia/nccl/lib/libnccl.so.2 +0 -0
- nvidia_nccl_cu13-2.28.3.dist-info/METADATA +45 -0
- nvidia_nccl_cu13-2.28.3.dist-info/RECORD +25 -0
- nvidia_nccl_cu13-2.28.3.dist-info/WHEEL +5 -0
- nvidia_nccl_cu13-2.28.3.dist-info/licenses/License.txt +39 -0
- nvidia_nccl_cu13-2.28.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_COOP_H_
|
|
8
|
+
#define _NCCL_DEVICE_COOP_H_
|
|
9
|
+
#include "utility.h"
|
|
10
|
+
|
|
11
|
+
// ncclCoop[Foo]: NCCL's versions of CUDA's Cooperative Groups. They conform
|
|
12
|
+
// to just this subset of the CUDA API:
|
|
13
|
+
// int Coop::thread_rank();
|
|
14
|
+
// int Coop::size();
|
|
15
|
+
// int Coop::num_threads(); // same as size()
|
|
16
|
+
// void Coop::sync();
|
|
17
|
+
|
|
18
|
+
#if __CUDACC__
|
|
19
|
+
template<int nThreadsPow2>
|
|
20
|
+
struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
|
|
21
|
+
static_assert(nccl::utility::isPow2(nThreadsPow2) && nThreadsPow2 <= 32, "Condition required");
|
|
22
|
+
|
|
23
|
+
NCCL_DEVICE_INLINE int thread_rank() const {
|
|
24
|
+
return nccl::utility::lane() % nThreadsPow2;
|
|
25
|
+
}
|
|
26
|
+
NCCL_DEVICE_INLINE constexpr int size() const { return nThreadsPow2; }
|
|
27
|
+
NCCL_DEVICE_INLINE constexpr int num_threads() const { return nThreadsPow2; }
|
|
28
|
+
|
|
29
|
+
NCCL_DEVICE_INLINE uint32_t laneMask() const {
|
|
30
|
+
return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
|
|
31
|
+
}
|
|
32
|
+
NCCL_DEVICE_INLINE void sync() {
|
|
33
|
+
__syncwarp(laneMask());
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
#endif
|
|
37
|
+
|
|
38
|
+
#if __CUDACC__
|
|
39
|
+
typedef ncclCoopTile<1> ncclCoopThread;
|
|
40
|
+
typedef ncclCoopTile<32> ncclCoopWarp;
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
#if __CUDACC__
|
|
44
|
+
struct ncclCoopLanes { // Some lanes of this warp.
|
|
45
|
+
uint32_t lmask;
|
|
46
|
+
|
|
47
|
+
NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
|
|
48
|
+
|
|
49
|
+
NCCL_DEVICE_INLINE int thread_rank() const {
|
|
50
|
+
return __popc(lmask & nccl::utility::lanemask_lt());
|
|
51
|
+
}
|
|
52
|
+
NCCL_DEVICE_INLINE int size() const {
|
|
53
|
+
return __popc(lmask);
|
|
54
|
+
}
|
|
55
|
+
NCCL_DEVICE_INLINE int num_threads() const {
|
|
56
|
+
return __popc(lmask);
|
|
57
|
+
}
|
|
58
|
+
NCCL_DEVICE_INLINE void sync() {
|
|
59
|
+
__syncwarp(lmask);
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
#if __CUDACC__
|
|
65
|
+
// A set of consecutive warps that the user has also supplied with a unique
|
|
66
|
+
// id from [0..15]. It is an error for two different warp spans with the same
|
|
67
|
+
// id to be in a collective concurrently.
|
|
68
|
+
struct ncclCoopWarpSpan {
|
|
69
|
+
uint32_t warp0:8, nWarps:8, id:8;
|
|
70
|
+
|
|
71
|
+
NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
|
|
72
|
+
warp0(warp0), nWarps(nWarps), id(id) {
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
NCCL_DEVICE_INLINE int thread_rank() const {
|
|
76
|
+
return threadIdx.x - 32*warp0;
|
|
77
|
+
}
|
|
78
|
+
NCCL_DEVICE_INLINE int size() const {
|
|
79
|
+
return 32*nWarps;
|
|
80
|
+
}
|
|
81
|
+
NCCL_DEVICE_INLINE int num_threads() const {
|
|
82
|
+
return 32*nWarps;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
NCCL_DEVICE_INLINE void sync() {
|
|
86
|
+
//asm volatile("barrier.sync %0, %1;" :: "r"(1+id), "r"(32*nWarps) : "memory");
|
|
87
|
+
__barrier_sync_count(1+id, 32*nWarps);
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
#endif
|
|
91
|
+
|
|
92
|
+
#if __CUDACC__
|
|
93
|
+
struct ncclCoopCta {
|
|
94
|
+
NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x; }
|
|
95
|
+
NCCL_DEVICE_INLINE int size() const { return blockDim.x; }
|
|
96
|
+
NCCL_DEVICE_INLINE int num_threads() const { return blockDim.x; }
|
|
97
|
+
NCCL_DEVICE_INLINE void sync() { __syncthreads(); }
|
|
98
|
+
};
|
|
99
|
+
#endif
|
|
100
|
+
|
|
101
|
+
#if __CUDACC__
|
|
102
|
+
template<int nThreadsPow2>
|
|
103
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile<nThreadsPow2> coop) {
|
|
104
|
+
return coop.laneMask();
|
|
105
|
+
}
|
|
106
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
|
|
107
|
+
return coop.lmask;
|
|
108
|
+
}
|
|
109
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
|
|
110
|
+
return -1u;
|
|
111
|
+
}
|
|
112
|
+
NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
|
|
113
|
+
return -1u;
|
|
114
|
+
}
|
|
115
|
+
#endif
|
|
116
|
+
|
|
117
|
+
#if __CUDACC__
|
|
118
|
+
// ncclCoopIsThread:
|
|
119
|
+
// At compile time do we know the given coop is a single thread only.
|
|
120
|
+
template<int nThreads>
|
|
121
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopTile<nThreads>) {
|
|
122
|
+
return nThreads == 1;
|
|
123
|
+
}
|
|
124
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { return false; }
|
|
125
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return false; }
|
|
126
|
+
NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
|
|
127
|
+
#endif
|
|
128
|
+
|
|
129
|
+
#if __CUDACC__
|
|
130
|
+
// Pick threads of our warp that are safe to use collectively.
|
|
131
|
+
NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
|
|
132
|
+
return ncclCoopLanes{__activemask()};
|
|
133
|
+
}
|
|
134
|
+
#endif
|
|
135
|
+
|
|
136
|
+
#if __CUDACC__
|
|
137
|
+
// Pick threads of our warp that are safe to use collectively given that this
|
|
138
|
+
// is a collective on the provided cooperative group.
|
|
139
|
+
template<typename Coop>
|
|
140
|
+
NCCL_DEVICE_INLINE ncclCoopTile<32> ncclCoopCoalesced(Coop) {
|
|
141
|
+
return ncclCoopTile<32>();
|
|
142
|
+
}
|
|
143
|
+
NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) {
|
|
144
|
+
return coop;
|
|
145
|
+
}
|
|
146
|
+
template<int nThreads>
|
|
147
|
+
NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThreads> coop) {
|
|
148
|
+
return coop;
|
|
149
|
+
}
|
|
150
|
+
#endif
|
|
151
|
+
|
|
152
|
+
#endif
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_CORE_H_
|
|
8
|
+
#define _NCCL_DEVICE_CORE_H_
|
|
9
|
+
#include <nccl.h>
|
|
10
|
+
#include "coop.h"
|
|
11
|
+
#include "utility.h"
|
|
12
|
+
|
|
13
|
+
struct ncclDevComm;
|
|
14
|
+
typedef struct ncclDevComm ncclDevComm_t;
|
|
15
|
+
|
|
16
|
+
struct ncclTeam;
|
|
17
|
+
typedef struct ncclTeam ncclTeam_t;
|
|
18
|
+
|
|
19
|
+
// typedef struct ncclWindow_vidmem* ncclWindow_t; // in nccl.h
|
|
20
|
+
|
|
21
|
+
struct ncclMultimemHandle;
|
|
22
|
+
typedef struct ncclMultimemHandle ncclMultimemHandle_t;
|
|
23
|
+
|
|
24
|
+
typedef uint32_t ncclDevResourceHandle;
|
|
25
|
+
typedef ncclDevResourceHandle ncclDevResourceHandle_t;
|
|
26
|
+
|
|
27
|
+
struct ncclLsaBarrierHandle;
|
|
28
|
+
typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
|
|
29
|
+
|
|
30
|
+
struct ncclLLA2AHandle;
|
|
31
|
+
typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
|
|
32
|
+
|
|
33
|
+
struct ncclTeam {
|
|
34
|
+
int nRanks, rank, stride;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
#if __cplusplus
|
|
38
|
+
template<typename T> struct ncclSymPtr;
|
|
39
|
+
#endif
|
|
40
|
+
|
|
41
|
+
#if __cplusplus
|
|
42
|
+
struct ncclTeamTagWorld {};
|
|
43
|
+
struct ncclTeamTagLsa {};
|
|
44
|
+
struct ncclTeamTagRail {};
|
|
45
|
+
#endif
|
|
46
|
+
|
|
47
|
+
struct ncclDevCommRequirements;
|
|
48
|
+
typedef struct ncclDevCommRequirements ncclDevCommRequirements_t;
|
|
49
|
+
|
|
50
|
+
struct ncclDevResourceRequirements;
|
|
51
|
+
typedef struct ncclDevResourceRequirements ncclDevResourceRequirements_t;
|
|
52
|
+
|
|
53
|
+
struct ncclTeamRequirements;
|
|
54
|
+
typedef struct ncclTeamRequirements ncclTeamRequirements_t;
|
|
55
|
+
|
|
56
|
+
struct ncclDevCommRequirements {
|
|
57
|
+
ncclDevResourceRequirements_t* resourceRequirementsList;
|
|
58
|
+
ncclTeamRequirements_t* teamRequirementsList;
|
|
59
|
+
|
|
60
|
+
bool lsaMultimem; // Enable multimem on lsa team
|
|
61
|
+
|
|
62
|
+
int lsaBarrierCount;
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
struct ncclDevResourceRequirements {
|
|
66
|
+
ncclDevResourceRequirements_t* next;
|
|
67
|
+
size_t bufferSize, bufferAlign;
|
|
68
|
+
ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
struct ncclTeamRequirements {
|
|
72
|
+
ncclTeamRequirements_t* next;
|
|
73
|
+
ncclTeam_t team;
|
|
74
|
+
bool multimem;
|
|
75
|
+
ncclMultimemHandle_t* outMultimemHandle; // If non-null, target assigned during ncclDevCommCreate.
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommCreate(ncclComm_t, ncclDevCommRequirements_t const*, ncclDevComm_t* outDevComm);
|
|
79
|
+
NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommDestroy(ncclComm_t, ncclDevComm_t const* devComm);
|
|
80
|
+
|
|
81
|
+
////////////////////////////////////////////////////////////////////////////////
|
|
82
|
+
// Team API:
|
|
83
|
+
|
|
84
|
+
#if __cplusplus
|
|
85
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const&);
|
|
86
|
+
#endif
|
|
87
|
+
NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamWorld(ncclComm_t);
|
|
88
|
+
|
|
89
|
+
#if __cplusplus
|
|
90
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const&);
|
|
91
|
+
#endif
|
|
92
|
+
NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamLsa(ncclComm_t);
|
|
93
|
+
|
|
94
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int bPeer);
|
|
95
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int bPeer);
|
|
96
|
+
|
|
97
|
+
#if __cplusplus
|
|
98
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const&, ncclTeam, int rank);
|
|
99
|
+
#endif
|
|
100
|
+
NCCL_EXTERN_C __host__ int ncclTeamRankToWorld(ncclComm_t, ncclTeam_t, int rank);
|
|
101
|
+
|
|
102
|
+
#if __cplusplus
|
|
103
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const&, ncclTeam, int rank);
|
|
104
|
+
#endif
|
|
105
|
+
NCCL_EXTERN_C __host__ int ncclTeamRankToLsa(ncclComm_t, ncclTeam_t, int rank);
|
|
106
|
+
|
|
107
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize);
|
|
108
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize);
|
|
109
|
+
|
|
110
|
+
// Interpret each team as a set of ranks. This function assumes that `subset`
|
|
111
|
+
// is a subset of `parent`. Thus the number of ranks in the set difference of
|
|
112
|
+
// `parent` minus `subset` is `super.nRanks - subset.nRanks`. Given `index` this
|
|
113
|
+
// function returns the index'th element of `parent` minus `subset`.
|
|
114
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index);
|
|
115
|
+
|
|
116
|
+
// Equivalent to ncclTeamOuterFactor of lsa team.
|
|
117
|
+
#if __cplusplus
|
|
118
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const&);
|
|
119
|
+
#endif
|
|
120
|
+
NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamRail(ncclComm_t);
|
|
121
|
+
|
|
122
|
+
// Get offset of resource buffer within `comm.resourceWindow`.
|
|
123
|
+
NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t);
|
|
124
|
+
|
|
125
|
+
#if __CUDACC__
|
|
126
|
+
NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const&, ncclDevResourceHandle);
|
|
127
|
+
#endif
|
|
128
|
+
|
|
129
|
+
////////////////////////////////////////////////////////////////////////////////
|
|
130
|
+
// Window API:
|
|
131
|
+
|
|
132
|
+
#if __CUDACC__
|
|
133
|
+
NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset);
|
|
134
|
+
NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer);
|
|
135
|
+
NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer);
|
|
136
|
+
NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer);
|
|
137
|
+
NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mmHandle);
|
|
138
|
+
NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const&);
|
|
139
|
+
#endif
|
|
140
|
+
|
|
141
|
+
#if __CUDACC__
|
|
142
|
+
// Convenience for combining ncclGet***Pointer() with resource handle.
|
|
143
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const&, ncclDevResourceHandle);
|
|
144
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const&, ncclDevResourceHandle, int peer);
|
|
145
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const&, ncclDevResourceHandle, ncclTeam, int peer);
|
|
146
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const&, ncclDevResourceHandle, ncclMultimemHandle);
|
|
147
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const&, ncclDevResourceHandle);
|
|
148
|
+
#endif
|
|
149
|
+
|
|
150
|
+
#endif // _NCCL_DEVICE_CORE_H_
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_COMM__FUNCS_H_
|
|
8
|
+
#define _NCCL_DEVICE_COMM__FUNCS_H_
|
|
9
|
+
#include "comm__types.h"
|
|
10
|
+
#endif // _NCCL_DEVICE_COMM__FUNCS_H_
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_COMM__TYPES_H_
|
|
8
|
+
#define _NCCL_DEVICE_COMM__TYPES_H_
|
|
9
|
+
#include "../comm.h"
|
|
10
|
+
#include "core__types.h"
|
|
11
|
+
#include "mem_barrier__types.h"
|
|
12
|
+
#include "ll_a2a__types.h"
|
|
13
|
+
|
|
14
|
+
struct ncclDevCommWindowTable;
|
|
15
|
+
#if __cplusplus
|
|
16
|
+
struct ncclDevCommWindowTable {
|
|
17
|
+
struct Entry {
|
|
18
|
+
uintptr_t base, size;
|
|
19
|
+
ncclWindow_t window;
|
|
20
|
+
} entries[32];
|
|
21
|
+
struct ncclDevCommWindowTable* next;
|
|
22
|
+
};
|
|
23
|
+
#endif
|
|
24
|
+
|
|
25
|
+
struct ncclDevComm {
|
|
26
|
+
int rank, nRanks;
|
|
27
|
+
uint32_t nRanks_rcp32;
|
|
28
|
+
int lsaRank, lsaSize;
|
|
29
|
+
uint32_t lsaSize_rcp32;
|
|
30
|
+
|
|
31
|
+
struct ncclDevCommWindowTable* windowTable;
|
|
32
|
+
|
|
33
|
+
ncclWindow_t resourceWindow;
|
|
34
|
+
struct ncclWindow_vidmem resourceWindow_inlined;
|
|
35
|
+
|
|
36
|
+
ncclMultimemHandle_t lsaMultimem;
|
|
37
|
+
ncclLsaBarrierHandle_t lsaBarrier;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
#endif // _NCCL_DEVICE_COMM__TYPES_H_
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_CORE__FUNCS_H_
|
|
8
|
+
#define _NCCL_DEVICE_CORE__FUNCS_H_
|
|
9
|
+
#include "core__types.h"
|
|
10
|
+
#include "comm__types.h"
|
|
11
|
+
#include "ptr__types.h"
|
|
12
|
+
|
|
13
|
+
#if __cplusplus
|
|
14
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const &comm) {
|
|
15
|
+
ncclTeam ans;
|
|
16
|
+
ans.nRanks = comm.nRanks;
|
|
17
|
+
ans.rank = comm.rank;
|
|
18
|
+
ans.stride = 1;
|
|
19
|
+
return ans;
|
|
20
|
+
}
|
|
21
|
+
#endif
|
|
22
|
+
|
|
23
|
+
#if __cplusplus
|
|
24
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const &comm) {
|
|
25
|
+
ncclTeam ans;
|
|
26
|
+
ans.nRanks = comm.lsaSize;
|
|
27
|
+
ans.rank = comm.lsaRank;
|
|
28
|
+
ans.stride = 1;
|
|
29
|
+
return ans;
|
|
30
|
+
}
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
#if __cplusplus
|
|
34
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const& comm) {
|
|
35
|
+
ncclTeam ans;
|
|
36
|
+
ans.nRanks = nccl::utility::idivFast32(comm.nRanks, comm.lsaSize, comm.lsaSize_rcp32);
|
|
37
|
+
ans.rank = nccl::utility::idivFast32(comm.rank, comm.lsaSize, comm.lsaSize_rcp32);
|
|
38
|
+
ans.stride = comm.lsaSize;
|
|
39
|
+
return ans;
|
|
40
|
+
}
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int brank) {
|
|
44
|
+
int wrank = (brank - b.rank)*b.stride;
|
|
45
|
+
uint32_t adelta = wrank/a.stride;
|
|
46
|
+
uint32_t amod = wrank%a.stride;
|
|
47
|
+
int arank = a.rank + adelta;
|
|
48
|
+
return 0 <= arank && arank < a.nRanks && amod == 0;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int brank) {
|
|
52
|
+
int wrank = (brank - b.rank)*b.stride;
|
|
53
|
+
uint32_t adelta = wrank/a.stride;
|
|
54
|
+
//uint32_t amod = wrank%a.stride;
|
|
55
|
+
int arank = a.rank + adelta;
|
|
56
|
+
return arank;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#if __cplusplus
|
|
60
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const& comm, ncclTeam tm, int rank) {
|
|
61
|
+
return comm.rank + (rank - tm.rank)*tm.stride;
|
|
62
|
+
}
|
|
63
|
+
#endif
|
|
64
|
+
|
|
65
|
+
#if __cplusplus
|
|
66
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const& comm, ncclTeam tm, int rank) {
|
|
67
|
+
return comm.lsaRank + (rank - tm.rank)*tm.stride;
|
|
68
|
+
}
|
|
69
|
+
#endif
|
|
70
|
+
|
|
71
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize) {
|
|
72
|
+
ncclTeam_t ans;
|
|
73
|
+
ans.nRanks = innerSize;
|
|
74
|
+
ans.rank = parent.rank%innerSize;
|
|
75
|
+
ans.stride = parent.stride;
|
|
76
|
+
return ans;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize) {
|
|
80
|
+
ncclTeam_t ans;
|
|
81
|
+
ans.nRanks = parent.nRanks/innerSize;
|
|
82
|
+
ans.rank = parent.rank/innerSize;
|
|
83
|
+
ans.stride = parent.stride*innerSize;
|
|
84
|
+
return ans;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index) {
|
|
88
|
+
int stride = subset.stride/parent.stride;
|
|
89
|
+
int below = parent.rank - subset.rank*stride;
|
|
90
|
+
if (stride < 0) {
|
|
91
|
+
stride = -stride;
|
|
92
|
+
below -= (subset.nRanks-1)*stride;
|
|
93
|
+
}
|
|
94
|
+
if (index < below) {
|
|
95
|
+
return index;
|
|
96
|
+
} else if (index-below < (subset.nRanks-1)*(stride-1)) {
|
|
97
|
+
return below + 1 + ((index-below)/(stride-1))*stride + (index-below)%(stride-1);
|
|
98
|
+
} else {
|
|
99
|
+
return below + 1 + (subset.nRanks-1)*stride + (index - below - (subset.nRanks-1)*(stride-1));
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
#if __CUDACC__
|
|
104
|
+
NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset) {
|
|
105
|
+
char* base = nccl::utility::loadConst(&w->lsaFlatBase);
|
|
106
|
+
uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
|
|
107
|
+
int i = nccl::utility::loadConst(&w->lsaRank);
|
|
108
|
+
return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
|
|
109
|
+
}
|
|
110
|
+
#endif
|
|
111
|
+
|
|
112
|
+
#if __CUDACC__
|
|
113
|
+
NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer) {
|
|
114
|
+
char* base = nccl::utility::loadConst(&w->lsaFlatBase);
|
|
115
|
+
uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
|
|
116
|
+
int i = peer;
|
|
117
|
+
return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
|
|
118
|
+
}
|
|
119
|
+
#endif
|
|
120
|
+
|
|
121
|
+
#if __CUDACC__
|
|
122
|
+
NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer) {
|
|
123
|
+
char* base = nccl::utility::loadConst(&w->lsaFlatBase);
|
|
124
|
+
uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
|
|
125
|
+
int worldRank = nccl::utility::loadConst(&w->worldRank);
|
|
126
|
+
int lsaRank = nccl::utility::loadConst(&w->lsaRank);
|
|
127
|
+
int i = lsaRank + (peer - worldRank);
|
|
128
|
+
return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
|
|
129
|
+
}
|
|
130
|
+
#endif
|
|
131
|
+
|
|
132
|
+
#if __CUDACC__
|
|
133
|
+
NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer) {
|
|
134
|
+
char* base = nccl::utility::loadConst(&w->lsaFlatBase);
|
|
135
|
+
uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
|
|
136
|
+
int lsaRank = nccl::utility::loadConst(&w->lsaRank);
|
|
137
|
+
int i = lsaRank + (peer - tm.rank)*tm.stride;
|
|
138
|
+
return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
|
|
139
|
+
}
|
|
140
|
+
#endif
|
|
141
|
+
|
|
142
|
+
#if __CUDACC__
|
|
143
|
+
NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mm) {
|
|
144
|
+
void* ptr = mm.mcBasePtr;
|
|
145
|
+
ptr = reinterpret_cast<char(*)[4096]>(ptr) + nccl::utility::loadConst(&w->mcOffset4K);
|
|
146
|
+
return (void*)((char*)ptr + offset);
|
|
147
|
+
}
|
|
148
|
+
#endif
|
|
149
|
+
|
|
150
|
+
#if __CUDACC__
|
|
151
|
+
NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const& comm) {
|
|
152
|
+
return ncclGetMultimemPointer(w, offset, comm.lsaMultimem);
|
|
153
|
+
}
|
|
154
|
+
#endif
|
|
155
|
+
|
|
156
|
+
NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) {
|
|
157
|
+
return ((size_t)h)*128;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#if __CUDACC__
|
|
161
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
|
|
162
|
+
void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
|
|
163
|
+
uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
|
|
164
|
+
void* local = nccl::utility::add4G(lsaFlatBase, comm.lsaRank*stride4G);
|
|
165
|
+
return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
|
|
166
|
+
}
|
|
167
|
+
#endif
|
|
168
|
+
|
|
169
|
+
#if __CUDACC__
|
|
170
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const& comm, ncclDevResourceHandle h, int peer) {
|
|
171
|
+
int r = peer;
|
|
172
|
+
void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
|
|
173
|
+
uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
|
|
174
|
+
void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
|
|
175
|
+
return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
|
|
176
|
+
}
|
|
177
|
+
#endif
|
|
178
|
+
|
|
179
|
+
#if __CUDACC__
|
|
180
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclTeam team, int peer) {
|
|
181
|
+
int r = comm.lsaRank + (peer - team.rank)*team.stride;
|
|
182
|
+
void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
|
|
183
|
+
uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
|
|
184
|
+
void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
|
|
185
|
+
return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
|
|
186
|
+
}
|
|
187
|
+
#endif
|
|
188
|
+
|
|
189
|
+
#if __CUDACC__
|
|
190
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclMultimemHandle mm) {
|
|
191
|
+
void* ptr = mm.mcBasePtr;
|
|
192
|
+
ptr = reinterpret_cast<char(*)[4096]>(ptr) + comm.resourceWindow_inlined.mcOffset4K;
|
|
193
|
+
ptr = reinterpret_cast<char(*)[128]>(ptr) + h;
|
|
194
|
+
return ptr;
|
|
195
|
+
}
|
|
196
|
+
#endif
|
|
197
|
+
|
|
198
|
+
#if __CUDACC__
|
|
199
|
+
NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
|
|
200
|
+
return ncclGetResourceBufferMultimemPointer(comm, h, comm.lsaMultimem);
|
|
201
|
+
}
|
|
202
|
+
#endif
|
|
203
|
+
|
|
204
|
+
#if __CUDACC__
|
|
205
|
+
NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const& comm, ncclDevResourceHandle h) {
|
|
206
|
+
return ncclSymPtr<char>(comm.resourceWindow, size_t(h)*128);
|
|
207
|
+
}
|
|
208
|
+
#endif
|
|
209
|
+
|
|
210
|
+
#endif
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/*************************************************************************
|
|
2
|
+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* See LICENSE.txt for license information
|
|
5
|
+
************************************************************************/
|
|
6
|
+
|
|
7
|
+
#ifndef _NCCL_DEVICE_CORE__TYPES_H_
|
|
8
|
+
#define _NCCL_DEVICE_CORE__TYPES_H_
|
|
9
|
+
#include "../core.h"
|
|
10
|
+
|
|
11
|
+
// nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t;
|
|
12
|
+
struct ncclWindow_vidmem {
|
|
13
|
+
void* winHost;
|
|
14
|
+
//ncclGinWindow_t ginWin;
|
|
15
|
+
char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team
|
|
16
|
+
int lsaRank;
|
|
17
|
+
int worldRank;
|
|
18
|
+
uint32_t stride4G;
|
|
19
|
+
uint32_t mcOffset4K;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
struct ncclMultimemHandle {
|
|
23
|
+
void* mcBasePtr;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
#endif
|