nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_COOP_H_
8
+ #define _NCCL_DEVICE_COOP_H_
9
+ #include "utility.h"
10
+
11
+ // ncclCoop[Foo]: NCCL's versions of CUDA's Cooperative Groups. They conform
12
+ // to just this subset of the CUDA API:
13
+ // int Coop::thread_rank();
14
+ // int Coop::size();
15
+ // int Coop::num_threads(); // same as size()
16
+ // void Coop::sync();
17
+
18
+ #if __CUDACC__
19
+ template<int nThreadsPow2>
20
+ struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
21
+ static_assert(nccl::utility::isPow2(nThreadsPow2) && nThreadsPow2 <= 32, "Condition required");
22
+
23
+ NCCL_DEVICE_INLINE int thread_rank() const {
24
+ return nccl::utility::lane() % nThreadsPow2;
25
+ }
26
+ NCCL_DEVICE_INLINE constexpr int size() const { return nThreadsPow2; }
27
+ NCCL_DEVICE_INLINE constexpr int num_threads() const { return nThreadsPow2; }
28
+
29
+ NCCL_DEVICE_INLINE uint32_t laneMask() const {
30
+ return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
31
+ }
32
+ NCCL_DEVICE_INLINE void sync() {
33
+ __syncwarp(laneMask());
34
+ }
35
+ };
36
+ #endif
37
+
38
+ #if __CUDACC__
39
+ typedef ncclCoopTile<1> ncclCoopThread;
40
+ typedef ncclCoopTile<32> ncclCoopWarp;
41
+ #endif
42
+
43
+ #if __CUDACC__
44
+ struct ncclCoopLanes { // Some lanes of this warp.
45
+ uint32_t lmask;
46
+
47
+ NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
48
+
49
+ NCCL_DEVICE_INLINE int thread_rank() const {
50
+ return __popc(lmask & nccl::utility::lanemask_lt());
51
+ }
52
+ NCCL_DEVICE_INLINE int size() const {
53
+ return __popc(lmask);
54
+ }
55
+ NCCL_DEVICE_INLINE int num_threads() const {
56
+ return __popc(lmask);
57
+ }
58
+ NCCL_DEVICE_INLINE void sync() {
59
+ __syncwarp(lmask);
60
+ }
61
+ };
62
+ #endif
63
+
64
+ #if __CUDACC__
65
+ // A set of consecutive warps that the user has also supplied with a unique
66
+ // id from [0..15]. It is an error for two different warp spans with the same
67
+ // id to be in a collective concurrently.
68
+ struct ncclCoopWarpSpan {
69
+ uint32_t warp0:8, nWarps:8, id:8;
70
+
71
+ NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
72
+ warp0(warp0), nWarps(nWarps), id(id) {
73
+ }
74
+
75
+ NCCL_DEVICE_INLINE int thread_rank() const {
76
+ return threadIdx.x - 32*warp0;
77
+ }
78
+ NCCL_DEVICE_INLINE int size() const {
79
+ return 32*nWarps;
80
+ }
81
+ NCCL_DEVICE_INLINE int num_threads() const {
82
+ return 32*nWarps;
83
+ }
84
+
85
+ NCCL_DEVICE_INLINE void sync() {
86
+ //asm volatile("barrier.sync %0, %1;" :: "r"(1+id), "r"(32*nWarps) : "memory");
87
+ __barrier_sync_count(1+id, 32*nWarps);
88
+ }
89
+ };
90
+ #endif
91
+
92
+ #if __CUDACC__
93
+ struct ncclCoopCta {
94
+ NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x; }
95
+ NCCL_DEVICE_INLINE int size() const { return blockDim.x; }
96
+ NCCL_DEVICE_INLINE int num_threads() const { return blockDim.x; }
97
+ NCCL_DEVICE_INLINE void sync() { __syncthreads(); }
98
+ };
99
+ #endif
100
+
101
+ #if __CUDACC__
102
+ template<int nThreadsPow2>
103
+ NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile<nThreadsPow2> coop) {
104
+ return coop.laneMask();
105
+ }
106
+ NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
107
+ return coop.lmask;
108
+ }
109
+ NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
110
+ return -1u;
111
+ }
112
+ NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
113
+ return -1u;
114
+ }
115
+ #endif
116
+
117
+ #if __CUDACC__
118
+ // ncclCoopIsThread:
119
+ // At compile time do we know the given coop is a single thread only.
120
+ template<int nThreads>
121
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopTile<nThreads>) {
122
+ return nThreads == 1;
123
+ }
124
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { return false; }
125
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return false; }
126
+ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
127
+ #endif
128
+
129
+ #if __CUDACC__
130
+ // Pick threads of our warp that are safe to use collectively.
131
+ NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
132
+ return ncclCoopLanes{__activemask()};
133
+ }
134
+ #endif
135
+
136
+ #if __CUDACC__
137
+ // Pick threads of our warp that are safe to use collectively given that this
138
+ // is a collective on the provided cooperative group.
139
+ template<typename Coop>
140
+ NCCL_DEVICE_INLINE ncclCoopTile<32> ncclCoopCoalesced(Coop) {
141
+ return ncclCoopTile<32>();
142
+ }
143
+ NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) {
144
+ return coop;
145
+ }
146
+ template<int nThreads>
147
+ NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThreads> coop) {
148
+ return coop;
149
+ }
150
+ #endif
151
+
152
+ #endif
@@ -0,0 +1,150 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_CORE_H_
8
+ #define _NCCL_DEVICE_CORE_H_
9
+ #include <nccl.h>
10
+ #include "coop.h"
11
+ #include "utility.h"
12
+
13
+ struct ncclDevComm;
14
+ typedef struct ncclDevComm ncclDevComm_t;
15
+
16
+ struct ncclTeam;
17
+ typedef struct ncclTeam ncclTeam_t;
18
+
19
+ // typedef struct ncclWindow_vidmem* ncclWindow_t; // in nccl.h
20
+
21
+ struct ncclMultimemHandle;
22
+ typedef struct ncclMultimemHandle ncclMultimemHandle_t;
23
+
24
+ typedef uint32_t ncclDevResourceHandle;
25
+ typedef ncclDevResourceHandle ncclDevResourceHandle_t;
26
+
27
+ struct ncclLsaBarrierHandle;
28
+ typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
29
+
30
+ struct ncclLLA2AHandle;
31
+ typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
32
+
33
+ struct ncclTeam {
34
+ int nRanks, rank, stride;
35
+ };
36
+
37
+ #if __cplusplus
38
+ template<typename T> struct ncclSymPtr;
39
+ #endif
40
+
41
+ #if __cplusplus
42
+ struct ncclTeamTagWorld {};
43
+ struct ncclTeamTagLsa {};
44
+ struct ncclTeamTagRail {};
45
+ #endif
46
+
47
+ struct ncclDevCommRequirements;
48
+ typedef struct ncclDevCommRequirements ncclDevCommRequirements_t;
49
+
50
+ struct ncclDevResourceRequirements;
51
+ typedef struct ncclDevResourceRequirements ncclDevResourceRequirements_t;
52
+
53
+ struct ncclTeamRequirements;
54
+ typedef struct ncclTeamRequirements ncclTeamRequirements_t;
55
+
56
+ struct ncclDevCommRequirements {
57
+ ncclDevResourceRequirements_t* resourceRequirementsList;
58
+ ncclTeamRequirements_t* teamRequirementsList;
59
+
60
+ bool lsaMultimem; // Enable multimem on lsa team
61
+
62
+ int lsaBarrierCount;
63
+ };
64
+
65
+ struct ncclDevResourceRequirements {
66
+ ncclDevResourceRequirements_t* next;
67
+ size_t bufferSize, bufferAlign;
68
+ ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
69
+ };
70
+
71
+ struct ncclTeamRequirements {
72
+ ncclTeamRequirements_t* next;
73
+ ncclTeam_t team;
74
+ bool multimem;
75
+ ncclMultimemHandle_t* outMultimemHandle; // If non-null, target assigned during ncclDevCommCreate.
76
+ };
77
+
78
+ NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommCreate(ncclComm_t, ncclDevCommRequirements_t const*, ncclDevComm_t* outDevComm);
79
+ NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommDestroy(ncclComm_t, ncclDevComm_t const* devComm);
80
+
81
+ ////////////////////////////////////////////////////////////////////////////////
82
+ // Team API:
83
+
84
+ #if __cplusplus
85
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const&);
86
+ #endif
87
+ NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamWorld(ncclComm_t);
88
+
89
+ #if __cplusplus
90
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const&);
91
+ #endif
92
+ NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamLsa(ncclComm_t);
93
+
94
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int bPeer);
95
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int bPeer);
96
+
97
+ #if __cplusplus
98
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const&, ncclTeam, int rank);
99
+ #endif
100
+ NCCL_EXTERN_C __host__ int ncclTeamRankToWorld(ncclComm_t, ncclTeam_t, int rank);
101
+
102
+ #if __cplusplus
103
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const&, ncclTeam, int rank);
104
+ #endif
105
+ NCCL_EXTERN_C __host__ int ncclTeamRankToLsa(ncclComm_t, ncclTeam_t, int rank);
106
+
107
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize);
108
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize);
109
+
110
+ // Interpret each team as a set of ranks. This function assumes that `subset`
111
+ // is a subset of `parent`. Thus the number of ranks in the set difference of
112
+ // `parent` minus `subset` is `super.nRanks - subset.nRanks`. Given `index` this
113
+ // function returns the index'th element of `parent` minus `subset`.
114
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index);
115
+
116
+ // Equivalent to ncclTeamOuterFactor of lsa team.
117
+ #if __cplusplus
118
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const&);
119
+ #endif
120
+ NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamRail(ncclComm_t);
121
+
122
+ // Get offset of resource buffer within `comm.resourceWindow`.
123
+ NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t);
124
+
125
+ #if __CUDACC__
126
+ NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const&, ncclDevResourceHandle);
127
+ #endif
128
+
129
+ ////////////////////////////////////////////////////////////////////////////////
130
+ // Window API:
131
+
132
+ #if __CUDACC__
133
+ NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset);
134
+ NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer);
135
+ NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer);
136
+ NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer);
137
+ NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mmHandle);
138
+ NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const&);
139
+ #endif
140
+
141
+ #if __CUDACC__
142
+ // Convenience for combining ncclGet***Pointer() with resource handle.
143
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const&, ncclDevResourceHandle);
144
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const&, ncclDevResourceHandle, int peer);
145
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const&, ncclDevResourceHandle, ncclTeam, int peer);
146
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const&, ncclDevResourceHandle, ncclMultimemHandle);
147
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const&, ncclDevResourceHandle);
148
+ #endif
149
+
150
+ #endif // _NCCL_DEVICE_CORE_H_
@@ -0,0 +1,10 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_COMM__FUNCS_H_
8
+ #define _NCCL_DEVICE_COMM__FUNCS_H_
9
+ #include "comm__types.h"
10
+ #endif // _NCCL_DEVICE_COMM__FUNCS_H_
@@ -0,0 +1,40 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_COMM__TYPES_H_
8
+ #define _NCCL_DEVICE_COMM__TYPES_H_
9
+ #include "../comm.h"
10
+ #include "core__types.h"
11
+ #include "mem_barrier__types.h"
12
+ #include "ll_a2a__types.h"
13
+
14
+ struct ncclDevCommWindowTable;
15
+ #if __cplusplus
16
+ struct ncclDevCommWindowTable {
17
+ struct Entry {
18
+ uintptr_t base, size;
19
+ ncclWindow_t window;
20
+ } entries[32];
21
+ struct ncclDevCommWindowTable* next;
22
+ };
23
+ #endif
24
+
25
+ struct ncclDevComm {
26
+ int rank, nRanks;
27
+ uint32_t nRanks_rcp32;
28
+ int lsaRank, lsaSize;
29
+ uint32_t lsaSize_rcp32;
30
+
31
+ struct ncclDevCommWindowTable* windowTable;
32
+
33
+ ncclWindow_t resourceWindow;
34
+ struct ncclWindow_vidmem resourceWindow_inlined;
35
+
36
+ ncclMultimemHandle_t lsaMultimem;
37
+ ncclLsaBarrierHandle_t lsaBarrier;
38
+ };
39
+
40
+ #endif // _NCCL_DEVICE_COMM__TYPES_H_
@@ -0,0 +1,210 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_CORE__FUNCS_H_
8
+ #define _NCCL_DEVICE_CORE__FUNCS_H_
9
+ #include "core__types.h"
10
+ #include "comm__types.h"
11
+ #include "ptr__types.h"
12
+
13
+ #if __cplusplus
14
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const &comm) {
15
+ ncclTeam ans;
16
+ ans.nRanks = comm.nRanks;
17
+ ans.rank = comm.rank;
18
+ ans.stride = 1;
19
+ return ans;
20
+ }
21
+ #endif
22
+
23
+ #if __cplusplus
24
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const &comm) {
25
+ ncclTeam ans;
26
+ ans.nRanks = comm.lsaSize;
27
+ ans.rank = comm.lsaRank;
28
+ ans.stride = 1;
29
+ return ans;
30
+ }
31
+ #endif
32
+
33
+ #if __cplusplus
34
+ NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const& comm) {
35
+ ncclTeam ans;
36
+ ans.nRanks = nccl::utility::idivFast32(comm.nRanks, comm.lsaSize, comm.lsaSize_rcp32);
37
+ ans.rank = nccl::utility::idivFast32(comm.rank, comm.lsaSize, comm.lsaSize_rcp32);
38
+ ans.stride = comm.lsaSize;
39
+ return ans;
40
+ }
41
+ #endif
42
+
43
+ NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int brank) {
44
+ int wrank = (brank - b.rank)*b.stride;
45
+ uint32_t adelta = wrank/a.stride;
46
+ uint32_t amod = wrank%a.stride;
47
+ int arank = a.rank + adelta;
48
+ return 0 <= arank && arank < a.nRanks && amod == 0;
49
+ }
50
+
51
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int brank) {
52
+ int wrank = (brank - b.rank)*b.stride;
53
+ uint32_t adelta = wrank/a.stride;
54
+ //uint32_t amod = wrank%a.stride;
55
+ int arank = a.rank + adelta;
56
+ return arank;
57
+ }
58
+
59
+ #if __cplusplus
60
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const& comm, ncclTeam tm, int rank) {
61
+ return comm.rank + (rank - tm.rank)*tm.stride;
62
+ }
63
+ #endif
64
+
65
+ #if __cplusplus
66
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const& comm, ncclTeam tm, int rank) {
67
+ return comm.lsaRank + (rank - tm.rank)*tm.stride;
68
+ }
69
+ #endif
70
+
71
+ NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize) {
72
+ ncclTeam_t ans;
73
+ ans.nRanks = innerSize;
74
+ ans.rank = parent.rank%innerSize;
75
+ ans.stride = parent.stride;
76
+ return ans;
77
+ }
78
+
79
+ NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize) {
80
+ ncclTeam_t ans;
81
+ ans.nRanks = parent.nRanks/innerSize;
82
+ ans.rank = parent.rank/innerSize;
83
+ ans.stride = parent.stride*innerSize;
84
+ return ans;
85
+ }
86
+
87
+ NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index) {
88
+ int stride = subset.stride/parent.stride;
89
+ int below = parent.rank - subset.rank*stride;
90
+ if (stride < 0) {
91
+ stride = -stride;
92
+ below -= (subset.nRanks-1)*stride;
93
+ }
94
+ if (index < below) {
95
+ return index;
96
+ } else if (index-below < (subset.nRanks-1)*(stride-1)) {
97
+ return below + 1 + ((index-below)/(stride-1))*stride + (index-below)%(stride-1);
98
+ } else {
99
+ return below + 1 + (subset.nRanks-1)*stride + (index - below - (subset.nRanks-1)*(stride-1));
100
+ }
101
+ }
102
+
103
+ #if __CUDACC__
104
+ NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset) {
105
+ char* base = nccl::utility::loadConst(&w->lsaFlatBase);
106
+ uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
107
+ int i = nccl::utility::loadConst(&w->lsaRank);
108
+ return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
109
+ }
110
+ #endif
111
+
112
+ #if __CUDACC__
113
+ NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer) {
114
+ char* base = nccl::utility::loadConst(&w->lsaFlatBase);
115
+ uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
116
+ int i = peer;
117
+ return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
118
+ }
119
+ #endif
120
+
121
+ #if __CUDACC__
122
+ NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer) {
123
+ char* base = nccl::utility::loadConst(&w->lsaFlatBase);
124
+ uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
125
+ int worldRank = nccl::utility::loadConst(&w->worldRank);
126
+ int lsaRank = nccl::utility::loadConst(&w->lsaRank);
127
+ int i = lsaRank + (peer - worldRank);
128
+ return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
129
+ }
130
+ #endif
131
+
132
+ #if __CUDACC__
133
+ NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer) {
134
+ char* base = nccl::utility::loadConst(&w->lsaFlatBase);
135
+ uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
136
+ int lsaRank = nccl::utility::loadConst(&w->lsaRank);
137
+ int i = lsaRank + (peer - tm.rank)*tm.stride;
138
+ return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
139
+ }
140
+ #endif
141
+
142
+ #if __CUDACC__
143
+ NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mm) {
144
+ void* ptr = mm.mcBasePtr;
145
+ ptr = reinterpret_cast<char(*)[4096]>(ptr) + nccl::utility::loadConst(&w->mcOffset4K);
146
+ return (void*)((char*)ptr + offset);
147
+ }
148
+ #endif
149
+
150
+ #if __CUDACC__
151
+ NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const& comm) {
152
+ return ncclGetMultimemPointer(w, offset, comm.lsaMultimem);
153
+ }
154
+ #endif
155
+
156
+ NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) {
157
+ return ((size_t)h)*128;
158
+ }
159
+
160
+ #if __CUDACC__
161
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
162
+ void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
163
+ uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
164
+ void* local = nccl::utility::add4G(lsaFlatBase, comm.lsaRank*stride4G);
165
+ return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
166
+ }
167
+ #endif
168
+
169
+ #if __CUDACC__
170
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const& comm, ncclDevResourceHandle h, int peer) {
171
+ int r = peer;
172
+ void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
173
+ uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
174
+ void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
175
+ return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
176
+ }
177
+ #endif
178
+
179
+ #if __CUDACC__
180
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclTeam team, int peer) {
181
+ int r = comm.lsaRank + (peer - team.rank)*team.stride;
182
+ void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
183
+ uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
184
+ void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
185
+ return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
186
+ }
187
+ #endif
188
+
189
+ #if __CUDACC__
190
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclMultimemHandle mm) {
191
+ void* ptr = mm.mcBasePtr;
192
+ ptr = reinterpret_cast<char(*)[4096]>(ptr) + comm.resourceWindow_inlined.mcOffset4K;
193
+ ptr = reinterpret_cast<char(*)[128]>(ptr) + h;
194
+ return ptr;
195
+ }
196
+ #endif
197
+
198
+ #if __CUDACC__
199
+ NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
200
+ return ncclGetResourceBufferMultimemPointer(comm, h, comm.lsaMultimem);
201
+ }
202
+ #endif
203
+
204
+ #if __CUDACC__
205
+ NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const& comm, ncclDevResourceHandle h) {
206
+ return ncclSymPtr<char>(comm.resourceWindow, size_t(h)*128);
207
+ }
208
+ #endif
209
+
210
+ #endif
@@ -0,0 +1,26 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_CORE__TYPES_H_
8
+ #define _NCCL_DEVICE_CORE__TYPES_H_
9
+ #include "../core.h"
10
+
11
+ // nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t;
12
+ struct ncclWindow_vidmem {
13
+ void* winHost;
14
+ //ncclGinWindow_t ginWin;
15
+ char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team
16
+ int lsaRank;
17
+ int worldRank;
18
+ uint32_t stride4G;
19
+ uint32_t mcOffset4K;
20
+ };
21
+
22
+ struct ncclMultimemHandle {
23
+ void* mcBasePtr;
24
+ };
25
+
26
+ #endif