PyPI - nvidia-nccl-cu13 - Versions diffs - 2.28.3__py3-none-manylinux_2_18_aarch64.whl - Mend

nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

nvidia/nccl/include/nccl.h +571 -0
nvidia/nccl/include/nccl_device/comm.h +10 -0
nvidia/nccl/include/nccl_device/coop.h +152 -0
nvidia/nccl/include/nccl_device/core.h +150 -0
nvidia/nccl/include/nccl_device/impl/comm__funcs.h +10 -0
nvidia/nccl/include/nccl_device/impl/comm__types.h +40 -0
nvidia/nccl/include/nccl_device/impl/core__funcs.h +210 -0
nvidia/nccl/include/nccl_device/impl/core__types.h +26 -0
nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h +229 -0
nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h +37 -0
nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h +126 -0
nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h +46 -0
nvidia/nccl/include/nccl_device/impl/ptr__funcs.h +157 -0
nvidia/nccl/include/nccl_device/impl/ptr__types.h +11 -0
nvidia/nccl/include/nccl_device/ll_a2a.h +53 -0
nvidia/nccl/include/nccl_device/mem_barrier.h +35 -0
nvidia/nccl/include/nccl_device/ptr.h +61 -0
nvidia/nccl/include/nccl_device/utility.h +352 -0
nvidia/nccl/include/nccl_device.h +15 -0
nvidia/nccl/lib/libnccl.so.2 +0 -0
nvidia_nccl_cu13-2.28.3.dist-info/METADATA +45 -0
nvidia_nccl_cu13-2.28.3.dist-info/RECORD +25 -0
nvidia_nccl_cu13-2.28.3.dist-info/WHEEL +5 -0
nvidia_nccl_cu13-2.28.3.dist-info/licenses/License.txt +39 -0
nvidia_nccl_cu13-2.28.3.dist-info/top_level.txt +1 -0

nvidia/nccl/include/nccl_device/impl/ll_a2a__funcs.h ADDED Viewed

@@ -0,0 +1,229 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_LL_A2A__FUNCS_H_
+#define _NCCL_DEVICE_LL_A2A__FUNCS_H_
+#include "ll_a2a__types.h"
+#include "comm__types.h"
+#include "../utility.h"
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::ncclLLA2ASession(
+    Coop coop, ncclDevComm const& comm, ncclTeam team,
+    ncclLLA2AHandle handle, uint32_t block, int maxElts,
+    bool multimem, ncclMultimemHandle mmHandle
+  ):
+  ncclLLA2ASession_internal<Coop>{
+    coop, comm, team, handle, (int)block, /*pitch=*/maxElts,
+    multimem, mmHandle, /*epoch=*/0, /*slotsOffset=*/0
+  } {
+  uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+  line += block*(1 + 2*handle.nSlots);
+  this->epoch = line->x + 2;
+  this->slotsOffset = this->calcSlotOffset();
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::~ncclLLA2ASession() {
+  uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  line += this->block*(1 + 2*this->handle.nSlots);
+  if (this->coop.thread_rank() == 0) line->x = this->epoch - 2;
+  this->coop.sync();
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::send(int peer, int elt, T data) {
+  using nccl::utility::divUp;
+  union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+  tmp = data;
+  uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, peer);
+  buf += this->slotsOffset + elt;
+  #pragma unroll
+  for (int u=0; u < divUp(sizeof(T), 8); u++) {
+    asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+      "l"(buf + u*this->pitch),
+      "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+    );
+  }
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::bcast(int elt, T data) {
+  using nccl::utility::divUp;
+  if (this->multimem) {
+    union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
+    tmp = data;
+    uint4* bufmc = (uint4*)ncclGetResourceBufferMultimemPointer(this->comm, this->handle.bufHandle, this->mmHandle);
+    bufmc += this->slotsOffset + elt;
+    #pragma unroll
+    for (int u=0; u < divUp(sizeof(T), 8); u++) {
+      asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+        "l"(bufmc + this->pitch*u),
+        "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+      );
+    }
+  } else {
+    union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+    tmp = data;
+    int dr = 0;
+    int r = this->team.rank;
+    #pragma unroll 1
+    for (; dr+8 <= this->team.nRanks; dr += 8) {
+      #pragma unroll
+      for (int ur=0; ur < 8; ur++) {
+        uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
+        buf += this->slotsOffset + elt;
+        #pragma unroll
+        for (int u=0; u < divUp(sizeof(T),8); u++) {
+          asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+            "l"(buf + u*this->pitch),
+            "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+          );
+        }
+        r += 1;
+        if (r == this->team.nRanks) r = 0;
+      }
+    }
+    #pragma unroll
+    for (int ur=0; ur < 8; ur++, dr++) {
+      if (dr == this->team.nRanks) break;
+      uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
+      buf += this->slotsOffset + elt;
+      #pragma unroll
+      for (int u=0; u < divUp(sizeof(T),8); u++) {
+        asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+          "l"(buf + u*this->pitch),
+          "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+        );
+      }
+      r += 1;
+      if (r == this->team.nRanks) r = 0;
+    }
+  }
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE T ncclLLA2ASession<Coop>::recv(int elt) {
+  T ret[1];
+  this->template recvUnrolled</*MinEltCount=*/1, /*MaxEltCount=*/1>(elt, 1, 0, ret);
+  return ret[0];
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+template<int MinEltCount, int MaxEltCount, typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::recvUnrolled(int eltStart, int eltCount, int eltStride, T(&elts)[MaxEltCount]) {
+  using nccl::utility::divUp;
+  uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  buf += this->slotsOffset + eltStart;
+  uint4 tmp[MaxEltCount][divUp(sizeof(T), 8)];
+  #pragma unroll 1
+  while (true) {
+    #pragma unroll
+    for (int u=0; u < MaxEltCount; u++) {
+      if (u < MinEltCount || u < eltCount) {
+        #pragma unroll
+        for (int v=0; v < divUp(sizeof(T), 8); v++) {
+          asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];"
+            : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w)
+            : "l"(buf + u*eltStride + v*this->pitch));
+        }
+      }
+    }
+    bool okAll = true;
+    #pragma unroll
+    for (int u=0; u < MaxEltCount; u++) {
+      #pragma unroll
+      for (int v=0; v < divUp(sizeof(T), 8); v++) {
+        if (u < MinEltCount || u < eltCount) {
+          bool ok = tmp[u][v].y == this->epoch &&
+                    tmp[u][v].w == this->epoch;
+          okAll &= ok;
+        }
+      }
+    }
+    if (__builtin_expect(okAll, true)) break;
+  }
+  #pragma unroll
+  for (int u=0; u < MaxEltCount; u++) {
+    if (MinEltCount <= u && u == eltCount) break;
+    union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+    #pragma unroll
+    for (int v=0; v < divUp(sizeof(T), 8); v++) {
+      u32[v][0] = tmp[u][v].x;
+      u32[v][1] = tmp[u][v].z;
+    }
+    elts[u] = val;
+  }
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
+NCCL_DEVICE_INLINE auto ncclLLA2ASession<Coop>::recvReduce(
+    int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce reduce
+  ) -> decltype(eltToAcc(nccl::utility::declval<Elt>())) {
+  using Acc = decltype(eltToAcc(nccl::utility::declval<Elt>()));
+  Acc acc;
+  int i = 0;
+  #pragma unroll 1
+  for (; i+Unroll <= eltCount; i += Unroll) {
+    Elt got[Unroll];
+    this->template recvUnrolled</*Min=*/Unroll>(eltStart + i*eltStride, Unroll, eltStride, got);
+    Acc acc0 = eltToAcc(got[0]);
+    acc = i==0 ? acc0 : reduce(acc, acc0);
+    #pragma unroll
+    for (int j=1; j < Unroll; j++) acc = reduce(acc, eltToAcc(got[j]));
+  }
+  if (i < eltCount) {
+    Elt got[Unroll];
+    this->template recvUnrolled</*Min=*/1>(eltStart + i*eltStride, eltCount-i, eltStride, got);
+    Acc acc0 = eltToAcc(got[0]);
+    acc = i==0 ? acc0 : reduce(acc, acc0);
+    #pragma unroll
+    for (int j=1; j < Unroll-1; j++) {
+      if (i+j < eltCount) acc = reduce(acc, eltToAcc(got[j]));
+    }
+  }
+  return acc;
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::endEpoch(Coop) {
+  if (__builtin_expect(this->epoch >= -2u, false)) {
+    this->coop.sync();
+    uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+    buf += this->slotsOffset;
+    #pragma unroll 4
+    for (int i=this->coop.thread_rank(); i < this->handle.nSlots; i += this->coop.size()) {
+      buf[i] = uint4{0, 0, 0, 0};
+    }
+  }
+  this->coop.sync();
+  this->epoch += (this->epoch == -1u) ? 3 : 1;
+  this->slotsOffset = this->calcSlotOffset();
+}
+#endif
+#endif // _NCCL_DEVICE_LL_A2A__FUNCS_H_

nvidia/nccl/include/nccl_device/impl/ll_a2a__types.h ADDED Viewed

@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_LL_A2A__TYPES_H_
+#define _NCCL_DEVICE_LL_A2A__TYPES_H_
+#include "../ll_a2a.h"
+#include "core__types.h"
+struct ncclLLA2AHandle {
+  ncclDevResourceHandle_t bufHandle;
+  uint32_t nSlots;
+};
+#if __CUDACC__
+template<typename Coop>
+struct ncclLLA2ASession_internal {
+  Coop coop;
+  ncclDevComm const& comm;
+  ncclTeam team;
+  ncclLLA2AHandle handle;
+  int block;
+  int pitch;
+  bool multimem;
+  ncclMultimemHandle mmHandle;
+  uint32_t epoch;
+  uint32_t slotsOffset;
+  NCCL_DEVICE_INLINE uint32_t calcSlotOffset() const {
+    return block*(1 + 2*handle.nSlots) + 1 + (epoch & 1)*handle.nSlots;
+  }
+};
+#endif
+#endif // _NCCL_DEVICE_LL_A2A__TYPES_H_

nvidia/nccl/include/nccl_device/impl/mem_barrier__funcs.h ADDED Viewed

@@ -0,0 +1,126 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#include "mem_barrier__types.h"
+#include "comm__types.h"
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeam team,
+    ncclLsaBarrierHandle handle, uint32_t index,
+    bool multimem, ncclMultimemHandle mmHandle
+  ):
+  ncclLsaBarrierSession_internal<Coop>{
+    coop, comm, team, handle, (int)index,
+#if CUDART_VERSION >= 12060
+    multimem,
+#else // WAR for an issue with ptxas in CTK < 12.6
+    /*multimem=*/false,
+#endif
+    mmHandle, /*epoch=*/0
+  } {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+  this->epoch = state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index];
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeamTagLsa, uint32_t index, bool multimem
+  ): ncclLsaBarrierSession(
+    coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem
+  ) {
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::~ncclLsaBarrierSession() {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  if (this->coop.thread_rank() == 0) {
+#if __CUDA_ARCH__ == 1200 && CUDART_VERSION < 13000
+    // WAR for a compiler issue with CTK < 13.0
+    if (this->index == 0)
+      state[(this->multimem ? 0 : 1)*this->handle.nBarriers] = this->epoch;
+    else
+#endif
+    state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index] = this->epoch;
+  }
+  this->coop.sync();
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::arrive(Coop, cuda::memory_order order) {
+  this->coop.sync();
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      uint32_t* inbox = this->mcInbox(/*multimem=*/true);
+      if (nccl::utility::releaseOrderOf(order) != cuda::memory_order_relaxed) {
+        asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
+      } else {
+        asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
+      }
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(peer, this->team.rank));
+      inbox.store(this->epoch+1, nccl::utility::releaseOrderOf(order));
+    }
+  }
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::wait(Coop, cuda::memory_order order) {
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      cuda::atomic_ref<uint32_t> inbox(*this->mcInbox(/*multimem=*/false));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + this->team.nRanks) <= uint32_t(-1)>>1) break;
+      }
+      this->epoch += this->team.nRanks;
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(this->team.rank, peer));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + 1) <= uint32_t(-1)>>1) break;
+      }
+    }
+    this->epoch += 1;
+  }
+  this->coop.sync();
+}
+#endif
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::sync(Coop coop, cuda::memory_order order) {
+  this->arrive(coop, order);
+  this->wait(coop, order);
+}
+#endif
+#endif // _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_

nvidia/nccl/include/nccl_device/impl/mem_barrier__types.h ADDED Viewed

@@ -0,0 +1,46 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#include "../mem_barrier.h"
+#include "core__types.h"
+struct ncclLsaBarrierHandle {
+  ncclDevResourceHandle_t bufHandle;
+  int nBarriers;
+};
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal {
+  Coop coop;
+  ncclDevComm const& comm;
+  ncclTeam team;
+  ncclLsaBarrierHandle handle;
+  int index;
+  bool multimem;
+  ncclMultimemHandle mmHandle;
+  uint32_t epoch;
+  NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) {
+    uint32_t* state;
+    if (multimem) { // multicast
+      state = (uint32_t*)ncclGetResourceBufferMultimemPointer(comm, handle.bufHandle, mmHandle);
+    } else { // unicast
+      state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+    }
+    return state + 2*handle.nBarriers + index;
+  }
+  NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) {
+    uint32_t* state = (uint32_t*)ncclGetResourceBufferPeerPointer(comm, handle.bufHandle, team, owner);
+    return state + 3*handle.nBarriers + index*team.nRanks + peer;
+  }
+};
+#endif
+#endif // _NCCL_DEVICE_MEM_BARRIER__TYPES_H_

nvidia/nccl/include/nccl_device/impl/ptr__funcs.h ADDED Viewed

@@ -0,0 +1,157 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_PTR__FUNCS_H_
+#define _NCCL_DEVICE_PTR__FUNCS_H_
+#include "ptr__types.h"
+#include "core__funcs.h"
+#include "comm__types.h"
+#if __cplusplus
+template<typename T>
+NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr<T>::ncclSymPtr(ncclWindow_t window, size_t offset):
+  window(window), offset(offset) {
+}
+template<typename T>
+template<typename U>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>::operator ncclSymPtr<U>() const {
+  return {window, offset};
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::localPtr() const {
+  return (T*)ncclGetLocalPointer(window, offset);
+}
+#endif
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaPtr(int peer) const {
+  return (T*)ncclGetLsaPointer(window, offset, peer);
+}
+#endif
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(int peer) const {
+  return (T*)ncclGetPeerPointer(window, offset, peer);
+}
+#endif
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(ncclTeam team, int peer) const {
+  return (T*)ncclGetPeerPointer(window, offset, team, peer);
+}
+#endif
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::multimemPtr(ncclMultimemHandle mmHandle) const {
+  return (T*)ncclGetMultimemPointer(window, offset, mmHandle);
+}
+#endif
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaMultimemPtr(ncclDevComm const& comm) const {
+  return (T*)ncclGetLsaMultimemPointer(window, offset, comm);
+}
+#endif
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d) {
+  return p += d;
+}
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d) {
+  return p -= d;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return reinterpret_cast<T*>(a.offset) - reinterpret_cast<T*>(b.offset);
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE bool operator==(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return a.window == b.window && a.offset == b.offset;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE bool operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return a.window != b.window || a.offset != b.offset;
+}
+#endif // __cplusplus
+#endif // _NCCL_DEVICE_PTR__FUNCS_H_

nvidia/nccl/include/nccl_device/impl/ptr__types.h ADDED Viewed

@@ -0,0 +1,11 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_PTR__TYPES_H_
+#define _NCCL_DEVICE_PTR__TYPES_H_
+#include "../ptr.h"
+#include "core__types.h"
+#endif // _NCCL_DEVICE_PTR__TYPES_H_

nvidia/nccl/include/nccl_device/ll_a2a.h ADDED Viewed

@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_LL_A2A_H_
+#define _NCCL_DEVICE_LL_A2A_H_
+#include "impl/core__types.h"
+struct ncclLLA2AHandle;
+NCCL_EXTERN_C __host__ int ncclLLA2ACalcSlots(int maxElts, int maxEltSize);
+NCCL_EXTERN_C __host__ ncclResult_t ncclLLA2ACreateRequirement(int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+#if __CUDACC__
+template<typename Coop>
+struct ncclLLA2ASession_internal;
+template<typename Coop>
+struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclLLA2ASession(Coop, ncclDevComm const&, ncclTeam, ncclLLA2AHandle, uint32_t block, int maxElts, bool multimem=false, ncclMultimemHandle mmHandle={});
+  NCCL_DEVICE_INLINE ~ncclLLA2ASession();
+  ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable
+  template<typename T>
+  NCCL_DEVICE_INLINE void send(int peer, int slot, T data);
+  template<typename T>
+  NCCL_DEVICE_INLINE void bcast(int slot, T data);
+  template<typename T>
+  NCCL_DEVICE_INLINE T recv(int slot);
+  template<int MinEltCount, int MaxEltCount, typename T>
+  NCCL_DEVICE_INLINE void recvUnrolled(int eltStart, int eltCount, int eltStride, T(&vals)[MaxEltCount]);
+  template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
+  NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red)
+    -> decltype(eltToAcc(nccl::utility::declval<Elt>())) ;
+  // End an alltoall region. For every peer in team you must have done both of the
+  // following each of which can be accomplished using any thread in coop:
+  //  1. Targeted that peer with at least one send().
+  //  2. Received from a slot targeted by that peer.
+  NCCL_DEVICE_INLINE void endEpoch(Coop);
+};
+#endif
+#endif // _NCCL_DEVICE_LL_A2A_H_

nvidia/nccl/include/nccl_device/mem_barrier.h ADDED Viewed

@@ -0,0 +1,35 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_DEVICE_MEM_BARRIER_H_
+#define _NCCL_DEVICE_MEM_BARRIER_H_
+#include "impl/core__types.h"
+struct ncclLsaBarrierHandle;
+NCCL_EXTERN_C __host__ ncclResult_t ncclLsaBarrierCreateRequirement(ncclTeam_t, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal;
+template<typename Coop>
+struct ncclLsaBarrierSession: ncclLsaBarrierSession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeam, ncclLsaBarrierHandle, uint32_t index, bool multimem=false, ncclMultimemHandle mmHandle={});
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeamTagLsa, uint32_t index, bool multimem=false);
+  NCCL_DEVICE_INLINE ~ncclLsaBarrierSession();
+  ncclLsaBarrierSession(ncclLsaBarrierSession const&) = delete; // Sessions are not copyable
+  NCCL_DEVICE_INLINE void arrive(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void wait(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order);
+};
+#endif
+#endif // _NCCL_DEVICE_MEM_BARRIER_H_