nvidia-nccl-cu13 2.28.3__py3-none-manylinux_2_18_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,571 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef NCCL_H_
8
+ #define NCCL_H_
9
+
10
+ #include <cuda_runtime.h>
11
+ #include <cuda_fp16.h>
12
+ #if CUDART_VERSION >= 11000
13
+ #include <cuda_bf16.h>
14
+ #endif
15
+ #if __cplusplus && CUDART_VERSION >= 11080
16
+ #include <cuda_fp8.h>
17
+ #endif
18
+
19
+ #define NCCL_MAJOR 2
20
+ #define NCCL_MINOR 28
21
+ #define NCCL_PATCH 3
22
+ #define NCCL_SUFFIX ""
23
+
24
+ #define NCCL_VERSION_CODE 22803
25
+ #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
26
+
27
+ #ifdef __cplusplus
28
+ extern "C" {
29
+ #endif
30
+
31
+ #include <limits.h>
32
+
33
+ /* Opaque handle to communicator */
34
+ typedef struct ncclComm* ncclComm_t;
35
+ typedef struct ncclWindow_vidmem* ncclWindow_t;
36
+ #define NCCL_COMM_NULL NULL
37
+
38
+ #define NCCL_UNIQUE_ID_BYTES 128
39
+ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
40
+
41
+ /* Error type */
42
+ typedef enum { ncclSuccess = 0,
43
+ ncclUnhandledCudaError = 1,
44
+ ncclSystemError = 2,
45
+ ncclInternalError = 3,
46
+ ncclInvalidArgument = 4,
47
+ ncclInvalidUsage = 5,
48
+ ncclRemoteError = 6,
49
+ ncclInProgress = 7,
50
+ ncclNumResults = 8 } ncclResult_t;
51
+
52
+ #define NCCL_CONFIG_UNDEF_INT INT_MIN
53
+ #define NCCL_CONFIG_UNDEF_PTR NULL
54
+ #define NCCL_SPLIT_NOCOLOR -1
55
+ #define NCCL_UNDEF_FLOAT -1.0f
56
+
57
+ /* Window Registration flags */
58
+ #define NCCL_WIN_DEFAULT 0x00
59
+ #define NCCL_WIN_COLL_SYMMETRIC 0x01
60
+
61
+ #define NCCL_WIN_REQUIRED_ALIGNMENT 4096
62
+
63
+ /* NCCL performance policy */
64
+ #define NCCL_CTA_POLICY_DEFAULT 0x00
65
+ #define NCCL_CTA_POLICY_EFFICIENCY 0x01
66
+ #define NCCL_CTA_POLICY_ZERO 0x02
67
+
68
+ /* ncclCommShrink flags*/
69
+ #define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
70
+ #define NCCL_SHRINK_ABORT 0x01 /* First, terminate ongoing parent operations, and then shrink the parent communicator */
71
+
72
+ /* Communicator configuration. Users can assign value to attributes to specify the
73
+ * behavior of a communicator. */
74
+ typedef struct ncclConfig_v22800 {
75
+ /* attributes that users should never touch. */
76
+ size_t size;
77
+ unsigned int magic;
78
+ unsigned int version;
79
+ /* attributes that users are able to customize. */
80
+ int blocking;
81
+ int cgaClusterSize;
82
+ int minCTAs;
83
+ int maxCTAs;
84
+ const char *netName;
85
+ int splitShare;
86
+ int trafficClass;
87
+ const char *commName;
88
+ int collnetEnable;
89
+ int CTAPolicy;
90
+ int shrinkShare;
91
+ int nvlsCTAs;
92
+ int nChannelsPerNetPeer;
93
+ int nvlinkCentricSched;
94
+ } ncclConfig_t;
95
+
96
+ /* Config initializer must be assigned to initialize config structure when it is created.
97
+ * Not initialized config will result in NCCL error. */
98
+ #define NCCL_CONFIG_INITIALIZER { \
99
+ sizeof(ncclConfig_t), /* size */ \
100
+ 0xcafebeef, /* magic */ \
101
+ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
102
+ NCCL_CONFIG_UNDEF_INT, /* blocking */ \
103
+ NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
104
+ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
105
+ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
106
+ NCCL_CONFIG_UNDEF_PTR, /* netName */ \
107
+ NCCL_CONFIG_UNDEF_INT, /* splitShare */ \
108
+ NCCL_CONFIG_UNDEF_INT, /* trafficClass */ \
109
+ NCCL_CONFIG_UNDEF_PTR, /* commName */ \
110
+ NCCL_CONFIG_UNDEF_INT, /* collnetEnable */ \
111
+ NCCL_CONFIG_UNDEF_INT, /* CTAPolicy */ \
112
+ NCCL_CONFIG_UNDEF_INT, /* shrinkShare */ \
113
+ NCCL_CONFIG_UNDEF_INT, /* nvlsCTAs */ \
114
+ NCCL_CONFIG_UNDEF_INT, /* nChannelsPerNetPeer */ \
115
+ NCCL_CONFIG_UNDEF_INT, /* nvlinkCentricSched */ \
116
+ }
117
+
118
+ /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
119
+ typedef struct ncclSimInfo_v22200 {
120
+ size_t size;
121
+ unsigned int magic;
122
+ unsigned int version;
123
+ float estimatedTime;
124
+ } ncclSimInfo_t;
125
+
126
+ /* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
127
+ * Not initialized simInfo will result in NCCL error. */
128
+ #define NCCL_SIM_INFO_INITIALIZER { \
129
+ sizeof(ncclSimInfo_t), /* size */ \
130
+ 0x74685283, /* magic */ \
131
+ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
132
+ NCCL_UNDEF_FLOAT /* estimated time */ \
133
+ }
134
+
135
+ /* NCCL malloc and free function for all types of NCCL optimizations
136
+ * (e.g. user buffer registration). The actual allocated size might
137
+ * be larger than requested due to granularity requirement. */
138
+ ncclResult_t ncclMemAlloc(void** ptr, size_t size);
139
+ ncclResult_t pncclMemAlloc(void** ptr, size_t size);
140
+
141
+ ncclResult_t ncclMemFree(void *ptr);
142
+ ncclResult_t pncclMemFree(void *ptr);
143
+
144
+ /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
145
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
146
+ * NCCL library
147
+ */
148
+ ncclResult_t ncclGetVersion(int *version);
149
+ ncclResult_t pncclGetVersion(int *version);
150
+
151
+ /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
152
+ * called once and the Id should be distributed to all ranks in the
153
+ * communicator before calling ncclCommInitRank. */
154
+ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
155
+ ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
156
+
157
+ /* Create a new communicator (multi thread/process version) with a configuration
158
+ * set by users. */
159
+ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
160
+ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
161
+
162
+ /* Creates a new communicator (multi thread/process version).
163
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
164
+ * Each rank is associated to a CUDA device, which has to be set before calling
165
+ * ncclCommInitRank.
166
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
167
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
168
+ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
169
+ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
170
+
171
+ /* Creates a clique of communicators (single process version).
172
+ * This is a convenience function to create a single-process communicator clique.
173
+ * Returns an array of ndev newly initialized communicators in comm.
174
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
175
+ * If devlist is NULL, the first ndev CUDA devices are used.
176
+ * Order of devlist defines user-order of processors within the communicator. */
177
+ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
178
+ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
179
+
180
+ /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
181
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
182
+ * when the communicator is globally quiescent and related resources are freed; then,
183
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
184
+ * itself) without blocking. */
185
+ ncclResult_t ncclCommFinalize(ncclComm_t comm);
186
+ ncclResult_t pncclCommFinalize(ncclComm_t comm);
187
+
188
+ /* Frees local resources associated with communicator object. */
189
+ ncclResult_t ncclCommDestroy(ncclComm_t comm);
190
+ ncclResult_t pncclCommDestroy(ncclComm_t comm);
191
+
192
+ /* Frees resources associated with communicator object and aborts any operations
193
+ * that might still be running on the device. */
194
+ ncclResult_t ncclCommAbort(ncclComm_t comm);
195
+ ncclResult_t pncclCommAbort(ncclComm_t comm);
196
+
197
+ /* Creates one or more communicators from an existing one.
198
+ * Ranks with the same color will end up in the same communicator.
199
+ * Within the new communicator, key will be used to order ranks.
200
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
201
+ * and will therefore return a NULL communicator.
202
+ * If config is NULL, the new communicator will inherit the original communicator's
203
+ * configuration*/
204
+ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
205
+ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
206
+
207
+ /* Shrink existing communicator.
208
+ * Ranks in excludeRanksList will be removed form the existing communicator.
209
+ * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
210
+ * If config is NULL, the new communicator will inherit the original communicator's configuration
211
+ * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
212
+ ncclResult_t ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
213
+ ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
214
+
215
+ /* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
216
+ * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
217
+ * The number of ncclUniqueIds and their order must be the same for every rank.
218
+ */
219
+ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
220
+ ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
221
+
222
+ /* Returns a string for each error code. */
223
+ const char* ncclGetErrorString(ncclResult_t result);
224
+ const char* pncclGetErrorString(ncclResult_t result);
225
+
226
+ /* Returns a human-readable message of the last error that occurred. */
227
+ const char* ncclGetLastError(ncclComm_t comm);
228
+ const char* pncclGetLastError(ncclComm_t comm);
229
+
230
+ /* Reload environment variables that determine logging. */
231
+ __attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
232
+ void ncclResetDebugInit();
233
+ __attribute__ ((deprecated("pncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
234
+ void pncclResetDebugInit();
235
+
236
+ /* Checks whether the comm has encountered any asynchronous errors */
237
+ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
238
+ ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
239
+
240
+ /* Gets the number of ranks in the communicator clique. */
241
+ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
242
+ ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
243
+
244
+ /* Returns the cuda device number associated with the communicator. */
245
+ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
246
+ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
247
+
248
+ /* Returns the user-ordered "rank" associated with the communicator. */
249
+ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
250
+ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
251
+
252
+ /* Register CUDA buffer for zero-copy operation */
253
+ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
254
+ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
255
+
256
+ /* Deregister CUDA buffer */
257
+ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
258
+ ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
259
+
260
+ /* Register memory window */
261
+ ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
262
+ ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
263
+
264
+ /* Deregister symmetric memory */
265
+ ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
266
+ ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
267
+
268
+ /* Reduction operation selector */
269
+ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
270
+ typedef enum { ncclSum = 0,
271
+ ncclProd = 1,
272
+ ncclMax = 2,
273
+ ncclMin = 3,
274
+ ncclAvg = 4,
275
+ /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
276
+ * serves as the least possible value for dynamic ncclRedOp_t's
277
+ * as constructed by ncclRedOpCreate*** functions. */
278
+ ncclNumOps = 5,
279
+ /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
280
+ * It is defined to be the largest signed value (since compilers
281
+ * are permitted to use signed enums) that won't grow
282
+ * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
283
+ * maintain ABI compatibility. */
284
+ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
285
+ } ncclRedOp_t;
286
+
287
+ /* Data types */
288
+ typedef enum { ncclInt8 = 0, ncclChar = 0,
289
+ ncclUint8 = 1,
290
+ ncclInt32 = 2, ncclInt = 2,
291
+ ncclUint32 = 3,
292
+ ncclInt64 = 4,
293
+ ncclUint64 = 5,
294
+ ncclFloat16 = 6, ncclHalf = 6,
295
+ ncclFloat32 = 7, ncclFloat = 7,
296
+ ncclFloat64 = 8, ncclDouble = 8,
297
+ ncclBfloat16 = 9,
298
+ ncclFloat8e4m3 = 10,
299
+ ncclFloat8e5m2 = 11,
300
+ ncclNumTypes = 12
301
+ } ncclDataType_t;
302
+
303
+ /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
304
+ typedef enum {
305
+ /* ncclScalarDevice: The scalar is in device-visible memory and will be
306
+ * dereferenced while the collective is running. */
307
+ ncclScalarDevice = 0,
308
+
309
+ /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
310
+ * dereferenced before the ncclRedOpCreate***() function returns. */
311
+ ncclScalarHostImmediate = 1
312
+ } ncclScalarResidence_t;
313
+
314
+ /*
315
+ * ncclRedOpCreatePreMulSum
316
+ *
317
+ * Creates a new reduction operator which pre-multiplies input values by a given
318
+ * scalar locally before reducing them with peer values via summation. For use
319
+ * only with collectives launched against *comm* and *datatype*. The
320
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
321
+ * will be dereferenced. Upon return, the newly created operator's handle
322
+ * is stored in *op*.
323
+ */
324
+ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
325
+ ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
326
+
327
+ /*
328
+ * ncclRedOpDestroy
329
+ *
330
+ * Destroys the reduction operator *op*. The operator must have been created by
331
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
332
+ * destroyed as soon as the last NCCL function which is given that operator returns.
333
+ */
334
+ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
335
+ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
336
+
337
+ /*
338
+ * Collective communication operations
339
+ *
340
+ * Collective communication operations must be called separately for each
341
+ * communicator in a communicator clique.
342
+ *
343
+ * They return when operations have been enqueued on the CUDA stream.
344
+ *
345
+ * Since they may perform inter-CPU synchronization, each call has to be done
346
+ * from a different thread or process, or need to use Group Semantics (see
347
+ * below).
348
+ */
349
+
350
+ /*
351
+ * Reduce
352
+ *
353
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
354
+ * operation.
355
+ * recvbuff may be NULL on all calls except for root device.
356
+ * root is the rank (not the CUDA device) where data will reside after the
357
+ * operation is complete.
358
+ *
359
+ * In-place operation will happen if sendbuff == recvbuff.
360
+ */
361
+ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
362
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
363
+ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
364
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
365
+
366
+ /*
367
+ * (deprecated) Broadcast (in-place)
368
+ *
369
+ * Copies count values from root to all other devices.
370
+ * root is the rank (not the CUDA device) where data resides before the
371
+ * operation is started.
372
+ *
373
+ * This operation is implicitely in place.
374
+ */
375
+ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
376
+ ncclComm_t comm, cudaStream_t stream);
377
+ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
378
+ ncclComm_t comm, cudaStream_t stream);
379
+
380
+ /*
381
+ * Broadcast
382
+ *
383
+ * Copies count values from root to all other devices.
384
+ * root is the rank (not the CUDA device) where data resides before the
385
+ * operation is started.
386
+ *
387
+ * In-place operation will happen if sendbuff == recvbuff.
388
+ */
389
+ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
390
+ ncclComm_t comm, cudaStream_t stream);
391
+ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
392
+ ncclComm_t comm, cudaStream_t stream);
393
+
394
+ /*
395
+ * All-Reduce
396
+ *
397
+ * Reduces data arrays of length count in sendbuff using op operation, and
398
+ * leaves identical copies of result on each recvbuff.
399
+ *
400
+ * In-place operation will happen if sendbuff == recvbuff.
401
+ */
402
+ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
403
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
404
+ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
405
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
406
+
407
+ /*
408
+ * Reduce-Scatter
409
+ *
410
+ * Reduces data in sendbuff using op operation and leaves reduced result
411
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
412
+ * block of the result.
413
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
414
+ * should have a size of at least nranks*recvcount elements.
415
+ *
416
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
417
+ */
418
+ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
419
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
420
+ cudaStream_t stream);
421
+ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
422
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
423
+ cudaStream_t stream);
424
+
425
+ /*
426
+ * All-Gather
427
+ *
428
+ * Each device gathers sendcount values from other GPUs into recvbuff,
429
+ * receiving data from rank i at offset i*sendcount.
430
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
431
+ * should have a size of at least nranks*sendcount elements.
432
+ *
433
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
434
+ */
435
+ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
436
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
437
+ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
438
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
439
+
440
+ /*
441
+ * All-to-All
442
+ *
443
+ * Each device sends count values to all other devices and receives count values
444
+ * from all other devices. Data to send to destination rank j is taken from
445
+ * sendbuff+j*count and data received from source rank i is placed at
446
+ * recvbuff+i*count.
447
+ */
448
+ ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
449
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
450
+ ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
451
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
452
+
453
+ /*
454
+ * Gather
455
+ *
456
+ * Each rank sends count elements from sendbuff to the root rank.
457
+ * On the root rank, data from rank i is placed at recvbuff + i*count.
458
+ * On non-root ranks, recvbuff is not used.
459
+ * root is the rank where data will be gathered.
460
+ *
461
+ * In-place operations will happen if sendbuff == recvbuff + root * count.
462
+ */
463
+ ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t count,
464
+ ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
465
+ ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t count,
466
+ ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
467
+
468
+ /*
469
+ * Scatter
470
+ *
471
+ * On the root rank, count elements from sendbuff+i*count are sent to rank i.
472
+ * On non-root ranks, sendbuff is not used.
473
+ * Each rank receives count elements into recvbuff.
474
+ * root is the rank that will distribute the data.
475
+ *
476
+ * In-place operations will happen if recvbuff == sendbuff + root * count.
477
+ */
478
+ ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t count,
479
+ ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
480
+ ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t count,
481
+ ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
482
+
483
+ /*
484
+ * Send
485
+ *
486
+ * Send data from sendbuff to rank peer.
487
+ *
488
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
489
+ * rank.
490
+ *
491
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
492
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
493
+ * ncclGroupEnd section.
494
+ */
495
+ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
496
+ ncclComm_t comm, cudaStream_t stream);
497
+ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
498
+ ncclComm_t comm, cudaStream_t stream);
499
+
500
+ /*
501
+ * Receive
502
+ *
503
+ * Receive data from rank peer into recvbuff.
504
+ *
505
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
506
+ * rank.
507
+ *
508
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
509
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
510
+ * ncclGroupEnd section.
511
+ */
512
+ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
513
+ ncclComm_t comm, cudaStream_t stream);
514
+ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
515
+ ncclComm_t comm, cudaStream_t stream);
516
+
517
+ /*
518
+ * Group semantics
519
+ *
520
+ * When managing multiple GPUs from a single thread, and since NCCL collective
521
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
522
+ * different ranks/devices into a single call.
523
+ *
524
+ * Grouping NCCL calls as being part of the same collective operation is done
525
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
526
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
527
+ * to be complete. Note that for collective communication, ncclGroupEnd only
528
+ * guarantees that the operations are enqueued on the streams, not that
529
+ * the operation is effectively done.
530
+ *
531
+ * Both collective communication and ncclCommInitRank can be used in conjunction
532
+ * of ncclGroupStart/ncclGroupEnd, but not together.
533
+ *
534
+ * Group semantics also allow to fuse multiple operations on the same device
535
+ * to improve performance (for aggregated collective calls), or to permit
536
+ * concurrent progress of multiple send/receive operations.
537
+ */
538
+
539
+ /*
540
+ * Group Start
541
+ *
542
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
543
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
544
+ * ncclGroupEnd.
545
+ */
546
+ ncclResult_t ncclGroupStart();
547
+ ncclResult_t pncclGroupStart();
548
+
549
+ /*
550
+ * Group End
551
+ *
552
+ * End a group call. Start a fused NCCL operation consisting of all calls since
553
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
554
+ * need to be called after ncclGroupEnd.
555
+ */
556
+ ncclResult_t ncclGroupEnd();
557
+ ncclResult_t pncclGroupEnd();
558
+
559
+ /*
560
+ * Group Simulate End
561
+ *
562
+ * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
563
+ */
564
+ ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
565
+ ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
566
+
567
+ #ifdef __cplusplus
568
+ } // end extern "C"
569
+ #endif
570
+
571
+ #endif // end include guard
@@ -0,0 +1,10 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef _NCCL_DEVICE_COMM_H_
8
+ #define _NCCL_DEVICE_COMM_H_
9
+ #include "core.h"
10
+ #endif