nvidia-nccl-cu12 2.23.4__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nvidia/__init__.py ADDED
File without changes
File without changes
File without changes
@@ -0,0 +1,472 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef NCCL_H_
8
+ #define NCCL_H_
9
+
10
+ #include <cuda_runtime.h>
11
+ #include <cuda_fp16.h>
12
+ #if CUDART_VERSION >= 11000
13
+ #include <cuda_bf16.h>
14
+ #endif
15
+
16
+ #define NCCL_MAJOR 2
17
+ #define NCCL_MINOR 23
18
+ #define NCCL_PATCH 4
19
+ #define NCCL_SUFFIX ""
20
+
21
+ #define NCCL_VERSION_CODE 22304
22
+ #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ #include <limits.h>
29
+ /* Opaque handle to communicator */
30
+ typedef struct ncclComm* ncclComm_t;
31
+ #define NCCL_COMM_NULL NULL
32
+
33
+ #define NCCL_UNIQUE_ID_BYTES 128
34
+ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
35
+
36
+ /* Error type */
37
+ typedef enum { ncclSuccess = 0,
38
+ ncclUnhandledCudaError = 1,
39
+ ncclSystemError = 2,
40
+ ncclInternalError = 3,
41
+ ncclInvalidArgument = 4,
42
+ ncclInvalidUsage = 5,
43
+ ncclRemoteError = 6,
44
+ ncclInProgress = 7,
45
+ ncclNumResults = 8 } ncclResult_t;
46
+
47
+ #define NCCL_CONFIG_UNDEF_INT INT_MIN
48
+ #define NCCL_CONFIG_UNDEF_PTR NULL
49
+ #define NCCL_SPLIT_NOCOLOR -1
50
+ #define NCCL_UNDEF_FLOAT -1.0f
51
+
52
+ /* Communicator configuration. Users can assign value to attributes to specify the
53
+ * behavior of a communicator. */
54
+ typedef struct ncclConfig_v21700 {
55
+ /* attributes that users should never touch. */
56
+ size_t size;
57
+ unsigned int magic;
58
+ unsigned int version;
59
+ /* attributes that users are able to customize. */
60
+ int blocking;
61
+ int cgaClusterSize;
62
+ int minCTAs;
63
+ int maxCTAs;
64
+ const char *netName;
65
+ int splitShare;
66
+ } ncclConfig_t;
67
+
68
+ /* Config initializer must be assigned to initialize config structure when it is created.
69
+ * Not initialized config will result in NCCL error. */
70
+ #define NCCL_CONFIG_INITIALIZER { \
71
+ sizeof(ncclConfig_t), /* size */ \
72
+ 0xcafebeef, /* magic */ \
73
+ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
74
+ NCCL_CONFIG_UNDEF_INT, /* blocking */ \
75
+ NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
76
+ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
77
+ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
78
+ NCCL_CONFIG_UNDEF_PTR, /* netName */ \
79
+ NCCL_CONFIG_UNDEF_INT /* splitShare */ \
80
+ }
81
+
82
+ /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
83
+ typedef struct ncclSimInfo_v22200 {
84
+ size_t size;
85
+ unsigned int magic;
86
+ unsigned int version;
87
+ float estimatedTime;
88
+ } ncclSimInfo_t;
89
+
90
+ /* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
91
+ * Not initialized simInfo will result in NCCL error. */
92
+ #define NCCL_SIM_INFO_INITIALIZER { \
93
+ sizeof(ncclSimInfo_t), /* size */ \
94
+ 0x74685283, /* magic */ \
95
+ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
96
+ NCCL_UNDEF_FLOAT /* estimated time */ \
97
+ }
98
+
99
+ /* NCCL malloc and free function for all types of NCCL optimizations
100
+ * (e.g. user buffer registration). The actual allocated size might
101
+ * be larger than requested due to granularity requirement. */
102
+ ncclResult_t ncclMemAlloc(void** ptr, size_t size);
103
+ ncclResult_t pncclMemAlloc(void** ptr, size_t size);
104
+
105
+ ncclResult_t ncclMemFree(void *ptr);
106
+ ncclResult_t pncclMemFree(void *ptr);
107
+
108
+ /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
109
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
110
+ * NCCL library
111
+ */
112
+ ncclResult_t ncclGetVersion(int *version);
113
+ ncclResult_t pncclGetVersion(int *version);
114
+
115
+ /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
116
+ * called once and the Id should be distributed to all ranks in the
117
+ * communicator before calling ncclCommInitRank. */
118
+ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
119
+ ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
120
+
121
+ /* Create a new communicator (multi thread/process version) with a configuration
122
+ * set by users. */
123
+ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
124
+ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
125
+
126
+ /* Creates a new communicator (multi thread/process version).
127
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
128
+ * Each rank is associated to a CUDA device, which has to be set before calling
129
+ * ncclCommInitRank.
130
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
131
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
132
+ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
133
+ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
134
+
135
+ /* Creates a clique of communicators (single process version).
136
+ * This is a convenience function to create a single-process communicator clique.
137
+ * Returns an array of ndev newly initialized communicators in comm.
138
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
139
+ * If devlist is NULL, the first ndev CUDA devices are used.
140
+ * Order of devlist defines user-order of processors within the communicator. */
141
+ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
142
+ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
143
+
144
+ /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
145
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
146
+ * when the communicator is globally quiescent and related resources are freed; then,
147
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
148
+ * itself) without blocking. */
149
+ ncclResult_t ncclCommFinalize(ncclComm_t comm);
150
+ ncclResult_t pncclCommFinalize(ncclComm_t comm);
151
+
152
+ /* Frees local resources associated with communicator object. */
153
+ ncclResult_t ncclCommDestroy(ncclComm_t comm);
154
+ ncclResult_t pncclCommDestroy(ncclComm_t comm);
155
+
156
+ /* Frees resources associated with communicator object and aborts any operations
157
+ * that might still be running on the device. */
158
+ ncclResult_t ncclCommAbort(ncclComm_t comm);
159
+ ncclResult_t pncclCommAbort(ncclComm_t comm);
160
+
161
+ /* Creates one or more communicators from an existing one.
162
+ * Ranks with the same color will end up in the same communicator.
163
+ * Within the new communicator, key will be used to order ranks.
164
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
165
+ * and will therefore return a NULL communicator.
166
+ * If config is NULL, the new communicator will inherit the original communicator's
167
+ * configuration*/
168
+ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
169
+ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
170
+
171
+ /* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
172
+ * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
173
+ * The number of ncclUniqueIds and their order must be the same for every rank.
174
+ */
175
+ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
176
+ ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
177
+
178
+ /* Returns a string for each error code. */
179
+ const char* ncclGetErrorString(ncclResult_t result);
180
+ const char* pncclGetErrorString(ncclResult_t result);
181
+
182
+ /* Returns a human-readable message of the last error that occurred. */
183
+ const char* ncclGetLastError(ncclComm_t comm);
184
+ const char* pncclGetLastError(ncclComm_t comm);
185
+
186
+ /* Checks whether the comm has encountered any asynchronous errors */
187
+ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
188
+ ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
189
+
190
+ /* Gets the number of ranks in the communicator clique. */
191
+ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
192
+ ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
193
+
194
+ /* Returns the cuda device number associated with the communicator. */
195
+ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
196
+ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
197
+
198
+ /* Returns the user-ordered "rank" associated with the communicator. */
199
+ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
200
+ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
201
+
202
+ /* Register CUDA buffer for zero-copy operation */
203
+ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
204
+ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
205
+
206
+ /* Deregister CUDA buffer */
207
+ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
208
+ ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
209
+
210
+ /* Reduction operation selector */
211
+ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
212
+ typedef enum { ncclSum = 0,
213
+ ncclProd = 1,
214
+ ncclMax = 2,
215
+ ncclMin = 3,
216
+ ncclAvg = 4,
217
+ /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
218
+ * serves as the least possible value for dynamic ncclRedOp_t's
219
+ * as constructed by ncclRedOpCreate*** functions. */
220
+ ncclNumOps = 5,
221
+ /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
222
+ * It is defined to be the largest signed value (since compilers
223
+ * are permitted to use signed enums) that won't grow
224
+ * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
225
+ * maintain ABI compatibility. */
226
+ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
227
+ } ncclRedOp_t;
228
+
229
+ /* Data types */
230
+ typedef enum { ncclInt8 = 0, ncclChar = 0,
231
+ ncclUint8 = 1,
232
+ ncclInt32 = 2, ncclInt = 2,
233
+ ncclUint32 = 3,
234
+ ncclInt64 = 4,
235
+ ncclUint64 = 5,
236
+ ncclFloat16 = 6, ncclHalf = 6,
237
+ ncclFloat32 = 7, ncclFloat = 7,
238
+ ncclFloat64 = 8, ncclDouble = 8,
239
+ #if defined(__CUDA_BF16_TYPES_EXIST__)
240
+ ncclBfloat16 = 9,
241
+ ncclNumTypes = 10
242
+ #else
243
+ ncclNumTypes = 9
244
+ #endif
245
+ } ncclDataType_t;
246
+
247
+ /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
248
+ typedef enum {
249
+ /* ncclScalarDevice: The scalar is in device-visible memory and will be
250
+ * dereferenced while the collective is running. */
251
+ ncclScalarDevice = 0,
252
+
253
+ /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
254
+ * dereferenced before the ncclRedOpCreate***() function returns. */
255
+ ncclScalarHostImmediate = 1
256
+ } ncclScalarResidence_t;
257
+
258
+ /*
259
+ * ncclRedOpCreatePreMulSum
260
+ *
261
+ * Creates a new reduction operator which pre-multiplies input values by a given
262
+ * scalar locally before reducing them with peer values via summation. For use
263
+ * only with collectives launched against *comm* and *datatype*. The
264
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
265
+ * will be dereferenced. Upon return, the newly created operator's handle
266
+ * is stored in *op*.
267
+ */
268
+ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
269
+ ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
270
+
271
+ /*
272
+ * ncclRedOpDestroy
273
+ *
274
+ * Destroys the reduction operator *op*. The operator must have been created by
275
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
276
+ * destroyed as soon as the last NCCL function which is given that operator returns.
277
+ */
278
+ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
279
+ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
280
+
281
+ /*
282
+ * Collective communication operations
283
+ *
284
+ * Collective communication operations must be called separately for each
285
+ * communicator in a communicator clique.
286
+ *
287
+ * They return when operations have been enqueued on the CUDA stream.
288
+ *
289
+ * Since they may perform inter-CPU synchronization, each call has to be done
290
+ * from a different thread or process, or need to use Group Semantics (see
291
+ * below).
292
+ */
293
+
294
+ /*
295
+ * Reduce
296
+ *
297
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
298
+ * operation.
299
+ * recvbuff may be NULL on all calls except for root device.
300
+ * root is the rank (not the CUDA device) where data will reside after the
301
+ * operation is complete.
302
+ *
303
+ * In-place operation will happen if sendbuff == recvbuff.
304
+ */
305
+ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
306
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
307
+ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
308
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
309
+
310
+ /*
311
+ * (deprecated) Broadcast (in-place)
312
+ *
313
+ * Copies count values from root to all other devices.
314
+ * root is the rank (not the CUDA device) where data resides before the
315
+ * operation is started.
316
+ *
317
+ * This operation is implicitely in place.
318
+ */
319
+ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
320
+ ncclComm_t comm, cudaStream_t stream);
321
+ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
322
+ ncclComm_t comm, cudaStream_t stream);
323
+
324
+ /*
325
+ * Broadcast
326
+ *
327
+ * Copies count values from root to all other devices.
328
+ * root is the rank (not the CUDA device) where data resides before the
329
+ * operation is started.
330
+ *
331
+ * In-place operation will happen if sendbuff == recvbuff.
332
+ */
333
+ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
334
+ ncclComm_t comm, cudaStream_t stream);
335
+ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
336
+ ncclComm_t comm, cudaStream_t stream);
337
+
338
+ /*
339
+ * All-Reduce
340
+ *
341
+ * Reduces data arrays of length count in sendbuff using op operation, and
342
+ * leaves identical copies of result on each recvbuff.
343
+ *
344
+ * In-place operation will happen if sendbuff == recvbuff.
345
+ */
346
+ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
347
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
348
+ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
349
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
350
+
351
+ /*
352
+ * Reduce-Scatter
353
+ *
354
+ * Reduces data in sendbuff using op operation and leaves reduced result
355
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
356
+ * block of the result.
357
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
358
+ * should have a size of at least nranks*recvcount elements.
359
+ *
360
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
361
+ */
362
+ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
363
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
364
+ cudaStream_t stream);
365
+ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
366
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
367
+ cudaStream_t stream);
368
+
369
+ /*
370
+ * All-Gather
371
+ *
372
+ * Each device gathers sendcount values from other GPUs into recvbuff,
373
+ * receiving data from rank i at offset i*sendcount.
374
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
375
+ * should have a size of at least nranks*sendcount elements.
376
+ *
377
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
378
+ */
379
+ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
380
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
381
+ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
382
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
383
+
384
+ /*
385
+ * Send
386
+ *
387
+ * Send data from sendbuff to rank peer.
388
+ *
389
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
390
+ * rank.
391
+ *
392
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
393
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
394
+ * ncclGroupEnd section.
395
+ */
396
+ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
397
+ ncclComm_t comm, cudaStream_t stream);
398
+ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
399
+ ncclComm_t comm, cudaStream_t stream);
400
+
401
+ /*
402
+ * Receive
403
+ *
404
+ * Receive data from rank peer into recvbuff.
405
+ *
406
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
407
+ * rank.
408
+ *
409
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
410
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
411
+ * ncclGroupEnd section.
412
+ */
413
+ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
414
+ ncclComm_t comm, cudaStream_t stream);
415
+ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
416
+ ncclComm_t comm, cudaStream_t stream);
417
+
418
+ /*
419
+ * Group semantics
420
+ *
421
+ * When managing multiple GPUs from a single thread, and since NCCL collective
422
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
423
+ * different ranks/devices into a single call.
424
+ *
425
+ * Grouping NCCL calls as being part of the same collective operation is done
426
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
427
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
428
+ * to be complete. Note that for collective communication, ncclGroupEnd only
429
+ * guarantees that the operations are enqueued on the streams, not that
430
+ * the operation is effectively done.
431
+ *
432
+ * Both collective communication and ncclCommInitRank can be used in conjunction
433
+ * of ncclGroupStart/ncclGroupEnd, but not together.
434
+ *
435
+ * Group semantics also allow to fuse multiple operations on the same device
436
+ * to improve performance (for aggregated collective calls), or to permit
437
+ * concurrent progress of multiple send/receive operations.
438
+ */
439
+
440
+ /*
441
+ * Group Start
442
+ *
443
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
444
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
445
+ * ncclGroupEnd.
446
+ */
447
+ ncclResult_t ncclGroupStart();
448
+ ncclResult_t pncclGroupStart();
449
+
450
+ /*
451
+ * Group End
452
+ *
453
+ * End a group call. Start a fused NCCL operation consisting of all calls since
454
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
455
+ * need to be called after ncclGroupEnd.
456
+ */
457
+ ncclResult_t ncclGroupEnd();
458
+ ncclResult_t pncclGroupEnd();
459
+
460
+ /*
461
+ * Group Simulate End
462
+ *
463
+ * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
464
+ */
465
+ ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
466
+ ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
467
+
468
+ #ifdef __cplusplus
469
+ } // end extern "C"
470
+ #endif
471
+
472
+ #endif // end include guard
@@ -0,0 +1,456 @@
1
+ /*************************************************************************
2
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef NCCL_NET_H_
8
+ #define NCCL_NET_H_
9
+
10
+ #include "nccl.h"
11
+ #include "nccl_common.h"
12
+ #include "net_device.h"
13
+ #include <stdint.h>
14
+
15
+ #define NCCL_NET_HANDLE_MAXSIZE 128
16
+
17
+ #define NCCL_PTR_HOST 0x1
18
+ #define NCCL_PTR_CUDA 0x2
19
+ #define NCCL_PTR_DMABUF 0x4
20
+
21
+ // Maximum number of requests per comm object
22
+ #define NCCL_NET_MAX_REQUESTS 32
23
+
24
+ typedef struct {
25
+ char* name; // Used mostly for logging.
26
+ char* pciPath; // Path to the PCI device in /sys.
27
+ uint64_t guid; // Unique identifier for the NIC chip. Important for
28
+ // cards with multiple PCI functions (Physical or virtual).
29
+ int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
30
+ int regIsGlobal; // regMr is not tied to a particular comm
31
+ int speed; // Port speed in Mbps.
32
+ int port; // Port number.
33
+ float latency; // Network latency
34
+ int maxComms; // Maximum number of comms we can create
35
+ int maxRecvs; // Maximum number of grouped receives.
36
+ ncclNetDeviceType netDeviceType; // Network offload type
37
+ int netDeviceVersion; // Version number for network offload
38
+ } ncclNetProperties_v8_t;
39
+
40
+ typedef ncclNetProperties_v8_t ncclNetProperties_t;
41
+
42
+ typedef struct {
43
+ // Name of the network (mainly for logs)
44
+ const char* name;
45
+ // Initialize the network.
46
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
47
+ // Return the number of adapters.
48
+ ncclResult_t (*devices)(int* ndev);
49
+ // Get various device properties.
50
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
51
+ // Create a receiving object and provide a handle to connect to it. The
52
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
53
+ // between ranks to create a connection.
54
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
55
+ // Connect to a handle and return a sending comm object for that peer.
56
+ // This call must not block for the connection to be established, and instead
57
+ // should return successfully with sendComm == NULL with the expectation that
58
+ // it will be called again until sendComm != NULL.
59
+ // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
60
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
61
+ // Finalize connection establishment after remote peer has called connect.
62
+ // This call must not block for the connection to be established, and instead
63
+ // should return successfully with recvComm == NULL with the expectation that
64
+ // it will be called again until recvComm != NULL.
65
+ // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
66
+ ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
67
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
68
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
69
+ ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
70
+ /* DMA-BUF support */
71
+ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
72
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
73
+ // Asynchronous send to a peer.
74
+ // May return request == NULL if the call cannot be performed (or would block)
75
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
76
+ // Asynchronous recv from a peer.
77
+ // May return request == NULL if the call cannot be performed (or would block)
78
+ ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
79
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
80
+ // visible to the GPU
81
+ ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
82
+ // Test whether a request is complete. If size is not NULL, it returns the
83
+ // number of bytes sent/received.
84
+ ncclResult_t (*test)(void* request, int* done, int* sizes);
85
+ // Close and free send/recv comm objects
86
+ ncclResult_t (*closeSend)(void* sendComm);
87
+ ncclResult_t (*closeRecv)(void* recvComm);
88
+ ncclResult_t (*closeListen)(void* listenComm);
89
+
90
+ // Copy the given mhandle to a dptr in a format usable by this plugin's device code
91
+ ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
92
+
93
+ // Notify the plugin that a recv has completed by the device
94
+ ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
95
+ } ncclNet_v8_t;
96
+
97
+ typedef ncclNet_v8_t ncclNet_t;
98
+
99
+ #define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
100
+
101
+ typedef struct {
102
+ void* mhandle;
103
+ void* address;
104
+ uint32_t size;
105
+ } ncclNetSGE_v8_t;
106
+
107
+ typedef struct {
108
+ // Name of the collective network (mainly for logs)
109
+ const char* name;
110
+ // Initialize the collective network.
111
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
112
+ // Return the number of adapters capable of doing collective operations.
113
+ // If ndev returns 0, all other functions might be set to NULL.
114
+ ncclResult_t (*devices)(int* ndev);
115
+ // Get various device properties.
116
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
117
+ // Create a receiving object and provide a handle to connect to it. The
118
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
119
+ // between ranks to create connections.
120
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
121
+ // Create a group for collective operations. handles have been created
122
+ // using listen() above. rank indicates caller's rank in the collective network.
123
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
124
+ // Returns whether a reduction operation on a data type is supported.
125
+ // 1 for supported, 0 otherwise.
126
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
127
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
128
+ ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
129
+ /* DMA-BUF support */
130
+ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
131
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
132
+ // Performs an asynchronous allreduce operation on the collective group.
133
+ // May return request == NULL if the call cannot be performed (or would block).
134
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
135
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
136
+ ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
137
+ size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
138
+ void* sendMhandle, void** request);
139
+ ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
140
+ size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
141
+ ncclDataType_t dataType, ncclRedOp_t redOp,
142
+ void* recvMhandle, void** request);
143
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
144
+ // visible to the GPU
145
+ ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
146
+ // Test whether a request is complete. If size is not NULL, it returns the
147
+ // number of bytes sent/received.
148
+ ncclResult_t (*test)(void* request, int* done, int* size);
149
+ // Close and free collective comm objects
150
+ ncclResult_t (*closeColl)(void* collComm);
151
+ ncclResult_t (*closeListen)(void* listenComm);
152
+ } ncclCollNet_v8_t;
153
+
154
+ typedef ncclCollNet_v8_t ncclCollNet_t;
155
+
156
+ #define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
157
+
158
+ typedef struct {
159
+ char* name; // Used mostly for logging.
160
+ char* pciPath; // Path to the PCI device in /sys.
161
+ uint64_t guid; // Unique identifier for the NIC chip. Important for
162
+ // cards with multiple PCI functions (Physical or virtual).
163
+ int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
164
+ int speed; // Port speed in Mbps.
165
+ int port; // Port number.
166
+ float latency; // Network latency
167
+ int maxComms; // Maximum number of comms we can create
168
+ int maxRecvs; // Maximum number of grouped receives.
169
+ ncclNetDeviceType netDeviceType; // Network offload type
170
+ int netDeviceVersion; // Version number for network offload
171
+ } ncclNetProperties_v7_t;
172
+
173
+ typedef struct {
174
+ // Name of the network (mainly for logs)
175
+ const char* name;
176
+ // Initialize the network.
177
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
178
+ // Return the number of adapters.
179
+ ncclResult_t (*devices)(int* ndev);
180
+ // Get various device properties.
181
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
182
+ // Create a receiving object and provide a handle to connect to it. The
183
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
184
+ // between ranks to create a connection.
185
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
186
+ // Connect to a handle and return a sending comm object for that peer.
187
+ // This call must not block for the connection to be established, and instead
188
+ // should return successfully with sendComm == NULL with the expectation that
189
+ // it will be called again until sendComm != NULL.
190
+ // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
191
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
192
+ // Finalize connection establishment after remote peer has called connect.
193
+ // This call must not block for the connection to be established, and instead
194
+ // should return successfully with recvComm == NULL with the expectation that
195
+ // it will be called again until recvComm != NULL.
196
+ // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
197
+ ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
198
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
199
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
200
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
201
+ /* DMA-BUF support */
202
+ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
203
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
204
+ // Asynchronous send to a peer.
205
+ // May return request == NULL if the call cannot be performed (or would block)
206
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
207
+ // Asynchronous recv from a peer.
208
+ // May return request == NULL if the call cannot be performed (or would block)
209
+ ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
210
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
211
+ // visible to the GPU
212
+ ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
213
+ // Test whether a request is complete. If size is not NULL, it returns the
214
+ // number of bytes sent/received.
215
+ ncclResult_t (*test)(void* request, int* done, int* sizes);
216
+ // Close and free send/recv comm objects
217
+ ncclResult_t (*closeSend)(void* sendComm);
218
+ ncclResult_t (*closeRecv)(void* recvComm);
219
+ ncclResult_t (*closeListen)(void* listenComm);
220
+
221
+ // Copy the given mhandle to a dptr in a format usable by this plugin's device code
222
+ ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
223
+
224
+ // Notify the plugin that a recv has completed by the device
225
+ ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
226
+ } ncclNet_v7_t;
227
+
228
+ typedef struct {
229
+ // Name of the collective network (mainly for logs)
230
+ const char* name;
231
+ // Initialize the collective network.
232
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
233
+ // Return the number of adapters capable of doing collective operations.
234
+ // If ndev returns 0, all other functions might be set to NULL.
235
+ ncclResult_t (*devices)(int* ndev);
236
+ // Get various device properties.
237
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
238
+ // Create a receiving object and provide a handle to connect to it. The
239
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
240
+ // between ranks to create connections.
241
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
242
+ // Create a group for collective operations. handles have been created
243
+ // using listen() above. rank indicates caller's rank in the collective network.
244
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
245
+ // Returns whether a reduction operation on a data type is supported.
246
+ // 1 for supported, 0 otherwise.
247
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
248
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
249
+ ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
250
+ /* DMA-BUF support */
251
+ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
252
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
253
+ // Performs an asynchronous allreduce operation on the collective group.
254
+ // May return request == NULL if the call cannot be performed (or would block).
255
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
256
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
257
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
258
+ // visible to the GPU
259
+ ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
260
+ // Test whether a request is complete. If size is not NULL, it returns the
261
+ // number of bytes sent/received.
262
+ ncclResult_t (*test)(void* request, int* done, int* size);
263
+ // Close and free collective comm objects
264
+ ncclResult_t (*closeColl)(void* collComm);
265
+ ncclResult_t (*closeListen)(void* listenComm);
266
+ } ncclCollNet_v7_t;
267
+
268
+ #define NCCL_NET_MAX_REQUESTS_V6 8
269
+
270
+ // v6 struct for backwards compatibility
271
+ typedef struct {
272
+ char* name; // Used mostly for logging.
273
+ char* pciPath; // Path to the PCI device in /sys.
274
+ uint64_t guid; // Unique identifier for the NIC chip. Important for
275
+ // cards with multiple PCI functions (Physical or virtual).
276
+ int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
277
+ int speed; // Port speed in Mbps.
278
+ int port; // Port number.
279
+ float latency; // Network latency
280
+ int maxComms; // Maximum number of comms we can create
281
+ int maxRecvs; // Maximum number of grouped receives.
282
+ } ncclNetProperties_v6_t;
283
+
284
+ typedef struct {
285
+ // Name of the network (mainly for logs)
286
+ const char* name;
287
+ // Initialize the network.
288
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
289
+ // Return the number of adapters.
290
+ ncclResult_t (*devices)(int* ndev);
291
+ // Get various device properties.
292
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
293
+ // Create a receiving object and provide a handle to connect to it. The
294
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
295
+ // between ranks to create a connection.
296
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
297
+ // Connect to a handle and return a sending comm object for that peer.
298
+ // This call must not block for the connection to be established, and instead
299
+ // should return successfully with sendComm == NULL with the expectation that
300
+ // it will be called again until sendComm != NULL.
301
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
302
+ // Finalize connection establishment after remote peer has called connect.
303
+ // This call must not block for the connection to be established, and instead
304
+ // should return successfully with recvComm == NULL with the expectation that
305
+ // it will be called again until recvComm != NULL.
306
+ ncclResult_t (*accept)(void* listenComm, void** recvComm);
307
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
308
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
309
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
310
+ /* DMA-BUF support */
311
+ ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
312
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
313
+ // Asynchronous send to a peer.
314
+ // May return request == NULL if the call cannot be performed (or would block)
315
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
316
+ // Asynchronous recv from a peer.
317
+ // May return request == NULL if the call cannot be performed (or would block)
318
+ ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
319
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
320
+ // visible to the GPU
321
+ ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
322
+ // Test whether a request is complete. If size is not NULL, it returns the
323
+ // number of bytes sent/received.
324
+ ncclResult_t (*test)(void* request, int* done, int* sizes);
325
+ // Close and free send/recv comm objects
326
+ ncclResult_t (*closeSend)(void* sendComm);
327
+ ncclResult_t (*closeRecv)(void* recvComm);
328
+ ncclResult_t (*closeListen)(void* listenComm);
329
+ } ncclNet_v6_t;
330
+
331
+ typedef struct {
332
+ // Name of the collective network (mainly for logs)
333
+ const char* name;
334
+ // Initialize the collective network.
335
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
336
+ // Return the number of adapters capable of doing collective operations.
337
+ // If ndev returns 0, all other functions might be set to NULL.
338
+ ncclResult_t (*devices)(int* ndev);
339
+ // Get various device properties.
340
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
341
+ // Create a receiving object and provide a handle to connect to it. The
342
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
343
+ // between ranks to create connections.
344
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
345
+ // Create a group for collective operations. handles have been created
346
+ // using listen() above. rank indicates caller's rank in the collective network.
347
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
348
+ // Returns whether a reduction operation on a data type is supported.
349
+ // 1 for supported, 0 otherwise.
350
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
351
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
352
+ ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
353
+ /* DMA-BUF support */
354
+ ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
355
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
356
+ // Performs an asynchronous allreduce operation on the collective group.
357
+ // May return request == NULL if the call cannot be performed (or would block).
358
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
359
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
360
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
361
+ // visible to the GPU
362
+ ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
363
+ // Test whether a request is complete. If size is not NULL, it returns the
364
+ // number of bytes sent/received.
365
+ ncclResult_t (*test)(void* request, int* done, int* size);
366
+ // Close and free collective comm objects
367
+ ncclResult_t (*closeColl)(void* collComm);
368
+ ncclResult_t (*closeListen)(void* listenComm);
369
+ } ncclCollNet_v6_t;
370
+
371
+ // v5 struct for backwards compatibility
372
+ typedef struct {
373
+ // Name of the network (mainly for logs)
374
+ const char* name;
375
+ // Initialize the network.
376
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
377
+ // Return the number of adapters.
378
+ ncclResult_t (*devices)(int* ndev);
379
+ // Get various device properties.
380
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
381
+ // Create a receiving object and provide a handle to connect to it. The
382
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
383
+ // between ranks to create a connection.
384
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
385
+ // Connect to a handle and return a sending comm object for that peer.
386
+ // This call must not block for the connection to be established, and instead
387
+ // should return successfully with sendComm == NULL with the expectation that
388
+ // it will be called again until sendComm != NULL.
389
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
390
+ // Finalize connection establishment after remote peer has called connect.
391
+ // This call must not block for the connection to be established, and instead
392
+ // should return successfully with recvComm == NULL with the expectation that
393
+ // it will be called again until recvComm != NULL.
394
+ ncclResult_t (*accept)(void* listenComm, void** recvComm);
395
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
396
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
397
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
398
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
399
+ // Asynchronous send to a peer.
400
+ // May return request == NULL if the call cannot be performed (or would block)
401
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
402
+ // Asynchronous recv from a peer.
403
+ // May return request == NULL if the call cannot be performed (or would block)
404
+ ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
405
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
406
+ // visible to the GPU
407
+ ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
408
+ // Test whether a request is complete. If size is not NULL, it returns the
409
+ // number of bytes sent/received.
410
+ ncclResult_t (*test)(void* request, int* done, int* sizes);
411
+ // Close and free send/recv comm objects
412
+ ncclResult_t (*closeSend)(void* sendComm);
413
+ ncclResult_t (*closeRecv)(void* recvComm);
414
+ ncclResult_t (*closeListen)(void* listenComm);
415
+ } ncclNet_v5_t;
416
+
417
+ // v5 struct for backwards compatibility
418
+ typedef struct {
419
+ // Name of the collective network (mainly for logs)
420
+ const char* name;
421
+ // Initialize the collective network.
422
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
423
+ // Return the number of adapters capable of doing collective operations.
424
+ // If ndev returns 0, all other functions might be set to NULL.
425
+ ncclResult_t (*devices)(int* ndev);
426
+ // Get various device properties.
427
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
428
+ // Create a receiving object and provide a handle to connect to it. The
429
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
430
+ // between ranks to create connections.
431
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
432
+ // Create a group for collective operations. handles have been created
433
+ // using listen() above. rank indicates caller's rank in the collective network.
434
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
435
+ // Returns whether a reduction operation on a data type is supported.
436
+ // 1 for supported, 0 otherwise.
437
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
438
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
439
+ ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
440
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
441
+ // Performs an asynchronous allreduce operation on the collective group.
442
+ // May return request == NULL if the call cannot be performed (or would block).
443
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
444
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
445
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
446
+ // visible to the GPU
447
+ ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
448
+ // Test whether a request is complete. If size is not NULL, it returns the
449
+ // number of bytes sent/received.
450
+ ncclResult_t (*test)(void* request, int* done, int* size);
451
+ // Close and free collective comm objects
452
+ ncclResult_t (*closeColl)(void* collComm);
453
+ ncclResult_t (*closeListen)(void* listenComm);
454
+ } ncclCollNet_v5_t;
455
+
456
+ #endif // end include guard
File without changes
Binary file
@@ -0,0 +1,39 @@
1
+
2
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in the
11
+ documentation and/or other materials provided with the distribution.
12
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13
+ Laboratory, the U.S. Department of Energy, nor the names of their
14
+ contributors may be used to endorse or promote products derived
15
+ from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ The U.S. Department of Energy funded the development of this software
30
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31
+
32
+
33
+ This code also includes files from the NVIDIA Tools Extension SDK project.
34
+
35
+ See:
36
+
37
+ https://github.com/NVIDIA/NVTX
38
+
39
+ for more information and license details.
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.1
2
+ Name: nvidia-nccl-cu12
3
+ Version: 2.23.4
4
+ Summary: NVIDIA Collective Communication Library (NCCL) Runtime
5
+ Home-page: https://developer.nvidia.com/cuda-zone
6
+ Author: Nvidia CUDA Installer Team
7
+ Author-email: compute_installer@nvidia.com
8
+ License: BSD-3-Clause
9
+ Keywords: cuda,nvidia,runtime,machine learning,deep learning
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: Other/Proprietary License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.5
18
+ Classifier: Programming Language :: Python :: 3.6
19
+ Classifier: Programming Language :: Python :: 3.7
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3 :: Only
25
+ Classifier: Topic :: Scientific/Engineering
26
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Topic :: Software Development
29
+ Classifier: Topic :: Software Development :: Libraries
30
+ Classifier: Operating System :: Microsoft :: Windows
31
+ Classifier: Operating System :: POSIX :: Linux
32
+ Requires-Python: >=3
33
+ License-File: License.txt
34
+
35
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on any platform using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets.
@@ -0,0 +1,12 @@
1
+ nvidia/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ nvidia/nccl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ nvidia/nccl/include/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ nvidia/nccl/include/nccl.h,sha256=-xnqDygmn2YmOkYW3bakiRfIsnvwUAptTJeZoMWsXdM,20225
5
+ nvidia/nccl/include/nccl_net.h,sha256=Q2yMEZBE6uKkX_nduRb3TaU64hC1jjOMstCypTKGSdk,25896
6
+ nvidia/nccl/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ nvidia/nccl/lib/libnccl.so.2,sha256=cpNUzYaGHk_iB-IyLjS-_cVxLENE_6MKeZS-bGDm3zU,267885488
8
+ nvidia_nccl_cu12-2.23.4.dist-info/License.txt,sha256=DwF0prTgszrCY3W_cpUzB1sy9MUaW2gCo9dC19zcmnY,1895
9
+ nvidia_nccl_cu12-2.23.4.dist-info/METADATA,sha256=gInA2V7kgbe5Q2WGriCuxDlUeVD70aQvto0S5vPqd8o,1822
10
+ nvidia_nccl_cu12-2.23.4.dist-info/WHEEL,sha256=SFSBTBkI2Eso5pk64VndQikelT62YOGkTuu55QuBsyk,109
11
+ nvidia_nccl_cu12-2.23.4.dist-info/top_level.txt,sha256=fTkAtiFuL16nUrB9ytDDtpytz2t0B4NvYTnRzwAhO14,7
12
+ nvidia_nccl_cu12-2.23.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (74.1.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-manylinux2014_aarch64
5
+
@@ -0,0 +1 @@
1
+ nvidia