megatron-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

@@ -0,0 +1,716 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ # Parts of the code here are adapted from PyTorch
4
+ # repo: https://github.com/pytorch/pytorch
5
+
6
+ import math
7
+ import os
8
+ from typing import Optional
9
+ import warnings
10
+
11
+ import torch
12
+ import torch.nn.functional as F
13
+ import torch.nn.init as init
14
+ from torch.nn.parameter import Parameter
15
+
16
+ from torch.cuda.amp import custom_fwd, custom_bwd
17
+
18
+ from megatron.core.parallel_state import (
19
+ get_tensor_model_parallel_rank,
20
+ get_tensor_model_parallel_world_size,
21
+ get_tensor_model_parallel_group,
22
+ get_global_memory_buffer,
23
+ )
24
+ from .mappings import (
25
+ copy_to_tensor_model_parallel_region,
26
+ gather_from_tensor_model_parallel_region,
27
+ gather_from_sequence_parallel_region,
28
+ reduce_from_tensor_model_parallel_region,
29
+ scatter_to_tensor_model_parallel_region,
30
+ reduce_scatter_to_sequence_parallel_region,
31
+ )
32
+
33
+ from .random import get_cuda_rng_tracker
34
+ from .utils import (
35
+ divide,
36
+ split_tensor_along_last_dim,
37
+ VocabUtility,
38
+ )
39
+
40
+ _grad_accum_fusion_available = True
41
+ try:
42
+ import fused_weight_gradient_mlp_cuda
43
+ except ImportError:
44
+ _grad_accum_fusion_available = False
45
+
46
+ _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
47
+ 'partition_dim': -1,
48
+ 'partition_stride': 1}
49
+
50
+ def param_is_not_tensor_parallel_duplicate(param):
51
+ return (hasattr(param, 'tensor_model_parallel') and
52
+ param.tensor_model_parallel) or (
53
+ get_tensor_model_parallel_rank() == 0)
54
+
55
+
56
+ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
57
+ # Make sure the attributes are not set.
58
+ for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
59
+ assert not hasattr(tensor, attribute)
60
+ # Set the attributes.
61
+ setattr(tensor, 'tensor_model_parallel', is_parallel)
62
+ setattr(tensor, 'partition_dim', dim)
63
+ setattr(tensor, 'partition_stride', stride)
64
+
65
+
66
+ def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
67
+ def maybe_set(attribute, value):
68
+ if not hasattr(tensor, attribute):
69
+ setattr(tensor, attribute, value)
70
+ for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
71
+ maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
72
+
73
+
74
+ def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
75
+ def maybe_copy(attribute):
76
+ if hasattr(source_tensor, attribute):
77
+ setattr(destination_tensor, attribute,
78
+ getattr(source_tensor, attribute))
79
+ for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
80
+ maybe_copy(attribute)
81
+
82
+
83
+ def _initialize_affine_weight_gpu(weight, init_method,
84
+ partition_dim, stride=1):
85
+ """Initialize affine weight for model parallel on GPU."""
86
+
87
+ set_tensor_model_parallel_attributes(tensor=weight,
88
+ is_parallel=True,
89
+ dim=partition_dim,
90
+ stride=stride)
91
+
92
+ with get_cuda_rng_tracker().fork():
93
+ init_method(weight)
94
+
95
+
96
+ def _initialize_affine_weight_cpu(weight, output_size, input_size,
97
+ per_partition_size, partition_dim,
98
+ init_method, stride=1,
99
+ return_master_weight=False,
100
+ *, params_dtype=torch.float32):
101
+ """Initialize affine weight for model parallel.
102
+
103
+ Build the master weight on all processes and scatter
104
+ the relevant chunk."""
105
+
106
+ set_tensor_model_parallel_attributes(tensor=weight,
107
+ is_parallel=True,
108
+ dim=partition_dim,
109
+ stride=stride)
110
+
111
+ # Initialize master weight
112
+ master_weight = torch.empty(output_size, input_size,
113
+ dtype=torch.float,
114
+ requires_grad=False)
115
+ init_method(master_weight)
116
+ master_weight = master_weight.to(dtype=params_dtype)
117
+
118
+ # Split and copy
119
+ per_partition_per_stride_size = divide(per_partition_size, stride)
120
+ weight_list = torch.split(master_weight, per_partition_per_stride_size,
121
+ dim=partition_dim)
122
+ rank = get_tensor_model_parallel_rank()
123
+ world_size = get_tensor_model_parallel_world_size()
124
+ my_weight_list = weight_list[rank::world_size]
125
+
126
+ with torch.no_grad():
127
+ torch.cat(my_weight_list, dim=partition_dim, out=weight)
128
+ if return_master_weight:
129
+ return master_weight
130
+ return None
131
+
132
+
133
+ class VocabParallelEmbedding(torch.nn.Module):
134
+ """Embedding parallelized in the vocabulary dimension.
135
+
136
+ This is mainly adapted from torch.nn.Embedding and all the default
137
+ values are kept.
138
+ Arguments:
139
+ num_embeddings: vocabulary size.
140
+ embedding_dim: size of hidden state.
141
+
142
+ Keyword Arguments:
143
+ init_method: method to initialize weights.
144
+ params_dtype
145
+ use_cpu_initialization
146
+ perform_initialization
147
+ """
148
+
149
+ def __init__(self, num_embeddings: int, embedding_dim: int, *,
150
+ init_method=init.xavier_normal_,
151
+ params_dtype: torch.dtype=torch.float32,
152
+ use_cpu_initialization: bool=False,
153
+ perform_initialization: bool=True):
154
+ super(VocabParallelEmbedding, self).__init__()
155
+ # Keep the input dimensions.
156
+ self.num_embeddings = num_embeddings
157
+ self.embedding_dim = embedding_dim
158
+ # Set the detauls for compatibility.
159
+ self.padding_idx = None
160
+ self.max_norm = None
161
+ self.norm_type = 2.
162
+ self.scale_grad_by_freq = False
163
+ self.sparse = False
164
+ self._weight = None
165
+ self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
166
+ # Divide the weight matrix along the vocaburaly dimension.
167
+ self.vocab_start_index, self.vocab_end_index = \
168
+ VocabUtility.vocab_range_from_global_vocab_size(
169
+ self.num_embeddings, get_tensor_model_parallel_rank(),
170
+ self.tensor_model_parallel_size)
171
+ self.num_embeddings_per_partition = self.vocab_end_index - \
172
+ self.vocab_start_index
173
+
174
+ # Allocate weights and initialize.
175
+ if use_cpu_initialization:
176
+ self.weight = Parameter(torch.empty(
177
+ self.num_embeddings_per_partition, self.embedding_dim,
178
+ dtype=params_dtype))
179
+ if perform_initialization:
180
+ _initialize_affine_weight_cpu(
181
+ self.weight, self.num_embeddings, self.embedding_dim,
182
+ self.num_embeddings_per_partition, 0, init_method,
183
+ params_dtype=params_dtype)
184
+ else:
185
+ self.weight = Parameter(torch.empty(
186
+ self.num_embeddings_per_partition, self.embedding_dim,
187
+ device=torch.cuda.current_device(), dtype=params_dtype))
188
+ if perform_initialization:
189
+ _initialize_affine_weight_gpu(self.weight, init_method,
190
+ partition_dim=0, stride=1)
191
+
192
+ def forward(self, input_):
193
+ if self.tensor_model_parallel_size > 1:
194
+ # Build the mask.
195
+ input_mask = (input_ < self.vocab_start_index) | \
196
+ (input_ >= self.vocab_end_index)
197
+ # Mask the input.
198
+ masked_input = input_.clone() - self.vocab_start_index
199
+ masked_input[input_mask] = 0
200
+ else:
201
+ masked_input = input_
202
+ # Get the embeddings.
203
+ output_parallel = F.embedding(masked_input, self.weight,
204
+ self.padding_idx, self.max_norm,
205
+ self.norm_type, self.scale_grad_by_freq,
206
+ self.sparse)
207
+ # Mask the output embedding.
208
+ if self.tensor_model_parallel_size > 1:
209
+ output_parallel[input_mask, :] = 0.0
210
+ # Reduce across all the model parallel GPUs.
211
+ output = reduce_from_tensor_model_parallel_region(output_parallel)
212
+ return output
213
+
214
+
215
+ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
216
+ """See linear_with_grad_accumulation_and_async_allreduce"""
217
+
218
+ @staticmethod
219
+ @custom_fwd
220
+ def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
221
+ async_grad_allreduce, sequence_parallel):
222
+ ctx.save_for_backward(input, weight)
223
+ ctx.use_bias = bias is not None
224
+ ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
225
+ ctx.async_grad_allreduce = async_grad_allreduce
226
+ ctx.sequence_parallel = sequence_parallel
227
+
228
+ if sequence_parallel:
229
+ world_size = get_tensor_model_parallel_world_size()
230
+ dim_size = list(input.size())
231
+ dim_size[0] = dim_size[0] * world_size
232
+
233
+ all_gather_buffer = \
234
+ get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
235
+ torch.distributed._all_gather_base(
236
+ all_gather_buffer,
237
+ input,
238
+ group=get_tensor_model_parallel_group())
239
+ total_input = all_gather_buffer
240
+ else:
241
+ total_input = input
242
+
243
+ output = torch.matmul(total_input, weight.t())
244
+ if bias is not None:
245
+ output = output + bias
246
+ return output
247
+
248
+ @staticmethod
249
+ @custom_bwd
250
+ def backward(ctx, grad_output):
251
+ input, weight = ctx.saved_tensors
252
+ use_bias = ctx.use_bias
253
+
254
+ if ctx.sequence_parallel:
255
+ world_size = get_tensor_model_parallel_world_size()
256
+ dim_size = list(input.size())
257
+ dim_size[0] = dim_size[0] * world_size
258
+
259
+ all_gather_buffer = \
260
+ get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
261
+ handle = torch.distributed._all_gather_base(
262
+ all_gather_buffer,
263
+ input,
264
+ group=get_tensor_model_parallel_group(), async_op=True)
265
+
266
+ # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
267
+ # gather is scheduled before the input gradient computation
268
+ total_input = all_gather_buffer
269
+ else:
270
+ total_input = input
271
+ grad_input = grad_output.matmul(weight)
272
+
273
+ if ctx.sequence_parallel:
274
+ handle.wait()
275
+
276
+ # Doing gather + slicing during the NeMo forward pass can make this tensor
277
+ # not be contiguous. PyTorch only checks if the tensor is contiguous, and only
278
+ # clones it if it's not contiguous:
279
+ # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
280
+ grad_output = grad_output.contiguous()
281
+ # Convert the tensor shapes to 2D for execution compatibility
282
+ grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
283
+ grad_output.shape[2])
284
+ total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
285
+ total_input.shape[2])
286
+
287
+ if ctx.async_grad_allreduce:
288
+ # Asynchronous all-reduce
289
+ handle = torch.distributed.all_reduce(
290
+ grad_input, group=get_tensor_model_parallel_group(), async_op=True)
291
+ # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
292
+ # all-reduce is scheduled before the weight gradient computation
293
+
294
+ if ctx.sequence_parallel:
295
+ assert not ctx.async_grad_allreduce
296
+ dim_size = list(input.size())
297
+ sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
298
+ device=torch.cuda.current_device(),
299
+ requires_grad=False)
300
+ # reduce_scatter
301
+ handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input,
302
+ group=get_tensor_model_parallel_group(),
303
+ async_op=True)
304
+ # Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
305
+ # reduce scatter is scheduled before the weight gradient computation
306
+
307
+
308
+ if ctx.gradient_accumulation_fusion:
309
+ if weight.main_grad.dtype == torch.float32:
310
+ fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
311
+ elif weight.main_grad.dtype == torch.float16:
312
+ fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, weight.main_grad)
313
+ else:
314
+ raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
315
+ grad_weight = None
316
+ else:
317
+ grad_weight = grad_output.t().matmul(total_input)
318
+ grad_bias = grad_output.sum(dim=0) if use_bias else None
319
+
320
+ if ctx.sequence_parallel:
321
+ handle.wait()
322
+ return sub_grad_input, grad_weight, grad_bias, None, None, None
323
+
324
+ if ctx.async_grad_allreduce:
325
+ handle.wait()
326
+
327
+ return grad_input, grad_weight, grad_bias, None, None, None
328
+
329
+ def linear_with_grad_accumulation_and_async_allreduce(
330
+ input: torch.Tensor,
331
+ weight: torch.Tensor,
332
+ bias: Optional[torch.Tensor],
333
+ gradient_accumulation_fusion: bool,
334
+ async_grad_allreduce: bool,
335
+ sequence_parallel_enabled: bool,
336
+ ) -> torch.Tensor:
337
+ """Linear layer execution with asynchronous communication and
338
+ gradient accumulation fusion in backprop.
339
+
340
+ This has the option to accumulate the result of backprop
341
+ calculation into an existing gradient buffer, preventing the need
342
+ to do an additional addition kernel after the gradient
343
+ calculation.
344
+
345
+ Additionally, the tensor parallel all reduce of the input
346
+ gradients can be done asynchronously with the calculation of
347
+ the weight gradients.
348
+
349
+ In the case of sequence parallelism, the reduce scatter of the
350
+ input gradients is done asynchronously with the calcluation of the
351
+ weight gradients.
352
+
353
+ Use of this module requires that the environment variable
354
+ CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
355
+ operations, noted in the code, that should be scheduled before
356
+ compute kernels to overlap the communication with the computation,
357
+ which is necessary for a speedup but not for correctness so that
358
+ ordering isn't imposed by the scheduler. Setting
359
+ CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
360
+ in the order they are called.
361
+
362
+ Arguments:
363
+
364
+ input (torch.Tensor required): input like torch.nn.functional.linear
365
+
366
+ weight (torch.Tensor required): weight like torch.nn.functional.linear
367
+
368
+ bias (torch.Tensor optional): bias like torch.nn.functional.linear
369
+
370
+ gradient_accumulation_fusion (bool required): Perform the gradient
371
+ accumulation fusion, requires the custom CUDA extension
372
+ fused_weight_gradient_mlp_cuda module. To use
373
+ gradient_accumulation_fusion you must install APEX with
374
+ --cpp_ext and --cuda_ext. For example: "pip install
375
+ --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
376
+ " Note that the extension requires CUDA>=11. Otherwise, you
377
+ must turn off gradient accumulation fusion."
378
+
379
+ async_grad_allreduce (bool required): Do the allreduce of input
380
+ gradients asyncronously with the computation of weight
381
+ gradients. If sequence_parallel_enabled is True, this must be
382
+ False, as no all reduce is performed.
383
+
384
+ sequence_parallel_enabled (bool required): Indicates that sequence
385
+ parallelism is used and thus in the forward pass the input is
386
+ all gathered, and the backward pass the input gradients are
387
+ reduce scattered.
388
+ """
389
+ args = [
390
+ input,
391
+ weight,
392
+ bias,
393
+ gradient_accumulation_fusion,
394
+ async_grad_allreduce,
395
+ sequence_parallel_enabled,
396
+ ]
397
+
398
+ if not linear_with_grad_accumulation_and_async_allreduce.warned:
399
+ if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
400
+ if sequence_parallel_enabled:
401
+ warnings.warn(
402
+ "When using sequence parallelism it is recommended to set the "
403
+ "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
404
+ "maximum speedup")
405
+ linear_with_grad_accumulation_and_async_allreduce.warned = True
406
+
407
+ if async_grad_allreduce:
408
+ warnings.warn(
409
+ "When using async grad allreduce it is recommended to set the "
410
+ "environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
411
+ "maximum speedup")
412
+ linear_with_grad_accumulation_and_async_allreduce.warned = True
413
+
414
+ return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
415
+
416
+ linear_with_grad_accumulation_and_async_allreduce.warned = False
417
+
418
+ class ColumnParallelLinear(torch.nn.Module):
419
+ """Linear layer with column parallelism.
420
+
421
+ The linear layer is defined as Y = XA + b. A is parallelized along
422
+ its second dimension as A = [A_1, ..., A_p].
423
+
424
+ Arguments:
425
+ input_size: first dimension of matrix A.
426
+ output_size: second dimension of matrix A.
427
+
428
+ Keyword Arguments
429
+ bias: If true, add bias
430
+ gather_output: If true, call all-gather on output and make Y available
431
+ to all GPUs, otherwise, every GPU will have its output
432
+ which is Y_i = XA_i
433
+ init_method: method to initialize weights. Note that bias is always set
434
+ to zero.
435
+ stride: For the strided linear layers.
436
+ keep_master_weight_for_test: This was added for testing and should be
437
+ set to False. It returns the master weights
438
+ used for initialization.
439
+ skip_bias_add: This was added to enable performance optimations where bias
440
+ can be fused with other elementwise operations. we skip
441
+ adding bias but instead return it.
442
+ async_tensor_model_parallel_allreduce:
443
+ params_dtype:
444
+ use_cpu_initialization:
445
+ gradient_accumulation_fusion:
446
+ sequence_parallel_enabled:
447
+ """
448
+
449
+ def __init__(self, input_size, output_size, *,
450
+ bias=True, gather_output=True,
451
+ init_method=init.xavier_normal_, stride=1,
452
+ keep_master_weight_for_test=False,
453
+ skip_bias_add=False,
454
+ async_tensor_model_parallel_allreduce=True,
455
+ params_dtype=torch.float32,
456
+ use_cpu_initialization=False,
457
+ perform_initialization=True,
458
+ gradient_accumulation_fusion=False,
459
+ sequence_parallel_enabled: bool = False,
460
+ ):
461
+ super(ColumnParallelLinear, self).__init__()
462
+
463
+ # Keep input parameters
464
+ self.input_size = input_size
465
+ self.output_size = output_size
466
+ self.gather_output = gather_output
467
+ # Divide the weight matrix along the last dimension.
468
+ world_size = get_tensor_model_parallel_world_size()
469
+ self.output_size_per_partition = divide(output_size, world_size)
470
+ self.skip_bias_add = skip_bias_add
471
+
472
+ # Parameters.
473
+ # Note: torch.nn.functional.linear performs XA^T + b and as a result
474
+ # we allocate the transpose.
475
+ # Initialize weight.
476
+ if use_cpu_initialization:
477
+ self.weight = Parameter(torch.empty(self.output_size_per_partition,
478
+ self.input_size,
479
+ dtype=params_dtype))
480
+ if perform_initialization:
481
+ self.master_weight = _initialize_affine_weight_cpu(
482
+ self.weight, self.output_size, self.input_size,
483
+ self.output_size_per_partition, 0, init_method,
484
+ stride=stride, return_master_weight=keep_master_weight_for_test)
485
+ else:
486
+ self.weight = Parameter(torch.empty(
487
+ self.output_size_per_partition, self.input_size,
488
+ device=torch.cuda.current_device(), dtype=params_dtype))
489
+ if perform_initialization:
490
+ _initialize_affine_weight_gpu(self.weight, init_method,
491
+ partition_dim=0, stride=stride)
492
+
493
+ if bias:
494
+ if use_cpu_initialization:
495
+ self.bias = Parameter(torch.empty(
496
+ self.output_size_per_partition, dtype=params_dtype))
497
+ else:
498
+ self.bias = Parameter(torch.empty(
499
+ self.output_size_per_partition,
500
+ device=torch.cuda.current_device(),
501
+ dtype=params_dtype))
502
+ set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
503
+ # Always initialize bias to zero.
504
+ with torch.no_grad():
505
+ self.bias.zero_()
506
+ else:
507
+ self.register_parameter('bias', None)
508
+
509
+ self.async_tensor_model_parallel_allreduce = (
510
+ async_tensor_model_parallel_allreduce and
511
+ world_size > 1)
512
+ if sequence_parallel_enabled:
513
+ if world_size <= 1:
514
+ warnings.warn(
515
+ f"`sequence_parallel_enabled` is set to `True`, but tensor model parallel size is {world_size}. "
516
+ f"Disabling sequence parallel."
517
+ )
518
+ sequence_parallel_enabled = False
519
+ self.sequence_parallel_enabled = sequence_parallel_enabled
520
+
521
+ if gradient_accumulation_fusion:
522
+ if not _grad_accum_fusion_available:
523
+ raise RuntimeError(
524
+ "ColumnParallelLinear was called with gradient_accumulation_fusion set "
525
+ "to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
526
+ "module is not found. To use gradient_accumulation_fusion you must "
527
+ "install APEX with --cpp_ext and --cuda_ext. For example: "
528
+ "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
529
+ "Note that the extension requires CUDA>=11. Otherwise, you must turn off "
530
+ "gradient accumulation fusion."
531
+ )
532
+ self.gradient_accumulation_fusion = gradient_accumulation_fusion
533
+
534
+ if self.async_tensor_model_parallel_allreduce and self.sequence_parallel_enabled:
535
+ raise RuntimeError(
536
+ "`async_tensor_model_parallel_allreduce` and `sequence_parallel_enabled` "
537
+ "cannot be enabled at the same time."
538
+ )
539
+
540
+
541
+ def forward(self, input_):
542
+ """Forward of ColumnParallelLinear
543
+
544
+ Args:
545
+ input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
546
+
547
+ Returns:
548
+ - output
549
+ - bias
550
+ """
551
+ bias = self.bias if not self.skip_bias_add else None
552
+
553
+ if self.async_tensor_model_parallel_allreduce or \
554
+ self.sequence_parallel_enabled:
555
+ input_parallel = input_
556
+ else:
557
+ input_parallel = copy_to_tensor_model_parallel_region(input_)
558
+ # Matrix multiply.
559
+ output_parallel = linear_with_grad_accumulation_and_async_allreduce(
560
+ input=input_parallel,
561
+ weight=self.weight,
562
+ bias=bias,
563
+ gradient_accumulation_fusion=self.gradient_accumulation_fusion,
564
+ async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
565
+ sequence_parallel_enabled=self.sequence_parallel_enabled,
566
+ )
567
+ if self.gather_output:
568
+ # All-gather across the partitions.
569
+ assert not self.sequence_parallel_enabled
570
+ output = gather_from_tensor_model_parallel_region(output_parallel)
571
+ else:
572
+ output = output_parallel
573
+ output_bias = self.bias if self.skip_bias_add else None
574
+ return output, output_bias
575
+
576
+
577
+ class RowParallelLinear(torch.nn.Module):
578
+ """Linear layer with row parallelism.
579
+
580
+ The linear layer is defined as Y = XA + b. A is parallelized along
581
+ its first dimension and X along its second dimension as:
582
+ - -
583
+ | A_1 |
584
+ | . |
585
+ A = | . | X = [X_1, ..., X_p]
586
+ | . |
587
+ | A_p |
588
+ - -
589
+ Arguments:
590
+ input_size: first dimension of matrix A.
591
+ output_size: second dimension of matrix A.
592
+
593
+ Keyword Arguments:
594
+ bias: If true, add bias. Note that bias is not parallelized.
595
+ input_is_parallel: If true, we assume that the input is already
596
+ split across the GPUs and we do not split
597
+ again.
598
+ init_method: method to initialize weights. Note that bias is always set
599
+ to zero.
600
+ stride: For the strided linear layers.
601
+ keep_master_weight_for_test: This was added for testing and should be
602
+ set to False. It returns the master weights
603
+ used for initialization.
604
+ skip_bias_add: This was added to enable performance optimization where bias
605
+ can be fused with other elementwise operations. We skip
606
+ adding bias but instead return it.
607
+ params_dtype:
608
+ use_cpu_initialization:
609
+ perform_initialization:
610
+ gradient_accumulation_fusion:
611
+ sequence_parallel_enabled:
612
+ """
613
+
614
+ def __init__(self, input_size, output_size, *,
615
+ bias=True, input_is_parallel=False,
616
+ init_method=init.xavier_normal_, stride=1,
617
+ keep_master_weight_for_test=False,
618
+ skip_bias_add=False,
619
+ params_dtype=torch.float32,
620
+ use_cpu_initialization=False,
621
+ perform_initialization=True,
622
+ gradient_accumulation_fusion=False,
623
+ sequence_parallel_enabled: bool = False,
624
+ ):
625
+ super(RowParallelLinear, self).__init__()
626
+
627
+ # Keep input parameters
628
+ self.input_size = input_size
629
+ self.output_size = output_size
630
+ self.input_is_parallel = input_is_parallel
631
+ # Divide the weight matrix along the last dimension.
632
+ world_size = get_tensor_model_parallel_world_size()
633
+ self.input_size_per_partition = divide(input_size, world_size)
634
+ self.skip_bias_add = skip_bias_add
635
+ self.gradient_accumulation_fusion = gradient_accumulation_fusion
636
+ self.sequence_parallel_enabled = sequence_parallel_enabled
637
+ if self.sequence_parallel_enabled and not self.input_is_parallel:
638
+ raise RuntimeError("To enable `sequence_parallel_enabled`, `input_is_parallel` must be `True`")
639
+
640
+ # Parameters.
641
+ # Note: torch.nn.functional.linear performs XA^T + b and as a result
642
+ # we allocate the transpose.
643
+ # Initialize weight.
644
+ if use_cpu_initialization:
645
+ self.weight = Parameter(torch.empty(self.output_size,
646
+ self.input_size_per_partition,
647
+ dtype=params_dtype))
648
+ if perform_initialization:
649
+ self.master_weight = _initialize_affine_weight_cpu(
650
+ self.weight, self.output_size, self.input_size,
651
+ self.input_size_per_partition, 1, init_method,
652
+ stride=stride, return_master_weight=keep_master_weight_for_test,
653
+ params_dtype=params_dtype)
654
+ else:
655
+ self.weight = Parameter(torch.empty(
656
+ self.output_size, self.input_size_per_partition,
657
+ device=torch.cuda.current_device(), dtype=params_dtype))
658
+ if perform_initialization:
659
+ _initialize_affine_weight_gpu(self.weight, init_method,
660
+ partition_dim=1, stride=stride)
661
+ if bias:
662
+ if use_cpu_initialization:
663
+ self.bias = Parameter(torch.empty(self.output_size,
664
+ dtype=params_dtype))
665
+ else:
666
+ self.bias = Parameter(torch.empty(
667
+ self.output_size, device=torch.cuda.current_device(),
668
+ dtype=params_dtype))
669
+ setattr(self.bias, 'sequence_parallel', sequence_parallel_enabled)
670
+
671
+ # Always initialize bias to zero.
672
+ with torch.no_grad():
673
+ self.bias.zero_()
674
+ else:
675
+ self.register_parameter('bias', None)
676
+
677
+
678
+
679
+ def forward(self, input_):
680
+ """Forward of RowParallelLinear
681
+
682
+ Args:
683
+ input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
684
+
685
+ Returns:
686
+ - output
687
+ - bias
688
+ """
689
+ # Set up backprop all-reduce.
690
+ if self.input_is_parallel:
691
+ input_parallel = input_
692
+ else:
693
+ assert not self.sequence_parallel_enabled
694
+ input_parallel = scatter_to_tensor_model_parallel_region(input_)
695
+ # Matrix multiply.
696
+ output_parallel = linear_with_grad_accumulation_and_async_allreduce(
697
+ input=input_parallel,
698
+ weight=self.weight,
699
+ bias=None,
700
+ gradient_accumulation_fusion=self.gradient_accumulation_fusion,
701
+ async_grad_allreduce=False,
702
+ sequence_parallel_enabled=False,
703
+ )
704
+
705
+ # All-reduce across all the partitions.
706
+ if self.sequence_parallel_enabled:
707
+ output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
708
+ else:
709
+ output_ = reduce_from_tensor_model_parallel_region(output_parallel)
710
+ if not self.skip_bias_add:
711
+ output = output_ + self.bias if self.bias is not None else output_
712
+ output_bias = None
713
+ else:
714
+ output = output_
715
+ output_bias = self.bias
716
+ return output, output_bias