megatron-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- megatron/core/__init__.py +12 -0
- megatron/core/enums.py +7 -0
- megatron/core/package_info.py +23 -0
- megatron/core/parallel_state.py +570 -0
- megatron/core/pipeline_parallel/__init__.py +1 -0
- megatron/core/pipeline_parallel/p2p_communication.py +456 -0
- megatron/core/pipeline_parallel/schedules.py +1050 -0
- megatron/core/tensor_parallel/__init__.py +65 -0
- megatron/core/tensor_parallel/cross_entropy.py +143 -0
- megatron/core/tensor_parallel/data.py +105 -0
- megatron/core/tensor_parallel/layers.py +716 -0
- megatron/core/tensor_parallel/mappings.py +279 -0
- megatron/core/tensor_parallel/random.py +253 -0
- megatron/core/tensor_parallel/utils.py +108 -0
- megatron/core/utils.py +137 -0
- megatron_core-0.1.0.dist-info/LICENSE +376 -0
- megatron_core-0.1.0.dist-info/METADATA +35 -0
- megatron_core-0.1.0.dist-info/RECORD +20 -0
- megatron_core-0.1.0.dist-info/WHEEL +5 -0
- megatron_core-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,716 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
|
|
3
|
+
# Parts of the code here are adapted from PyTorch
|
|
4
|
+
# repo: https://github.com/pytorch/pytorch
|
|
5
|
+
|
|
6
|
+
import math
|
|
7
|
+
import os
|
|
8
|
+
from typing import Optional
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
import torch.nn.functional as F
|
|
13
|
+
import torch.nn.init as init
|
|
14
|
+
from torch.nn.parameter import Parameter
|
|
15
|
+
|
|
16
|
+
from torch.cuda.amp import custom_fwd, custom_bwd
|
|
17
|
+
|
|
18
|
+
from megatron.core.parallel_state import (
|
|
19
|
+
get_tensor_model_parallel_rank,
|
|
20
|
+
get_tensor_model_parallel_world_size,
|
|
21
|
+
get_tensor_model_parallel_group,
|
|
22
|
+
get_global_memory_buffer,
|
|
23
|
+
)
|
|
24
|
+
from .mappings import (
|
|
25
|
+
copy_to_tensor_model_parallel_region,
|
|
26
|
+
gather_from_tensor_model_parallel_region,
|
|
27
|
+
gather_from_sequence_parallel_region,
|
|
28
|
+
reduce_from_tensor_model_parallel_region,
|
|
29
|
+
scatter_to_tensor_model_parallel_region,
|
|
30
|
+
reduce_scatter_to_sequence_parallel_region,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
from .random import get_cuda_rng_tracker
|
|
34
|
+
from .utils import (
|
|
35
|
+
divide,
|
|
36
|
+
split_tensor_along_last_dim,
|
|
37
|
+
VocabUtility,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
_grad_accum_fusion_available = True
|
|
41
|
+
try:
|
|
42
|
+
import fused_weight_gradient_mlp_cuda
|
|
43
|
+
except ImportError:
|
|
44
|
+
_grad_accum_fusion_available = False
|
|
45
|
+
|
|
46
|
+
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
|
|
47
|
+
'partition_dim': -1,
|
|
48
|
+
'partition_stride': 1}
|
|
49
|
+
|
|
50
|
+
def param_is_not_tensor_parallel_duplicate(param):
|
|
51
|
+
return (hasattr(param, 'tensor_model_parallel') and
|
|
52
|
+
param.tensor_model_parallel) or (
|
|
53
|
+
get_tensor_model_parallel_rank() == 0)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
|
|
57
|
+
# Make sure the attributes are not set.
|
|
58
|
+
for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
|
|
59
|
+
assert not hasattr(tensor, attribute)
|
|
60
|
+
# Set the attributes.
|
|
61
|
+
setattr(tensor, 'tensor_model_parallel', is_parallel)
|
|
62
|
+
setattr(tensor, 'partition_dim', dim)
|
|
63
|
+
setattr(tensor, 'partition_stride', stride)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
|
|
67
|
+
def maybe_set(attribute, value):
|
|
68
|
+
if not hasattr(tensor, attribute):
|
|
69
|
+
setattr(tensor, attribute, value)
|
|
70
|
+
for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
|
|
71
|
+
maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
|
|
75
|
+
def maybe_copy(attribute):
|
|
76
|
+
if hasattr(source_tensor, attribute):
|
|
77
|
+
setattr(destination_tensor, attribute,
|
|
78
|
+
getattr(source_tensor, attribute))
|
|
79
|
+
for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
|
|
80
|
+
maybe_copy(attribute)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _initialize_affine_weight_gpu(weight, init_method,
|
|
84
|
+
partition_dim, stride=1):
|
|
85
|
+
"""Initialize affine weight for model parallel on GPU."""
|
|
86
|
+
|
|
87
|
+
set_tensor_model_parallel_attributes(tensor=weight,
|
|
88
|
+
is_parallel=True,
|
|
89
|
+
dim=partition_dim,
|
|
90
|
+
stride=stride)
|
|
91
|
+
|
|
92
|
+
with get_cuda_rng_tracker().fork():
|
|
93
|
+
init_method(weight)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _initialize_affine_weight_cpu(weight, output_size, input_size,
|
|
97
|
+
per_partition_size, partition_dim,
|
|
98
|
+
init_method, stride=1,
|
|
99
|
+
return_master_weight=False,
|
|
100
|
+
*, params_dtype=torch.float32):
|
|
101
|
+
"""Initialize affine weight for model parallel.
|
|
102
|
+
|
|
103
|
+
Build the master weight on all processes and scatter
|
|
104
|
+
the relevant chunk."""
|
|
105
|
+
|
|
106
|
+
set_tensor_model_parallel_attributes(tensor=weight,
|
|
107
|
+
is_parallel=True,
|
|
108
|
+
dim=partition_dim,
|
|
109
|
+
stride=stride)
|
|
110
|
+
|
|
111
|
+
# Initialize master weight
|
|
112
|
+
master_weight = torch.empty(output_size, input_size,
|
|
113
|
+
dtype=torch.float,
|
|
114
|
+
requires_grad=False)
|
|
115
|
+
init_method(master_weight)
|
|
116
|
+
master_weight = master_weight.to(dtype=params_dtype)
|
|
117
|
+
|
|
118
|
+
# Split and copy
|
|
119
|
+
per_partition_per_stride_size = divide(per_partition_size, stride)
|
|
120
|
+
weight_list = torch.split(master_weight, per_partition_per_stride_size,
|
|
121
|
+
dim=partition_dim)
|
|
122
|
+
rank = get_tensor_model_parallel_rank()
|
|
123
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
124
|
+
my_weight_list = weight_list[rank::world_size]
|
|
125
|
+
|
|
126
|
+
with torch.no_grad():
|
|
127
|
+
torch.cat(my_weight_list, dim=partition_dim, out=weight)
|
|
128
|
+
if return_master_weight:
|
|
129
|
+
return master_weight
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class VocabParallelEmbedding(torch.nn.Module):
|
|
134
|
+
"""Embedding parallelized in the vocabulary dimension.
|
|
135
|
+
|
|
136
|
+
This is mainly adapted from torch.nn.Embedding and all the default
|
|
137
|
+
values are kept.
|
|
138
|
+
Arguments:
|
|
139
|
+
num_embeddings: vocabulary size.
|
|
140
|
+
embedding_dim: size of hidden state.
|
|
141
|
+
|
|
142
|
+
Keyword Arguments:
|
|
143
|
+
init_method: method to initialize weights.
|
|
144
|
+
params_dtype
|
|
145
|
+
use_cpu_initialization
|
|
146
|
+
perform_initialization
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(self, num_embeddings: int, embedding_dim: int, *,
|
|
150
|
+
init_method=init.xavier_normal_,
|
|
151
|
+
params_dtype: torch.dtype=torch.float32,
|
|
152
|
+
use_cpu_initialization: bool=False,
|
|
153
|
+
perform_initialization: bool=True):
|
|
154
|
+
super(VocabParallelEmbedding, self).__init__()
|
|
155
|
+
# Keep the input dimensions.
|
|
156
|
+
self.num_embeddings = num_embeddings
|
|
157
|
+
self.embedding_dim = embedding_dim
|
|
158
|
+
# Set the detauls for compatibility.
|
|
159
|
+
self.padding_idx = None
|
|
160
|
+
self.max_norm = None
|
|
161
|
+
self.norm_type = 2.
|
|
162
|
+
self.scale_grad_by_freq = False
|
|
163
|
+
self.sparse = False
|
|
164
|
+
self._weight = None
|
|
165
|
+
self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
|
|
166
|
+
# Divide the weight matrix along the vocaburaly dimension.
|
|
167
|
+
self.vocab_start_index, self.vocab_end_index = \
|
|
168
|
+
VocabUtility.vocab_range_from_global_vocab_size(
|
|
169
|
+
self.num_embeddings, get_tensor_model_parallel_rank(),
|
|
170
|
+
self.tensor_model_parallel_size)
|
|
171
|
+
self.num_embeddings_per_partition = self.vocab_end_index - \
|
|
172
|
+
self.vocab_start_index
|
|
173
|
+
|
|
174
|
+
# Allocate weights and initialize.
|
|
175
|
+
if use_cpu_initialization:
|
|
176
|
+
self.weight = Parameter(torch.empty(
|
|
177
|
+
self.num_embeddings_per_partition, self.embedding_dim,
|
|
178
|
+
dtype=params_dtype))
|
|
179
|
+
if perform_initialization:
|
|
180
|
+
_initialize_affine_weight_cpu(
|
|
181
|
+
self.weight, self.num_embeddings, self.embedding_dim,
|
|
182
|
+
self.num_embeddings_per_partition, 0, init_method,
|
|
183
|
+
params_dtype=params_dtype)
|
|
184
|
+
else:
|
|
185
|
+
self.weight = Parameter(torch.empty(
|
|
186
|
+
self.num_embeddings_per_partition, self.embedding_dim,
|
|
187
|
+
device=torch.cuda.current_device(), dtype=params_dtype))
|
|
188
|
+
if perform_initialization:
|
|
189
|
+
_initialize_affine_weight_gpu(self.weight, init_method,
|
|
190
|
+
partition_dim=0, stride=1)
|
|
191
|
+
|
|
192
|
+
def forward(self, input_):
|
|
193
|
+
if self.tensor_model_parallel_size > 1:
|
|
194
|
+
# Build the mask.
|
|
195
|
+
input_mask = (input_ < self.vocab_start_index) | \
|
|
196
|
+
(input_ >= self.vocab_end_index)
|
|
197
|
+
# Mask the input.
|
|
198
|
+
masked_input = input_.clone() - self.vocab_start_index
|
|
199
|
+
masked_input[input_mask] = 0
|
|
200
|
+
else:
|
|
201
|
+
masked_input = input_
|
|
202
|
+
# Get the embeddings.
|
|
203
|
+
output_parallel = F.embedding(masked_input, self.weight,
|
|
204
|
+
self.padding_idx, self.max_norm,
|
|
205
|
+
self.norm_type, self.scale_grad_by_freq,
|
|
206
|
+
self.sparse)
|
|
207
|
+
# Mask the output embedding.
|
|
208
|
+
if self.tensor_model_parallel_size > 1:
|
|
209
|
+
output_parallel[input_mask, :] = 0.0
|
|
210
|
+
# Reduce across all the model parallel GPUs.
|
|
211
|
+
output = reduce_from_tensor_model_parallel_region(output_parallel)
|
|
212
|
+
return output
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
|
|
216
|
+
"""See linear_with_grad_accumulation_and_async_allreduce"""
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
@custom_fwd
|
|
220
|
+
def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
|
|
221
|
+
async_grad_allreduce, sequence_parallel):
|
|
222
|
+
ctx.save_for_backward(input, weight)
|
|
223
|
+
ctx.use_bias = bias is not None
|
|
224
|
+
ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
|
|
225
|
+
ctx.async_grad_allreduce = async_grad_allreduce
|
|
226
|
+
ctx.sequence_parallel = sequence_parallel
|
|
227
|
+
|
|
228
|
+
if sequence_parallel:
|
|
229
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
230
|
+
dim_size = list(input.size())
|
|
231
|
+
dim_size[0] = dim_size[0] * world_size
|
|
232
|
+
|
|
233
|
+
all_gather_buffer = \
|
|
234
|
+
get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
|
|
235
|
+
torch.distributed._all_gather_base(
|
|
236
|
+
all_gather_buffer,
|
|
237
|
+
input,
|
|
238
|
+
group=get_tensor_model_parallel_group())
|
|
239
|
+
total_input = all_gather_buffer
|
|
240
|
+
else:
|
|
241
|
+
total_input = input
|
|
242
|
+
|
|
243
|
+
output = torch.matmul(total_input, weight.t())
|
|
244
|
+
if bias is not None:
|
|
245
|
+
output = output + bias
|
|
246
|
+
return output
|
|
247
|
+
|
|
248
|
+
@staticmethod
|
|
249
|
+
@custom_bwd
|
|
250
|
+
def backward(ctx, grad_output):
|
|
251
|
+
input, weight = ctx.saved_tensors
|
|
252
|
+
use_bias = ctx.use_bias
|
|
253
|
+
|
|
254
|
+
if ctx.sequence_parallel:
|
|
255
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
256
|
+
dim_size = list(input.size())
|
|
257
|
+
dim_size[0] = dim_size[0] * world_size
|
|
258
|
+
|
|
259
|
+
all_gather_buffer = \
|
|
260
|
+
get_global_memory_buffer().get_tensor(dim_size, input.dtype, "mpu")
|
|
261
|
+
handle = torch.distributed._all_gather_base(
|
|
262
|
+
all_gather_buffer,
|
|
263
|
+
input,
|
|
264
|
+
group=get_tensor_model_parallel_group(), async_op=True)
|
|
265
|
+
|
|
266
|
+
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
|
|
267
|
+
# gather is scheduled before the input gradient computation
|
|
268
|
+
total_input = all_gather_buffer
|
|
269
|
+
else:
|
|
270
|
+
total_input = input
|
|
271
|
+
grad_input = grad_output.matmul(weight)
|
|
272
|
+
|
|
273
|
+
if ctx.sequence_parallel:
|
|
274
|
+
handle.wait()
|
|
275
|
+
|
|
276
|
+
# Doing gather + slicing during the NeMo forward pass can make this tensor
|
|
277
|
+
# not be contiguous. PyTorch only checks if the tensor is contiguous, and only
|
|
278
|
+
# clones it if it's not contiguous:
|
|
279
|
+
# https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
|
|
280
|
+
grad_output = grad_output.contiguous()
|
|
281
|
+
# Convert the tensor shapes to 2D for execution compatibility
|
|
282
|
+
grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
|
|
283
|
+
grad_output.shape[2])
|
|
284
|
+
total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
|
|
285
|
+
total_input.shape[2])
|
|
286
|
+
|
|
287
|
+
if ctx.async_grad_allreduce:
|
|
288
|
+
# Asynchronous all-reduce
|
|
289
|
+
handle = torch.distributed.all_reduce(
|
|
290
|
+
grad_input, group=get_tensor_model_parallel_group(), async_op=True)
|
|
291
|
+
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
|
|
292
|
+
# all-reduce is scheduled before the weight gradient computation
|
|
293
|
+
|
|
294
|
+
if ctx.sequence_parallel:
|
|
295
|
+
assert not ctx.async_grad_allreduce
|
|
296
|
+
dim_size = list(input.size())
|
|
297
|
+
sub_grad_input = torch.empty(dim_size, dtype=input.dtype,
|
|
298
|
+
device=torch.cuda.current_device(),
|
|
299
|
+
requires_grad=False)
|
|
300
|
+
# reduce_scatter
|
|
301
|
+
handle = torch.distributed._reduce_scatter_base(sub_grad_input, grad_input,
|
|
302
|
+
group=get_tensor_model_parallel_group(),
|
|
303
|
+
async_op=True)
|
|
304
|
+
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
|
|
305
|
+
# reduce scatter is scheduled before the weight gradient computation
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
if ctx.gradient_accumulation_fusion:
|
|
309
|
+
if weight.main_grad.dtype == torch.float32:
|
|
310
|
+
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, weight.main_grad)
|
|
311
|
+
elif weight.main_grad.dtype == torch.float16:
|
|
312
|
+
fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, weight.main_grad)
|
|
313
|
+
else:
|
|
314
|
+
raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
|
|
315
|
+
grad_weight = None
|
|
316
|
+
else:
|
|
317
|
+
grad_weight = grad_output.t().matmul(total_input)
|
|
318
|
+
grad_bias = grad_output.sum(dim=0) if use_bias else None
|
|
319
|
+
|
|
320
|
+
if ctx.sequence_parallel:
|
|
321
|
+
handle.wait()
|
|
322
|
+
return sub_grad_input, grad_weight, grad_bias, None, None, None
|
|
323
|
+
|
|
324
|
+
if ctx.async_grad_allreduce:
|
|
325
|
+
handle.wait()
|
|
326
|
+
|
|
327
|
+
return grad_input, grad_weight, grad_bias, None, None, None
|
|
328
|
+
|
|
329
|
+
def linear_with_grad_accumulation_and_async_allreduce(
|
|
330
|
+
input: torch.Tensor,
|
|
331
|
+
weight: torch.Tensor,
|
|
332
|
+
bias: Optional[torch.Tensor],
|
|
333
|
+
gradient_accumulation_fusion: bool,
|
|
334
|
+
async_grad_allreduce: bool,
|
|
335
|
+
sequence_parallel_enabled: bool,
|
|
336
|
+
) -> torch.Tensor:
|
|
337
|
+
"""Linear layer execution with asynchronous communication and
|
|
338
|
+
gradient accumulation fusion in backprop.
|
|
339
|
+
|
|
340
|
+
This has the option to accumulate the result of backprop
|
|
341
|
+
calculation into an existing gradient buffer, preventing the need
|
|
342
|
+
to do an additional addition kernel after the gradient
|
|
343
|
+
calculation.
|
|
344
|
+
|
|
345
|
+
Additionally, the tensor parallel all reduce of the input
|
|
346
|
+
gradients can be done asynchronously with the calculation of
|
|
347
|
+
the weight gradients.
|
|
348
|
+
|
|
349
|
+
In the case of sequence parallelism, the reduce scatter of the
|
|
350
|
+
input gradients is done asynchronously with the calcluation of the
|
|
351
|
+
weight gradients.
|
|
352
|
+
|
|
353
|
+
Use of this module requires that the environment variable
|
|
354
|
+
CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
|
|
355
|
+
operations, noted in the code, that should be scheduled before
|
|
356
|
+
compute kernels to overlap the communication with the computation,
|
|
357
|
+
which is necessary for a speedup but not for correctness so that
|
|
358
|
+
ordering isn't imposed by the scheduler. Setting
|
|
359
|
+
CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
|
|
360
|
+
in the order they are called.
|
|
361
|
+
|
|
362
|
+
Arguments:
|
|
363
|
+
|
|
364
|
+
input (torch.Tensor required): input like torch.nn.functional.linear
|
|
365
|
+
|
|
366
|
+
weight (torch.Tensor required): weight like torch.nn.functional.linear
|
|
367
|
+
|
|
368
|
+
bias (torch.Tensor optional): bias like torch.nn.functional.linear
|
|
369
|
+
|
|
370
|
+
gradient_accumulation_fusion (bool required): Perform the gradient
|
|
371
|
+
accumulation fusion, requires the custom CUDA extension
|
|
372
|
+
fused_weight_gradient_mlp_cuda module. To use
|
|
373
|
+
gradient_accumulation_fusion you must install APEX with
|
|
374
|
+
--cpp_ext and --cuda_ext. For example: "pip install
|
|
375
|
+
--global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\"
|
|
376
|
+
" Note that the extension requires CUDA>=11. Otherwise, you
|
|
377
|
+
must turn off gradient accumulation fusion."
|
|
378
|
+
|
|
379
|
+
async_grad_allreduce (bool required): Do the allreduce of input
|
|
380
|
+
gradients asyncronously with the computation of weight
|
|
381
|
+
gradients. If sequence_parallel_enabled is True, this must be
|
|
382
|
+
False, as no all reduce is performed.
|
|
383
|
+
|
|
384
|
+
sequence_parallel_enabled (bool required): Indicates that sequence
|
|
385
|
+
parallelism is used and thus in the forward pass the input is
|
|
386
|
+
all gathered, and the backward pass the input gradients are
|
|
387
|
+
reduce scattered.
|
|
388
|
+
"""
|
|
389
|
+
args = [
|
|
390
|
+
input,
|
|
391
|
+
weight,
|
|
392
|
+
bias,
|
|
393
|
+
gradient_accumulation_fusion,
|
|
394
|
+
async_grad_allreduce,
|
|
395
|
+
sequence_parallel_enabled,
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
if not linear_with_grad_accumulation_and_async_allreduce.warned:
|
|
399
|
+
if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
|
|
400
|
+
if sequence_parallel_enabled:
|
|
401
|
+
warnings.warn(
|
|
402
|
+
"When using sequence parallelism it is recommended to set the "
|
|
403
|
+
"environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
|
|
404
|
+
"maximum speedup")
|
|
405
|
+
linear_with_grad_accumulation_and_async_allreduce.warned = True
|
|
406
|
+
|
|
407
|
+
if async_grad_allreduce:
|
|
408
|
+
warnings.warn(
|
|
409
|
+
"When using async grad allreduce it is recommended to set the "
|
|
410
|
+
"environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
|
|
411
|
+
"maximum speedup")
|
|
412
|
+
linear_with_grad_accumulation_and_async_allreduce.warned = True
|
|
413
|
+
|
|
414
|
+
return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
|
|
415
|
+
|
|
416
|
+
linear_with_grad_accumulation_and_async_allreduce.warned = False
|
|
417
|
+
|
|
418
|
+
class ColumnParallelLinear(torch.nn.Module):
|
|
419
|
+
"""Linear layer with column parallelism.
|
|
420
|
+
|
|
421
|
+
The linear layer is defined as Y = XA + b. A is parallelized along
|
|
422
|
+
its second dimension as A = [A_1, ..., A_p].
|
|
423
|
+
|
|
424
|
+
Arguments:
|
|
425
|
+
input_size: first dimension of matrix A.
|
|
426
|
+
output_size: second dimension of matrix A.
|
|
427
|
+
|
|
428
|
+
Keyword Arguments
|
|
429
|
+
bias: If true, add bias
|
|
430
|
+
gather_output: If true, call all-gather on output and make Y available
|
|
431
|
+
to all GPUs, otherwise, every GPU will have its output
|
|
432
|
+
which is Y_i = XA_i
|
|
433
|
+
init_method: method to initialize weights. Note that bias is always set
|
|
434
|
+
to zero.
|
|
435
|
+
stride: For the strided linear layers.
|
|
436
|
+
keep_master_weight_for_test: This was added for testing and should be
|
|
437
|
+
set to False. It returns the master weights
|
|
438
|
+
used for initialization.
|
|
439
|
+
skip_bias_add: This was added to enable performance optimations where bias
|
|
440
|
+
can be fused with other elementwise operations. we skip
|
|
441
|
+
adding bias but instead return it.
|
|
442
|
+
async_tensor_model_parallel_allreduce:
|
|
443
|
+
params_dtype:
|
|
444
|
+
use_cpu_initialization:
|
|
445
|
+
gradient_accumulation_fusion:
|
|
446
|
+
sequence_parallel_enabled:
|
|
447
|
+
"""
|
|
448
|
+
|
|
449
|
+
def __init__(self, input_size, output_size, *,
|
|
450
|
+
bias=True, gather_output=True,
|
|
451
|
+
init_method=init.xavier_normal_, stride=1,
|
|
452
|
+
keep_master_weight_for_test=False,
|
|
453
|
+
skip_bias_add=False,
|
|
454
|
+
async_tensor_model_parallel_allreduce=True,
|
|
455
|
+
params_dtype=torch.float32,
|
|
456
|
+
use_cpu_initialization=False,
|
|
457
|
+
perform_initialization=True,
|
|
458
|
+
gradient_accumulation_fusion=False,
|
|
459
|
+
sequence_parallel_enabled: bool = False,
|
|
460
|
+
):
|
|
461
|
+
super(ColumnParallelLinear, self).__init__()
|
|
462
|
+
|
|
463
|
+
# Keep input parameters
|
|
464
|
+
self.input_size = input_size
|
|
465
|
+
self.output_size = output_size
|
|
466
|
+
self.gather_output = gather_output
|
|
467
|
+
# Divide the weight matrix along the last dimension.
|
|
468
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
469
|
+
self.output_size_per_partition = divide(output_size, world_size)
|
|
470
|
+
self.skip_bias_add = skip_bias_add
|
|
471
|
+
|
|
472
|
+
# Parameters.
|
|
473
|
+
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
|
474
|
+
# we allocate the transpose.
|
|
475
|
+
# Initialize weight.
|
|
476
|
+
if use_cpu_initialization:
|
|
477
|
+
self.weight = Parameter(torch.empty(self.output_size_per_partition,
|
|
478
|
+
self.input_size,
|
|
479
|
+
dtype=params_dtype))
|
|
480
|
+
if perform_initialization:
|
|
481
|
+
self.master_weight = _initialize_affine_weight_cpu(
|
|
482
|
+
self.weight, self.output_size, self.input_size,
|
|
483
|
+
self.output_size_per_partition, 0, init_method,
|
|
484
|
+
stride=stride, return_master_weight=keep_master_weight_for_test)
|
|
485
|
+
else:
|
|
486
|
+
self.weight = Parameter(torch.empty(
|
|
487
|
+
self.output_size_per_partition, self.input_size,
|
|
488
|
+
device=torch.cuda.current_device(), dtype=params_dtype))
|
|
489
|
+
if perform_initialization:
|
|
490
|
+
_initialize_affine_weight_gpu(self.weight, init_method,
|
|
491
|
+
partition_dim=0, stride=stride)
|
|
492
|
+
|
|
493
|
+
if bias:
|
|
494
|
+
if use_cpu_initialization:
|
|
495
|
+
self.bias = Parameter(torch.empty(
|
|
496
|
+
self.output_size_per_partition, dtype=params_dtype))
|
|
497
|
+
else:
|
|
498
|
+
self.bias = Parameter(torch.empty(
|
|
499
|
+
self.output_size_per_partition,
|
|
500
|
+
device=torch.cuda.current_device(),
|
|
501
|
+
dtype=params_dtype))
|
|
502
|
+
set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
|
|
503
|
+
# Always initialize bias to zero.
|
|
504
|
+
with torch.no_grad():
|
|
505
|
+
self.bias.zero_()
|
|
506
|
+
else:
|
|
507
|
+
self.register_parameter('bias', None)
|
|
508
|
+
|
|
509
|
+
self.async_tensor_model_parallel_allreduce = (
|
|
510
|
+
async_tensor_model_parallel_allreduce and
|
|
511
|
+
world_size > 1)
|
|
512
|
+
if sequence_parallel_enabled:
|
|
513
|
+
if world_size <= 1:
|
|
514
|
+
warnings.warn(
|
|
515
|
+
f"`sequence_parallel_enabled` is set to `True`, but tensor model parallel size is {world_size}. "
|
|
516
|
+
f"Disabling sequence parallel."
|
|
517
|
+
)
|
|
518
|
+
sequence_parallel_enabled = False
|
|
519
|
+
self.sequence_parallel_enabled = sequence_parallel_enabled
|
|
520
|
+
|
|
521
|
+
if gradient_accumulation_fusion:
|
|
522
|
+
if not _grad_accum_fusion_available:
|
|
523
|
+
raise RuntimeError(
|
|
524
|
+
"ColumnParallelLinear was called with gradient_accumulation_fusion set "
|
|
525
|
+
"to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
|
|
526
|
+
"module is not found. To use gradient_accumulation_fusion you must "
|
|
527
|
+
"install APEX with --cpp_ext and --cuda_ext. For example: "
|
|
528
|
+
"pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext .\" "
|
|
529
|
+
"Note that the extension requires CUDA>=11. Otherwise, you must turn off "
|
|
530
|
+
"gradient accumulation fusion."
|
|
531
|
+
)
|
|
532
|
+
self.gradient_accumulation_fusion = gradient_accumulation_fusion
|
|
533
|
+
|
|
534
|
+
if self.async_tensor_model_parallel_allreduce and self.sequence_parallel_enabled:
|
|
535
|
+
raise RuntimeError(
|
|
536
|
+
"`async_tensor_model_parallel_allreduce` and `sequence_parallel_enabled` "
|
|
537
|
+
"cannot be enabled at the same time."
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def forward(self, input_):
|
|
542
|
+
"""Forward of ColumnParallelLinear
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
- output
|
|
549
|
+
- bias
|
|
550
|
+
"""
|
|
551
|
+
bias = self.bias if not self.skip_bias_add else None
|
|
552
|
+
|
|
553
|
+
if self.async_tensor_model_parallel_allreduce or \
|
|
554
|
+
self.sequence_parallel_enabled:
|
|
555
|
+
input_parallel = input_
|
|
556
|
+
else:
|
|
557
|
+
input_parallel = copy_to_tensor_model_parallel_region(input_)
|
|
558
|
+
# Matrix multiply.
|
|
559
|
+
output_parallel = linear_with_grad_accumulation_and_async_allreduce(
|
|
560
|
+
input=input_parallel,
|
|
561
|
+
weight=self.weight,
|
|
562
|
+
bias=bias,
|
|
563
|
+
gradient_accumulation_fusion=self.gradient_accumulation_fusion,
|
|
564
|
+
async_grad_allreduce=self.async_tensor_model_parallel_allreduce,
|
|
565
|
+
sequence_parallel_enabled=self.sequence_parallel_enabled,
|
|
566
|
+
)
|
|
567
|
+
if self.gather_output:
|
|
568
|
+
# All-gather across the partitions.
|
|
569
|
+
assert not self.sequence_parallel_enabled
|
|
570
|
+
output = gather_from_tensor_model_parallel_region(output_parallel)
|
|
571
|
+
else:
|
|
572
|
+
output = output_parallel
|
|
573
|
+
output_bias = self.bias if self.skip_bias_add else None
|
|
574
|
+
return output, output_bias
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
class RowParallelLinear(torch.nn.Module):
|
|
578
|
+
"""Linear layer with row parallelism.
|
|
579
|
+
|
|
580
|
+
The linear layer is defined as Y = XA + b. A is parallelized along
|
|
581
|
+
its first dimension and X along its second dimension as:
|
|
582
|
+
- -
|
|
583
|
+
| A_1 |
|
|
584
|
+
| . |
|
|
585
|
+
A = | . | X = [X_1, ..., X_p]
|
|
586
|
+
| . |
|
|
587
|
+
| A_p |
|
|
588
|
+
- -
|
|
589
|
+
Arguments:
|
|
590
|
+
input_size: first dimension of matrix A.
|
|
591
|
+
output_size: second dimension of matrix A.
|
|
592
|
+
|
|
593
|
+
Keyword Arguments:
|
|
594
|
+
bias: If true, add bias. Note that bias is not parallelized.
|
|
595
|
+
input_is_parallel: If true, we assume that the input is already
|
|
596
|
+
split across the GPUs and we do not split
|
|
597
|
+
again.
|
|
598
|
+
init_method: method to initialize weights. Note that bias is always set
|
|
599
|
+
to zero.
|
|
600
|
+
stride: For the strided linear layers.
|
|
601
|
+
keep_master_weight_for_test: This was added for testing and should be
|
|
602
|
+
set to False. It returns the master weights
|
|
603
|
+
used for initialization.
|
|
604
|
+
skip_bias_add: This was added to enable performance optimization where bias
|
|
605
|
+
can be fused with other elementwise operations. We skip
|
|
606
|
+
adding bias but instead return it.
|
|
607
|
+
params_dtype:
|
|
608
|
+
use_cpu_initialization:
|
|
609
|
+
perform_initialization:
|
|
610
|
+
gradient_accumulation_fusion:
|
|
611
|
+
sequence_parallel_enabled:
|
|
612
|
+
"""
|
|
613
|
+
|
|
614
|
+
def __init__(self, input_size, output_size, *,
|
|
615
|
+
bias=True, input_is_parallel=False,
|
|
616
|
+
init_method=init.xavier_normal_, stride=1,
|
|
617
|
+
keep_master_weight_for_test=False,
|
|
618
|
+
skip_bias_add=False,
|
|
619
|
+
params_dtype=torch.float32,
|
|
620
|
+
use_cpu_initialization=False,
|
|
621
|
+
perform_initialization=True,
|
|
622
|
+
gradient_accumulation_fusion=False,
|
|
623
|
+
sequence_parallel_enabled: bool = False,
|
|
624
|
+
):
|
|
625
|
+
super(RowParallelLinear, self).__init__()
|
|
626
|
+
|
|
627
|
+
# Keep input parameters
|
|
628
|
+
self.input_size = input_size
|
|
629
|
+
self.output_size = output_size
|
|
630
|
+
self.input_is_parallel = input_is_parallel
|
|
631
|
+
# Divide the weight matrix along the last dimension.
|
|
632
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
633
|
+
self.input_size_per_partition = divide(input_size, world_size)
|
|
634
|
+
self.skip_bias_add = skip_bias_add
|
|
635
|
+
self.gradient_accumulation_fusion = gradient_accumulation_fusion
|
|
636
|
+
self.sequence_parallel_enabled = sequence_parallel_enabled
|
|
637
|
+
if self.sequence_parallel_enabled and not self.input_is_parallel:
|
|
638
|
+
raise RuntimeError("To enable `sequence_parallel_enabled`, `input_is_parallel` must be `True`")
|
|
639
|
+
|
|
640
|
+
# Parameters.
|
|
641
|
+
# Note: torch.nn.functional.linear performs XA^T + b and as a result
|
|
642
|
+
# we allocate the transpose.
|
|
643
|
+
# Initialize weight.
|
|
644
|
+
if use_cpu_initialization:
|
|
645
|
+
self.weight = Parameter(torch.empty(self.output_size,
|
|
646
|
+
self.input_size_per_partition,
|
|
647
|
+
dtype=params_dtype))
|
|
648
|
+
if perform_initialization:
|
|
649
|
+
self.master_weight = _initialize_affine_weight_cpu(
|
|
650
|
+
self.weight, self.output_size, self.input_size,
|
|
651
|
+
self.input_size_per_partition, 1, init_method,
|
|
652
|
+
stride=stride, return_master_weight=keep_master_weight_for_test,
|
|
653
|
+
params_dtype=params_dtype)
|
|
654
|
+
else:
|
|
655
|
+
self.weight = Parameter(torch.empty(
|
|
656
|
+
self.output_size, self.input_size_per_partition,
|
|
657
|
+
device=torch.cuda.current_device(), dtype=params_dtype))
|
|
658
|
+
if perform_initialization:
|
|
659
|
+
_initialize_affine_weight_gpu(self.weight, init_method,
|
|
660
|
+
partition_dim=1, stride=stride)
|
|
661
|
+
if bias:
|
|
662
|
+
if use_cpu_initialization:
|
|
663
|
+
self.bias = Parameter(torch.empty(self.output_size,
|
|
664
|
+
dtype=params_dtype))
|
|
665
|
+
else:
|
|
666
|
+
self.bias = Parameter(torch.empty(
|
|
667
|
+
self.output_size, device=torch.cuda.current_device(),
|
|
668
|
+
dtype=params_dtype))
|
|
669
|
+
setattr(self.bias, 'sequence_parallel', sequence_parallel_enabled)
|
|
670
|
+
|
|
671
|
+
# Always initialize bias to zero.
|
|
672
|
+
with torch.no_grad():
|
|
673
|
+
self.bias.zero_()
|
|
674
|
+
else:
|
|
675
|
+
self.register_parameter('bias', None)
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def forward(self, input_):
|
|
680
|
+
"""Forward of RowParallelLinear
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
- output
|
|
687
|
+
- bias
|
|
688
|
+
"""
|
|
689
|
+
# Set up backprop all-reduce.
|
|
690
|
+
if self.input_is_parallel:
|
|
691
|
+
input_parallel = input_
|
|
692
|
+
else:
|
|
693
|
+
assert not self.sequence_parallel_enabled
|
|
694
|
+
input_parallel = scatter_to_tensor_model_parallel_region(input_)
|
|
695
|
+
# Matrix multiply.
|
|
696
|
+
output_parallel = linear_with_grad_accumulation_and_async_allreduce(
|
|
697
|
+
input=input_parallel,
|
|
698
|
+
weight=self.weight,
|
|
699
|
+
bias=None,
|
|
700
|
+
gradient_accumulation_fusion=self.gradient_accumulation_fusion,
|
|
701
|
+
async_grad_allreduce=False,
|
|
702
|
+
sequence_parallel_enabled=False,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
# All-reduce across all the partitions.
|
|
706
|
+
if self.sequence_parallel_enabled:
|
|
707
|
+
output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
|
|
708
|
+
else:
|
|
709
|
+
output_ = reduce_from_tensor_model_parallel_region(output_parallel)
|
|
710
|
+
if not self.skip_bias_add:
|
|
711
|
+
output = output_ + self.bias if self.bias is not None else output_
|
|
712
|
+
output_bias = None
|
|
713
|
+
else:
|
|
714
|
+
output = output_
|
|
715
|
+
output_bias = self.bias
|
|
716
|
+
return output, output_bias
|