megatron-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- megatron/core/__init__.py +12 -0
- megatron/core/enums.py +7 -0
- megatron/core/package_info.py +23 -0
- megatron/core/parallel_state.py +570 -0
- megatron/core/pipeline_parallel/__init__.py +1 -0
- megatron/core/pipeline_parallel/p2p_communication.py +456 -0
- megatron/core/pipeline_parallel/schedules.py +1050 -0
- megatron/core/tensor_parallel/__init__.py +65 -0
- megatron/core/tensor_parallel/cross_entropy.py +143 -0
- megatron/core/tensor_parallel/data.py +105 -0
- megatron/core/tensor_parallel/layers.py +716 -0
- megatron/core/tensor_parallel/mappings.py +279 -0
- megatron/core/tensor_parallel/random.py +253 -0
- megatron/core/tensor_parallel/utils.py +108 -0
- megatron/core/utils.py +137 -0
- megatron_core-0.1.0.dist-info/LICENSE +376 -0
- megatron_core-0.1.0.dist-info/METADATA +35 -0
- megatron_core-0.1.0.dist-info/RECORD +20 -0
- megatron_core-0.1.0.dist-info/WHEEL +5 -0
- megatron_core-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from .cross_entropy import vocab_parallel_cross_entropy
|
|
2
|
+
from .data import broadcast_data
|
|
3
|
+
|
|
4
|
+
from .layers import (
|
|
5
|
+
ColumnParallelLinear,
|
|
6
|
+
RowParallelLinear,
|
|
7
|
+
VocabParallelEmbedding,
|
|
8
|
+
set_tensor_model_parallel_attributes,
|
|
9
|
+
set_defaults_if_not_set_tensor_model_parallel_attributes,
|
|
10
|
+
copy_tensor_model_parallel_attributes,
|
|
11
|
+
param_is_not_tensor_parallel_duplicate,
|
|
12
|
+
linear_with_grad_accumulation_and_async_allreduce
|
|
13
|
+
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from .mappings import (
|
|
17
|
+
copy_to_tensor_model_parallel_region,
|
|
18
|
+
gather_from_tensor_model_parallel_region,
|
|
19
|
+
gather_from_sequence_parallel_region,
|
|
20
|
+
scatter_to_tensor_model_parallel_region,
|
|
21
|
+
scatter_to_sequence_parallel_region,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from .random import (
|
|
25
|
+
checkpoint,
|
|
26
|
+
get_cuda_rng_tracker,
|
|
27
|
+
model_parallel_cuda_manual_seed,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from .utils import (
|
|
31
|
+
split_tensor_along_last_dim,
|
|
32
|
+
split_tensor_into_1d_equal_chunks,
|
|
33
|
+
gather_split_1d_tensor,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# cross_entropy.py
|
|
38
|
+
"vocab_parallel_cross_entropy",
|
|
39
|
+
# data.py
|
|
40
|
+
"broadcast_data",
|
|
41
|
+
#layers.py
|
|
42
|
+
"ColumnParallelLinear",
|
|
43
|
+
"RowParallelLinear",
|
|
44
|
+
"VocabParallelEmbedding",
|
|
45
|
+
"set_tensor_model_parallel_attributes",
|
|
46
|
+
"set_defaults_if_not_set_tensor_model_parallel_attributes",
|
|
47
|
+
"copy_tensor_model_parallel_attributes",
|
|
48
|
+
"param_is_not_tensor_parallel_duplicate",
|
|
49
|
+
"linear_with_grad_accumulation_and_async_allreduce",
|
|
50
|
+
# mappings.py
|
|
51
|
+
"copy_to_tensor_model_parallel_region",
|
|
52
|
+
"gather_from_tensor_model_parallel_region",
|
|
53
|
+
"gather_from_sequence_parallel_region",
|
|
54
|
+
# "reduce_from_tensor_model_parallel_region",
|
|
55
|
+
"scatter_to_tensor_model_parallel_region",
|
|
56
|
+
"scatter_to_sequence_parallel_region",
|
|
57
|
+
# random.py
|
|
58
|
+
"checkpoint",
|
|
59
|
+
"get_cuda_rng_tracker",
|
|
60
|
+
"model_parallel_cuda_manual_seed",
|
|
61
|
+
# utils.py
|
|
62
|
+
"split_tensor_along_last_dim",
|
|
63
|
+
"split_tensor_into_1d_equal_chunks",
|
|
64
|
+
"gather_split_1d_tensor",
|
|
65
|
+
]
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
|
|
5
|
+
from megatron.core.parallel_state import (
|
|
6
|
+
get_tensor_model_parallel_group,
|
|
7
|
+
get_tensor_model_parallel_rank,
|
|
8
|
+
get_tensor_model_parallel_world_size
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .utils import VocabUtility
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _VocabParallelCrossEntropy(torch.autograd.Function):
|
|
15
|
+
|
|
16
|
+
@staticmethod
|
|
17
|
+
def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
|
|
18
|
+
|
|
19
|
+
# Maximum value along vocab dimension across all GPUs.
|
|
20
|
+
logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
|
|
21
|
+
torch.distributed.all_reduce(logits_max,
|
|
22
|
+
op=torch.distributed.ReduceOp.MAX,
|
|
23
|
+
group=get_tensor_model_parallel_group())
|
|
24
|
+
# Subtract the maximum value.
|
|
25
|
+
vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
|
|
26
|
+
|
|
27
|
+
# Get the partition's vocab indecies
|
|
28
|
+
get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
|
|
29
|
+
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
|
30
|
+
rank = get_tensor_model_parallel_rank()
|
|
31
|
+
world_size = get_tensor_model_parallel_world_size()
|
|
32
|
+
vocab_start_index, vocab_end_index = get_vocab_range(
|
|
33
|
+
partition_vocab_size, rank, world_size)
|
|
34
|
+
|
|
35
|
+
# Create a mask of valid vocab ids (1 means it needs to be masked).
|
|
36
|
+
target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
|
|
37
|
+
masked_target = target.clone() - vocab_start_index
|
|
38
|
+
masked_target[target_mask] = 0
|
|
39
|
+
|
|
40
|
+
# Get predicted-logits = logits[target].
|
|
41
|
+
# For Simplicity, we convert logits to a 2-D tensor with size
|
|
42
|
+
# [*, partition-vocab-size] and target to a 1-D tensor of size [*].
|
|
43
|
+
logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
|
|
44
|
+
masked_target_1d = masked_target.view(-1)
|
|
45
|
+
arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
|
|
46
|
+
device=logits_2d.device)
|
|
47
|
+
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
|
|
48
|
+
predicted_logits_1d = predicted_logits_1d.clone().contiguous()
|
|
49
|
+
predicted_logits = predicted_logits_1d.view_as(target)
|
|
50
|
+
predicted_logits[target_mask] = 0.0
|
|
51
|
+
# All reduce is needed to get the chunks from other GPUs.
|
|
52
|
+
torch.distributed.all_reduce(predicted_logits,
|
|
53
|
+
op=torch.distributed.ReduceOp.SUM,
|
|
54
|
+
group=get_tensor_model_parallel_group())
|
|
55
|
+
|
|
56
|
+
# Sum of exponential of logits along vocab dimension across all GPUs.
|
|
57
|
+
exp_logits = vocab_parallel_logits
|
|
58
|
+
torch.exp(vocab_parallel_logits, out=exp_logits)
|
|
59
|
+
sum_exp_logits = exp_logits.sum(dim=-1)
|
|
60
|
+
torch.distributed.all_reduce(sum_exp_logits,
|
|
61
|
+
op=torch.distributed.ReduceOp.SUM,
|
|
62
|
+
group=get_tensor_model_parallel_group())
|
|
63
|
+
|
|
64
|
+
# Loss = log(sum(exp(logits))) - predicted-logit.
|
|
65
|
+
loss = torch.log(sum_exp_logits) - predicted_logits
|
|
66
|
+
|
|
67
|
+
# Normalize and optionally smooth logits
|
|
68
|
+
exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
|
|
69
|
+
|
|
70
|
+
vocab_size = exp_logits.size(-1)
|
|
71
|
+
if label_smoothing > 0:
|
|
72
|
+
"""
|
|
73
|
+
We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
|
|
74
|
+
= (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
|
|
75
|
+
= (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
|
|
76
|
+
= ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
|
|
77
|
+
= (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i
|
|
78
|
+
= (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
|
|
79
|
+
From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
|
|
80
|
+
"""
|
|
81
|
+
assert 1.0 > label_smoothing > 0.0
|
|
82
|
+
smoothing = label_smoothing * vocab_size / (vocab_size - 1)
|
|
83
|
+
|
|
84
|
+
# Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
|
|
85
|
+
log_probs = torch.log(exp_logits)
|
|
86
|
+
mean_log_probs = log_probs.mean(dim=-1)
|
|
87
|
+
loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
|
|
88
|
+
|
|
89
|
+
ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
|
|
90
|
+
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
|
|
91
|
+
|
|
92
|
+
# Store softmax, target-mask and masked-target for backward pass.
|
|
93
|
+
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
|
|
94
|
+
|
|
95
|
+
return loss
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def backward(ctx, grad_output):
|
|
99
|
+
|
|
100
|
+
# Retreive tensors from the forward path.
|
|
101
|
+
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
|
102
|
+
label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
|
|
103
|
+
|
|
104
|
+
# All the inputs have softmax as thier gradient.
|
|
105
|
+
grad_input = softmax
|
|
106
|
+
# For simplicity, work with the 2D gradient.
|
|
107
|
+
partition_vocab_size = softmax.size()[-1]
|
|
108
|
+
grad_2d = grad_input.view(-1, partition_vocab_size)
|
|
109
|
+
|
|
110
|
+
# Add the gradient from matching classes.
|
|
111
|
+
arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
|
|
112
|
+
device=grad_2d.device)
|
|
113
|
+
|
|
114
|
+
softmax_update = 1.0 - target_mask.view(-1).float()
|
|
115
|
+
|
|
116
|
+
if label_smoothing > 0:
|
|
117
|
+
smoothing = label_smoothing * vocab_size / (vocab_size - 1)
|
|
118
|
+
grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
|
|
119
|
+
average_grad = 1 / vocab_size
|
|
120
|
+
grad_2d[arange_1d, :] -= smoothing * average_grad
|
|
121
|
+
else:
|
|
122
|
+
grad_2d[arange_1d, masked_target_1d] -= softmax_update
|
|
123
|
+
|
|
124
|
+
# Finally elementwise multiplication with the output gradients.
|
|
125
|
+
grad_input.mul_(grad_output.unsqueeze(dim=-1))
|
|
126
|
+
|
|
127
|
+
return grad_input, None, None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
|
|
131
|
+
"""
|
|
132
|
+
Performs cross entropy loss when logits are split across tensor parallel ranks
|
|
133
|
+
|
|
134
|
+
Arguments:
|
|
135
|
+
vocab_parallel_logits: logits split across tensor parallel ranks
|
|
136
|
+
dimension is [sequence_length, batch_size, hidden_size]
|
|
137
|
+
|
|
138
|
+
target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
|
|
139
|
+
|
|
140
|
+
lobal_smoothing: smoothing factor, must be in range [0.0, 1.0)
|
|
141
|
+
default is no smoothing (=0.0)
|
|
142
|
+
"""
|
|
143
|
+
return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
|
|
5
|
+
from megatron.core.parallel_state import (
|
|
6
|
+
get_tensor_model_parallel_group,
|
|
7
|
+
get_tensor_model_parallel_rank,
|
|
8
|
+
get_tensor_model_parallel_src_rank,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
_MAX_DATA_DIM = 5
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _check_data_types(keys, data, target_dtype):
|
|
16
|
+
"""Check that all the keys have the same target data type."""
|
|
17
|
+
for key in keys:
|
|
18
|
+
assert data[key].dtype == target_dtype, '{} has data type {} which '\
|
|
19
|
+
'is different than {}'.format(key, data[key].dtype, target_dtype)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _build_key_size_numel_dictionaries(keys, data):
|
|
23
|
+
"""Build the size on rank 0 and broadcast."""
|
|
24
|
+
max_dim = _MAX_DATA_DIM
|
|
25
|
+
sizes = [0 for _ in range(max_dim) for _ in keys]
|
|
26
|
+
|
|
27
|
+
# Pack the sizes on rank zero.
|
|
28
|
+
if get_tensor_model_parallel_rank() == 0:
|
|
29
|
+
offset = 0
|
|
30
|
+
for key in keys:
|
|
31
|
+
assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
|
|
32
|
+
size = data[key].size()
|
|
33
|
+
for i, s in enumerate(size):
|
|
34
|
+
sizes[i + offset] = s
|
|
35
|
+
offset += max_dim
|
|
36
|
+
|
|
37
|
+
# Move to GPU and broadcast.
|
|
38
|
+
sizes_cuda = torch.cuda.LongTensor(sizes)
|
|
39
|
+
torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
|
|
40
|
+
group=get_tensor_model_parallel_group())
|
|
41
|
+
|
|
42
|
+
# Move back to cpu and unpack.
|
|
43
|
+
sizes_cpu = sizes_cuda.cpu()
|
|
44
|
+
key_size = {}
|
|
45
|
+
key_numel = {}
|
|
46
|
+
total_numel = 0
|
|
47
|
+
offset = 0
|
|
48
|
+
for key in keys:
|
|
49
|
+
i = 0
|
|
50
|
+
size = []
|
|
51
|
+
numel = 1
|
|
52
|
+
while sizes_cpu[offset + i] > 0:
|
|
53
|
+
this_size = sizes_cpu[offset + i]
|
|
54
|
+
size.append(this_size)
|
|
55
|
+
numel *= this_size
|
|
56
|
+
i += 1
|
|
57
|
+
key_size[key] = size
|
|
58
|
+
key_numel[key] = numel
|
|
59
|
+
total_numel += numel
|
|
60
|
+
offset += max_dim
|
|
61
|
+
|
|
62
|
+
return key_size, key_numel, total_numel
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def broadcast_data(keys, data, datatype):
|
|
66
|
+
"""Broadcast data from rank zero of each model parallel group to the
|
|
67
|
+
members of the same model parallel group.
|
|
68
|
+
|
|
69
|
+
Arguments:
|
|
70
|
+
keys: list of keys in the data disctionary to be broadcasted
|
|
71
|
+
data: data dictionary of string keys and cpu tensor values.
|
|
72
|
+
datatype: torch data type of all tensors in data associated
|
|
73
|
+
with keys.
|
|
74
|
+
"""
|
|
75
|
+
# Build (key, size) and (key, number of elements) dictionaries along
|
|
76
|
+
# with the total number of elements on all ranks.
|
|
77
|
+
key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
|
|
78
|
+
data)
|
|
79
|
+
|
|
80
|
+
# Pack on rank zero.
|
|
81
|
+
if get_tensor_model_parallel_rank() == 0:
|
|
82
|
+
# Check that all keys have the same data type.
|
|
83
|
+
_check_data_types(keys, data, datatype)
|
|
84
|
+
# Flatten the data associated with the keys
|
|
85
|
+
flatten_data = torch.cat(
|
|
86
|
+
[data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
|
|
87
|
+
else:
|
|
88
|
+
flatten_data = torch.empty(total_numel,
|
|
89
|
+
device=torch.cuda.current_device(),
|
|
90
|
+
dtype=datatype)
|
|
91
|
+
|
|
92
|
+
# Broadcast
|
|
93
|
+
torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
|
|
94
|
+
group=get_tensor_model_parallel_group())
|
|
95
|
+
|
|
96
|
+
# Unpack
|
|
97
|
+
output = {}
|
|
98
|
+
offset = 0
|
|
99
|
+
for key in keys:
|
|
100
|
+
size = key_size[key]
|
|
101
|
+
numel = key_numel[key]
|
|
102
|
+
output[key] = flatten_data.narrow(0, offset, numel).view(size)
|
|
103
|
+
offset += numel
|
|
104
|
+
|
|
105
|
+
return output
|