megatron-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

@@ -0,0 +1,65 @@
1
+ from .cross_entropy import vocab_parallel_cross_entropy
2
+ from .data import broadcast_data
3
+
4
+ from .layers import (
5
+ ColumnParallelLinear,
6
+ RowParallelLinear,
7
+ VocabParallelEmbedding,
8
+ set_tensor_model_parallel_attributes,
9
+ set_defaults_if_not_set_tensor_model_parallel_attributes,
10
+ copy_tensor_model_parallel_attributes,
11
+ param_is_not_tensor_parallel_duplicate,
12
+ linear_with_grad_accumulation_and_async_allreduce
13
+
14
+ )
15
+
16
+ from .mappings import (
17
+ copy_to_tensor_model_parallel_region,
18
+ gather_from_tensor_model_parallel_region,
19
+ gather_from_sequence_parallel_region,
20
+ scatter_to_tensor_model_parallel_region,
21
+ scatter_to_sequence_parallel_region,
22
+ )
23
+
24
+ from .random import (
25
+ checkpoint,
26
+ get_cuda_rng_tracker,
27
+ model_parallel_cuda_manual_seed,
28
+ )
29
+
30
+ from .utils import (
31
+ split_tensor_along_last_dim,
32
+ split_tensor_into_1d_equal_chunks,
33
+ gather_split_1d_tensor,
34
+ )
35
+
36
+ __all__ = [
37
+ # cross_entropy.py
38
+ "vocab_parallel_cross_entropy",
39
+ # data.py
40
+ "broadcast_data",
41
+ #layers.py
42
+ "ColumnParallelLinear",
43
+ "RowParallelLinear",
44
+ "VocabParallelEmbedding",
45
+ "set_tensor_model_parallel_attributes",
46
+ "set_defaults_if_not_set_tensor_model_parallel_attributes",
47
+ "copy_tensor_model_parallel_attributes",
48
+ "param_is_not_tensor_parallel_duplicate",
49
+ "linear_with_grad_accumulation_and_async_allreduce",
50
+ # mappings.py
51
+ "copy_to_tensor_model_parallel_region",
52
+ "gather_from_tensor_model_parallel_region",
53
+ "gather_from_sequence_parallel_region",
54
+ # "reduce_from_tensor_model_parallel_region",
55
+ "scatter_to_tensor_model_parallel_region",
56
+ "scatter_to_sequence_parallel_region",
57
+ # random.py
58
+ "checkpoint",
59
+ "get_cuda_rng_tracker",
60
+ "model_parallel_cuda_manual_seed",
61
+ # utils.py
62
+ "split_tensor_along_last_dim",
63
+ "split_tensor_into_1d_equal_chunks",
64
+ "gather_split_1d_tensor",
65
+ ]
@@ -0,0 +1,143 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import torch
4
+
5
+ from megatron.core.parallel_state import (
6
+ get_tensor_model_parallel_group,
7
+ get_tensor_model_parallel_rank,
8
+ get_tensor_model_parallel_world_size
9
+ )
10
+
11
+ from .utils import VocabUtility
12
+
13
+
14
+ class _VocabParallelCrossEntropy(torch.autograd.Function):
15
+
16
+ @staticmethod
17
+ def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
18
+
19
+ # Maximum value along vocab dimension across all GPUs.
20
+ logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
21
+ torch.distributed.all_reduce(logits_max,
22
+ op=torch.distributed.ReduceOp.MAX,
23
+ group=get_tensor_model_parallel_group())
24
+ # Subtract the maximum value.
25
+ vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
26
+
27
+ # Get the partition's vocab indecies
28
+ get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
29
+ partition_vocab_size = vocab_parallel_logits.size()[-1]
30
+ rank = get_tensor_model_parallel_rank()
31
+ world_size = get_tensor_model_parallel_world_size()
32
+ vocab_start_index, vocab_end_index = get_vocab_range(
33
+ partition_vocab_size, rank, world_size)
34
+
35
+ # Create a mask of valid vocab ids (1 means it needs to be masked).
36
+ target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
37
+ masked_target = target.clone() - vocab_start_index
38
+ masked_target[target_mask] = 0
39
+
40
+ # Get predicted-logits = logits[target].
41
+ # For Simplicity, we convert logits to a 2-D tensor with size
42
+ # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
43
+ logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
44
+ masked_target_1d = masked_target.view(-1)
45
+ arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
46
+ device=logits_2d.device)
47
+ predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
48
+ predicted_logits_1d = predicted_logits_1d.clone().contiguous()
49
+ predicted_logits = predicted_logits_1d.view_as(target)
50
+ predicted_logits[target_mask] = 0.0
51
+ # All reduce is needed to get the chunks from other GPUs.
52
+ torch.distributed.all_reduce(predicted_logits,
53
+ op=torch.distributed.ReduceOp.SUM,
54
+ group=get_tensor_model_parallel_group())
55
+
56
+ # Sum of exponential of logits along vocab dimension across all GPUs.
57
+ exp_logits = vocab_parallel_logits
58
+ torch.exp(vocab_parallel_logits, out=exp_logits)
59
+ sum_exp_logits = exp_logits.sum(dim=-1)
60
+ torch.distributed.all_reduce(sum_exp_logits,
61
+ op=torch.distributed.ReduceOp.SUM,
62
+ group=get_tensor_model_parallel_group())
63
+
64
+ # Loss = log(sum(exp(logits))) - predicted-logit.
65
+ loss = torch.log(sum_exp_logits) - predicted_logits
66
+
67
+ # Normalize and optionally smooth logits
68
+ exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
69
+
70
+ vocab_size = exp_logits.size(-1)
71
+ if label_smoothing > 0:
72
+ """
73
+ We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
74
+ = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
75
+ = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
76
+ = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
77
+ = (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i
78
+ = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
79
+ From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
80
+ """
81
+ assert 1.0 > label_smoothing > 0.0
82
+ smoothing = label_smoothing * vocab_size / (vocab_size - 1)
83
+
84
+ # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
85
+ log_probs = torch.log(exp_logits)
86
+ mean_log_probs = log_probs.mean(dim=-1)
87
+ loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
88
+
89
+ ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
90
+ ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
91
+
92
+ # Store softmax, target-mask and masked-target for backward pass.
93
+ ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
94
+
95
+ return loss
96
+
97
+ @staticmethod
98
+ def backward(ctx, grad_output):
99
+
100
+ # Retreive tensors from the forward path.
101
+ softmax, target_mask, masked_target_1d = ctx.saved_tensors
102
+ label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
103
+
104
+ # All the inputs have softmax as thier gradient.
105
+ grad_input = softmax
106
+ # For simplicity, work with the 2D gradient.
107
+ partition_vocab_size = softmax.size()[-1]
108
+ grad_2d = grad_input.view(-1, partition_vocab_size)
109
+
110
+ # Add the gradient from matching classes.
111
+ arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
112
+ device=grad_2d.device)
113
+
114
+ softmax_update = 1.0 - target_mask.view(-1).float()
115
+
116
+ if label_smoothing > 0:
117
+ smoothing = label_smoothing * vocab_size / (vocab_size - 1)
118
+ grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
119
+ average_grad = 1 / vocab_size
120
+ grad_2d[arange_1d, :] -= smoothing * average_grad
121
+ else:
122
+ grad_2d[arange_1d, masked_target_1d] -= softmax_update
123
+
124
+ # Finally elementwise multiplication with the output gradients.
125
+ grad_input.mul_(grad_output.unsqueeze(dim=-1))
126
+
127
+ return grad_input, None, None
128
+
129
+
130
+ def vocab_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
131
+ """
132
+ Performs cross entropy loss when logits are split across tensor parallel ranks
133
+
134
+ Arguments:
135
+ vocab_parallel_logits: logits split across tensor parallel ranks
136
+ dimension is [sequence_length, batch_size, hidden_size]
137
+
138
+ target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
139
+
140
+ lobal_smoothing: smoothing factor, must be in range [0.0, 1.0)
141
+ default is no smoothing (=0.0)
142
+ """
143
+ return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
@@ -0,0 +1,105 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import torch
4
+
5
+ from megatron.core.parallel_state import (
6
+ get_tensor_model_parallel_group,
7
+ get_tensor_model_parallel_rank,
8
+ get_tensor_model_parallel_src_rank,
9
+ )
10
+
11
+
12
+ _MAX_DATA_DIM = 5
13
+
14
+
15
+ def _check_data_types(keys, data, target_dtype):
16
+ """Check that all the keys have the same target data type."""
17
+ for key in keys:
18
+ assert data[key].dtype == target_dtype, '{} has data type {} which '\
19
+ 'is different than {}'.format(key, data[key].dtype, target_dtype)
20
+
21
+
22
+ def _build_key_size_numel_dictionaries(keys, data):
23
+ """Build the size on rank 0 and broadcast."""
24
+ max_dim = _MAX_DATA_DIM
25
+ sizes = [0 for _ in range(max_dim) for _ in keys]
26
+
27
+ # Pack the sizes on rank zero.
28
+ if get_tensor_model_parallel_rank() == 0:
29
+ offset = 0
30
+ for key in keys:
31
+ assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
32
+ size = data[key].size()
33
+ for i, s in enumerate(size):
34
+ sizes[i + offset] = s
35
+ offset += max_dim
36
+
37
+ # Move to GPU and broadcast.
38
+ sizes_cuda = torch.cuda.LongTensor(sizes)
39
+ torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
40
+ group=get_tensor_model_parallel_group())
41
+
42
+ # Move back to cpu and unpack.
43
+ sizes_cpu = sizes_cuda.cpu()
44
+ key_size = {}
45
+ key_numel = {}
46
+ total_numel = 0
47
+ offset = 0
48
+ for key in keys:
49
+ i = 0
50
+ size = []
51
+ numel = 1
52
+ while sizes_cpu[offset + i] > 0:
53
+ this_size = sizes_cpu[offset + i]
54
+ size.append(this_size)
55
+ numel *= this_size
56
+ i += 1
57
+ key_size[key] = size
58
+ key_numel[key] = numel
59
+ total_numel += numel
60
+ offset += max_dim
61
+
62
+ return key_size, key_numel, total_numel
63
+
64
+
65
+ def broadcast_data(keys, data, datatype):
66
+ """Broadcast data from rank zero of each model parallel group to the
67
+ members of the same model parallel group.
68
+
69
+ Arguments:
70
+ keys: list of keys in the data disctionary to be broadcasted
71
+ data: data dictionary of string keys and cpu tensor values.
72
+ datatype: torch data type of all tensors in data associated
73
+ with keys.
74
+ """
75
+ # Build (key, size) and (key, number of elements) dictionaries along
76
+ # with the total number of elements on all ranks.
77
+ key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
78
+ data)
79
+
80
+ # Pack on rank zero.
81
+ if get_tensor_model_parallel_rank() == 0:
82
+ # Check that all keys have the same data type.
83
+ _check_data_types(keys, data, datatype)
84
+ # Flatten the data associated with the keys
85
+ flatten_data = torch.cat(
86
+ [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
87
+ else:
88
+ flatten_data = torch.empty(total_numel,
89
+ device=torch.cuda.current_device(),
90
+ dtype=datatype)
91
+
92
+ # Broadcast
93
+ torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(),
94
+ group=get_tensor_model_parallel_group())
95
+
96
+ # Unpack
97
+ output = {}
98
+ offset = 0
99
+ for key in keys:
100
+ size = key_size[key]
101
+ numel = key_numel[key]
102
+ output[key] = flatten_data.narrow(0, offset, numel).view(size)
103
+ offset += numel
104
+
105
+ return output