PyPI - megatron-core - Versions diffs - 0.15.0rc5__tar.gz → 0.15.0rc6__tar.gz - Mend

megatron-core 0.15.0rc5tar.gz → 0.15.0rc6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (351) hide show

{megatron_core-0.15.0rc5/megatron_core.egg-info → megatron_core-0.15.0rc6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.15.0rc5
+Version: 0.15.0rc6
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel.py RENAMED Viewed

@@ -519,8 +519,11 @@ class DistributedDataParallel(_BaseDataParallel):
                         param_slice = bucket.param_data.view(-1)[param_start:param_end]
                         param.data.copy_(param_slice.view(param.data.shape))
                     # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
-                    bucket.grad_data.zero_()
+                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
+                    # We cannot zero out the entire grad buffer because one grad buffer may
+                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
+                    # it would clear the data of those param buffers that have not yet completed AG.
+                    bucket.param_data.zero_()
     def start_grad_sync(self, *unused):
         """
@@ -562,16 +565,8 @@ class DistributedDataParallel(_BaseDataParallel):
             # to True, and there will be a double-GA.
             for param in self.params_with_grad:
                 param.grad_added_to_main_grad = False
-        # In the case of "reuse_grad_buf_for_mxfp8_param_ag=True & overlap_param_gather=True",
-        # the grad buffer is not reset here because the grad buffer is shared with the param buffer.
-        # The grad buffer is zeroed by "bucket.grad_data.zero_()" in the "finish_param_sync" stage
-        # after the param all-gather.
-        if not (
-            self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
-            and self.ddp_config.overlap_param_gather
-        ):
-            for buffer in self.buffers + self.expert_parallel_buffers:
-                buffer.reset()
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.reset()
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
             bucket_group.reset()

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/distributed/finalize_model_grads.py RENAMED Viewed

@@ -267,13 +267,18 @@ def _allreduce_position_embedding_grads(
     )
-def _reset_global_aux_loss_tracker(model: List[torch.nn.Module]):
+def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.nn.Module]):
     """
-    Reset the global aux loss tracker.
+    Reset the temporary tensors of the model.
     """
     for model_chunk in model:
         for module in get_attr_wrapped_model(model_chunk, 'modules')():
-            if hasattr(module, 'reset_global_aux_loss_tracker'):
+            if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
+                module.local_tokens_per_expert.zero_()
+            if (
+                config.moe_router_load_balancing_type == "global_aux_loss"
+                or "global_aux_loss" in config.moe_router_load_balancing_type
+            ) and hasattr(module, 'reset_global_aux_loss_tracker'):
                 module.reset_global_aux_loss_tracker()
@@ -298,10 +303,7 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
         stacked_tokens_per_expert, stacked_expert_bias, config.moe_router_bias_update_rate
     )
-    for tokens_per_expert, expert_bias, updated_expert_bias in zip(
-        tokens_per_expert_list, expert_bias_list, stacked_updated_expert_bias
-    ):
-        tokens_per_expert.zero_()
+    for expert_bias, updated_expert_bias in zip(expert_bias_list, stacked_updated_expert_bias):
         expert_bias.copy_(updated_expert_bias)
@@ -465,11 +467,7 @@ def finalize_model_grads(
     if config.moe_router_enable_expert_bias:
         _update_router_expert_bias(model, config)
-    if (
-        config.moe_router_load_balancing_type == "global_aux_loss"
-        or "global_aux_loss" in config.moe_router_load_balancing_type
-    ):
-        _reset_global_aux_loss_tracker(model)
+    reset_model_temporary_tensors(config, model)
     # normalize gradients for per-token loss normalization.
     # if we are using by the number of tokens, then we use that as a divisor. this number

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py RENAMED Viewed

@@ -224,7 +224,7 @@ class MegatronFSDP(torch.nn.Module):
         # step of the model will reduce all gradients and gather all parameters
         # for synchronized operations such as distributed optimization and
         # distributed checkpointing particularly sharding with HSDP / DP-Outer.
-        self.model_auto_sync = self.set_model_auto_sync(sync_model_each_microbatch)
+        self.set_model_auto_sync(sync_model_each_microbatch)
         # Check if the module contains (Megatron-Core) expert parallel parameters or DTensors.
         has_expert_parameters = self._check_module_parameter_types()
@@ -307,8 +307,11 @@ class MegatronFSDP(torch.nn.Module):
             expert_gradient_scaling_factor = None
         else:
             if self.ddp_config.average_in_collective:
-                # FIXME(@jianbinc): Will fix this issue based on Parallel Folding's EDP patch MR.
-                raise Exception("Not supported")
+                gradient_scaling_factor = 1.0
+                expert_gradient_scaling_factor = (
+                    self.dist_index.get_dp_group(is_expert_parallel=True).size()
+                    / self.dist_index.get_dp_group().size()
+                )
             else:
                 data_parallel_world_size = self.dist_index.get_dp_group().size()
                 gradient_scaling_factor = 1.0 / data_parallel_world_size
@@ -426,6 +429,14 @@ class MegatronFSDP(torch.nn.Module):
                 bucket_id = self.param_and_grad_buffer.param_to_param_group[param]
                 ag_pipeline.wait_bucket_ready(bucket_id)
+        for param in params:
+            # This setting is needed to make FSDP store the weight object when used
+            # with TE's activation offloading for the first global batch.
+            param.grad_added_to_main_grad = False
+            # This setting is needed to have this attribute present after every
+            # un-shard of the FSDP params.
+            param.__fsdp_param__ = True
     def _register_fsdp_hooks(self, root_module):
         """Register necessary hooks for Fully Sharded Data Parallel (FSDP) execution on the model.

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 1
 PATCH = 0
-PRE_RELEASE = 'rc3'
+PRE_RELEASE = 'rc4'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/distributed/param_and_grad_buffer.py RENAMED Viewed

@@ -313,8 +313,11 @@ class _ParamAndGradBucketGroup:
                         param_slice = bucket.param_data.view(-1)[param_start:param_end]
                         param.data.copy_(param_slice.view(param.data.shape))
                     # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
-                    bucket.grad_data.zero_()
+                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
+                    # We cannot zero out the entire grad buffer because one grad buffer may
+                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
+                    # it would clear the data of those param buffers that have not yet completed AG.
+                    bucket.param_data.zero_()
     def start_grad_sync(self):
         """

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
+import inspect
 import io
 import os
 import pickle
@@ -1591,21 +1592,21 @@ if HAVE_TE and is_te_min_version("1.13.0"):
             if self.linear_fc2.config.tp_comm_overlap and self.linear_fc2.ub_name is not None:
                 userbuffers_options = {"comm_name": self.linear_fc2.ub_name}
             op = te.pytorch.ops.BasicLinear(
-                weight.size(1) * tp_world_size,
+                weight.size(1),
                 weight.size(0),
                 device="meta",
                 dtype=weight.dtype,
-                tensor_parallel_mode="row" if tp_world_size > 1 else None,
-                tensor_parallel_group=tp_group,
-                sequence_parallel=self.linear_fc2.sequence_parallel,
                 rng_state_tracker_function=rng_state_tracker_function,
                 accumulate_into_main_grad=self.linear_fc2.fuse_wgrad_accumulation,
                 userbuffers_options=userbuffers_options,
             )
             op.weight = weight
             fused_impl.append(op)
-            if tp_world_size > 1 and self.linear_fc2.sequence_parallel:
-                fused_impl.append(te.pytorch.ops.ReduceScatter(tp_group))
+            if tp_world_size > 1:
+                if self.linear_fc2.sequence_parallel:
+                    fused_impl.append(te.pytorch.ops.ReduceScatter(tp_group))
+                else:
+                    fused_impl.append(te.pytorch.ops.AllReduce(tp_group))
             # FC2 bias op
             if not self.linear_fc2.te_return_bias:
@@ -1617,6 +1618,9 @@ if HAVE_TE and is_te_min_version("1.13.0"):
                     op.bias = bias
                     fused_impl.append(op)
+            # Emulate submodule forward hooks if needed
+            self._register_hooks_on_fused_impl(fused_impl)
             return fused_impl
         def _make_activation_op(
@@ -1655,6 +1659,92 @@ if HAVE_TE and is_te_min_version("1.13.0"):
                 kwargs["cache_quantized_input"] = cache_quantized_input
             return op_type(**kwargs)
+        def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -> None:
+            """Attempt to emulate submodule callback hooks.
+            This is not always possible because Transformer Engine's
+            op fuser does not expose intermediate tensors. Depending
+            on what kernel fusions the op fuser chooses, the
+            intermediate tensors may not even exist. Hooks that modify
+            tensors will result in incorrect behavior.
+            """
+            # Get submodule hooks
+            forward_pre_hooks = []
+            forward_post_hooks = []
+            backward_pre_hooks = []
+            backward_post_hooks = []
+            for submodule in self.modules():
+                for hook in submodule._forward_pre_hooks.values():
+                    forward_pre_hooks.append((submodule, hook))
+                for hook in submodule._forward_hooks.values():
+                    forward_post_hooks.append((submodule, hook))
+                for hook in submodule._backward_pre_hooks.values():
+                    backward_pre_hooks.append((submodule, hook))
+                for hook in submodule._backward_hooks.values():
+                    backward_post_hooks.append((submodule, hook))
+            # Pre-forward hooks
+            # Note: DDP pre-forward hooks are safe since they do not
+            # interact with input tensor.
+            if forward_pre_hooks:
+                from megatron.core.distributed import distributed_data_parallel
+                if any(
+                    inspect.getmodule(hook) != distributed_data_parallel
+                    for _, hook in forward_pre_hooks
+                ):
+                    warnings.warn(
+                        "TEFusedMLP module has a submodule with a pre-forward hook. "
+                        "TEFusedMLP module does not expose intermediate tensors, "
+                        "so the hook may have incorrect behavior if it attempts to "
+                        "access the input tensor."
+                    )
+                def forward_pre_hook(module, *_) -> None:
+                    for submodule, hook in forward_pre_hooks:
+                        # Assume that hook does not interact with input
+                        ret = hook(submodule, None)
+                        if ret is not None:
+                            raise RuntimeError(
+                                "TEFusedMLP module does not expose intermediate tensors, but "
+                                "submodule has pre-forward hook that modifies input tensor."
+                            )
+                fused_impl.register_forward_pre_hook(forward_pre_hook)
+            # Post-forward hooks
+            if forward_post_hooks:
+                warnings.warn(
+                    "TEFusedMLP module has a submodule with a post-forward hook. "
+                    "TEFusedMLP module does not expose intermediate tensors, "
+                    "so the hook may have incorrect behavior if it attempts to "
+                    "access the input or output tensors."
+                )
+                def forward_post_hook(module, *_) -> None:
+                    for submodule, hook in forward_post_hooks:
+                        # Assume that hook does not interact with input or output
+                        ret = hook(submodule, None, None)
+                        if ret is not None:
+                            raise RuntimeError(
+                                "TEFusedMLP module does not expose intermediate tensors, but "
+                                "submodule has post-forward hook that modifies output tensor."
+                            )
+                fused_impl.register_forward_hook(forward_post_hook)
+            # Backward hooks
+            if backward_pre_hooks:
+                raise RuntimeError(
+                    "TEFusedMLP module does not support submodules with pre-backward hooks"
+                )
+            if backward_post_hooks:
+                raise RuntimeError(
+                    "TEFusedMLP module does not support submodules with post-backward hooks"
+                )
         def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:
             """Forward."""

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -406,6 +406,25 @@ def correct_amax_history_if_needed(model: List[torch.nn.Module]):
     _correct_amax_history_if_needed_impl(model)
+def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int):
+    """Check if the layer is in bf16."""
+    num_bf16_layers_at_start = (
+        config.num_layers_at_start_in_bf16 if config.first_last_layers_bf16 else 0
+    )
+    num_bf16_layers_at_end = (
+        config.num_layers_at_end_in_bf16 if config.first_last_layers_bf16 else 0
+    )
+    # Since layer_no is a global layer index, additional checks on whether
+    # we are in the first or last pipeline-parallel rank are not needed.
+    is_first_layer = layer_no < num_bf16_layers_at_start
+    is_last_layer = layer_no >= config.num_layers - num_bf16_layers_at_end
+    if layer_no >= 0 and config.first_last_layers_bf16 and (is_first_layer or is_last_layer):
+        return True
+    else:
+        return False
 if HAVE_TE:
     from megatron.core import parallel_state
     from megatron.core.extensions.transformer_engine import TEDelayedScaling
@@ -437,7 +456,7 @@ if HAVE_TE:
                 )
             elif config.fp8_recipe == Fp8Recipe.tensorwise and is_te_min_version("2.2.0.dev0"):
                 fp8_recipe = transformer_engine.common.recipe.Float8CurrentScaling(
-                    fp8_format=fp8_format
+                    fp8_format=fp8_format, fp8_dpa=config.fp8_dot_product_attention
                 )
             elif config.fp8_recipe == Fp8Recipe.blockwise and is_te_min_version("2.3.0.dev0"):
                 fp8_recipe = transformer_engine.common.recipe.Float8BlockScaling(
@@ -483,24 +502,10 @@ if HAVE_TE:
             that needs to be trained in bf16.
         """
-        num_bf16_layers_at_start = (
-            config.num_layers_at_start_in_bf16 if config.first_last_layers_bf16 else 0
-        )
-        num_bf16_layers_at_end = (
-            config.num_layers_at_end_in_bf16 if config.first_last_layers_bf16 else 0
-        )
-        # Since layer_no is a global layer index, additional checks on whether
-        # we are in the first or last pipeline-parallel rank are not needed.
-        is_first_layer = layer_no < num_bf16_layers_at_start
-        is_last_layer = layer_no >= config.num_layers - num_bf16_layers_at_end
         need_fp8_context = config.fp8 if not is_init else config.fp8_param
-        if not need_fp8_context:
-            # bf16 training
-            fp8_context = nullcontext()
-        elif layer_no >= 0 and config.first_last_layers_bf16 and (is_first_layer or is_last_layer):
-            # fp8 training but this layer_no should be bf16
+        if not need_fp8_context or is_first_last_bf16_layer(config, layer_no):
+            # bf16 training or bf16 layer in fp8 training
             fp8_context = nullcontext()
         else:
             # fp8 training and this layer_no is in fp8

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/optimizer/__init__.py RENAMED Viewed

@@ -10,10 +10,14 @@ from torch.optim import AdamW as CPUAdam
 try:
     from transformer_engine.pytorch.optimizers import FusedAdam as Adam
     from transformer_engine.pytorch.optimizers import FusedSGD as SGD
+    USING_PYTORCH_OPTIMIZER = False
 except ImportError:
     try:
         from apex.optimizers import FusedAdam as Adam
         from apex.optimizers import FusedSGD as SGD
+        USING_PYTORCH_OPTIMIZER = False
     except ImportError:
         warnings.warn(
             f'Transformer Engine and Apex are not installed. Falling back to Torch optimizers.'
@@ -22,7 +26,10 @@ except ImportError:
         # Apex's FusedAdam is a drop-in replacement for torch's AdamW.
         # pylint: disable-next=line-too-long.
         # See https://github.com/NVIDIA/apex/blob/7b73b12361068a10b0f44844534613f252a5ea75/apex/optimizers/fused_adam.py#L16.
-        from torch.optim import AdamW as Adam, SGD
+        from torch.optim import SGD
+        from torch.optim import AdamW as Adam
+        USING_PYTORCH_OPTIMIZER = True
 from megatron.core import parallel_state
 from megatron.core.optimizer.cpu_offloading.hybrid_optimizer import HybridDeviceOptimizer
@@ -305,6 +312,9 @@ def _get_megatron_optimizer_based_on_param_groups(
                     "CPU offload is recommended for PyTorch >= 2.3.0, "
                     "untested versions below this may have convergence issues."
                 )
+            assert (
+                config.decoupled_weight_decay
+            ), "CPU offloading only supported with decoupled_weight_decay enabled (AdamW mode)."
             gpu_optimizer_cls = Adam if config.optimizer == 'adam' else SGD
             cpu_optimizer_cls = CPUAdam if config.optimizer == 'adam' else CPUSGD
             if config.use_torch_optimizer_for_cpu_offload:
@@ -347,6 +357,14 @@ def _get_megatron_optimizer_based_on_param_groups(
                 "eps": config.adam_eps,
             }
+            # set Adam class and weight decay mode depending
+            # on source of optimizer (Torch or TE/Apex)
+            if USING_PYTORCH_OPTIMIZER:
+                adam_cls = torch.optim.AdamW if config.decoupled_weight_decay else torch.optim.Adam
+            else:
+                kwargs["adam_w_mode"] = config.decoupled_weight_decay
+                adam_cls = Adam
             if config.use_precision_aware_optimizer:
                 kwargs.update(
                     {
@@ -371,7 +389,7 @@ def _get_megatron_optimizer_based_on_param_groups(
                 if is_te_min_version("2.1.0.dev0"):
                     kwargs.update({"store_param_remainders": config.store_param_remainders})
-            optimizer = Adam(**kwargs)
+            optimizer = adam_cls(**kwargs)
             def init_state_fn(opt, config=None):
                 for group in opt.param_groups:

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/optimizer/distrib_optimizer.py RENAMED Viewed

@@ -28,7 +28,7 @@ except ImportError:
         USING_APEX_OPTIMIZER = True
     except ImportError:
-        from torch.optim import AdamW as Adam
+        from torch.optim import Adam as Adam
         HAVE_APEX_OR_TE = False
@@ -507,7 +507,10 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
             assert self.ddp_config == model_chunk.ddp_config
         self.distributed_optimizer_instance_id = distributed_optimizer_instance_id
-        assert isinstance(optimizer, (Adam, HybridDeviceOptimizer)) or optimizer is None, (
+        assert (
+            isinstance(optimizer, (Adam, torch.optim.AdamW, HybridDeviceOptimizer))
+            or optimizer is None
+        ), (
             "Only Adam and HybridDeviceOptimizer currently supported, "
             "due to checkpointing requirements."
         )
@@ -637,7 +640,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         elif isinstance(self.optimizer, HybridDeviceOptimizer):
             step = None
             for optimizer in self.optimizer.sub_optimizers:
-                if isinstance(optimizer, torch.optim.AdamW):
+                if isinstance(optimizer, (torch.optim.Adam, torch.optim.AdamW)):
                     if len(optimizer.state) == 0:
                         continue
                     steps = list(set([s["step"].item() for s in optimizer.state.values()]))

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/optimizer/optimizer_config.py RENAMED Viewed

@@ -115,6 +115,11 @@ class OptimizerConfig:
     adam_eps: float = 1e-08
     """Term added to the denominator to improve numerical stability in Adam optimizer."""
+    decoupled_weight_decay: bool = True
+    """If true, decouples weight decay from the gradient update, equivalent to AdamW. If false,
+    original Adam update rule will be used. Defaults to True.
+    """
     # SGD.
     sgd_momentum: float = 0.9
     """Momentum factor for SGD optimizer."""

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc6}/megatron/core/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 15
 PATCH = 0
-PRE_RELEASE = 'rc5'
+PRE_RELEASE = 'rc6'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

megatron-core 0.15.0rc5__tar.gz → 0.15.0rc6__tar.gz

Potentially problematic release.

megatron-core 0.15.0rc5tar.gz → 0.15.0rc6tar.gz