PyPI - megatron-core - Versions diffs - 0.15.0rc5__tar.gz → 0.15.0rc7__tar.gz - Mend

megatron-core 0.15.0rc5tar.gz → 0.15.0rc7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (353) hide show

{megatron_core-0.15.0rc5/megatron_core.egg-info → megatron_core-0.15.0rc7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.15.0rc5
+Version: 0.15.0rc7
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -17,8 +17,9 @@ Classifier: License :: OSI Approved :: BSD License
 Classifier: Natural Language :: English
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Scientific/Engineering :: Image Recognition
 Classifier: Topic :: Scientific/Engineering :: Mathematics

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/dist_checkpointing/strategies/async_utils.py RENAMED Viewed

@@ -564,6 +564,9 @@ class AsyncCallsQueue:
         Returns:
             List[int]: list of indices (as returned by `schedule_async_request`)
                 of async calls that have been successfully finalized.
+        Raises:
+            CheckpointException: if any rank(s) raised an exception during checkpoint
+                writing, the exceptions are wrapped and raised on all ranks.
         """
         call_idx_finalized = []
         while self.async_calls:

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/dist_checkpointing/strategies/filesystem_async.py RENAMED Viewed

@@ -19,6 +19,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import torch
 from torch import multiprocessing as mp
 from torch.distributed.checkpoint import FileSystemWriter
+from torch.distributed.checkpoint.api import WRAPPED_EXCEPTION, _wrap_exception
 from torch.distributed.checkpoint.filesystem import DEFAULT_SUFFIX, _StoragePrefix, _write_item
 from torch.distributed.checkpoint.metadata import Metadata
@@ -420,14 +421,14 @@ class FileSystemWriterAsync(FileSystemWriter):
         """Write all items from ``plan``."""
         raise NotImplementedError("write_data not implemented for FileSystemWriterAsync")
-    def retrieve_write_results(self) -> List[WriteResult]:
+    def retrieve_write_results(self) -> Union[List[WriteResult], WRAPPED_EXCEPTION]:
         """
         Turn the latest dict including write results from `self.results_queue`
             into a single results lists. Includes error check.
-        Returns (List[WriteResult]): the list of write results
-            from all local processes performing the save.
+        Returns (Union(List[WriteResult], WRAPPED_EXCEPTION): the list of write results
+            from all local processes performing the save, or a WRAPPED_EXCEPTION if
+            an exception was raised during the writing process.
         """
         assert self.write_buckets is not None
@@ -437,15 +438,22 @@ class FileSystemWriterAsync(FileSystemWriter):
             try:
                 write_results_or_exc = self.results_queue.get_nowait()
             except queue.Empty:
-                raise RuntimeError("results_queue should not be empty")
+                return _wrap_exception(RuntimeError("results_queue should not be empty"))
         if isinstance(write_results_or_exc, Exception):
-            raise RuntimeError(f"Worker failure: {write_results_or_exc}") from write_results_or_exc
+            try:
+                raise RuntimeError(
+                    f"Worker failure: {write_results_or_exc}"
+                ) from write_results_or_exc
+            except Exception as e:
+                return _wrap_exception(e)
         write_results: dict = write_results_or_exc
         if len(write_results) != len(self.write_buckets):
-            raise RuntimeError(
-                f"Incomplete worker results (expected {len(self.write_buckets)},"
-                f" got {len(write_results)}. This probably indicates a worker failure."
+            return _wrap_exception(
+                RuntimeError(
+                    f"Incomplete worker results (expected {len(self.write_buckets)},"
+                    f" got {len(write_results)}. This probably indicates a worker failure."
+                )
             )
         return list(chain.from_iterable(write_results.values()))

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py RENAMED Viewed

@@ -243,5 +243,16 @@ def save_state_dict_async_finalize(
             storage_writer.finish(global_metadata, all_results)
             write_end = time()
             logger.debug(f"{write_end}, metadata_write: {write_end - write_start}")
-        else:
-            raise CheckpointException("write", node_failures)
+    else:
+        node_failures = {}
+    # Broadcast failure status to all ranks to raise exceptions everywhere if needed.
+    # The failure details are only raised on the coordinator.
+    failures_occurred = torch.tensor(
+        [int(len(node_failures) > 0)], dtype=torch.int, device=torch.cuda.current_device()
+    )
+    torch.distributed.broadcast(
+        failures_occurred, src=dist_wrapper.coordinator_rank, group=dist_wrapper.group
+    )
+    if failures_occurred:
+        raise CheckpointException("write", node_failures)

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/dist_checkpointing/strategies/torch.py RENAMED Viewed

@@ -830,7 +830,6 @@ class TorchDistSaveShardedStrategy(AsyncSaveShardedStrategy):
         def finalize_fn():
             save_state_dict_async_finalize(*save_state_dict_ret)
-            torch.distributed.barrier()
         return AsyncRequest(save_fn, save_args, [finalize_fn], preload_fn=preload_fn)

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/distributed/distributed_data_parallel.py RENAMED Viewed

@@ -51,8 +51,6 @@ class DistributedDataParallel(_BaseDataParallel):
         if has_config_logger_enabled(config):
             log_config_to_disk(config, locals(), prefix=type(self).__name__)
-        self.module = module
         # If bucket_size is not provided as an input, use sane default.
         # If using very large dp_sizes, make buckets larger to ensure that chunks used in NCCL
         # ring-reduce implementations are large enough to remain bandwidth-bound rather than
@@ -121,9 +119,7 @@ class DistributedDataParallel(_BaseDataParallel):
             pp_rank = self.pp_group[0].rank()
         else:
             pp_rank = self.pp_group.rank()
-        if pp_rank > 0:
-            self.bucket_size = None
-        if disable_bucketing:
+        if disable_bucketing or pp_rank > 0:
             self.bucket_size = None
         self.param_to_bucket_group = {}
@@ -519,8 +515,11 @@ class DistributedDataParallel(_BaseDataParallel):
                         param_slice = bucket.param_data.view(-1)[param_start:param_end]
                         param.data.copy_(param_slice.view(param.data.shape))
                     # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
-                    bucket.grad_data.zero_()
+                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
+                    # We cannot zero out the entire grad buffer because one grad buffer may
+                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
+                    # it would clear the data of those param buffers that have not yet completed AG.
+                    bucket.param_data.zero_()
     def start_grad_sync(self, *unused):
         """
@@ -562,16 +561,8 @@ class DistributedDataParallel(_BaseDataParallel):
             # to True, and there will be a double-GA.
             for param in self.params_with_grad:
                 param.grad_added_to_main_grad = False
-        # In the case of "reuse_grad_buf_for_mxfp8_param_ag=True & overlap_param_gather=True",
-        # the grad buffer is not reset here because the grad buffer is shared with the param buffer.
-        # The grad buffer is zeroed by "bucket.grad_data.zero_()" in the "finish_param_sync" stage
-        # after the param all-gather.
-        if not (
-            self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
-            and self.ddp_config.overlap_param_gather
-        ):
-            for buffer in self.buffers + self.expert_parallel_buffers:
-                buffer.reset()
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.reset()
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
             bucket_group.reset()

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/distributed/finalize_model_grads.py RENAMED Viewed

@@ -267,13 +267,18 @@ def _allreduce_position_embedding_grads(
     )
-def _reset_global_aux_loss_tracker(model: List[torch.nn.Module]):
+def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.nn.Module]):
     """
-    Reset the global aux loss tracker.
+    Reset the temporary tensors of the model.
     """
     for model_chunk in model:
         for module in get_attr_wrapped_model(model_chunk, 'modules')():
-            if hasattr(module, 'reset_global_aux_loss_tracker'):
+            if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
+                module.local_tokens_per_expert.zero_()
+            if (
+                config.moe_router_load_balancing_type == "global_aux_loss"
+                or "global_aux_loss" in config.moe_router_load_balancing_type
+            ) and hasattr(module, 'reset_global_aux_loss_tracker'):
                 module.reset_global_aux_loss_tracker()
@@ -298,10 +303,7 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
         stacked_tokens_per_expert, stacked_expert_bias, config.moe_router_bias_update_rate
     )
-    for tokens_per_expert, expert_bias, updated_expert_bias in zip(
-        tokens_per_expert_list, expert_bias_list, stacked_updated_expert_bias
-    ):
-        tokens_per_expert.zero_()
+    for expert_bias, updated_expert_bias in zip(expert_bias_list, stacked_updated_expert_bias):
         expert_bias.copy_(updated_expert_bias)
@@ -465,11 +467,7 @@ def finalize_model_grads(
     if config.moe_router_enable_expert_bias:
         _update_router_expert_bias(model, config)
-    if (
-        config.moe_router_load_balancing_type == "global_aux_loss"
-        or "global_aux_loss" in config.moe_router_load_balancing_type
-    ):
-        _reset_global_aux_loss_tracker(model)
+    reset_model_temporary_tensors(config, model)
     # normalize gradients for per-token loss normalization.
     # if we are using by the number of tokens, then we use that as a divisor. this number

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py RENAMED Viewed

@@ -224,7 +224,7 @@ class MegatronFSDP(torch.nn.Module):
         # step of the model will reduce all gradients and gather all parameters
         # for synchronized operations such as distributed optimization and
         # distributed checkpointing particularly sharding with HSDP / DP-Outer.
-        self.model_auto_sync = self.set_model_auto_sync(sync_model_each_microbatch)
+        self.set_model_auto_sync(sync_model_each_microbatch)
         # Check if the module contains (Megatron-Core) expert parallel parameters or DTensors.
         has_expert_parameters = self._check_module_parameter_types()
@@ -307,8 +307,11 @@ class MegatronFSDP(torch.nn.Module):
             expert_gradient_scaling_factor = None
         else:
             if self.ddp_config.average_in_collective:
-                # FIXME(@jianbinc): Will fix this issue based on Parallel Folding's EDP patch MR.
-                raise Exception("Not supported")
+                gradient_scaling_factor = 1.0
+                expert_gradient_scaling_factor = (
+                    self.dist_index.get_dp_group(is_expert_parallel=True).size()
+                    / self.dist_index.get_dp_group().size()
+                )
             else:
                 data_parallel_world_size = self.dist_index.get_dp_group().size()
                 gradient_scaling_factor = 1.0 / data_parallel_world_size
@@ -426,6 +429,14 @@ class MegatronFSDP(torch.nn.Module):
                 bucket_id = self.param_and_grad_buffer.param_to_param_group[param]
                 ag_pipeline.wait_bucket_ready(bucket_id)
+        for param in params:
+            # This setting is needed to make FSDP store the weight object when used
+            # with TE's activation offloading for the first global batch.
+            param.grad_added_to_main_grad = False
+            # This setting is needed to have this attribute present after every
+            # un-shard of the FSDP params.
+            param.__fsdp_param__ = True
     def _register_fsdp_hooks(self, root_module):
         """Register necessary hooks for Fully Sharded Data Parallel (FSDP) execution on the model.

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 1
 PATCH = 0
-PRE_RELEASE = 'rc3'
+PRE_RELEASE = 'rc5'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/distributed/param_and_grad_buffer.py RENAMED Viewed

@@ -313,8 +313,11 @@ class _ParamAndGradBucketGroup:
                         param_slice = bucket.param_data.view(-1)[param_start:param_end]
                         param.data.copy_(param_slice.view(param.data.shape))
                     # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
-                    bucket.grad_data.zero_()
+                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
+                    # We cannot zero out the entire grad buffer because one grad buffer may
+                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
+                    # it would clear the data of those param buffers that have not yet completed AG.
+                    bucket.param_data.zero_()
     def start_grad_sync(self):
         """

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
+import inspect
 import io
 import os
 import pickle
@@ -1591,21 +1592,21 @@ if HAVE_TE and is_te_min_version("1.13.0"):
             if self.linear_fc2.config.tp_comm_overlap and self.linear_fc2.ub_name is not None:
                 userbuffers_options = {"comm_name": self.linear_fc2.ub_name}
             op = te.pytorch.ops.BasicLinear(
-                weight.size(1) * tp_world_size,
+                weight.size(1),
                 weight.size(0),
                 device="meta",
                 dtype=weight.dtype,
-                tensor_parallel_mode="row" if tp_world_size > 1 else None,
-                tensor_parallel_group=tp_group,
-                sequence_parallel=self.linear_fc2.sequence_parallel,
                 rng_state_tracker_function=rng_state_tracker_function,
                 accumulate_into_main_grad=self.linear_fc2.fuse_wgrad_accumulation,
                 userbuffers_options=userbuffers_options,
             )
             op.weight = weight
             fused_impl.append(op)
-            if tp_world_size > 1 and self.linear_fc2.sequence_parallel:
-                fused_impl.append(te.pytorch.ops.ReduceScatter(tp_group))
+            if tp_world_size > 1:
+                if self.linear_fc2.sequence_parallel:
+                    fused_impl.append(te.pytorch.ops.ReduceScatter(tp_group))
+                else:
+                    fused_impl.append(te.pytorch.ops.AllReduce(tp_group))
             # FC2 bias op
             if not self.linear_fc2.te_return_bias:
@@ -1617,6 +1618,9 @@ if HAVE_TE and is_te_min_version("1.13.0"):
                     op.bias = bias
                     fused_impl.append(op)
+            # Emulate submodule forward hooks if needed
+            self._register_hooks_on_fused_impl(fused_impl)
             return fused_impl
         def _make_activation_op(
@@ -1655,6 +1659,92 @@ if HAVE_TE and is_te_min_version("1.13.0"):
                 kwargs["cache_quantized_input"] = cache_quantized_input
             return op_type(**kwargs)
+        def _register_hooks_on_fused_impl(self, fused_impl: torch.nn.Module) -> None:
+            """Attempt to emulate submodule callback hooks.
+            This is not always possible because Transformer Engine's
+            op fuser does not expose intermediate tensors. Depending
+            on what kernel fusions the op fuser chooses, the
+            intermediate tensors may not even exist. Hooks that modify
+            tensors will result in incorrect behavior.
+            """
+            # Get submodule hooks
+            forward_pre_hooks = []
+            forward_post_hooks = []
+            backward_pre_hooks = []
+            backward_post_hooks = []
+            for submodule in self.modules():
+                for hook in submodule._forward_pre_hooks.values():
+                    forward_pre_hooks.append((submodule, hook))
+                for hook in submodule._forward_hooks.values():
+                    forward_post_hooks.append((submodule, hook))
+                for hook in submodule._backward_pre_hooks.values():
+                    backward_pre_hooks.append((submodule, hook))
+                for hook in submodule._backward_hooks.values():
+                    backward_post_hooks.append((submodule, hook))
+            # Pre-forward hooks
+            # Note: DDP pre-forward hooks are safe since they do not
+            # interact with input tensor.
+            if forward_pre_hooks:
+                from megatron.core.distributed import distributed_data_parallel
+                if any(
+                    inspect.getmodule(hook) != distributed_data_parallel
+                    for _, hook in forward_pre_hooks
+                ):
+                    warnings.warn(
+                        "TEFusedMLP module has a submodule with a pre-forward hook. "
+                        "TEFusedMLP module does not expose intermediate tensors, "
+                        "so the hook may have incorrect behavior if it attempts to "
+                        "access the input tensor."
+                    )
+                def forward_pre_hook(module, *_) -> None:
+                    for submodule, hook in forward_pre_hooks:
+                        # Assume that hook does not interact with input
+                        ret = hook(submodule, None)
+                        if ret is not None:
+                            raise RuntimeError(
+                                "TEFusedMLP module does not expose intermediate tensors, but "
+                                "submodule has pre-forward hook that modifies input tensor."
+                            )
+                fused_impl.register_forward_pre_hook(forward_pre_hook)
+            # Post-forward hooks
+            if forward_post_hooks:
+                warnings.warn(
+                    "TEFusedMLP module has a submodule with a post-forward hook. "
+                    "TEFusedMLP module does not expose intermediate tensors, "
+                    "so the hook may have incorrect behavior if it attempts to "
+                    "access the input or output tensors."
+                )
+                def forward_post_hook(module, *_) -> None:
+                    for submodule, hook in forward_post_hooks:
+                        # Assume that hook does not interact with input or output
+                        ret = hook(submodule, None, None)
+                        if ret is not None:
+                            raise RuntimeError(
+                                "TEFusedMLP module does not expose intermediate tensors, but "
+                                "submodule has post-forward hook that modifies output tensor."
+                            )
+                fused_impl.register_forward_hook(forward_post_hook)
+            # Backward hooks
+            if backward_pre_hooks:
+                raise RuntimeError(
+                    "TEFusedMLP module does not support submodules with pre-backward hooks"
+                )
+            if backward_post_hooks:
+                raise RuntimeError(
+                    "TEFusedMLP module does not support submodules with post-backward hooks"
+                )
         def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:
             """Forward."""

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -406,6 +406,25 @@ def correct_amax_history_if_needed(model: List[torch.nn.Module]):
     _correct_amax_history_if_needed_impl(model)
+def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int):
+    """Check if the layer is in bf16."""
+    num_bf16_layers_at_start = (
+        config.num_layers_at_start_in_bf16 if config.first_last_layers_bf16 else 0
+    )
+    num_bf16_layers_at_end = (
+        config.num_layers_at_end_in_bf16 if config.first_last_layers_bf16 else 0
+    )
+    # Since layer_no is a global layer index, additional checks on whether
+    # we are in the first or last pipeline-parallel rank are not needed.
+    is_first_layer = layer_no < num_bf16_layers_at_start
+    is_last_layer = layer_no >= config.num_layers - num_bf16_layers_at_end
+    if layer_no >= 0 and config.first_last_layers_bf16 and (is_first_layer or is_last_layer):
+        return True
+    else:
+        return False
 if HAVE_TE:
     from megatron.core import parallel_state
     from megatron.core.extensions.transformer_engine import TEDelayedScaling
@@ -437,7 +456,7 @@ if HAVE_TE:
                 )
             elif config.fp8_recipe == Fp8Recipe.tensorwise and is_te_min_version("2.2.0.dev0"):
                 fp8_recipe = transformer_engine.common.recipe.Float8CurrentScaling(
-                    fp8_format=fp8_format
+                    fp8_format=fp8_format, fp8_dpa=config.fp8_dot_product_attention
                 )
             elif config.fp8_recipe == Fp8Recipe.blockwise and is_te_min_version("2.3.0.dev0"):
                 fp8_recipe = transformer_engine.common.recipe.Float8BlockScaling(
@@ -483,24 +502,10 @@ if HAVE_TE:
             that needs to be trained in bf16.
         """
-        num_bf16_layers_at_start = (
-            config.num_layers_at_start_in_bf16 if config.first_last_layers_bf16 else 0
-        )
-        num_bf16_layers_at_end = (
-            config.num_layers_at_end_in_bf16 if config.first_last_layers_bf16 else 0
-        )
-        # Since layer_no is a global layer index, additional checks on whether
-        # we are in the first or last pipeline-parallel rank are not needed.
-        is_first_layer = layer_no < num_bf16_layers_at_start
-        is_last_layer = layer_no >= config.num_layers - num_bf16_layers_at_end
         need_fp8_context = config.fp8 if not is_init else config.fp8_param
-        if not need_fp8_context:
-            # bf16 training
-            fp8_context = nullcontext()
-        elif layer_no >= 0 and config.first_last_layers_bf16 and (is_first_layer or is_last_layer):
-            # fp8 training but this layer_no should be bf16
+        if not need_fp8_context or is_first_last_bf16_layer(config, layer_no):
+            # bf16 training or bf16 layer in fp8 training
             fp8_context = nullcontext()
         else:
             # fp8 training and this layer_no is in fp8

{megatron_core-0.15.0rc5 → megatron_core-0.15.0rc7}/megatron/core/inference/async_stream.py RENAMED Viewed

@@ -20,7 +20,7 @@ class AsyncStream:
     Adopted from https://github.com/vllm-project/vllm/blob/eb881ed006ca458b052905e33f0d16dbb428063a/vllm/v1/engine/async_stream.py # pylint: disable=line-too-long
     """
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
+    def __init__(self, request_id: int, cancel: Callable[[str], None]) -> None:
         self._request_id = request_id
         self._cancel = cancel
         self._queue: asyncio.Queue = asyncio.Queue()

megatron-core 0.15.0rc5__tar.gz → 0.15.0rc7__tar.gz

Potentially problematic release.

megatron-core 0.15.0rc5tar.gz → 0.15.0rc7tar.gz