PyPI - megatron-core - Versions diffs - 0.15.0rc4__tar.gz → 0.15.0rc6__tar.gz - Mend

megatron-core 0.15.0rc4tar.gz → 0.15.0rc6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (353) hide show

{megatron_core-0.15.0rc4/megatron_core.egg-info → megatron_core-0.15.0rc6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.15.0rc4
+Version: 0.15.0rc6
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/indexed_dataset.py RENAMED Viewed

@@ -12,6 +12,7 @@ import shutil
 import struct
 import time
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from enum import Enum
 from functools import lru_cache
 from itertools import accumulate
@@ -172,9 +173,9 @@ class _IndexWriter(object):
     def write(
         self,
-        sequence_lengths: List[int],
-        sequence_modes: Optional[List[int]],
-        document_indices: List[int],
+        sequence_lengths: Iterable[Union[int, numpy.integer]],
+        sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
+        document_indices: Iterable[Union[int, numpy.integer]],
     ) -> None:
         """Write the index (.idx) file
@@ -208,7 +209,9 @@ class _IndexWriter(object):
         if sequence_modes is not None:
             self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
-    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
+    def _sequence_pointers(
+        self, sequence_lengths: Iterable[Union[int, numpy.integer]]
+    ) -> List[int]:
         """Build the sequence pointers per the sequence lengths and dtype size
         Args:
@@ -217,11 +220,11 @@ class _IndexWriter(object):
         Returns:
             List[int]: The pointer to the beginning of each sequence
         """
-        itemsize = DType.size(self.dtype)
-        curr_ptr = 0
+        itemsize = numpy.int64(DType.size(self.dtype))
+        curr_ptr = numpy.int64(0)
         list_ptr = []
         for length in sequence_lengths:
-            list_ptr.append(curr_ptr)
+            list_ptr.append(curr_ptr.item())
             curr_ptr += length * itemsize
         return list_ptr

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel.py RENAMED Viewed

@@ -519,8 +519,11 @@ class DistributedDataParallel(_BaseDataParallel):
                         param_slice = bucket.param_data.view(-1)[param_start:param_end]
                         param.data.copy_(param_slice.view(param.data.shape))
                     # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
-                    bucket.grad_data.zero_()
+                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
+                    # We cannot zero out the entire grad buffer because one grad buffer may
+                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
+                    # it would clear the data of those param buffers that have not yet completed AG.
+                    bucket.param_data.zero_()
     def start_grad_sync(self, *unused):
         """
@@ -562,16 +565,8 @@ class DistributedDataParallel(_BaseDataParallel):
             # to True, and there will be a double-GA.
             for param in self.params_with_grad:
                 param.grad_added_to_main_grad = False
-        # In the case of "reuse_grad_buf_for_mxfp8_param_ag=True & overlap_param_gather=True",
-        # the grad buffer is not reset here because the grad buffer is shared with the param buffer.
-        # The grad buffer is zeroed by "bucket.grad_data.zero_()" in the "finish_param_sync" stage
-        # after the param all-gather.
-        if not (
-            self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
-            and self.ddp_config.overlap_param_gather
-        ):
-            for buffer in self.buffers + self.expert_parallel_buffers:
-                buffer.reset()
+        for buffer in self.buffers + self.expert_parallel_buffers:
+            buffer.reset()
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
             bucket_group.reset()

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/finalize_model_grads.py RENAMED Viewed

@@ -267,13 +267,18 @@ def _allreduce_position_embedding_grads(
     )
-def _reset_global_aux_loss_tracker(model: List[torch.nn.Module]):
+def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.nn.Module]):
     """
-    Reset the global aux loss tracker.
+    Reset the temporary tensors of the model.
     """
     for model_chunk in model:
         for module in get_attr_wrapped_model(model_chunk, 'modules')():
-            if hasattr(module, 'reset_global_aux_loss_tracker'):
+            if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
+                module.local_tokens_per_expert.zero_()
+            if (
+                config.moe_router_load_balancing_type == "global_aux_loss"
+                or "global_aux_loss" in config.moe_router_load_balancing_type
+            ) and hasattr(module, 'reset_global_aux_loss_tracker'):
                 module.reset_global_aux_loss_tracker()
@@ -298,10 +303,7 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
         stacked_tokens_per_expert, stacked_expert_bias, config.moe_router_bias_update_rate
     )
-    for tokens_per_expert, expert_bias, updated_expert_bias in zip(
-        tokens_per_expert_list, expert_bias_list, stacked_updated_expert_bias
-    ):
-        tokens_per_expert.zero_()
+    for expert_bias, updated_expert_bias in zip(expert_bias_list, stacked_updated_expert_bias):
         expert_bias.copy_(updated_expert_bias)
@@ -465,11 +467,7 @@ def finalize_model_grads(
     if config.moe_router_enable_expert_bias:
         _update_router_expert_bias(model, config)
-    if (
-        config.moe_router_load_balancing_type == "global_aux_loss"
-        or "global_aux_loss" in config.moe_router_load_balancing_type
-    ):
-        _reset_global_aux_loss_tracker(model)
+    reset_model_temporary_tensors(config, model)
     # normalize gradients for per-token loss normalization.
     # if we are using by the number of tokens, then we use that as a divisor. this number

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py RENAMED Viewed

@@ -158,7 +158,7 @@ class FullyShardedDataParallel(_BaseDataParallel):
                 dp_cp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=True
                 )
-                inter_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
+                outer_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
                 hybrid_fsdp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=False
                 )
@@ -166,17 +166,17 @@ class FullyShardedDataParallel(_BaseDataParallel):
                 dp_cp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=False
                 )
-                inter_fsdp_group = None
+                outer_fsdp_group = None
                 hybrid_fsdp_group = None
         else:
             tp_group = getattr(pg_collection, 'tp', None)
             if enable_hsdp:
                 dp_cp_group = pg_collection.intra_dp_cp
-                inter_fsdp_group = pg_collection.inter_dist_opt
+                outer_fsdp_group = pg_collection.inter_dist_opt
                 hybrid_fsdp_group = pg_collection.dp_cp
             else:
                 dp_cp_group = pg_collection.dp_cp
-                inter_fsdp_group = None
+                outer_fsdp_group = None
                 hybrid_fsdp_group = None
         if tp_group is None:
@@ -184,17 +184,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
             tp_group = single_rank_group
         if enable_hsdp:
-            mesh = _get_hsdp_tp_mesh(inter_fsdp_group, dp_cp_group, tp_group)
+            mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
             dist_index = FSDPDistributedIndex(
-                use_hybrid_fsdp=True,
                 hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
                 device_mesh=DeviceMesh.from_group(
-                    [inter_fsdp_group, dp_cp_group, tp_group],
+                    [outer_fsdp_group, dp_cp_group, tp_group],
                     device_type="cuda",
                     mesh=mesh.tolist(),
-                    mesh_dim_names=["inter_fsdp_dp", "dp_cp", "tp"],
+                    mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"],
                 ),
-                dp_inter_dim="inter_fsdp_dp",
+                dp_outer_dim="outer_fsdp_dp",  # Use Hybrid FSDP!
                 dp_shard_dim="dp_cp",
                 tp_dim="tp",
                 hybrid_fsdp_group=hybrid_fsdp_group,
@@ -222,20 +221,20 @@ class FullyShardedDataParallel(_BaseDataParallel):
         self.module.synchronize_param_gather()
-def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
+def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
     assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
     world_size = dist.get_world_size()
     mesh = einops.rearrange(
         torch.arange(world_size),
-        "(inter_fsdp_dp fsdp tp) -> inter_fsdp_dp fsdp tp",
-        inter_fsdp_dp=inter_fsdp_dp_group.size(),
+        "(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp",
+        outer_fsdp_dp=outer_fsdp_dp_group.size(),
         tp=tp_group.size(),
     )
     mesh_fsdp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp tp) fsdp',
+        'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
@@ -247,7 +246,7 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
     mesh_tp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp fsdp) tp',
+        'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
@@ -257,18 +256,18 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
         f"do not match the ranks in the TP group {tp_group_ranks}."
     )
-    mesh_inter_fsdp_dp_ranks = einops.rearrange(
+    mesh_outer_fsdp_dp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (fsdp tp) inter_fsdp_dp',
+        'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
-    inter_fsdp_dp_group_ranks = dist.get_process_group_ranks(inter_fsdp_dp_group)
+    outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group)
     assert _check_mesh_ranks_and_group_ranks_are_consistent(
-        mesh_inter_fsdp_dp_ranks, inter_fsdp_dp_group_ranks
+        mesh_outer_fsdp_dp_ranks, outer_fsdp_dp_group_ranks
     ), (
-        f"[Megatron-FSDP] Inter FSDP Data Parallel ranks in the mesh {mesh_inter_fsdp_dp_ranks} "
-        f"do not match the ranks in the Inter FSDP DP group {inter_fsdp_dp_group_ranks}."
+        f"[Megatron-FSDP] Outer FSDP Data Parallel ranks in the mesh {mesh_outer_fsdp_dp_ranks} "
+        f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}."
     )
     return mesh

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py RENAMED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .fully_shard import fully_shard, fully_shard_model, fully_shard_optimizer
 from .megatron_fsdp import MegatronFSDP
 from .package_info import (
     __contact_emails__,
@@ -29,16 +30,13 @@ from .package_info import (
 )
 from .utils import FSDPDistributedIndex
-try:
-    from .fully_shard import fully_shard
-except ImportError as e:
-    print(f"Failed to import fully_shard: {e}")
 __all__ = [
     "DistributedDataParallelConfig",
     "MegatronFSDP",
     "FSDPDistributedIndex",
     "fully_shard",
+    "fully_shard_model",
+    "fully_shard_optimizer",
     "__contact_emails__",
     "__contact_names__",
     "__description__",

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py RENAMED Viewed

@@ -117,13 +117,12 @@ class DistributedDataParallelConfig:
       This option will cause additional memory overhead, however, it is necessary for
       to register user buffer (nccl_ub=True) for the Megatron FSDP.
       This option will be automatically set to True when nccl_ub=True.
-   """
+    """
     outer_dp_sharding_strategy: str = 'no_shard'
     """
     Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
-    Valid values are 'no_shard', 'optim', 'optim_grads', 'optim_grads_params'.
-    This option is only effective when Hybrid FSDP is enabled.
+    Valid values are 'no_shard', 'optim'. This option is only effective when Hybrid FSDP is enabled.
     """
     disable_symmetric_registration: bool = False

megatron-core 0.15.0rc4__tar.gz → 0.15.0rc6__tar.gz

Potentially problematic release.

megatron-core 0.15.0rc4tar.gz → 0.15.0rc6tar.gz