PyPI - megatron-core - Versions diffs - 0.14.0rc6__tar.gz → 0.14.0rc7__tar.gz - Mend

megatron-core 0.14.0rc6tar.gz → 0.14.0rc7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (324) hide show

{megatron_core-0.14.0rc6/megatron_core.egg-info → megatron_core-0.14.0rc7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc6
+Version: 0.14.0rc7
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/__init__.py RENAMED Viewed

@@ -20,6 +20,7 @@ from megatron.core.package_info import (
     __version__,
 )
 from megatron.core.timers import Timers
+from megatron.core.utils import is_torch_min_version
 # Alias parallel_state as mpu, its legacy name
 mpu = parallel_state
@@ -33,3 +34,8 @@ __all__ = [
     "ModelParallelConfig",
     "Timers",
 ]
+from .safe_globals import register_safe_globals
+if is_torch_min_version("2.6a0"):
+    register_safe_globals()

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/dist_checkpointing/mapping.py RENAMED Viewed

@@ -136,12 +136,6 @@ class ShardedTensor(ShardedBase):
             )
         for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
-            # NOTE: In custom FSDP, we have a case where a new parameter shard is created locally.
-            # For example, consider parameters [p0, p1, p2] sharded across GPU0 and GPU1.
-            # GPU0 receives p0 and a portion of p1, while GPU1 receives the
-            # remaining portion of p1 and p2.
-            # As a result, there is no parameter shard of p2 on GPU0, and
-            # the shape of p2 on GPU0 is zero.
             if sh != 0 and off % sh != 0:
                 raise CheckpointingException(
                     f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/dist_checkpointing/strategies/common.py RENAMED Viewed

@@ -84,9 +84,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
         try:
             if MultiStorageClientFeature.is_enabled():
                 msc = MultiStorageClientFeature.import_package()
-                return msc.torch.load(load_path, map_location='cpu', weights_only=False)
+                return msc.torch.load(load_path, map_location='cpu')
             else:
-                return torch.load(load_path, map_location='cpu', weights_only=False)
+                return torch.load(load_path, map_location='cpu')
         except FileNotFoundError as e:
             err_msg = f'Common file {load_path} does not exist'
             if MultiStorageClientFeature.is_enabled():
@@ -118,9 +118,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
             try:
                 if MultiStorageClientFeature.is_enabled():
                     msc = MultiStorageClientFeature.import_package()
-                    loaded_obj = msc.torch.load(load_path, weights_only=False)
+                    loaded_obj = msc.torch.load(load_path)
                 else:
-                    loaded_obj = torch.load(load_path, weights_only=False)
+                    loaded_obj = torch.load(load_path)
             except FileNotFoundError as e:
                 # Backward compatible logic: previously the save format was incorrect
                 base, _ = os.path.splitext(sh_obj.unique_key)
@@ -128,9 +128,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
                 try:
                     if MultiStorageClientFeature.is_enabled():
                         msc = MultiStorageClientFeature.import_package()
-                        loaded_obj = msc.torch.load(old_load_path, weights_only=False)
+                        loaded_obj = msc.torch.load(old_load_path)
                     else:
-                        loaded_obj = torch.load(old_load_path, weights_only=False)
+                        loaded_obj = torch.load(old_load_path)
                 except FileNotFoundError:
                     err_msg = f'Object shard {load_path} not found'
                     obj_subdir = os.path.join(checkpoint_dir, sh_obj.key)

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/distributed/__init__.py RENAMED Viewed

@@ -8,5 +8,6 @@ except ImportError:
 from .distributed_data_parallel import DistributedDataParallel
 from .distributed_data_parallel_config import DistributedDataParallelConfig
 from .finalize_model_grads import finalize_model_grads
+from .fsdp.mcore_fsdp_adapter import FullyShardedDataParallel
 from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel
 from .torch_fully_sharded_data_parallel_config import TorchFullyShardedDataParallelConfig

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/distributed/distributed_data_parallel_config.py RENAMED Viewed

@@ -61,9 +61,16 @@ class DistributedDataParallelConfig:
     """If true, reuse the grad buffer for param AG when using mxfp8 recipe. Should be
        set to True only when fp8_recipe is mxfp8 and fp8_param_gather is True."""
-    use_custom_fsdp: bool = False
+    use_megatron_fsdp: bool = False
     """If true, use the FSDP code path for DDP."""
+    use_custom_fsdp: bool = False
+    """
+    NOTE: The flag `use_custom_fsdp` is deprecated and will be removed in future versions.
+    Please use `use_megatron_fsdp` instead, as all functionality will be migrated there.
+    Future updates will drop support for `use_custom_fsdp` to avoid confusion.
+    """
     data_parallel_sharding_strategy: str = 'no_shard'
     """Sharding strategy for FSDP. Valid values are 'no_shard', 'optim',
         'optim_grads', 'optim_grads_params'."""
@@ -80,10 +87,10 @@ class DistributedDataParallelConfig:
       based on your system's memory and performance requirements."""
     preserve_fp32_weights: bool = True
-    """If true, preserve fp32 weights in the custom FSDP ParamAndGradBuffer."""
+    """If true, preserve fp32 weights in the Megatron FSDP ParamAndGradBuffer."""
-    keep_fp8_transpose_cache_when_using_custom_fsdp: bool = False
-    """If true, keep the fp8 transpose cache when using custom FSDP."""
+    keep_fp8_transpose_cache: bool = False
+    """If true, keep the fp8 transpose cache when using Megatron FSDP."""
     nccl_ub: bool = False
     """If true, allocate and register NCCL userbuffer for param and grad buffer.
@@ -106,12 +113,19 @@ class DistributedDataParallelConfig:
     fsdp_double_buffer: bool = False
     """If true, use persistently allocated double buffers for the
-      temporary memory needed in the custom FSDP communications.
+      temporary memory needed in the Megatron FSDP communications.
       This option will cause additional memory overhead, however, it is necessary for
-      to register user buffer (nccl_ub=True) for the custom FSDP.
+      to register user buffer (nccl_ub=True) for the Megatron FSDP.
       This option will be automatically set to True when nccl_ub=True.
    """
+    outer_dp_sharding_strategy: str = 'no_shard'
+    """
+    Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
+    Valid values are 'no_shard', 'optim', 'optim_grads', 'optim_grads_params'.
+    This option is only effective when Hybrid FSDP is enabled.
+    """
     def __post_init__(self):
         import os

{megatron_core-0.14.0rc6 → megatron_core-0.14.0rc7}/megatron/core/distributed/finalize_model_grads.py RENAMED Viewed

@@ -31,9 +31,7 @@ from ..utils import (
 )
-def _get_main_grad_attr(param: torch.nn.Parameter, use_custom_fsdp: bool = False):
-    if use_custom_fsdp:
-        return "fsdp_managed_main_grad"
+def _get_main_grad_attr(param: torch.nn.Parameter, use_megatron_fsdp: bool = False):
     if hasattr(param, "main_grad"):
         return "main_grad"
     return "grad"
@@ -241,8 +239,10 @@ def _allreduce_embedding_grad(
         if weight is None and skip_if_none:
             return
-        grad_attr = _get_main_grad_attr(weight, ddp_config.use_custom_fsdp)
+        grad_attr = _get_main_grad_attr(weight, ddp_config.use_megatron_fsdp)
         orig_grad = getattr(weight, grad_attr)
+        if ddp_config.use_megatron_fsdp:
+            orig_grad = orig_grad._local_tensor if orig_grad is not None else None
         grad = _unshard_if_dtensor(orig_grad)
         # When the embedding is frozen, the grad is None.
         if grad is None and skip_if_none:
@@ -320,20 +320,30 @@ def _allreduce_non_tensor_model_parallel_grads(
             if param.requires_grad:
                 # Check if this param needs average reduction (average_gradients_across_tp_domain)
                 if getattr(param, "average_gradients_across_tp_domain", False):
-                    params_avg.append(param)
-                    grad_attr = _get_main_grad_attr(param, ddp_config.use_custom_fsdp)
+                    grad_attr = _get_main_grad_attr(param, ddp_config.use_megatron_fsdp)
                     grad = getattr(param, grad_attr)
-                    grad = _unshard_if_dtensor(grad)
-                    grads_avg.append(grad.data)
+                    if grad is None:
+                        continue
+                    params_avg.append(param)
+                    if ddp_config.use_megatron_fsdp:
+                        grads_avg.append(grad._local_tensor.data)
+                    else:
+                        grad = _unshard_if_dtensor(grad)
+                        grads_avg.append(grad.data)
                 # Check if this param needs sum reduction (sequence parallel or qk_layernorm)
                 elif (config.sequence_parallel and getattr(param, "sequence_parallel", False)) or (
                     config.qk_layernorm and ("q_layernorm" in name or "k_layernorm" in name)
                 ):
-                    params_sum.append(param)
-                    grad_attr = _get_main_grad_attr(param, ddp_config.use_custom_fsdp)
+                    grad_attr = _get_main_grad_attr(param, ddp_config.use_megatron_fsdp)
                     grad = getattr(param, grad_attr)
-                    grad = _unshard_if_dtensor(grad)
-                    grads_sum.append(grad.data)
+                    if grad is None:
+                        continue
+                    params_sum.append(param)
+                    if ddp_config.use_megatron_fsdp:
+                        grads_sum.append(grad._local_tensor.data)
+                    else:
+                        grad = _unshard_if_dtensor(grad)
+                        grads_sum.append(grad.data)
     # Loop grads and perform correct all-reduce
     for params, grads, all_reduce_op in zip(
@@ -348,9 +358,12 @@ def _allreduce_non_tensor_model_parallel_grads(
                 params, grads, _unflatten_dense_tensors(coalesced, grads)
             ):
                 buf.copy_(synced)
-                grad_attr = _get_main_grad_attr(param, ddp_config.use_custom_fsdp)
+                grad_attr = _get_main_grad_attr(param, ddp_config.use_megatron_fsdp)
                 orig_grad = getattr(param, grad_attr)
-                setattr(param, grad_attr, _reshard_if_dtensor(buf, orig_grad))
+                if ddp_config.use_megatron_fsdp:
+                    setattr(param, grad_attr, orig_grad)
+                else:
+                    setattr(param, grad_attr, _reshard_if_dtensor(buf, orig_grad))
 """

megatron_core-0.14.0rc7/megatron/core/distributed/fsdp/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from .mcore_fsdp_adapter import FullyShardedDataParallel

megatron_core-0.14.0rc7/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py ADDED Viewed

@@ -0,0 +1,317 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import List, Optional
+try:
+    import einops
+    HAVE_EINOPS = True
+except ImportError:
+    HAVE_EINOPS = False
+import torch
+import torch.distributed as dist
+try:
+    from torch.distributed import DeviceMesh
+    HAVE_DTENSOR = True
+except ImportError:
+    HAVE_DTENSOR = False
+from megatron.core import parallel_state
+from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
+from megatron.core.distributed.data_parallel_base import _BaseDataParallel
+from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig
+from megatron.core.process_groups_config import GradCommProcessGroups, ModelCommProcessGroups
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_layer import TransformerLayer
+from megatron.core.utils import log_single_rank
+try:
+    from megatron.core.distributed.fsdp.src.megatron_fsdp import FSDPDistributedIndex, MegatronFSDP
+    HAVE_MEGATRON_FSDP = True
+except ImportError as import_megatron_fsdp_error:
+    IMPORT_MEGATRON_FSDP_ERROR = import_megatron_fsdp_error
+    HAVE_MEGATRON_FSDP = False
+logger = logging.getLogger(__name__)
+class FullyShardedDataParallel(_BaseDataParallel):
+    """
+    Fully Sharded Data Parallel (FSDP) wrapper for the Megatron model.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        ddp_config: DistributedDataParallelConfig,
+        module: torch.nn.Module,
+        fsdp_unit_modules: Optional[List[torch.nn.Module]] = None,
+        disable_bucketing: bool = False,
+        device: Optional[torch.device] = None,
+        grad_comm_pgs: Optional[GradCommProcessGroups] = None,
+        model_comm_pgs: Optional[ModelCommProcessGroups] = None,
+    ):
+        if not HAVE_MEGATRON_FSDP:
+            raise IMPORT_MEGATRON_FSDP_ERROR
+        if has_config_logger_enabled(config):
+            log_config_to_disk(config, locals(), prefix=type(self).__name__)
+        self.ddp_config = ddp_config
+        log_single_rank(
+            logger,
+            logging.INFO,
+            f'Setting up DistributedDataParallel with config {self.ddp_config}',
+        )
+        self.megatron_fsdp_dist_index = self._init_dist_index(grad_comm_pgs, model_comm_pgs)
+        self.bucket_size = self.ddp_config.bucket_size
+        if disable_bucketing:
+            self.bucket_size = None
+        self.device = device if device else torch.device(f'cuda:{torch.cuda.current_device()}')
+        if fsdp_unit_modules is not None:
+            self.fsdp_unit_modules = fsdp_unit_modules
+        else:
+            if self.ddp_config.data_parallel_sharding_strategy == "optim_grads_params":
+                self.fsdp_unit_modules = [TransformerLayer]
+            else:
+                self.fsdp_unit_modules = []
+        super().__init__(
+            config=config,
+            module=MegatronFSDP(
+                ddp_config=ddp_config,
+                module=module,
+                fsdp_unit_modules=self.fsdp_unit_modules,
+                disable_bucketing=disable_bucketing,
+                device=self.device,
+                dist_index=self.megatron_fsdp_dist_index,
+                calculate_per_token_loss=config.calculate_per_token_loss,
+                init_model_with_meta_device=config.init_model_with_meta_device,
+            ),
+        )
+        self.param_and_grad_buffer = self.module.param_and_grad_buffer
+        self.no_sync = self.module.no_sync
+        self.start_param_sync = self.module.start_param_sync
+        self.start_grad_sync = self.module.start_grad_sync
+        self.finish_grad_sync = self.module.finish_grad_sync
+        self.scale_gradients = self.module.scale_gradients
+        self.zero_grad_buffer = self.module.zero_grad_buffer
+        self.broadcast_params = self.module.broadcast_params
+        self.module.state_dict_for_save_checkpoint = self.module.state_dict
+        self.state_dict_for_save_checkpoint = self.state_dict
+    def load_state_dict(self, state_dict, strict=True):
+        """
+        Load the state dictionary into the module.
+        """
+        custom_state_dict = {}
+        for key, value in state_dict.items():
+            if self.config.fp8 and key.endswith('._extra_state'):
+                # Skip extra state keys
+                continue
+            custom_state_dict[f"module.{key}"] = value
+        if self.config.fp8 or self.config.gated_linear_unit:
+            strict = False
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                "Loading state_dict with strict=False due to fp8 configuration. "
+                "This is expected as some keys may not match exactly.",
+            )
+        self.module.load_state_dict(custom_state_dict, strict=strict)
+    def _init_dist_index(self, grad_comm_pgs, model_comm_pgs):
+        """
+        Initialize the distributed index for the module.
+        """
+        if not HAVE_DTENSOR:
+            raise ImportError(
+                "This module requires PyTorch with DTensor support. "
+                "Please install a compatible version of PyTorch."
+            )
+        enable_hsdp = self.ddp_config.num_distributed_optimizer_instances > 1
+        if grad_comm_pgs is None and model_comm_pgs is None:
+            tp_group = parallel_state.get_tensor_model_parallel_group()
+            if enable_hsdp:
+                dp_cp_group = parallel_state.get_data_parallel_group(
+                    with_context_parallel=True, partial_data_parallel=True
+                )
+                inter_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
+                hybrid_fsdp_group = parallel_state.get_data_parallel_group(
+                    with_context_parallel=True, partial_data_parallel=False
+                )
+            else:
+                dp_cp_group = parallel_state.get_data_parallel_group(
+                    with_context_parallel=True, partial_data_parallel=False
+                )
+                inter_fsdp_group = None
+                hybrid_fsdp_group = None
+        elif grad_comm_pgs is not None and model_comm_pgs is not None:
+            tp_group = getattr(model_comm_pgs, 'tp', None)
+            if enable_hsdp:
+                dp_cp_group = grad_comm_pgs.intra_dp_cp
+                inter_fsdp_group = grad_comm_pgs.inter_dist_opt
+                hybrid_fsdp_group = grad_comm_pgs.dp_cp
+            else:
+                dp_cp_group = grad_comm_pgs.dp_cp
+                inter_fsdp_group = None
+                hybrid_fsdp_group = None
+        else:
+            raise ValueError(
+                "Both grad_comm_pgs and model_comm_pgs must be either None or provided together."
+            )
+        if tp_group is None:
+            single_rank_group = dist.new_group(ranks=[dist.get_rank()])
+            tp_group = single_rank_group
+        if enable_hsdp:
+            mesh = _get_hsdp_tp_mesh(inter_fsdp_group, dp_cp_group, tp_group)
+            dist_index = FSDPDistributedIndex(
+                use_hybrid_fsdp=True,
+                hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
+                device_mesh=DeviceMesh.from_group(
+                    [inter_fsdp_group, dp_cp_group, tp_group],
+                    device_type="cuda",
+                    mesh=mesh.tolist(),
+                    mesh_dim_names=["inter_fsdp_dp", "dp_cp", "tp"],
+                ),
+                dp_inter_dim="inter_fsdp_dp",
+                dp_shard_dim="dp_cp",
+                tp_dim="tp",
+                hybrid_fsdp_group=hybrid_fsdp_group,
+            )
+        else:
+            mesh = _get_dp_tp_mesh(dp_cp_group, tp_group)
+            dist_index = FSDPDistributedIndex(
+                device_mesh=DeviceMesh.from_group(
+                    [dp_cp_group, tp_group],
+                    device_type="cuda",
+                    mesh=mesh.tolist(),
+                    mesh_dim_names=["dp_cp", "tp"],
+                ),
+                dp_shard_dim="dp_cp",
+                tp_dim="tp",
+            )
+        return dist_index
+    def stop_communication(self):
+        """
+        Stop communication for the module.
+        """
+        self.module.synchronize_gradient_reduce()
+        self.module.synchronize_param_gather()
+def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
+    assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
+    world_size = dist.get_world_size()
+    mesh = einops.rearrange(
+        torch.arange(world_size),
+        "(inter_fsdp_dp fsdp tp) -> inter_fsdp_dp fsdp tp",
+        inter_fsdp_dp=inter_fsdp_dp_group.size(),
+        tp=tp_group.size(),
+    )
+    mesh_fsdp_ranks = einops.rearrange(
+        mesh,
+        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp tp) fsdp',
+        tp=tp_group.size(),
+        fsdp=dp_cp_group.size(),
+    )
+    fsdp_group_ranks = dist.get_process_group_ranks(dp_cp_group)
+    assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_fsdp_ranks, fsdp_group_ranks), (
+        f"[Megatron-FSDP] FSDP ranks in the mesh {mesh_fsdp_ranks} "
+        f"do not match the ranks in the FSDP group {fsdp_group_ranks}."
+    )
+    mesh_tp_ranks = einops.rearrange(
+        mesh,
+        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp fsdp) tp',
+        tp=tp_group.size(),
+        fsdp=dp_cp_group.size(),
+    )
+    tp_group_ranks = dist.get_process_group_ranks(tp_group)
+    assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), (
+        f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} "
+        f"do not match the ranks in the TP group {tp_group_ranks}."
+    )
+    mesh_inter_fsdp_dp_ranks = einops.rearrange(
+        mesh,
+        'inter_fsdp_dp fsdp tp -> (fsdp tp) inter_fsdp_dp',
+        tp=tp_group.size(),
+        fsdp=dp_cp_group.size(),
+    )
+    inter_fsdp_dp_group_ranks = dist.get_process_group_ranks(inter_fsdp_dp_group)
+    assert _check_mesh_ranks_and_group_ranks_are_consistent(
+        mesh_inter_fsdp_dp_ranks, inter_fsdp_dp_group_ranks
+    ), (
+        f"[Megatron-FSDP] Inter FSDP Data Parallel ranks in the mesh {mesh_inter_fsdp_dp_ranks} "
+        f"do not match the ranks in the Inter FSDP DP group {inter_fsdp_dp_group_ranks}."
+    )
+    return mesh
+def _get_dp_tp_mesh(dp_cp_group, tp_group):
+    assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
+    world_size = dist.get_world_size()
+    tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1
+    # TODO: Supports configurable (dp, cp, tp) order.
+    mesh = einops.rearrange(torch.arange(world_size), "(dp_cp tp) -> dp_cp tp", tp=tp_size)
+    mesh_dp_ranks = einops.rearrange(mesh, 'dp_cp tp -> tp dp_cp', tp=tp_size)
+    dp_cp_group_ranks = dist.get_process_group_ranks(dp_cp_group)
+    assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_dp_ranks, dp_cp_group_ranks), (
+        f"[Megatron-FSDP] Data Parallel ranks in the mesh {mesh_dp_ranks} "
+        f"do not match the ranks in the DP group {dp_cp_group_ranks}."
+    )
+    mesh_tp_ranks = einops.rearrange(mesh, 'dp_cp tp -> (dp_cp) tp', tp=tp_size)
+    tp_group_ranks = dist.get_process_group_ranks(tp_group)
+    assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), (
+        f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} "
+        f"do not match the ranks in the TP group {tp_group_ranks}."
+    )
+    return mesh
+def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks):
+    current_rank = dist.get_rank()
+    current_ranks = list(filter(lambda ranks: current_rank in ranks, mesh_ranks.tolist()))
+    assert len(current_ranks) == 1, (
+        f"[Megatron-FSDP] Current rank {current_rank} is not unique in "
+        f"the mesh ranks {mesh_ranks.tolist()}."
+    )
+    assert sorted(current_ranks[0]) == sorted(group_ranks), (
+        f"[Megatron-FSDP] Current rank {current_rank} in the mesh ranks "
+        f"{mesh_ranks.tolist()} does not match the group ranks {group_ranks}."
+    )
+    return sorted(current_ranks[0]) == sorted(group_ranks)

megatron_core-0.14.0rc7/megatron/core/distributed/fsdp/src/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

megatron_core-0.14.0rc7/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .megatron_fsdp import MegatronFSDP
+from .utils import FSDPDistributedIndex
+try:
+    from .fully_shard import fully_shard
+except ImportError as e:
+    print(f"Failed to import fully_shard: {e}")

megatron-core 0.14.0rc6__tar.gz → 0.14.0rc7__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc6tar.gz → 0.14.0rc7tar.gz