PyPI - megatron-fsdp - Versions diffs - 0.2.0.dev101882__tar.gz → 0.2.0.dev102301__tar.gz - Mend

megatron-fsdp 0.2.0.dev101882tar.gz → 0.2.0.dev102301tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-fsdp
-Version: 0.2.0.dev101882
+Version: 0.2.0.dev102301
 Summary: **Megatron-FSDP** is an NVIDIA-developed PyTorch extension that provides a high-performance implementation of Fully Sharded Data Parallelism (FSDP)
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/distributed_data_parallel_config.py RENAMED Viewed

@@ -137,6 +137,14 @@ class DistributedDataParallelConfig:
         """Check the validity of the config."""
         if self.reuse_grad_buf_for_mxfp8_param_ag:
             assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
+            # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN.
+            # TODO: Remove this assertion when the issue is fixed.
+            assert (
+                self.overlap_param_gather
+            ), "--overlap-param-gather is required when using mxfp8 params"
+            assert (
+                self.overlap_grad_reduce
+            ), "--overlap-grad-reduce is required when using mxfp8 params"
         if self.nccl_ub:
             if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/megatron_fsdp.py RENAMED Viewed

@@ -898,9 +898,10 @@ class MegatronFSDP(torch.nn.Module):
         # Register pre state_dict hook to ensure that the module parameters are
         # distributed before saving the state_dict.
-        self._state_dict_pre_hook = self.module.register_state_dict_pre_hook(
-            lambda *args, **kwargs: self._replace_param_with_distributed_if_needed()
-        )
+        for name, module in self.named_modules():
+            module.register_state_dict_pre_hook(
+                lambda *args, **kwargs: self._replace_param_with_distributed_if_needed()
+            )
     @contextmanager
     def no_sync(self):

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 2
 PATCH = 0
-PRE_RELEASE = '0.dev101882'
+PRE_RELEASE = '0.dev102301'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/param_and_grad_buffer.py RENAMED Viewed

@@ -2782,6 +2782,9 @@ class GradReducePipeline:
             outer_fsdp_group_grad_reduce (bool, optional): Whether to reduce gradients
                 across outer-DP groups. Defaults to False.
         """
+        # Sort parameters by their bucket IDs to ensure a deterministic processing order.
+        # Performing reduce-scatter operations out of order can lead to hangs.
+        params = sorted(list(params), key=lambda x: self.buffer.param_to_param_group[x])
         for param in params:
             bucket_id = self.buffer.param_to_param_group[param]
             param_group = self.buffer.parameter_groups[bucket_id]

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/uneven_dtensor.py RENAMED Viewed

@@ -25,6 +25,8 @@ from torch.distributed.checkpoint.metadata import (
 from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
 from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
+from .utils import get_mesh_names
 def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
     """
@@ -272,7 +274,14 @@ def gather_uneven_dtensor_to_full_tensor(
     if not device_mesh.mesh_dim_names:
         process_group = device_mesh.get_group()
     else:
-        process_group = device_mesh._flatten().get_group()
+        # Check if the fully-flattened mesh exists first.
+        full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
+        if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
+            # Retrieve the existing flattened DeviceMesh ProcessGroup.
+            process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
+        else:
+            # Create the _-separated flattened DeviceMesh ProcessGroup.
+            process_group = device_mesh._flatten().get_group()
     # Collect chunk metadata for uneven shards (update if missing)
     if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/utils.py RENAMED Viewed

@@ -167,13 +167,10 @@ def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
         submesh_dim_name
         for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
         for submesh_dim_name in (child_mesh.mesh_dim_names or [])
-        if root_mesh == device_mesh
+        # Add flattened or other unaccounted for children of the root mesh.
+        if root_mesh == device_mesh and submesh_dim_name not in mesh_dim_names
     ]
-    # Combine without duplicate dimensions.
-    for dim_name in submesh_dim_names:
-        if dim_name not in mesh_dim_names:
-            mesh_dim_names.append(dim_name)
-    return mesh_dim_names
+    return mesh_dim_names + submesh_dim_names
 def contains_submesh(

{megatron_fsdp-0.2.0.dev101882 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-fsdp
-Version: 0.2.0.dev101882
+Version: 0.2.0.dev102301
 Summary: **Megatron-FSDP** is an NVIDIA-developed PyTorch extension that provides a high-performance implementation of Fully Sharded Data Parallelism (FSDP)
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>