megatron-fsdp 0.2.0.dev101858__tar.gz → 0.2.0.dev102301__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/PKG-INFO +1 -1
  2. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/distributed_data_parallel_config.py +8 -0
  3. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/package_info.py +1 -1
  4. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/uneven_dtensor.py +10 -1
  5. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/utils.py +3 -6
  6. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/PKG-INFO +1 -1
  7. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/README.md +0 -0
  8. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/__init__.py +0 -0
  9. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/fully_shard.py +0 -0
  10. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/megatron_fsdp.py +0 -0
  11. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp/param_and_grad_buffer.py +0 -0
  12. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/SOURCES.txt +0 -0
  13. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/dependency_links.txt +0 -0
  14. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/requires.txt +0 -0
  15. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/megatron_fsdp.egg-info/top_level.txt +0 -0
  16. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/pyproject.toml +0 -0
  17. {megatron_fsdp-0.2.0.dev101858 → megatron_fsdp-0.2.0.dev102301}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-fsdp
3
- Version: 0.2.0.dev101858
3
+ Version: 0.2.0.dev102301
4
4
  Summary: **Megatron-FSDP** is an NVIDIA-developed PyTorch extension that provides a high-performance implementation of Fully Sharded Data Parallelism (FSDP)
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -137,6 +137,14 @@ class DistributedDataParallelConfig:
137
137
  """Check the validity of the config."""
138
138
  if self.reuse_grad_buf_for_mxfp8_param_ag:
139
139
  assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
140
+ # Using mxfp8 param without overlap param gather and overlap grad reduce will cause NaN.
141
+ # TODO: Remove this assertion when the issue is fixed.
142
+ assert (
143
+ self.overlap_param_gather
144
+ ), "--overlap-param-gather is required when using mxfp8 params"
145
+ assert (
146
+ self.overlap_grad_reduce
147
+ ), "--overlap-grad-reduce is required when using mxfp8 params"
140
148
 
141
149
  if self.nccl_ub:
142
150
  if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 2
6
6
  PATCH = 0
7
- PRE_RELEASE = '0.dev101858'
7
+ PRE_RELEASE = '0.dev102301'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -25,6 +25,8 @@ from torch.distributed.checkpoint.metadata import (
25
25
  from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
26
26
  from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
27
27
 
28
+ from .utils import get_mesh_names
29
+
28
30
 
29
31
  def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
30
32
  """
@@ -272,7 +274,14 @@ def gather_uneven_dtensor_to_full_tensor(
272
274
  if not device_mesh.mesh_dim_names:
273
275
  process_group = device_mesh.get_group()
274
276
  else:
275
- process_group = device_mesh._flatten().get_group()
277
+ # Check if the fully-flattened mesh exists first.
278
+ full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
279
+ if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
280
+ # Retrieve the existing flattened DeviceMesh ProcessGroup.
281
+ process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
282
+ else:
283
+ # Create the _-separated flattened DeviceMesh ProcessGroup.
284
+ process_group = device_mesh._flatten().get_group()
276
285
 
277
286
  # Collect chunk metadata for uneven shards (update if missing)
278
287
  if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):
@@ -167,13 +167,10 @@ def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
167
167
  submesh_dim_name
168
168
  for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
169
169
  for submesh_dim_name in (child_mesh.mesh_dim_names or [])
170
- if root_mesh == device_mesh
170
+ # Add flattened or other unaccounted for children of the root mesh.
171
+ if root_mesh == device_mesh and submesh_dim_name not in mesh_dim_names
171
172
  ]
172
- # Combine without duplicate dimensions.
173
- for dim_name in submesh_dim_names:
174
- if dim_name not in mesh_dim_names:
175
- mesh_dim_names.append(dim_name)
176
- return mesh_dim_names
173
+ return mesh_dim_names + submesh_dim_names
177
174
 
178
175
 
179
176
  def contains_submesh(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-fsdp
3
- Version: 0.2.0.dev101858
3
+ Version: 0.2.0.dev102301
4
4
  Summary: **Megatron-FSDP** is an NVIDIA-developed PyTorch extension that provides a high-performance implementation of Fully Sharded Data Parallelism (FSDP)
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>