PyPI - megatron-core - Versions diffs - 0.15.0rc4__tar.gz → 0.15.0rc5__tar.gz - Mend

megatron-core 0.15.0rc4tar.gz → 0.15.0rc5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (352) hide show

{megatron_core-0.15.0rc4/megatron_core.egg-info → megatron_core-0.15.0rc5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.15.0rc4
+Version: 0.15.0rc5
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/indexed_dataset.py RENAMED Viewed

@@ -12,6 +12,7 @@ import shutil
 import struct
 import time
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from enum import Enum
 from functools import lru_cache
 from itertools import accumulate
@@ -172,9 +173,9 @@ class _IndexWriter(object):
     def write(
         self,
-        sequence_lengths: List[int],
-        sequence_modes: Optional[List[int]],
-        document_indices: List[int],
+        sequence_lengths: Iterable[Union[int, numpy.integer]],
+        sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
+        document_indices: Iterable[Union[int, numpy.integer]],
     ) -> None:
         """Write the index (.idx) file
@@ -208,7 +209,9 @@ class _IndexWriter(object):
         if sequence_modes is not None:
             self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
-    def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
+    def _sequence_pointers(
+        self, sequence_lengths: Iterable[Union[int, numpy.integer]]
+    ) -> List[int]:
         """Build the sequence pointers per the sequence lengths and dtype size
         Args:
@@ -217,11 +220,11 @@ class _IndexWriter(object):
         Returns:
             List[int]: The pointer to the beginning of each sequence
         """
-        itemsize = DType.size(self.dtype)
-        curr_ptr = 0
+        itemsize = numpy.int64(DType.size(self.dtype))
+        curr_ptr = numpy.int64(0)
         list_ptr = []
         for length in sequence_lengths:
-            list_ptr.append(curr_ptr)
+            list_ptr.append(curr_ptr.item())
             curr_ptr += length * itemsize
         return list_ptr

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py RENAMED Viewed

@@ -158,7 +158,7 @@ class FullyShardedDataParallel(_BaseDataParallel):
                 dp_cp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=True
                 )
-                inter_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
+                outer_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
                 hybrid_fsdp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=False
                 )
@@ -166,17 +166,17 @@ class FullyShardedDataParallel(_BaseDataParallel):
                 dp_cp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=False
                 )
-                inter_fsdp_group = None
+                outer_fsdp_group = None
                 hybrid_fsdp_group = None
         else:
             tp_group = getattr(pg_collection, 'tp', None)
             if enable_hsdp:
                 dp_cp_group = pg_collection.intra_dp_cp
-                inter_fsdp_group = pg_collection.inter_dist_opt
+                outer_fsdp_group = pg_collection.inter_dist_opt
                 hybrid_fsdp_group = pg_collection.dp_cp
             else:
                 dp_cp_group = pg_collection.dp_cp
-                inter_fsdp_group = None
+                outer_fsdp_group = None
                 hybrid_fsdp_group = None
         if tp_group is None:
@@ -184,17 +184,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
             tp_group = single_rank_group
         if enable_hsdp:
-            mesh = _get_hsdp_tp_mesh(inter_fsdp_group, dp_cp_group, tp_group)
+            mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
             dist_index = FSDPDistributedIndex(
-                use_hybrid_fsdp=True,
                 hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
                 device_mesh=DeviceMesh.from_group(
-                    [inter_fsdp_group, dp_cp_group, tp_group],
+                    [outer_fsdp_group, dp_cp_group, tp_group],
                     device_type="cuda",
                     mesh=mesh.tolist(),
-                    mesh_dim_names=["inter_fsdp_dp", "dp_cp", "tp"],
+                    mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"],
                 ),
-                dp_inter_dim="inter_fsdp_dp",
+                dp_outer_dim="outer_fsdp_dp",  # Use Hybrid FSDP!
                 dp_shard_dim="dp_cp",
                 tp_dim="tp",
                 hybrid_fsdp_group=hybrid_fsdp_group,
@@ -222,20 +221,20 @@ class FullyShardedDataParallel(_BaseDataParallel):
         self.module.synchronize_param_gather()
-def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
+def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
     assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
     world_size = dist.get_world_size()
     mesh = einops.rearrange(
         torch.arange(world_size),
-        "(inter_fsdp_dp fsdp tp) -> inter_fsdp_dp fsdp tp",
-        inter_fsdp_dp=inter_fsdp_dp_group.size(),
+        "(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp",
+        outer_fsdp_dp=outer_fsdp_dp_group.size(),
         tp=tp_group.size(),
     )
     mesh_fsdp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp tp) fsdp',
+        'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
@@ -247,7 +246,7 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
     mesh_tp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp fsdp) tp',
+        'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
@@ -257,18 +256,18 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
         f"do not match the ranks in the TP group {tp_group_ranks}."
     )
-    mesh_inter_fsdp_dp_ranks = einops.rearrange(
+    mesh_outer_fsdp_dp_ranks = einops.rearrange(
         mesh,
-        'inter_fsdp_dp fsdp tp -> (fsdp tp) inter_fsdp_dp',
+        'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp',
         tp=tp_group.size(),
         fsdp=dp_cp_group.size(),
     )
-    inter_fsdp_dp_group_ranks = dist.get_process_group_ranks(inter_fsdp_dp_group)
+    outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group)
     assert _check_mesh_ranks_and_group_ranks_are_consistent(
-        mesh_inter_fsdp_dp_ranks, inter_fsdp_dp_group_ranks
+        mesh_outer_fsdp_dp_ranks, outer_fsdp_dp_group_ranks
     ), (
-        f"[Megatron-FSDP] Inter FSDP Data Parallel ranks in the mesh {mesh_inter_fsdp_dp_ranks} "
-        f"do not match the ranks in the Inter FSDP DP group {inter_fsdp_dp_group_ranks}."
+        f"[Megatron-FSDP] Outer FSDP Data Parallel ranks in the mesh {mesh_outer_fsdp_dp_ranks} "
+        f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}."
     )
     return mesh

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py RENAMED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .fully_shard import fully_shard, fully_shard_model, fully_shard_optimizer
 from .megatron_fsdp import MegatronFSDP
 from .package_info import (
     __contact_emails__,
@@ -29,16 +30,13 @@ from .package_info import (
 )
 from .utils import FSDPDistributedIndex
-try:
-    from .fully_shard import fully_shard
-except ImportError as e:
-    print(f"Failed to import fully_shard: {e}")
 __all__ = [
     "DistributedDataParallelConfig",
     "MegatronFSDP",
     "FSDPDistributedIndex",
     "fully_shard",
+    "fully_shard_model",
+    "fully_shard_optimizer",
     "__contact_emails__",
     "__contact_names__",
     "__description__",

{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py RENAMED Viewed

@@ -117,13 +117,12 @@ class DistributedDataParallelConfig:
       This option will cause additional memory overhead, however, it is necessary for
       to register user buffer (nccl_ub=True) for the Megatron FSDP.
       This option will be automatically set to True when nccl_ub=True.
-   """
+    """
     outer_dp_sharding_strategy: str = 'no_shard'
     """
     Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
-    Valid values are 'no_shard', 'optim', 'optim_grads', 'optim_grads_params'.
-    This option is only effective when Hybrid FSDP is enabled.
+    Valid values are 'no_shard', 'optim'. This option is only effective when Hybrid FSDP is enabled.
     """
     disable_symmetric_registration: bool = False

megatron-core 0.15.0rc4__tar.gz → 0.15.0rc5__tar.gz

Potentially problematic release.

megatron-core 0.15.0rc4tar.gz → 0.15.0rc5tar.gz