PyPI - megatron-core - Versions diffs - 0.14.0rc6__tar.gz → 0.15.0rc0__tar.gz - Mend

megatron-core 0.14.0rc6tar.gz → 0.15.0rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (327) hide show

{megatron_core-0.14.0rc6/megatron_core.egg-info → megatron_core-0.15.0rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc6
+Version: 0.15.0rc0
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -31,7 +31,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: torch
 Requires-Dist: numpy<2.0.0
-Requires-Dist: packaging
+Requires-Dist: packaging>=24.2
 Provides-Extra: mlm
 Requires-Dist: flask-restful; extra == "mlm"
 Requires-Dist: sentencepiece; extra == "mlm"
@@ -43,7 +43,7 @@ Requires-Dist: einops~=0.8; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
 Requires-Dist: nvtx~=0.2; extra == "dev"
 Requires-Dist: transformers~=4.53; extra == "dev"
-Requires-Dist: multi-storage-client~=0.20; extra == "dev"
+Requires-Dist: multi-storage-client<0.26,~=0.25; extra == "dev"
 Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
 Requires-Dist: mamba-ssm~=2.2; extra == "dev"

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/dict_utils.py RENAMED Viewed

@@ -103,11 +103,19 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
     else:
         only_left = []
         only_right = []
+        mismatch_debug_data = [prefix, type(x1), type(x2)]
         if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
-            if x1.device != x2.device:
-                _is_mismatch = not torch.all(x1.cpu() == x2.cpu())
-            else:
-                _is_mismatch = not torch.all(x1 == x2)
+            try:
+                if x1.device != x2.device:
+                    _is_mismatch = not torch.all(x1.cpu() == x2.cpu())
+                else:
+                    _is_mismatch = not torch.all(x1 == x2)
+                mismatch_debug_data.extend(
+                    [(x1 != x2).sum(), (x1 != x2).shape, (x1 != x2).nonzero().tolist()]
+                )
+            except (RuntimeError, TypeError, ValueError):
+                _is_mismatch = True
+                mismatch_debug_data.extend([x1.shape, x2.shape])
         # TODO: change with concrete type that has both replica_id and data attrs
         elif hasattr(x1, "replica_id") and hasattr(x2, "replica_id"):
             assert type(x1) == type(x2)
@@ -122,7 +130,7 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
                 _is_mismatch = True
         if _is_mismatch:
-            mismatch.append((prefix, type(x1), type(x2)))
+            mismatch.append(tuple(mismatch_debug_data))
     return only_left, only_right, mismatch

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/mapping.py RENAMED Viewed

@@ -135,23 +135,23 @@ class ShardedTensor(ShardedBase):
                 f"equal to global shape dimensions for {self}"
             )
-        for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
-            # NOTE: In custom FSDP, we have a case where a new parameter shard is created locally.
-            # For example, consider parameters [p0, p1, p2] sharded across GPU0 and GPU1.
-            # GPU0 receives p0 and a portion of p1, while GPU1 receives the
-            # remaining portion of p1 and p2.
-            # As a result, there is no parameter shard of p2 on GPU0, and
-            # the shape of p2 on GPU0 is zero.
-            if sh != 0 and off % sh != 0:
-                raise CheckpointingException(
-                    f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
-                )
+        if self.axis_fragmentations is not None:
+            for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
+                if sh != 0 and off % sh != 0:
+                    raise CheckpointingException(
+                        f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
+                    )
         if has_flattened_range and self.flattened_range.step is not None:
             raise CheckpointingException(
                 f"`step` argument in the flattened range of a ShardedTensor is not supported."
             )
+    @property
+    def has_regular_grid(self):
+        """Alias for having a regular sharding grid."""
+        return self.axis_fragmentations is not None
     def global_slice(self) -> Tuple[Union[int, slice], ...]:
         """
         Returns a tuple of int and slice objects representing a slice of the

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/optimizer.py RENAMED Viewed

@@ -25,6 +25,12 @@ from .mapping import (
 )
 from .utils import extract_sharded_tensors_and_factories
+KEEP_VARS_HINT = (
+    " Make sure state dict contains original torch.nn.Parameters (not pure torch.Tensors)"
+    " by passing `keep_vars=True` to `.state_dict()`. If any transformation of the original"
+    " parameter is needed, use a ShardedTensorFactory."
+)
 def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
     """Generate mapping from optimizer param to optimizer state id."""

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/strategies/async_utils.py RENAMED Viewed

@@ -79,9 +79,24 @@ class AsyncRequest(NamedTuple):
         This logic is equivalent to what should happen in case of the async call.
         """
+        # preload tensors.
+        async_fn_args = list(self.async_fn_args)
+        if self.preload_fn:
+            assert len(async_fn_args) == 3, "Expected 3 args to be passed to async function"
+            # The async_fn is passed as a partial functool with pre-determined args
+            # In the async_fn_args we pass the remaining positional args required by the async_fn
+            # async_fn_args[1] refers to the write_buckets
+            # To ensure we stage the write_buckets to CPU memory for sync CP,
+            # we replace it with preload_fn callable that returns the CPU staged tensors
+            async_fn_args[1] = self.preload_fn()
+        # persist the state
         if self.async_fn is not None:
-            self.async_fn(*self.async_fn_args)
+            self.async_fn(*async_fn_args, **self.async_fn_kwargs)
+        # This utility implements a sync cp save. Hence the barrier.
         torch.distributed.barrier()
+        # Finalize the CP state
         for finalize_fn in self.finalize_fns:
             finalize_fn()
@@ -150,7 +165,7 @@ class AsyncCaller(ABC):
         return ten[0] == 0
     @abstractmethod
-    def close(self):
+    def close(self, abort=False):
         """Terminate the async caller at exit of an application or some termination conditions"""
         logger.info(f"AsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller")
@@ -237,15 +252,23 @@ class TemporalAsyncCaller(AsyncCaller):
             is_done = True
         return is_done
-    def close(self):
+    def close(self, abort=False):
         """For TemporalAsyncCaller, this method is called explictly in `is_current_async_calls_done`
         This method make sure the TemporalAsyncCaller terminated
         with all its assigned async request completed
+        Args:
+            abort (bool, optional): Default to False. Needs to be manually set to true when
+                the checkpoint async process needs to be aborted.
         """
         if self.process:
             logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
-            self.process.join()
+            if abort:
+                logger.warning(f"Temporal worker aborted in rank {torch.distributed.get_rank()}")
+                self.process.kill()
+            else:
+                self.process.join()
             self.process = None
             logger.debug(
                 "TemporalAsyncCaller: Async process join finished "
@@ -388,18 +411,25 @@ class PersistentAsyncCaller(AsyncCaller):
         return is_done
-    def close(self):
+    def close(self, abort=False):
         """Wait on the left async requests and terminate the PersistentAsyncCaller
         Signals the PersistentAsyncCaller by sending a 'DONE' message to make it terminated
+        Args:
+            abort (bool, optional): Default to False. Needs to be manually set to true when
+                the checkpoint async process needs to be aborted.
         """
         logger.info(
             f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller"
         )
         if self.process:
-            self.queue.put('DONE')
-            self.queue.join()
-            self.process.join()
+            if abort:
+                logger.warning(f"Persistent worker aborted in rank {torch.distributed.get_rank()}")
+                self.process.kill()
+            else:
+                self.queue.put('DONE')
+                self.queue.join()
+                self.process.join()
             self.process = None
     def __del__(self):
@@ -528,6 +558,9 @@ class AsyncCallsQueue:
             blocking (bool, optional): if True, will wait until all active requests
                 are done. Otherwise, finalizes only the async request that already
                 finished. Defaults to False.
+            no_dist (bool, Optional): if True, training ranks simply check its
+                asynchronous checkpoint writer without synchronization.
         Returns:
             List[int]: list of indices (as returned by `schedule_async_request`)
                 of async calls that have been successfully finalized.
@@ -545,8 +578,8 @@ class AsyncCallsQueue:
                     finalize_fn()
                 ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
                 torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
-                assert ten.item() == call_idx, 'Unmatched async calls. '
-                'That probably means not all ranks are participating in async finalization'
+                assert ten.item() == call_idx, "Unmatched async calls. "
+                "That probably means not all ranks are participating in async finalization"
                 call_idx_finalized.append(call_idx)
         return call_idx_finalized
@@ -554,8 +587,13 @@ class AsyncCallsQueue:
         """Get the number of active async calls."""
         return len(self.async_calls)
-    def close(self):
-        """Finalize all calls upon closing."""
-        self.maybe_finalize_async_calls(blocking=True)
+    def close(self, abort=False):
+        """Finalize all calls upon closing.
+        Args:
+            abort (bool, optional): Default to False. Needs to be manually set to true when
+                the checkpoint async process needs to be aborted.
+        """
+        if not abort:
+            self.maybe_finalize_async_calls(blocking=True)
         if self.persistent and self.persistent_caller:
-            self.persistent_caller.close()
+            self.persistent_caller.close(abort=abort)

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/strategies/base.py RENAMED Viewed

@@ -221,8 +221,4 @@ class AsyncSaveShardedStrategy(SaveShardedStrategy):
     def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
         """Each async strategy can be trivially used as a sync strategy."""
         async_request = self.async_save(sharded_state_dict, checkpoint_dir)
-        # multiprocessing routines  may cause issue when called on parent process
-        # We keep this verbose call for now
-        global async_calls
-        async_calls.schedule_async_request(async_request)
-        async_calls.maybe_finalize_async_calls(blocking=True)
+        async_request.execute_sync()

megatron_core-0.15.0rc0/megatron/core/dist_checkpointing/strategies/checkpointable.py ADDED Viewed

@@ -0,0 +1,196 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from itertools import chain
+import torch
+from torch.distributed.checkpoint.metadata import (
+    ChunkStorageMetadata,
+    MetadataIndex,
+    TensorProperties,
+)
+from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
+from ..mapping import ShardedTensor
+class CheckpointableShardedTensor(torch.Tensor):
+    """ShardedTensor extension compatible with PyTorch DCP checkpointing library.
+    Implements the torch.distributed._checkpointable._Checkpointable protocol.
+    """
+    def __new__(cls, data: torch.Tensor, sh_ten: ShardedTensor):
+        return torch.Tensor._make_wrapper_subclass(cls, torch.Size(sh_ten.global_shape))
+    def __init__(self, data: torch.Tensor, sh_ten: ShardedTensor):
+        self._data = data
+        self._sh_ten = sh_ten
+    def __create_write_items__(
+        self, fqn: str, sh_ten: 'CheckpointableShardedTensor', index: int = None
+    ) -> list[WriteItem]:
+        """Simple translation from ShardedTensor offsets into DCP offsets.
+        Args:
+            fqn (str): tensor FQN.
+            sh_ten (CheckpointableShardedTensor): same as `self`
+            index (int): specifies index within the LocalShardsContainer.
+                This is an optimization hint used in DCP.
+        Returns:
+            List[WriteItem]: list of DCP WriteItem metadata objects.
+        """
+        offsets = torch.Size(sh_ten._sh_ten.global_offset)
+        global_shape = torch.Size(sh_ten._sh_ten.global_shape)
+        chunk_size = torch.Size(sh_ten._sh_ten.local_shape)
+        assert chunk_size == sh_ten._sh_ten.data.size()
+        return [
+            WriteItem(
+                index=MetadataIndex(fqn, offsets, index),
+                type=WriteItemType.SHARD,
+                tensor_data=TensorWriteData(
+                    chunk=ChunkStorageMetadata(offsets=offsets, sizes=chunk_size),
+                    properties=TensorProperties.create_from_tensor(sh_ten._sh_ten.data),
+                    size=global_shape,
+                ),
+            )
+        ]
+    def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
+        """Simple translation from ShardedTensor offsets into DCP offsets.
+        Returns:
+            List[ChunkStorageMetadata]: list of DCP ChunkStorageMetadata metadata objects.
+        """
+        offsets = torch.Size(self._sh_ten.global_offset)
+        chunk_size = torch.Size(self._sh_ten.local_shape)
+        assert chunk_size == self._sh_ten.data.size()
+        return [ChunkStorageMetadata(offsets=offsets, sizes=chunk_size)]
+    def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
+        """Trivial implementation which simply yields the underlying tensor.
+        Args:
+            index (MetadataIndex): unused
+        Returns:
+            Tensor: the underlying data tensor
+        """
+        return self._sh_ten.data
+    @classmethod
+    def from_sh_ten(cls, sh_ten: ShardedTensor) -> 'CheckpointableShardedTensor':
+        """Constructor which turns a ShardedTensor into CheckpointableShardedTensor
+        Args:
+            sh_ten (ShardedTensor): a sharded tensor to wrap
+        Returns:
+            CheckpointableShardedTensor: wrapped ShardedTensor
+        """
+        assert isinstance(sh_ten, ShardedTensor)
+        return cls(sh_ten.data, sh_ten)
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs=None):
+        """Placeholder implementation."""
+        raise NotImplementedError(
+            f"{cls.__name__}.__torch_dispatch__ not implemented."
+            f" {cls.__name__} shouldn't be used with Tensor operations."
+        )
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self._sh_ten.__repr__()})'
+class LocalShardsContainer(torch.Tensor):
+    """DCP compatible container for local shards.
+    PyTorch DCP requires a single tensor per rank for a given global tensor FQN.
+    This class acts as a container allowing multiple checkpointable shards per rank.
+    Implements the torch.distributed._checkpointable._Checkpointable protocol.
+    """
+    @staticmethod
+    def __new__(cls, local_shards: list[torch.Tensor]) -> "LocalShardsContainer":
+        assert len(local_shards) > 0
+        # This assumes local shard already has correct size info
+        return torch.Tensor._make_wrapper_subclass(cls, local_shards[0].size())
+    def __init__(self, local_shards: list[torch.Tensor]):
+        for local_shard in local_shards:
+            # this is needed only for __get_tensor_shard__
+            assert isinstance(local_shard, CheckpointableShardedTensor)
+        self._local_shards = local_shards
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        """Placeholder implementation."""
+        raise NotImplementedError(
+            f"{cls.__name__}.__torch_dispatch__ not implemented."
+            f" {cls.__name__} shouldn't be used with Tensor operations."
+        )
+    def __create_write_items__(
+        self, fqn: str, local_shards_cont: 'LocalShardsContainer'
+    ) -> list[object]:
+        """Delegates creating write items to local shards.
+        Args:
+            fqn (str): tensor FQN.
+            local_shards_cont (LocalShardsContainer): same as `self`
+        Returns:
+            List[WriteItem]: list of DCP WriteItem metadata objects.
+        """
+        return list(
+            chain.from_iterable(
+                shard.__create_write_items__(fqn, shard, index=index)
+                for index, shard in enumerate(local_shards_cont._local_shards)
+            )
+        )
+    def __create_chunk_list__(self) -> list[ChunkStorageMetadata]:
+        """Delegates creating chunk items to local shards.
+        Returns:
+            List[ChunkStorageMetadata]: list of DCP ChunkStorageMetadata metadata objects.
+        """
+        return list(
+            chain.from_iterable(shard.__create_chunk_list__() for shard in self._local_shards)
+        )
+    def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
+        """Performs shard matching lookup based on index hint or offset.
+        Args:
+            index (MetadataIndex): metadata specifying the offset of the queried shard.
+                Optionally provides an index hint which speeds up the lookup.
+        Returns:
+            Tensor: the matching shard data tensor
+        """
+        if index.offset is None:
+            raise ValueError(
+                f"Cannot lookup {index.fqn} for a LocalShardsContainer without an offset"
+            )
+        shards = self._local_shards
+        # index hint direct lookup
+        if index.index is not None:
+            if (
+                len(shards) > index.index
+                and torch.Size(shards[index.index]._sh_ten.global_offset) == index.offset
+            ):
+                return shards[index.index].__get_tensor_shard__(index)
+        # slow linear search
+        for shard in shards:
+            if torch.Size(shard._sh_ten.global_offset) == index.offset:
+                return shard.__get_tensor_shard__(index)
+        raise ValueError(f"Could not find shard at '{index.offset}' for FQN: '{index.fqn}'")
+    def __repr__(self):
+        return f'{self.__class__.__name__}({self._local_shards.__repr__()})'

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/strategies/torch.py RENAMED Viewed

@@ -57,6 +57,7 @@ from .base import (
     register_default_strategy,
 )
 from .cached_metadata_filesystem_reader import CachedMetadataFileSystemReader
+from .checkpointable import CheckpointableShardedTensor, LocalShardsContainer
 from .filesystem_async import FileSystemWriterAsync
 from .resharding import (
     TensorReformulationMetadata,
@@ -240,14 +241,18 @@ def sharded_tensor_to_torch_sharded_tensor(
             placement = f"rank:{rank}/cuda"
             for sh_ten in local_global_offsets[offset]:
                 if has_flattened_range:
-                    assert offset == sh_ten.local_chunk_offset_in_global()
+                    assert offset == sh_ten.local_chunk_offset_in_global(), (
+                        offset,
+                        sh_ten.local_chunk_offset_in_global(),
+                    )
                     # This is not an actual offset, but an offset of the whole shard
                     # This is needed for a PyT Dist internal integrity check
-                    offset = sh_ten.local_chunk_offset_in_global() + (0,)
+                    _shard_offset = sh_ten.local_chunk_offset_in_global() + (0,)
                     size = (1,) * len(offsets_shape) + global_shape[-1:]
                 else:
                     size = sh_ten.data.shape
-                shard_metadata.append(ShardMetadata(offset, size, placement))
+                    _shard_offset = offset
+                shard_metadata.append(ShardMetadata(_shard_offset, size, placement))
         else:
             # pylint: disable=line-too-long
@@ -312,7 +317,7 @@ def mcore_to_pyt_state_dict(
     rank = torch.distributed.get_rank()
     pyt_state_dict = {}
-    def _mcore_to_torch_sharded_tensor(sh_tens: List[ShardedTensor]) -> TorchShardedTensor:
+    def _mcore_to_dcp_compatible_tensor(sh_tens: List[ShardedTensor]) -> TorchShardedTensor:
         """Build a PyT ShardedTensor from given shards.
         During loading:
@@ -335,11 +340,24 @@ def mcore_to_pyt_state_dict(
                 if sh_ten.allow_shape_mismatch and is_loading:
                     sh_ten.data.zero_()
-        torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(
-            sh_tens, rank, load_legacy_1d_flatten_tensors
-        )
-        torch_sh_ten.key = sh_tens[0].key
-        return torch_sh_ten
+        if not sh_tens[0].has_regular_grid:
+            if not is_torch_min_version("2.6a0"):
+                raise CheckpointingException(
+                    f"Uneven sharding not supported for PyTorch version {get_torch_version()}"
+                )
+            assert sh_tens[0].flattened_range is None
+            if len(sh_tens) > 1:
+                return LocalShardsContainer(
+                    [CheckpointableShardedTensor.from_sh_ten(sh_ten) for sh_ten in sh_tens]
+                )
+            else:
+                return CheckpointableShardedTensor.from_sh_ten(sh_tens[0])
+        else:
+            torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(
+                sh_tens, rank, load_legacy_1d_flatten_tensors
+            )
+            torch_sh_ten.key = sh_tens[0].key
+            return torch_sh_ten
     def _mcore_to_torch_sharded_object(sh_objs: List[ShardedObject]) -> io.BytesIO:
         """Build io.BytesIO from given sharded objects data."""
@@ -351,7 +369,7 @@ def mcore_to_pyt_state_dict(
     for k, v in state_dict.items():
         if isinstance(v[0], ShardedTensor):
             v = cast(List[ShardedTensor], v)
-            pyt_state_dict[k] = _mcore_to_torch_sharded_tensor(v)
+            pyt_state_dict[k] = _mcore_to_dcp_compatible_tensor(v)
         else:
             v = cast(List[ShardedObject], v)
             pyt_state_dict[k] = _mcore_to_torch_sharded_object(v)
@@ -359,12 +377,20 @@ def mcore_to_pyt_state_dict(
     return pyt_state_dict
-def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]:
+def _unwrap_pyt_sharded_tensor(
+    sh_ten: Union[TorchShardedTensor, CheckpointableShardedTensor, LocalShardsContainer, Any]
+) -> Union[List[torch.Tensor], Any]:
     """Unwrap tensor from PyT ShardedTensor instance.
     If `prepend_axis_num` was non-zero (which is specific to MCore ShardedTensor)
     then the tensor has additional singleton dimensions which should be squeezed.
     """
+    if isinstance(sh_ten, CheckpointableShardedTensor):
+        return [sh_ten._sh_ten.data]
+    if isinstance(sh_ten, LocalShardsContainer):
+        return [local_shard._sh_ten.data for local_shard in sh_ten._local_shards]
+    if not isinstance(sh_ten, TorchShardedTensor):
+        return sh_ten
     mcore_sh_ten = sh_ten.mcore_sh_ten
     ret_tensors = []
     for sh in sh_ten.local_shards():
@@ -930,10 +956,7 @@ class TorchDistLoadShardedStrategy(LoadShardedStrategy):
             Dict[str, Union[TorchShardedTensor, List[io.BytesIO]]], pyt_state_dict
         )
         # Unwrap ShardedTensors and return to original state dict
-        mcore_state_dict = {
-            k: v if not isinstance(v, TorchShardedTensor) else _unwrap_pyt_sharded_tensor(v)
-            for k, v in pyt_state_dict.items()
-        }
+        mcore_state_dict = {k: _unwrap_pyt_sharded_tensor(v) for k, v in pyt_state_dict.items()}
         mcore_state_dict = _replace_sharded_keys_with_state_dict_keys(
             mcore_state_dict, flat_mapping, rename_mapping  # type: ignore[arg-type]
         )

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/strategies/zarr.py RENAMED Viewed

@@ -175,6 +175,11 @@ def _create_zarr_array(sharded_tensor: ShardedTensor, checkpoint_dir: Path):
             compressor=None,
             fill_value=None,
             write_empty_chunks=True,
+            synchronizer=(
+                zarr.ProcessSynchronizer(str(checkpoint_dir / f'{sharded_tensor.key}.sync'))
+                if sharded_tensor.flattened_range is not None
+                else None
+            ),
         )
         logger.debug(f"Created a new Zarr array at {checkpoint_dir / sharded_tensor.key}")
     except zarr.errors.ContainsArrayError as e:
@@ -328,7 +333,7 @@ def load_zarr_based_sharded_metadata(
     sharded_state_dict = {}
     for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir() or not (subdir / ".zarray").exists():
+        if not subdir.is_dir() or not (subdir / ".zarray").exists() or subdir.suffix == ".sync":
             continue
         key = subdir.name
         arr_shape, arr_dtype = get_shape_dtype_fn(str(subdir))

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/dist_checkpointing/validation.py RENAMED Viewed

@@ -450,6 +450,7 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
     local_shape = some_rank_shard.local_shape
     dtype = some_rank_shard.dtype
     has_flattened_range = some_rank_shard.flattened_range is not None
+    has_regular_sharding_grid = some_rank_shard.has_regular_grid
     for rank, sharding in rank_sharding:
         assert sharding.dtype == dtype, (sharding.dtype, dtype, some_rank_shard)
         assert sharding.global_shape == global_shape, (
@@ -457,17 +458,26 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
             global_shape,
             some_rank_shard,
         )
-        assert sharding.local_shape == local_shape, (
-            sharding.local_shape,
-            local_shape,
+        assert sharding.has_regular_grid == has_regular_sharding_grid, (
+            has_regular_sharding_grid,
             some_rank_shard,
         )
+        if has_regular_sharding_grid:
+            assert sharding.local_shape == local_shape, (
+                sharding.local_shape,
+                local_shape,
+                some_rank_shard,
+            )
         assert (sharding.flattened_range is not None) == has_flattened_range, (
             (sharding.flattened_range is not None),
             has_flattened_range,
             some_rank_shard,
         )
+    if not has_regular_sharding_grid:
+        # In case of uneven sharding we defer the validation to DCP
+        return
     shard_access_cnt = _compute_shards_access(rank_sharding)
     if has_flattened_range:
         map_reduce(

{megatron_core-0.14.0rc6 → megatron_core-0.15.0rc0}/megatron/core/distributed/__init__.py RENAMED Viewed

@@ -8,5 +8,6 @@ except ImportError:
 from .distributed_data_parallel import DistributedDataParallel
 from .distributed_data_parallel_config import DistributedDataParallelConfig
 from .finalize_model_grads import finalize_model_grads
+from .fsdp.mcore_fsdp_adapter import FullyShardedDataParallel
 from .torch_fully_sharded_data_parallel import TorchFullyShardedDataParallel
 from .torch_fully_sharded_data_parallel_config import TorchFullyShardedDataParallelConfig

megatron-core 0.14.0rc6__tar.gz → 0.15.0rc0__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc6tar.gz → 0.15.0rc0tar.gz