PyPI - megatron-core - Versions diffs - 0.11.0__tar.gz → 0.12.0rc3__tar.gz - Mend

megatron-core 0.11.0tar.gz → 0.12.0rc3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (290) hide show

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/LICENSE RENAMED Viewed

@@ -247,8 +247,9 @@ LICENSE FOR
 Facebook, Inc. and its affiliates,
 Meta Platforms, Inc. and its affiliates,
 Microsoft Corporation,
-OpenGVLab/InternVL, and
-Triton language and compiler.
+OpenGVLab/InternVL,
+Triton language and compiler,
+and DeepSeek.
 MIT License

{megatron_core-0.11.0/megatron_core.egg-info → megatron_core-0.12.0rc3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.11.0
+Version: 0.12.0rc3
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Home-page: https://github.com/NVIDIA/Megatron-LM/megatron/core
 Download-URL: https://github.com/NVIDIA/Megatron-LM/releases
@@ -257,8 +257,9 @@ License: The following applies to all files unless otherwise noted:
         Facebook, Inc. and its affiliates,
         Meta Platforms, Inc. and its affiliates,
         Microsoft Corporation,
-        OpenGVLab/InternVL, and
-        Triton language and compiler.
+        OpenGVLab/InternVL,
+        Triton language and compiler,
+        and DeepSeek.
         MIT License
@@ -308,7 +309,6 @@ Requires-Dist: einops
 Requires-Dist: flask-restful
 Requires-Dist: nltk
 Requires-Dist: pytest
-Requires-Dist: pytest_asyncio
 Requires-Dist: pytest-cov
 Requires-Dist: pytest_mock
 Requires-Dist: pytest-random-order
@@ -319,13 +319,13 @@ Requires-Dist: zarr
 Requires-Dist: wandb
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72
 Requires-Dist: torch
-Requires-Dist: nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
-Requires-Dist: nvidia-resiliency-ext; platform_machine == "x86_64"
+Requires-Dist: nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
 Requires-Dist: torch
 Requires-Dist: packaging
 Dynamic: author
 Dynamic: download-url
 Dynamic: home-page
+Dynamic: license-file
 Dynamic: maintainer
 Dynamic: requires-dist

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import megatron.core.tensor_parallel
 import megatron.core.utils
 from megatron.core import parallel_state

megatron_core-0.12.0rc3/megatron/core/config.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ENABLE_EXPERIMENTAL = False

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/exchange_utils.py RENAMED Viewed

@@ -325,7 +325,10 @@ def exchange_loaded_tensors_gather_rounds(
                         # this during state dict load.
                         # TODO: remove it once the bug is fixed
                         if is_float8tensor(local_ten):
-                            local_ten = local_ten.from_float8()
+                            try:
+                                local_ten = local_ten.from_float8()
+                            except Exception as e:
+                                local_ten = local_ten.dequantize()
                             all_loaded_tensors[shard_id] = local_ten
                     round_tensors.append(local_ten)
@@ -483,7 +486,10 @@ def exchange_loaded_tensors_broadcast(
         # this during state dict load.
         # TODO: remove it once the bug is fixed
         if is_float8tensor(local_ten):
-            local_ten = local_ten.from_float8()
+            try:
+                local_ten = local_ten.from_float8()
+            except Exception as e:
+                local_ten = local_ten.dequantize()
             all_loaded_tensors[shard_id] = local_ten
         global_src_rank = (

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/mapping.py RENAMED Viewed

@@ -136,7 +136,13 @@ class ShardedTensor(ShardedBase):
             )
         for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
-            if off % sh != 0:
+            # NOTE: In custom FSDP, we have a case where a new parameter shard is created locally.
+            # For example, consider parameters [p0, p1, p2] sharded across GPU0 and GPU1.
+            # GPU0 receives p0 and a portion of p1, while GPU1 receives the
+            # remaining portion of p1 and p2.
+            # As a result, there is no parameter shard of p2 on GPU0, and
+            # the shape of p2 on GPU0 is zero.
+            if sh != 0 and off % sh != 0:
                 raise CheckpointingException(
                     f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.'
                 )

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/serialization.py RENAMED Viewed

@@ -351,9 +351,10 @@ def save(
             )
         if next(checkpoint_dir.iterdir(), None) is not None:
-            raise CheckpointingException(
-                f'Checkpoint destination directory ({checkpoint_dir}) is not empty'
-            )
+            # Don't throw exception here since this could cause a cascade of failures
+            # without human intervention in cases where multiple jobs are queued up.
+            if torch.distributed.get_rank() == 0:
+                logger.warning("Overwriting old incomplete / corrupted checkpoint...")
     if common_strategy is not None:
         raise NotImplementedError('The only supported common strategy is torch')

{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/state_dict_utils.py RENAMED Viewed

@@ -2,12 +2,13 @@
 """ Utilities for transforming state_dict."""
-from typing import Callable
+from typing import Callable, Union
 from .dict_utils import dict_list_map_inplace, extract_matching_values
 from .mapping import (
     CommonStateDict,
     ShardedStateDict,
+    ShardedTensor,
     ShardedTensorFactory,
     StateDict,
     apply_factories,
@@ -39,6 +40,7 @@ def save_preprocess(
     apply_factories(sharded_state_dict)
     _, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
     sharded_part, common_state_dict = extract_sharded_base(sharded_state_dict)
+    sharded_part = filter_out_empty_flatten_tensor(sharded_part)
     if validate_access_integrity:
         preprocessed_common_state_dict = common_state_dict
         if preprocess_common_before_consistancy_check:
@@ -69,6 +71,7 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
     # Create a copy of sharded_state_dict as the passed in state dict may have
     # references that prevent tensors from being deallocated
     sharded_state_dict, _ = extract_matching_values(sharded_state_dict, lambda x: True)
+    sharded_state_dict = filter_out_empty_flatten_tensor(sharded_state_dict)
     sh_ten_factories, _ = extract_matching_values(
         sharded_state_dict,
@@ -83,3 +86,27 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
     nonpersistent_state_dict, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
     dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
     return sharded_state_dict, nonpersistent_state_dict, sh_ten_factories
+def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]):
+    """
+    Filter out ShardedTensors with empty flatten_range.
+    These tensors can cause the PyTorch check in failure.
+    Args:
+        sharded_state_dict: state dict possibly containing ShardedTensor objects
+    """
+    # Filter out ShardedTensors with empty flatten_range.
+    # These tensors can cause the PyTorch check in
+    # `TorchShardedTensor._init_from_local_shards_and_global_metadata` to fail.
+    # This situation may occur in custom Fully Sharded Data Parallel (FSDP) cases.
+    sharded_state_dict, _ = extract_matching_values(
+        sharded_state_dict,
+        lambda v: not (
+            isinstance(v, ShardedTensor)
+            and v.flattened_range
+            and v.flattened_range.start == v.flattened_range.stop
+        ),
+    )
+    return sharded_state_dict

megatron-core 0.11.0__tar.gz → 0.12.0rc3__tar.gz

Potentially problematic release.

megatron-core 0.11.0tar.gz → 0.12.0rc3tar.gz