PyPI - megatron-core - Versions diffs - 0.14.0rc1__tar.gz → 0.14.0rc3__tar.gz - Mend

megatron-core 0.14.0rc1tar.gz → 0.14.0rc3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (308) hide show

{megatron_core-0.14.0rc1/megatron_core.egg-info → megatron_core-0.14.0rc3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc1
+Version: 0.14.0rc3
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -31,6 +31,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: torch
 Requires-Dist: numpy<2.0.0
+Requires-Dist: packaging~=25.0
 Provides-Extra: mlm
 Requires-Dist: flask-restful; extra == "mlm"
 Requires-Dist: sentencepiece; extra == "mlm"
@@ -38,14 +39,16 @@ Requires-Dist: tiktoken; extra == "mlm"
 Requires-Dist: wandb; extra == "mlm"
 Provides-Extra: dev
 Requires-Dist: tqdm; extra == "dev"
-Requires-Dist: einops; extra == "dev"
-Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "dev"
-Requires-Dist: nvtx; extra == "dev"
-Requires-Dist: transformers; extra == "dev"
-Requires-Dist: multi-storage-client; extra == "dev"
+Requires-Dist: einops~=0.8; extra == "dev"
+Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
+Requires-Dist: nvtx~=0.2; extra == "dev"
+Requires-Dist: transformers~=4.53; extra == "dev"
+Requires-Dist: multi-storage-client~=0.20.3; extra == "dev"
+Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
-Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
-Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
+Requires-Dist: nvidia-modelopt[torch]<0.32.0,>=0.31.0a0; sys_platform != "darwin" and extra == "dev"
+Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
+Requires-Dist: flashinfer-python; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
 Requires-Dist: einops; extra == "lts"

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc3}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py RENAMED Viewed

@@ -217,6 +217,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
         self.module.apply(unmap_weight_tensor)
+        for param in self.module.parameters():
+            if not hasattr(param, 'grad_added_to_main_grad'):
+                # This is to ensure that the param.grad_added_to_main_grad is set to False
+                # when the parameter is created.
+                param.grad_added_to_main_grad = False
+            if not hasattr(param, '__fsdp_param__'):
+                # This is to ensure that the param.__fsdp_param__ is set to True
+                # when the parameter is created.
+                param.__fsdp_param__ = True
     def _init_fsdp_param_and_grad_buffer(self):
         if self.config.calculate_per_token_loss:
             # We don't need to scale the gradients in this case.

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc3}/megatron/core/enums.py RENAMED Viewed

@@ -7,9 +7,16 @@ class ModelType(enum.Enum):
     """Model type."""
     encoder_or_decoder = 1
-    encoder_and_decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
+    retro_encoder = 2
+    retro_decoder = 3
+    @property
+    def encoder_and_decoder(self):
+        """Deprecated property - use encoder_or_decoder instead."""
+        raise ValueError(
+            "ModelType.encoder_and_decoder is deprecated. Please use ModelType.encoder_or_decoder "
+            "instead."
+        )
 class Fp8Recipe(str, enum.Enum):

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc3}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -2,7 +2,9 @@
 """Utility functions related to FP8 that are used throughout Megatron core"""
+import weakref
 from contextlib import nullcontext
+from functools import wraps
 from typing import List, Optional
 import torch
@@ -53,6 +55,29 @@ except (ImportError, ModuleNotFoundError):
     # MXFP8Tensor not found
     HAVE_TE_MXFP8TENSOR = False
+if HAVE_TE:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelLinear,
+        TELayerNormColumnParallelLinear,
+        TELinear,
+        TERowParallelLinear,
+    )
+    TE_LINEAR_TYPES = (
+        TELinear,
+        TEColumnParallelLinear,
+        TERowParallelLinear,
+        TELayerNormColumnParallelLinear,
+    )
+else:
+    TE_LINEAR_TYPES = ()
+try:
+    from megatron.core.extensions.transformer_engine import Fp8Padding, Fp8Unpadding
+except ImportError:
+    Fp8Padding = None
+    Fp8Unpadding = None
 def is_float8tensor(tensor: torch.Tensor) -> bool:
     """Check if a tensor is a Transformer Engine Float8Tensor.
@@ -346,8 +371,12 @@ else:
     def _modify_underlying_storage_impl(*args, **kwargs):
         raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
-    def _quantize_param_shard_impl(*args, **kwargs):
-        raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
+    def _quantize_param_shard_impl(model_params, *args, **kwargs):
+        if len(model_params) == 0:
+            return
+        else:
+            # If TE is not installed, there shouldn't be any fp8 params.
+            raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
     def _correct_amax_history_if_needed_impl(*args, **kwargs):
         # If TE is not installed, we are definitely not using fp8 for training, so no correction
@@ -507,3 +536,97 @@ else:
     def get_fp8_context(config: TransformerConfig, layer_no: int = -1, is_init: bool = False):
         """Returns dummy fp8 context manager since TE is not available."""
         return nullcontext()
+if HAVE_TE:
+    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+    # Modules that have been wrapped for inference for fp8
+    _fp8_inference_wrapped_modules = weakref.WeakSet()
+    def _wrap_te_linear_for_padding(module: torch.nn.Module):
+        """Wrap a TE linear module to automatically pad sequences for FP8 inference.
+        Modifies the module's forward method to:
+        1. Pad input sequences to FP8 alignment requirements
+        2. Run the original forward pass
+        3. Unpad outputs to original sequence length
+        Args:
+            module: A Transformer Engine linear layer (TELinear, TEColumnParallelLinear, etc.)
+        """
+        if module in _fp8_inference_wrapped_modules:
+            return
+        _pad_func = Fp8Padding(1)
+        _unpad_func = Fp8Unpadding(1)
+        original_forward = module.forward
+        @wraps(original_forward)
+        def padded_forward(input_tensor, *args, **kwargs):
+            # Only do padding for fp8 if we are in fp8 context
+            if not FP8GlobalStateManager.is_fp8_enabled():
+                return original_forward(input_tensor, *args, **kwargs)
+            seq_len, batch_size, hidden_size = input_tensor.shape
+            # Reshape to (S, B*H) to pad sequence dimension
+            input_2d = input_tensor.reshape(seq_len, -1)
+            # Pad the sequence dimension
+            padded_input_2d, _ = _pad_func(input_2d, [seq_len])
+            padded_seq_len = padded_input_2d.shape[0]
+            # Reshape back to (padded_S, B, H)
+            padded_input_3d = padded_input_2d.view(padded_seq_len, batch_size, hidden_size)
+            output = original_forward(padded_input_3d, *args, **kwargs)
+            # Handle output
+            if isinstance(output, tuple):
+                output_tensor = output[0]
+                other_outputs = output[1:]
+            else:
+                output_tensor = output
+                other_outputs = ()
+            # Unpad output - reshape to 2D, unpad, reshape back
+            _, _, output_hidden_size = output_tensor.shape
+            output_2d = output_tensor.reshape(padded_seq_len, -1)
+            unpadded_output_2d = _unpad_func(output_2d, [seq_len])
+            unpadded_output = unpadded_output_2d.reshape(seq_len, batch_size, output_hidden_size)
+            if other_outputs:
+                return (unpadded_output,) + other_outputs
+            else:
+                return unpadded_output
+        module.forward = padded_forward
+        _fp8_inference_wrapped_modules.add(module)
+    def prepare_model_for_fp8_inference(model):
+        """Prepare a model for FP8 inference by wrapping TE linear layers with padding support.
+        FP8 TE Gemms have specific shape requirements. This function wraps all Transformer
+        Engine linear layers in the model to automatically pad/unpad sequences during inference.
+        Args:
+            model (model (GPTModel): Model containing TE linear layers.
+        Returns:
+            GPTModel: The same model with wrapped linear layers (modified in-place).
+        """
+        assert Fp8Padding and Fp8Unpadding, "TE version does not have FP8 padding functions"
+        # Find and wrap all TE linear layers
+        for module in model.modules():
+            if isinstance(module, TE_LINEAR_TYPES):
+                _wrap_te_linear_for_padding(module)
+        return model
+else:
+    def prepare_model_for_fp8_inference(model):
+        """If trys using prepare_model_for_fp8_inference without TE we error"""
+        raise RuntimeError(
+            "prepare_model_for_fp8_inference requires Transformer Engine to be installed. "
+            "Please install transformer-engine to use FP8 inference."
+        )

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc3}/megatron/core/inference/contexts/__init__.py RENAMED Viewed

@@ -14,6 +14,7 @@ warnings.warn(
     DeprecationWarning,
 )
 from .dynamic_context import (
+    ActiveRequestCountOverflowError,
     ChunkOverflowError,
     ContextOverflowError,
     DynamicInferenceContext,

megatron-core 0.14.0rc1__tar.gz → 0.14.0rc3__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc1tar.gz → 0.14.0rc3tar.gz