PyPI - megatron-core - Versions diffs - 0.14.0rc4__tar.gz → 0.14.0rc5__tar.gz - Mend

megatron-core 0.14.0rc4tar.gz → 0.14.0rc5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (311) hide show

{megatron_core-0.14.0rc4/megatron_core.egg-info → megatron_core-0.14.0rc5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc4
+Version: 0.14.0rc5
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -31,7 +31,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: torch
 Requires-Dist: numpy<2.0.0
-Requires-Dist: packaging~=25.0
+Requires-Dist: packaging
 Provides-Extra: mlm
 Requires-Dist: flask-restful; extra == "mlm"
 Requires-Dist: sentencepiece; extra == "mlm"
@@ -43,12 +43,18 @@ Requires-Dist: einops~=0.8; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
 Requires-Dist: nvtx~=0.2; extra == "dev"
 Requires-Dist: transformers~=4.53; extra == "dev"
-Requires-Dist: multi-storage-client~=0.20.3; extra == "dev"
+Requires-Dist: multi-storage-client~=0.20; extra == "dev"
 Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
-Requires-Dist: nvidia-modelopt[torch]<0.32.0,>=0.31.0a0; sys_platform != "darwin" and extra == "dev"
+Requires-Dist: mamba-ssm~=2.2; extra == "dev"
+Requires-Dist: causal-conv1d~=1.5; extra == "dev"
+Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
+Requires-Dist: transformer-engine[pytorch]<2.7.0,>=2.5.0a0; extra == "dev"
+Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
+Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
 Requires-Dist: flashinfer-python; extra == "dev"
+Requires-Dist: onnxscript; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
 Requires-Dist: einops; extra == "lts"

megatron_core-0.14.0rc5/megatron/core/activations.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+from megatron.core.jit import jit_fuser
+@jit_fuser
+def squared_relu(x: torch.Tensor) -> torch.Tensor:
+    """Squared ReLU activation"""
+    return torch.pow(F.relu(x), 2)
+@jit_fuser
+def quick_gelu(x: torch.Tensor) -> torch.Tensor:
+    """Quick GELU activation"""
+    return x * torch.sigmoid(1.702 * x)
+@jit_fuser
+def fast_gelu(x: torch.Tensor) -> torch.Tensor:
+    """Fast GELU activation"""
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))

{megatron_core-0.14.0rc4 → megatron_core-0.14.0rc5}/megatron/core/distributed/param_and_grad_buffer.py RENAMED Viewed

@@ -157,6 +157,14 @@ class _ParamAndGradBucketGroup:
         self.param_gather_dispatched = False
         self.grad_reduce_handle = None
+        # Each time a local shard is created from bucket.param_data or bucket.grad_data, it
+        # introduces some CPU overheads. We use these two lists to cache the created local
+        # shards to avoid unnecessary CPU operations. This does not increase GPU memory usage
+        # because it only saves a slice view, which shares the same memory with bucket.param_data
+        # or bucket.grad_data.
+        self.cached_param_buffer_shard_list = [None] * len(self.buckets)
+        self.cached_grad_buffer_shard_list = [None] * len(self.buckets)
     def reset(self):
         """
         Reset metadata in bucket group in preparation for the next iteration of training.
@@ -229,10 +237,14 @@ class _ParamAndGradBucketGroup:
         with _coalescing_manager(
             self.intra_distributed_optimizer_instance_group, async_ops=async_op
         ) as cm:
-            for bucket in self.buckets:
-                local_data_view = shard_buffer(
-                    bucket.param_data, self.intra_distributed_optimizer_instance_size
-                )[self.intra_distributed_optimizer_instance_rank]
+            for idx, bucket in enumerate(self.buckets):
+                if self.cached_param_buffer_shard_list[idx] is None:
+                    self.cached_param_buffer_shard_list[idx] = shard_buffer(
+                        bucket.param_data, self.intra_distributed_optimizer_instance_size
+                    )
+                local_data_view = self.cached_param_buffer_shard_list[idx][
+                    self.intra_distributed_optimizer_instance_rank
+                ]
                 dist_all_gather_func(
                     bucket.param_data,
                     local_data_view,
@@ -352,11 +364,15 @@ class _ParamAndGradBucketGroup:
         # Coalesce communication kernels across buckets in the bucket group.
         with stream_context, _coalescing_manager(communication_group, async_ops=async_op) as cm:
-            for bucket in self.buckets:
+            for idx, bucket in enumerate(self.buckets):
                 if self.ddp_config.use_distributed_optimizer:
-                    local_data_view = shard_buffer(
-                        bucket.grad_data, self.intra_distributed_optimizer_instance_size
-                    )[self.intra_distributed_optimizer_instance_rank]
+                    if self.cached_grad_buffer_shard_list[idx] is None:
+                        self.cached_grad_buffer_shard_list[idx] = shard_buffer(
+                            bucket.grad_data, self.intra_distributed_optimizer_instance_size
+                        )
+                    local_data_view = self.cached_grad_buffer_shard_list[idx][
+                        self.intra_distributed_optimizer_instance_rank
+                    ]
                     dist_reduce_scatter_func(
                         local_data_view,
                         bucket.grad_data,
@@ -382,10 +398,14 @@ class _ParamAndGradBucketGroup:
                     self.inter_distributed_optimizer_instance_group, async_ops=async_op
                 ) as cm,
             ):
-                for bucket in self.buckets:
-                    local_data_view = shard_buffer(
-                        bucket.grad_data, self.intra_distributed_optimizer_instance_size
-                    )[self.intra_distributed_optimizer_instance_rank]
+                for idx, bucket in enumerate(self.buckets):
+                    if self.cached_grad_buffer_shard_list[idx] is None:
+                        self.cached_grad_buffer_shard_list[idx] = shard_buffer(
+                            bucket.grad_data, self.intra_distributed_optimizer_instance_size
+                        )
+                    local_data_view = self.cached_grad_buffer_shard_list[idx][
+                        self.intra_distributed_optimizer_instance_rank
+                    ]
                     torch.distributed.all_reduce(
                         local_data_view,

{megatron_core-0.14.0rc4 → megatron_core-0.14.0rc5}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -1192,6 +1192,7 @@ if HAVE_TE and is_te_min_version("1.9.0.dev0"):
             """
             prefix should be module_name to make keys identical to sequetial ones.
             """
+            singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
             sharded_state_dict = {}
             full_state_dict = self.state_dict(prefix="", keep_vars=True)
             num_global_experts = get_expert_model_parallel_world_size() * self.num_gemms
@@ -1199,23 +1200,27 @@ if HAVE_TE and is_te_min_version("1.9.0.dev0"):
             ep_axis = len(sharded_offsets)
             extra_states = self._split_extra_state(full_state_dict["_extra_state"])
             for gemm_idx in range(self.num_gemms):
+                global_expert_idx = local_expert_indices_offset + gemm_idx
                 state_dict = {
                     f"{gemm_idx}.weight": full_state_dict[f"weight{gemm_idx}"],
                     f"{gemm_idx}._extra_state": extra_states[gemm_idx],
                 }
                 if self.use_bias:
                     state_dict[f"{gemm_idx}.bias"] = full_state_dict[f"bias{gemm_idx}"]
-                sub_sd = make_sharded_tensors_for_checkpoint(
-                    state_dict,
-                    "",
-                    tp_axis_map,
-                    (
+                if singleton_local_shards:
+                    expert_prefix = f"{global_expert_idx}.{prefix}"
+                    new_sharded_offsets = sharded_offsets
+                else:
+                    expert_prefix = prefix
+                    new_sharded_offsets = (
                         *sharded_offsets,
-                        (ep_axis, local_expert_indices_offset + gemm_idx, num_global_experts),
-                    ),
+                        (ep_axis, global_expert_idx, num_global_experts),
+                    )
+                sub_sd = make_sharded_tensors_for_checkpoint(
+                    state_dict, '', tp_axis_map, new_sharded_offsets
                 )
                 # Remove expert layers indexing from sharded keys
-                replace_prefix_for_sharding(sub_sd, f"{gemm_idx}.", prefix)
+                replace_prefix_for_sharding(sub_sd, f"{gemm_idx}.", expert_prefix)
                 sharded_state_dict.update(
                     {
                         f"{prefix}weight{gemm_idx}": sub_sd[f"{gemm_idx}.weight"],

megatron_core-0.14.0rc5/megatron/core/fusions/fused_weighted_squared_relu.py ADDED Viewed

@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn.functional as F
+from megatron.core.activations import squared_relu
+from megatron.core.jit import jit_fuser
+from megatron.core.utils import nvtx_decorator
+######################  WEIGHTED SQUARED ReLU FUSION  ######################
+@jit_fuser
+def weighted_squared_relu(x: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
+    """Element-wise weight applied after Squared-ReLU.
+    Args:
+        x (torch.Tensor): Input tensor.
+        weights (torch.Tensor): Weight tensor that will be broadcast-multiplied with the
+            activation result. Typically of shape ``(B, 1)`` so it can be broadcast across
+            the hidden dimension.
+    Returns:
+        torch.Tensor: ``squared_relu(x) * weights`` with original ``dtype`` preserved.
+    """
+    out_dtype = x.dtype
+    res = torch.pow(F.relu(x), 2) * weights
+    return res.to(out_dtype)
+@jit_fuser
+def _squared_relu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """Gradient of Squared-ReLU.
+    The derivative of ``(ReLU(x))^2`` w.r.t ``x`` is ``2 * ReLU(x)``.
+    """
+    return g * 2 * F.relu(x)
+@jit_fuser
+def weighted_squared_relu_back(g: torch.Tensor, x: torch.Tensor, weights: torch.Tensor):
+    """Backward for weighted Squared-ReLU.
+    Returns gradients w.r.t ``x`` and ``weights``.
+    """
+    input_dtype = x.dtype
+    w_dtype = weights.dtype
+    # Gradient w.r.t. the input.
+    input_grad = _squared_relu_back(g * weights, x)
+    # Gradient w.r.t. the weights.
+    weights_grad = squared_relu(x) * g.to(w_dtype)
+    # Sum across the hidden dimension so each token has a single scalar weight.
+    weights_grad = torch.sum(weights_grad, dim=-1, keepdim=True)
+    return input_grad.to(input_dtype), weights_grad.to(w_dtype)
+class WeightedSquaredReLUFunction(torch.autograd.Function):
+    """Autograd wrapper around the weighted Squared-ReLU fused kernels."""
+    @staticmethod
+    @nvtx_decorator()
+    def forward(ctx, input: torch.Tensor, weights: torch.Tensor):
+        """forward method for `WeightedSquaredReLUFunction`
+        Args:
+            ctx : context object to store intermediate tensors.
+            input (torch.Tensor): input tensor.
+            weights (torch.Tensor): weight tensor.
+            fp8_input_store (bool): a bool flag to indicate if storing input in fp8.
+        """
+        ctx.save_for_backward(input, weights)
+        return weighted_squared_relu(input, weights)
+    @staticmethod
+    @nvtx_decorator()
+    def backward(ctx, grad_output: torch.Tensor):
+        """backward method for `WeightedSquaredReLUFunction`
+        Args:
+            ctx : context object to store intermediate tensors.
+            grad_output (torch.Tensor): gradient of the output of the forward function.
+        """
+        input, weights = ctx.saved_tensors
+        inp_grad, w_grad = weighted_squared_relu_back(grad_output, input, weights)
+        return inp_grad, w_grad
+def weighted_squared_relu_impl(input: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
+    """Token-wise weighted Squared-ReLU fusion with optional FP8 storage.
+    Args:
+        input (torch.Tensor): Input tensor of shape ``(B, *, hidden_size)`` where ``*`` can be
+            the sequence dimension.
+        weights (torch.Tensor): Per-token weights broadcastable to the output of
+            ``squared_relu``.
+    Returns:
+        torch.Tensor: Output tensor with the same shape as ``input`` except that the hidden
+            dimension remains unchanged.
+    """
+    ori_shape = input.shape
+    assert len(ori_shape) in [2, 3]
+    input = input.view(-1, ori_shape[-1])
+    output = WeightedSquaredReLUFunction.apply(input, weights)
+    return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)

{megatron_core-0.14.0rc4 → megatron_core-0.14.0rc5}/megatron/core/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 14
 PATCH = 0
-PRE_RELEASE = 'rc4'
+PRE_RELEASE = 'rc5'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.14.0rc4 → megatron_core-0.14.0rc5}/megatron/core/tensor_parallel/layers.py RENAMED Viewed

@@ -923,8 +923,6 @@ class ColumnParallelLinear(torch.nn.Module):
                 "`allreduce_dgrad` and `sequence_parallel` cannot be enabled at the same time."
             )
-        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
         # Hook adding a default empty _extra_state for state dict
         self._register_load_state_dict_pre_hook(
             lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
@@ -932,6 +930,12 @@ class ColumnParallelLinear(torch.nn.Module):
             )
         )
+    def _forward_impl(self, *args, **kwargs):
+        if self.weight is not None and not self.weight.requires_grad:
+            return linear_with_frozen_weight(*args, **kwargs)
+        else:
+            return linear_with_grad_accumulation_and_async_allreduce(*args, **kwargs)
     def forward(
         self,
         input_: torch.Tensor,
@@ -989,11 +993,6 @@ class ColumnParallelLinear(torch.nn.Module):
                 self.embedding_activation_buffer.append(input_parallel)
         # Matrix multiply.
-        if not weight.requires_grad:
-            self._forward_impl = linear_with_frozen_weight
-        else:
-            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
         allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad
         if self.config._cpu_offloading_context is not None:
@@ -1203,8 +1202,6 @@ class RowParallelLinear(torch.nn.Module):
         else:
             self.register_parameter("bias", None)
-        self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
         # Hook adding a default empty _extra_state for state dict
         self._register_load_state_dict_pre_hook(
             lambda state_dict, prefix, *args, **kwargs: state_dict.setdefault(
@@ -1212,6 +1209,12 @@ class RowParallelLinear(torch.nn.Module):
             )
         )
+    def _forward_impl(self, *args, **kwargs):
+        if self.weight is not None and not self.weight.requires_grad:
+            return linear_with_frozen_weight(*args, **kwargs)
+        else:
+            return linear_with_grad_accumulation_and_async_allreduce(*args, **kwargs)
     def forward(self, input_):
         """Forward of RowParallelLinear
@@ -1230,11 +1233,6 @@ class RowParallelLinear(torch.nn.Module):
             assert not self.sequence_parallel
             input_parallel = scatter_to_tensor_model_parallel_region(input_, group=self.tp_group)
         # Matrix multiply.
-        if not self.weight.requires_grad:
-            self._forward_impl = linear_with_frozen_weight
-        else:
-            self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
         allreduce_dgrad = False
         if self.config._cpu_offloading_context is not None:

{megatron_core-0.14.0rc4 → megatron_core-0.14.0rc5}/megatron/core/transformer/mlp.py RENAMED Viewed

@@ -198,12 +198,15 @@ class MLP(MegatronModule):
         self, prefix: str = "", sharded_offsets: tuple = (), metadata: Optional[dict] = None
     ) -> ShardedStateDict:
         sharded_state_dict = {}
+        singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
         for name, module in self._modules.items():
             sub_sd = module.sharded_state_dict(f"{prefix}{name}.", sharded_offsets, metadata)
             if self.config.gated_linear_unit and name == "linear_fc1":
                 for k, v in sub_sd.items():
                     if k in (f"{prefix}{name}.weight", f"{prefix}{name}.bias"):
-                        sub_sd[k] = apply_swiglu_sharded_factory(v, sharded_offsets)
+                        sub_sd[k] = apply_swiglu_sharded_factory(
+                            v, sharded_offsets, singleton_local_shards
+                        )
             sharded_state_dict.update(sub_sd)
         return sharded_state_dict
@@ -213,7 +216,9 @@ class MLP(MegatronModule):
 # pylint: disable=missing-function-docstring
-def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
+def apply_swiglu_sharded_factory(
+    original_sh_ten, sharded_offsets, singleton_local_shards: bool = False
+):
     # We must split the tensor into 2 parts, each sharded separately.
     # This requires a ShardedTensorFactory which `chunk`s during saving
     # and `cat`s during loading
@@ -235,13 +240,25 @@ def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
     def sh_ten_build_fn(
         key: str, t: torch.Tensor, replica_id: ReplicaId, flattened_range: Optional[slice]
     ):
-        offset_w = (swiglu_shard_axis + prepend_axis_num, rank_offset, axis_frag * 2)
-        offset_v = (swiglu_shard_axis + prepend_axis_num, rank_offset + axis_frag, axis_frag * 2)
+        if singleton_local_shards:
+            offset_w = (swiglu_shard_axis + prepend_axis_num, rank_offset, axis_frag)
+            offset_v = (swiglu_shard_axis + prepend_axis_num, rank_offset, axis_frag)
+            w_key = f'{key}_w'
+            v_key = f'{key}_v'
+        else:
+            offset_w = (swiglu_shard_axis + prepend_axis_num, rank_offset, axis_frag * 2)
+            offset_v = (
+                swiglu_shard_axis + prepend_axis_num,
+                rank_offset + axis_frag,
+                axis_frag * 2,
+            )
+            w_key = key
+            v_key = key
         if flattened_range is None:
             tensor_w, tensor_v = torch.chunk(t, 2, dim=swiglu_shard_axis)
             return [
                 ShardedTensor.from_rank_offsets(
-                    key,
+                    w_key,
                     tensor_w,
                     *sharded_offsets,
                     offset_w,
@@ -249,7 +266,7 @@ def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
                     prepend_axis_num=prepend_axis_num,
                 ),
                 ShardedTensor.from_rank_offsets(
-                    key,
+                    v_key,
                     tensor_v,
                     *sharded_offsets,
                     offset_v,
@@ -258,6 +275,10 @@ def apply_swiglu_sharded_factory(original_sh_ten, sharded_offsets):
                 ),
             ]
         else:
+            if singleton_local_shards:
+                raise NotImplementedError(
+                    'singleton_local_shards not implemented for SwiGLU MLP flattened tensors'
+                )
             # Here we need to map a slice `t` (`flattened_range` specifies slice start and stop)
             # of the *original* flattened tensor into slices `w` and `v` of chunked
             # and flattened tensor.

megatron-core 0.14.0rc4__tar.gz → 0.14.0rc5__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc4tar.gz → 0.14.0rc5tar.gz