PyPI - megatron-core - Versions diffs - 0.14.0rc3__tar.gz → 0.14.0rc4__tar.gz - Mend

megatron-core 0.14.0rc3tar.gz → 0.14.0rc4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (310) hide show

{megatron_core-0.14.0rc3/megatron_core.egg-info → megatron_core-0.14.0rc4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc3
+Version: 0.14.0rc4
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/dist_checkpointing/strategies/torch.py RENAMED Viewed

@@ -374,7 +374,8 @@ def _unwrap_pyt_sharded_tensor(sh_ten: TorchShardedTensor) -> List[torch.Tensor]
             ten = ten.view(-1)
         else:
             for _ in range(mcore_sh_ten.prepend_axis_num):
-                ten = ten.squeeze(0)
+                assert ten.size(0) == 1
+                ten = ten[0]  # NOTE: ten.squeeze(0) uses more memory for FP8 tensors
         ret_tensors.append(ten)
     return ret_tensors

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/dist_checkpointing/validation.py RENAMED Viewed

@@ -375,28 +375,34 @@ def maybe_report_missing_and_unexpected_keys(
 def _validate_common_state_dict(common_state_dict: CommonStateDict) -> None:
     """Validate consistancy across ranks for the common state dict
-    We save the common state dict only on rank 0. We validate to make sure that the common dict is consistant across ranks before saving.
+    We save the common state dict only on rank 0. We validate to make sure that the common dict is consistent across ranks before saving.
     Args:
         common_state_dict: The common state dict present in all ransk
     """
+    if not torch.distributed.is_initialized():
+        return
-    # Gather the common state dict across ranks onto rank 0 for comparison
+    # Broadcast the common state dict from rank 0 to all other ranks
+    # Each rank will do a comparison with its local rank vs the broadcasted state dict from rank 0
     rank = torch.distributed.get_rank()
-    other_rank_state_dicts = [None] * torch.distributed.get_world_size() if rank == 0 else None
-    torch.distributed.gather_object(common_state_dict, other_rank_state_dicts)
-    common_state_dict_diff = {}
-    if rank == 0:
-        assert other_rank_state_dicts
-        main_rank_state_dict = common_state_dict
-        for rank, rank_state_dict in enumerate(other_rank_state_dicts[1:], 1):
-            only_left, only_right, mismatch = diff(main_rank_state_dict, rank_state_dict)
-            if only_left or only_right or mismatch:
-                common_state_dict_diff[rank] = (only_left, only_right, mismatch)
-        if len(common_state_dict_diff) != 0:
+    object_list = [common_state_dict] if rank == 0 else [None]
+    torch.distributed.broadcast_object_list(object_list, src=0)
+    rank0_state_dict = object_list[0]
+    # Skip comparing rank 0 with itself
+    if rank > 0:
+        current_rank_state_dict = common_state_dict
+        only_in_rank0, only_in_current_rank, mismatch = diff(
+            rank0_state_dict, current_rank_state_dict
+        )
+        if only_in_rank0 or only_in_current_rank or mismatch:
             logger.warning(
-                f"There is difference in the common state dict in different ranks. The differences are {common_state_dict_diff}"
+                f"Rank {rank} common state dict differs from rank 0 common state dict. "
+                f"Keys only on rank 0: {only_in_rank0}, "
+                f"Keys only on {rank}: {only_in_current_rank}, "
+                f"Mismatched keys: {mismatch}"
             )

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -889,25 +889,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
             if packed_seq_params is not None
             else {}
         )
-        # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
-        # after init
-        if self.config.apply_rope_fusion and is_te_min_version("0.13.0", check_equality=False):
-            self.qkv_format = "bshd"
-        qkv_format = packed_seq_kwargs.get("qkv_format", self.qkv_format)
-        # WAR for peak memory usage.
-        # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388
-        if self.config.apply_rope_fusion and qkv_format == "bshd":
-            query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
-            # In PyTorch, the following two tensors are in fact the same:
-            #   Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
-            #   Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
-            # Stride for a dimension that is 1 has no meaning, so tensors created two different ways
-            # can have same shape but different strides.
-            # We unify them to the first one to pass the stride check in TE
-            if value.shape == key.shape and value.shape[0] == 1 and value.stride() != key.stride():
-                value = value.as_strided(value.shape, key.stride())
+        qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
         attention_bias_kwargs = {}
         if attention_bias is not None:
@@ -942,10 +924,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
                 query, key, value, attention_mask, **attention_bias_kwargs, **packed_seq_kwargs
             )
-        if self.config.apply_rope_fusion and qkv_format == "bshd":
-            return core_attn_out.transpose(0, 1)
-        else:
-            return core_attn_out
+        return core_attn_out
 if HAVE_TE and is_te_min_version("1.9.0.dev0"):
@@ -1633,10 +1612,8 @@ try:
         else:
             if interleaved:
                 raise ValueError("Only TE >= 2.3.0 supports interleaved fused RoPE.")
-            if is_te_min_version("1.4.0.dev0"):
-                return apply_rotary_pos_emb(t, freqs, tensor_format="sbhd", fused=True)
-            else:
-                raise ValueError("Only TE >= 1.4.0.dev0 supports fused RoPE.")
+            return apply_rotary_pos_emb(t, freqs, tensor_format="sbhd", fused=True)
     def fused_apply_rotary_pos_emb_thd(
         t: torch.Tensor,
@@ -1659,6 +1636,7 @@ try:
                 cp_rank=cp_rank,
             )
         else:
+            assert cp_size == 1, "Only TE >= 1.12 supports RoPE fusion for THD format with CP."
             return apply_rotary_pos_emb(
                 t, freqs, tensor_format="thd", fused=True, cu_seqlens=cu_seqlens
             )

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/extensions/transformer_engine_spec_provider.py RENAMED Viewed

@@ -8,6 +8,7 @@ from megatron.core.extensions.transformer_engine import (
     TEColumnParallelLinear,
     TEDotProductAttention,
     TELayerNormColumnParallelLinear,
+    TELinear,
     TENorm,
     TERowParallelGroupedLinear,
     TERowParallelLinear,
@@ -23,6 +24,10 @@ from megatron.core.utils import get_te_version, is_te_min_version
 class TESpecProvider(BackendSpecProvider):
     """A protocol for providing the submodules used in Spec building."""
+    def linear(self) -> type:
+        """Which linear module TE backend uses"""
+        return TELinear
     def column_parallel_linear(self) -> type:
         """Which column parallel linear module TE backend uses"""
         return TEColumnParallelLinear

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/fusions/fused_mla_yarn_rope_apply.py RENAMED Viewed

@@ -28,16 +28,25 @@ if not HAVE_TRITON:
 @triton.jit
-def _get_thd_token_idx(cu_seqlens, pid_m, seq_num):
+def _get_thd_token_idx(cu_seqlens, pid_m, seq_num, cp_rank, cp_size):
     token_idx = -1
+    this_seq_len = 0
     seq_idx = 0
-    last_cum_seqlen = tl.load(cu_seqlens)
+    last_cum_seqlen = tl.load(cu_seqlens) // cp_size
     while seq_idx < seq_num:
-        cur_cum_seqlen = tl.load(cu_seqlens + seq_idx + 1)
+        cur_cum_seqlen = tl.load(cu_seqlens + seq_idx + 1) // cp_size
         if token_idx == -1 and cur_cum_seqlen > pid_m:
             token_idx = pid_m - last_cum_seqlen
+            this_seq_len = cur_cum_seqlen - last_cum_seqlen
         last_cum_seqlen = cur_cum_seqlen
         seq_idx += 1
+    if cp_size > 1:
+        if token_idx < this_seq_len // 2:
+            token_idx = token_idx + cp_rank * this_seq_len // 2
+        else:
+            token_idx = (token_idx - this_seq_len // 2) + (
+                2 * cp_size - cp_rank - 1
+            ) * this_seq_len // 2
     return token_idx
@@ -68,6 +77,8 @@ def rotary_fwd_q_kernel(
     cu_seqlens_q,
     stride_x_seq,
     stride_x_nheads,
+    cp_rank,
+    cp_size,
     BLOCK_H: tl.constexpr,
 ):
     """
@@ -89,7 +100,7 @@ def rotary_fwd_q_kernel(
     if cu_seqlens_q is None:
         token_idx = pid_m // batch_size
     else:
-        token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num)
+        token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num, cp_rank, cp_size)
     cos_left = tl.load(COS + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
     sin_left = tl.load(SIN + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
@@ -146,6 +157,8 @@ def rotary_bwd_q_kernel(
     cu_seqlens_q,
     stride_x_seq,
     stride_x_nheads,
+    cp_rank,
+    cp_size,
     BLOCK_H: tl.constexpr,
 ):
     """
@@ -165,7 +178,7 @@ def rotary_bwd_q_kernel(
     if cu_seqlens_q is None:
         token_idx = pid_m // batch_size
     else:
-        token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num)
+        token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num, cp_rank, cp_size)
     cos_left = tl.load(COS + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
     sin_left = tl.load(SIN + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
@@ -200,7 +213,18 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
     """
     @staticmethod
-    def forward(ctx, q, cos, sin, qk_head_dim, emb_dim, cu_seqlens_q, rotary_interleaved=False):
+    def forward(
+        ctx,
+        q,
+        cos,
+        sin,
+        qk_head_dim,
+        emb_dim,
+        cu_seqlens_q,
+        cp_rank,
+        cp_size,
+        rotary_interleaved=False,
+    ):
         """
         Forward function for ApplyMLARotaryEmbQ.
@@ -243,12 +267,16 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
             cu_seqlens_q,
             q.stride(0),
             q.stride(1),
+            cp_rank,
+            cp_size,
         )
         ctx.save_for_backward(cos, sin)
         ctx.qk_head_dim = qk_head_dim
         ctx.emb_dim = emb_dim
         ctx.cu_seqlens_q = cu_seqlens_q
         ctx.rotary_interleaved = rotary_interleaved
+        ctx.cp_rank = cp_rank
+        ctx.cp_size = cp_size
         if cu_seqlens_q is None:
             q = q.view(max_seqlen, batch_size, nheads, headdim)
         return q
@@ -268,7 +296,7 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
         seq_num = None
         if ctx.cu_seqlens_q is None:
             max_seqlen, batch_size, nheads, headdim = grad.shape
-            grad = grad.view(-1, nheads, headdim)
+            grad = grad.contiguous().view(-1, nheads, headdim)
             total_seqlen = grad.shape[0]
         else:
             seq_num = len(ctx.cu_seqlens_q) - 1
@@ -288,10 +316,12 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
             ctx.cu_seqlens_q,
             grad.stride(0),
             grad.stride(1),
+            ctx.cp_rank,
+            ctx.cp_size,
         )
         if ctx.cu_seqlens_q is None:
             grad = grad.view(max_seqlen, batch_size, nheads, headdim)
-        return grad, None, None, None, None, None, None
+        return grad, None, None, None, None, None, None, None, None
 @experimental_fn(introduced_with_version="0.13.0")
@@ -302,6 +332,8 @@ def fused_apply_mla_rope_for_q(
     qk_head_dim: int,
     emb_dim: int,
     cu_seqlens_q: Optional[torch.Tensor] = None,
+    cp_rank: int = 0,
+    cp_size: int = 1,
     rotary_interleaved: bool = False,
 ):
     """
@@ -327,7 +359,7 @@ def fused_apply_mla_rope_for_q(
         t: inplace modified input tensor
     """
     return ApplyMLARotaryEmbQ.apply(
-        t, cos, sin, qk_head_dim, emb_dim, cu_seqlens_q, rotary_interleaved
+        t, cos, sin, qk_head_dim, emb_dim, cu_seqlens_q, cp_rank, cp_size, rotary_interleaved
     )
@@ -366,6 +398,8 @@ def rotary_fwd_kv_kernel(
     stride_k_nheads,
     stride_v_seq,
     stride_v_nheads,
+    cp_rank,
+    cp_size,
     BLOCK_H: tl.constexpr,
 ):
     """
@@ -394,7 +428,7 @@ def rotary_fwd_kv_kernel(
     if cu_seqlens_kv is None:
         token_idx = pid_m // batch_size
     else:
-        token_idx = _get_thd_token_idx(cu_seqlens_kv, pid_m, seq_num)
+        token_idx = _get_thd_token_idx(cu_seqlens_kv, pid_m, seq_num, cp_rank, cp_size)
     cos_left = tl.load(COS + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
     sin_left = tl.load(SIN + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
@@ -472,6 +506,8 @@ def rotary_bwd_kv_kernel(
     stride_dkv_seq,
     stride_dkv_nheads,
     stride_demb_seq,
+    cp_rank,
+    cp_size,
     BLOCK_H: tl.constexpr,
 ):
     """
@@ -496,7 +532,7 @@ def rotary_bwd_kv_kernel(
     if cu_seqlens_kv is None:
         token_idx = pid_m // batch_size
     else:
-        token_idx = _get_thd_token_idx(cu_seqlens_kv, pid_m, seq_num)
+        token_idx = _get_thd_token_idx(cu_seqlens_kv, pid_m, seq_num, cp_rank, cp_size)
     dKV_ptr = dKV + pid_m * stride_dkv_seq + pid_head * BLOCK_H * stride_dkv_nheads
     dkv_off = tl.arange(0, BLOCK_H)[:, None] * stride_dkv_nheads
@@ -550,7 +586,18 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
     @staticmethod
     def forward(
-        ctx, kv, k_pos_emb, cos, sin, emb_dim, k_dim, v_dim, cu_seqlens_kv, rotary_interleaved=False
+        ctx,
+        kv,
+        k_pos_emb,
+        cos,
+        sin,
+        emb_dim,
+        k_dim,
+        v_dim,
+        cu_seqlens_kv,
+        cp_rank,
+        cp_size,
+        rotary_interleaved=False,
     ):
         """
         Forward function for ApplyMLARotaryEmbKV.
@@ -609,6 +656,8 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
             o_key.stride(1),
             o_value.stride(0),
             o_value.stride(1),
+            cp_rank,
+            cp_size,
         )
         ctx.save_for_backward(cos, sin)
         ctx.rotary_interleaved = rotary_interleaved
@@ -616,6 +665,8 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
         ctx.k_dim = k_dim
         ctx.v_dim = v_dim
         ctx.cu_seqlens_kv = cu_seqlens_kv
+        ctx.cp_rank = cp_rank
+        ctx.cp_size = cp_size
         if cu_seqlens_kv is None:
             o_key = o_key.view(max_seqlen, -1, nheads, emb_dim + k_dim)
             o_value = o_value.view(max_seqlen, -1, nheads, v_dim)
@@ -638,8 +689,8 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
         if ctx.cu_seqlens_kv is None:
             # sbhd
             max_seqlen, batch_size, nheads, _ = dk.shape
-            dk = dk.view(-1, nheads, ctx.emb_dim + ctx.k_dim)
-            dv = dv.view(-1, nheads, ctx.v_dim)
+            dk = dk.contiguous().view(-1, nheads, ctx.emb_dim + ctx.k_dim)
+            dv = dv.contiguous().view(-1, nheads, ctx.v_dim)
             total_seqlen = dk.shape[0]
         else:
             # thd
@@ -673,11 +724,13 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
             d_kv.stride(0),
             d_kv.stride(1),
             d_emb.stride(0),
+            ctx.cp_rank,
+            ctx.cp_size,
         )
         if ctx.cu_seqlens_kv is None:
             d_kv = d_kv.view(max_seqlen, batch_size, nheads, ctx.k_dim + ctx.v_dim)
             d_emb = d_emb.view(max_seqlen, batch_size, 1, ctx.emb_dim)
-        return d_kv, d_emb, None, None, None, None, None, None, None
+        return d_kv, d_emb, None, None, None, None, None, None, None, None, None
 @experimental_fn(introduced_with_version="0.13.0")
@@ -690,6 +743,8 @@ def fused_apply_mla_rope_for_kv(
     k_dim: int,
     v_dim: int,
     cu_seqlens_kv: Optional[torch.Tensor] = None,
+    cp_rank: int = 0,
+    cp_size: int = 1,
     rotary_interleaved: bool = False,
 ):
     """
@@ -715,5 +770,15 @@ def fused_apply_mla_rope_for_kv(
         value: [seq_len, batch_size, head_num, v_dim] or [total_seq_len, head_num, v_dim]
     """
     return ApplyMLARotaryEmbKV.apply(
-        kv, k_pos_emb, cos, sin, emb_dim, k_dim, v_dim, cu_seqlens_kv, rotary_interleaved
+        kv,
+        k_pos_emb,
+        cos,
+        sin,
+        emb_dim,
+        k_dim,
+        v_dim,
+        cu_seqlens_kv,
+        cp_rank,
+        cp_size,
+        rotary_interleaved,
     )

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -155,7 +155,6 @@ class DynamicInferenceContext(BaseInferenceContext):
             tp_size = tensor_model_parallel_size
         hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
         num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
         # Chunk size tokens, bytes.
         dtype_size_bytes = params_dtype.itemsize
         self.chunk_size_tokens = chunk_size_tokens
@@ -177,23 +176,24 @@ class DynamicInferenceContext(BaseInferenceContext):
         def bytes_to_max_requests_and_tokens(n_bytes):
             n_tokens = n_bytes / self.chunk_size_bytes * self.chunk_size_tokens
             n_requests = n_tokens / max_sequence_length
-            return int(n_requests), int(n_tokens)
+            return self.round_up_requests(int(n_requests), tp_size=tp_size), self.round_up_tokens(
+                int(n_tokens), tp_size=tp_size
+            )
         self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes)
         if buffer_overflow_factor is not None:
             self.max_requests = self.round_up_requests(
-                int(self.max_requests * buffer_overflow_factor)
+                int(self.max_requests * buffer_overflow_factor), tp_size=tp_size
             )
             self.max_tokens = self.round_up_tokens(
-                int(self.max_tokens * buffer_overflow_factor / 50.0)
+                int(self.max_tokens * buffer_overflow_factor / 50.0), tp_size=tp_size
             )
         if max_requests_override is not None:
-            self.max_requests = self.round_up_requests(max_requests_override)
+            self.max_requests = self.round_up_requests(max_requests_override, tp_size=tp_size)
         if max_tokens_override is not None:
-            self.max_tokens = self.round_up_tokens(max_tokens_override)
+            self.max_tokens = self.round_up_tokens(max_tokens_override, tp_size=tp_size)
         self.max_requests = min(self.max_requests, self.max_tokens)  # e.g., decode only.
@@ -277,7 +277,8 @@ class DynamicInferenceContext(BaseInferenceContext):
             self.cuda_graph_step_size = cuda_graph_rounder * int(
                 math.ceil(int(self.cuda_graph_step_size) / cuda_graph_rounder)
             )
+            # Make sure divisble by TP size
+            self.cuda_graph_step_size = math.ceil(self.cuda_graph_step_size / tp_size) * tp_size
             # Cuda graph request counts.
             if num_cuda_graphs == 1:
                 self.cuda_graph_request_counts = [self.max_requests]
@@ -355,26 +356,46 @@ class DynamicInferenceContext(BaseInferenceContext):
     REQUEST_ROUNDER = 4
     @classmethod
-    def round_up_tokens(cls, value):
-        """Round up to nearest multiple of `TOKEN_ROUNDER` (above)."""
+    def round_up_tokens(cls, value, tp_size=None):
+        """Round up to nearest multiple of `TOKEN_ROUNDER` (above) that is also divisible by tensor model parallel size."""
         if not HAVE_PACKAGING:
             raise ImportError(
                 "`packaging` is required for this functionality, please install it with `pip install packaging`"
             )
         if PkgVersion(mcore_version) < PkgVersion("0.13"):
             return cls.round_up(value)
-        return cls.TOKEN_ROUNDER * int(math.ceil(int(value) / cls.TOKEN_ROUNDER))
+        # Make sure divisible by TP size
+        if tp_size is None:
+            # Check if parallel state is initialized before trying to get TP size
+            if parallel_state.is_initialized():
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            else:
+                tp_size = 1
+        token_rounder = math.ceil(cls.TOKEN_ROUNDER / tp_size) * tp_size
+        return token_rounder * int(math.ceil(int(value) / token_rounder))
     @classmethod
-    def round_up_requests(cls, value):
-        """Round up to nearest multiple of `REQUEST_ROUNDER` (above)."""
+    def round_up_requests(cls, value, tp_size=None):
+        """Round up to nearest multiple of `REQUEST_ROUNDER` (above) that is also divisible by tensor model parallel size."""
         if not HAVE_PACKAGING:
             raise ImportError(
                 "`packaging` is required for this functionality, please install it with `pip install packaging`"
             )
         if PkgVersion(mcore_version) < PkgVersion("0.13"):
             return cls.round_up(value)
-        return cls.REQUEST_ROUNDER * int(math.ceil(int(value) / cls.REQUEST_ROUNDER))
+        # Make sure divisible by TP size
+        if tp_size is None:
+            # Check if parallel state is initialized before trying to get TP size
+            if parallel_state.is_initialized():
+                tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            else:
+                tp_size = 1
+        request_rounder = math.ceil(cls.REQUEST_ROUNDER / tp_size) * tp_size
+        return request_rounder * int(math.ceil(int(value) / request_rounder))
     @classmethod
     def round_up(cls, value):
@@ -1043,21 +1064,16 @@ class DynamicInferenceContext(BaseInferenceContext):
         # We determine how many requests we can resume and resume them
         # Assign released chunks to paused requests.
         # todo: @shanmugamr, un-pause requests using FIFO, rather than LIFO.
-        if (
-            self.chunk_allocator.chunk_count_avail
-            <= self.paused_request_count + self.gtd_chunk_count
-        ):
-            if active_request_count < self.gtd_request_count:
-                resume_request_count = min(
-                    self.paused_request_count, self.gtd_request_count - active_request_count
-                )
-            else:
-                # If there are more active requests than gtd requests and not enough
-                # chunks available, no requests can be resumed
-                resume_request_count = 0
+        num_non_gtd_chunks = max(0, self.chunk_allocator.chunk_count_avail - self.gtd_chunk_count)
+        if num_non_gtd_chunks:
+            # if we have non-gtd chunks, use them. Do not dip into the gtd-chunk pool
+            resume_request_count = min(num_non_gtd_chunks, self.paused_request_count)
         else:
-            # If there are more available chunks than (paused + gtd requests), resume all paused requests
-            resume_request_count = self.paused_request_count
+            # only dip into the gtd-chunk pool if we have run out of non-gtd-chunks and the active
+            # request count has fallen below a certain threshold.
+            resume_request_count = min(
+                max(self.gtd_request_count - active_request_count, 0), self.paused_request_count
+            )
         self.paused_request_count -= resume_request_count
         active_request_count += resume_request_count

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/inference/text_generation_controllers/text_generation_controller.py RENAMED Viewed

@@ -26,6 +26,8 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_w
 from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.utils import get_attention_mask
 from megatron.core.transformer.cuda_graphs import create_cudagraphs
+from megatron.core.transformer.moe.moe_layer import BaseMoELayer
+from megatron.core.transformer.utils import set_model_to_sequence_parallel
 from megatron.core.utils import get_model_config
 try:
@@ -429,9 +431,11 @@ class TextGenerationController:
         # Get flat tokens, position ids.
         input_ids, position_ids = context.current_input_and_position_ids()
+        model_config = get_model_config(self.inference_wrapped_model.model)
         # If using symmetric kernels and we are using using nccl
         # for prefill turn off symmetric kernels
-        symmetric_ar_type = get_model_config(self.inference_wrapped_model.model).symmetric_ar_type
+        symmetric_ar_type = model_config.symmetric_ar_type
         nccl_all_reduce_for_prefill = (
             self.inference_wrapped_model.inference_wrapper_config.nccl_all_reduce_for_prefill
         )
@@ -588,7 +592,9 @@ class TextGenerationController:
         )
         # Check whether CUDA graphs are enabled
-        enable_cuda_graph = model_config.enable_cuda_graph
+        enable_cuda_graph = (
+            model_config.enable_cuda_graph and model_config.cuda_graph_scope != "full_iteration"
+        )
         # Pad batch tokens if necessary
         batch_size = len(active_requests)
@@ -681,6 +687,21 @@ class TextGenerationController:
                 not self.inference_wrapped_model.inference_context.is_decode_only()
             ), f"Generation must start in prefill mode"
+            # Sequence parallelism is required for MoE layers when using expert parallelism (EP)
+            # becausethe expert routing mechanism relies on sequence parallelism's communication
+            # infrastructure to distribute tokens across expert ranks. However, sequence parallelism
+            # is not currently supported for non-MoE layers during inference,so we selectively
+            # disable it for all other layer types. This is safe because MoE layers perform an
+            # all-gather operation on sequences before passing data to subsequent layers, ensuring
+            # that each rank has the complete sequence data needed for the next non-MoE layer.
+            tp_size = model_config.tensor_model_parallel_size
+            ep_size = model_config.expert_model_parallel_size
+            model_is_tp_ep = tp_size > 1 and ep_size > 1
+            if model_is_tp_ep:
+                set_model_to_sequence_parallel(
+                    self.inference_wrapped_model.model.module, False, exclude_modules=[BaseMoELayer]
+                )
             # If using symmetric kernels and we are using using nccl
             # for prefill turn off symmetric kernels
             symmetric_ar_type = model_config.symmetric_ar_type

{megatron_core-0.14.0rc3 → megatron_core-0.14.0rc4}/megatron/core/model_parallel_config.py RENAMED Viewed

@@ -237,6 +237,14 @@ class ModelParallelConfig:
        Set the bootstrapping backend out of 'nccl', 'mpi', and 'gloo'
     """
+    overlap_moe_expert_parallel_comm: bool = False
+    """Overlap EP A2A communications with independent computations of different micro-batches
+    in 1f1b phase of pipelining or non-pipelining schedule.
+    """
+    delay_wgrad_compute: bool = False
+    """Delay the weight gradient computation to improve batch-level communication overlapping"""
     ###################
     # Pipeline Parallel
     ###################
@@ -307,9 +315,6 @@ class ModelParallelConfig:
        rank 1 |   0 1 2 0 1 2 3 4 3 4
     """
-    delay_wgrad_compute: bool = False
-    """If true, delay the wgrad compute for better overlapping in combined 1F1B."""
     ###################
     # CPU Offloading
     ###################

megatron-core 0.14.0rc3__tar.gz → 0.14.0rc4__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc3tar.gz → 0.14.0rc4tar.gz