PyPI - megatron-core - Versions diffs - 0.14.0rc1__tar.gz → 0.14.0rc2__tar.gz - Mend

megatron-core 0.14.0rc1tar.gz → 0.14.0rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show

{megatron_core-0.14.0rc1/megatron_core.egg-info → megatron_core-0.14.0rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc1
+Version: 0.14.0rc2
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -44,7 +44,7 @@ Requires-Dist: nvtx; extra == "dev"
 Requires-Dist: transformers; extra == "dev"
 Requires-Dist: multi-storage-client; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
-Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
+Requires-Dist: nvidia-modelopt[torch]~=0.31.0; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/enums.py RENAMED Viewed

@@ -7,9 +7,16 @@ class ModelType(enum.Enum):
     """Model type."""
     encoder_or_decoder = 1
-    encoder_and_decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
+    retro_encoder = 2
+    retro_decoder = 3
+    @property
+    def encoder_and_decoder(self):
+        """Deprecated property - use encoder_or_decoder instead."""
+        raise ValueError(
+            "ModelType.encoder_and_decoder is deprecated. Please use ModelType.encoder_or_decoder "
+            "instead."
+        )
 class Fp8Recipe(str, enum.Enum):

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -346,8 +346,12 @@ else:
     def _modify_underlying_storage_impl(*args, **kwargs):
         raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
-    def _quantize_param_shard_impl(*args, **kwargs):
-        raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
+    def _quantize_param_shard_impl(model_params, *args, **kwargs):
+        if len(model_params) == 0:
+            return
+        else:
+            # If TE is not installed, there shouldn't be any fp8 params.
+            raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
     def _correct_amax_history_if_needed_impl(*args, **kwargs):
         # If TE is not installed, we are definitely not using fp8 for training, so no correction

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -2,9 +2,11 @@
 import math
 import warnings
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 import torch
+import torch.nn.functional as F
+from packaging.version import Version as PkgVersion
 from torch import Tensor
 from megatron.core import parallel_state
@@ -123,8 +125,10 @@ class DynamicInferenceContext(BaseInferenceContext):
         max_requests_override: Optional[int] = None,
         max_tokens_override: Optional[int] = None,
         tensor_model_parallel_size: Optional[int] = None,
+        materialize_only_last_token_logits: bool = True,
     ):
-        super().__init__(materialize_only_last_token_logits=True)
+        super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
         # Per partition num heads and hidden size.
         projection_size = kv_channels * num_attention_heads
         if tensor_model_parallel_size is None:
@@ -762,7 +766,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.total_request_count += 1
         self.active_token_count += context_length
-    def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
+    def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         """
         Swaps all the relevent booking tensors with src idxs to dst idxs
         """
@@ -866,7 +870,12 @@ class DynamicInferenceContext(BaseInferenceContext):
             kv_chunks_asigned = self.request_to_kv_chunk_ids[finished_idxs]
             non_zero_values_in_kv_memory = kv_chunks_asigned[kv_chunks_asigned != -1]
             self.chunk_allocator.release_memory_chunks(non_zero_values_in_kv_memory)
-            self.request_to_kv_chunk_ids[finished_idxs].fill_(-1)
+            # Reset the KV chunks for finished requests.
+            # Note: do not use fill_() (or add_() and similar inplace ops) here.
+            # The combinition of indexing with a tensor (like finished_idxs) and fill_()/add_() creates a clone
+            # and updates it instead of the original tensor.
+            self.request_to_kv_chunk_ids[finished_idxs] = -1
             if active_request_count > 0:
                 finished_idxs_on_left = (
@@ -881,12 +890,15 @@ class DynamicInferenceContext(BaseInferenceContext):
                     + self.paused_request_count
                 )
-                self._swap_book_keeping_tensors(
+                self._move_book_keeping_tensors(
                     src_idxs=active_idxs_on_right,
                     dst_idxs=finished_idxs_on_left,
                     next_tokens=next_tokens,
                 )
+                # Reset chunk ids for recently moved requests.
+                self.request_to_kv_chunk_ids[active_idxs_on_right] = -1
         # 5. We identify requests that require a new chunk and add them to the paused requests (i.e move them left) :-
         #       a) Put requests that have filled their current chunk and  require a new one in a pause state temporarily
         #       b) Move the paused requests to the left, and active requets to the right
@@ -931,7 +943,7 @@ class DynamicInferenceContext(BaseInferenceContext):
                 )
                 dst_idxs = torch.cat((active_request_ids_on_left, paused_requests_idxs_on_right))
                 src_idxs = torch.cat((paused_requests_idxs_on_right, active_request_ids_on_left))
-                self._swap_book_keeping_tensors(
+                self._move_book_keeping_tensors(
                     src_idxs=src_idxs, dst_idxs=dst_idxs, next_tokens=next_tokens
                 )
@@ -974,6 +986,8 @@ class DynamicInferenceContext(BaseInferenceContext):
         if self.paused_request_count > 0:
             self.paused_tokens = next_tokens[: self.paused_request_count]
+        # add_ and fill_ calls seems to work as intended with sliced indexing (i.e. x[3:5].add(...) or x[3:5].fill_)
+        # but when another tensor is used for indexing, it does not work as expected (i.e. x[y] if x and y are torch tensors)
         self.request_kv_length_offsets[self.paused_request_count : self.total_request_count].add_(
             self.request_query_lengths[self.paused_request_count : self.total_request_count]
         )
@@ -1027,3 +1041,35 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.token_to_local_position_within_kv_chunk[: self.active_token_count] = (
             self.request_last_kv_chunk_offset[self.paused_request_count : self.total_request_count]
         )
+    def calculate_log_probs(self, logits: torch.Tensor) -> List[List[float]]:
+        """Calculate log probs for all active requests and return them.
+        TODO: @wdykas support top-n log probs.
+        Args:
+            logits: Raw model output logits with shape [1, sequence_length, vocab_size].
+        Returns:
+            List of lists where each inner list contains log probs for a request in the
+            same order as the active requests (from paused_request_count to total_request_count).
+        """
+        # Calculate log_probs (sequence_length x vocab_size)
+        log_probs = F.log_softmax(logits, dim=-1).to(torch.float32).squeeze()
+        # Extract the log probs for only the selected tokens
+        # (sequence_length x vocab_size) -> (sequence_length)
+        active_token_ids = self.token_to_input_ids[: self.active_token_count]
+        sequence_indices = torch.arange(self.active_token_count, device=log_probs.device)
+        selected_log_probs = log_probs[sequence_indices, active_token_ids]
+        # Split the log probs across request boundaries
+        active_query_lengths = self.request_query_lengths[
+            self.paused_request_count : self.total_request_count
+        ]
+        selected_log_probs_list = selected_log_probs.cpu().split(
+            active_query_lengths.tolist(), dim=0
+        )
+        # Convert each log prob tensor into a list
+        return [lp.tolist() for lp in selected_log_probs_list]

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/contexts/static_context.py RENAMED Viewed

@@ -17,7 +17,7 @@ class StaticInferenceContext(BaseInferenceContext):
     """
     def __init__(self, max_batch_size: int, max_sequence_length: int):
-        super().__init__(materialize_only_last_token_logits=False)
+        super().__init__(materialize_only_last_token_logits=True)
         self.max_sequence_length = max_sequence_length
         self.max_batch_size = max_batch_size
         self.sequence_len_offset = 0

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/engines/dynamic_engine.py RENAMED Viewed

@@ -2,6 +2,7 @@
 import asyncio
 from collections import deque
+from itertools import repeat
 from typing import Dict, List, Optional, Tuple, Union
 import torch
@@ -182,6 +183,7 @@ class DynamicInferenceEngine(AbstractEngine):
         finished_request_ids: torch.Tensor,
         step_time: float,
         sample: torch.Tensor,
+        log_probs: torch.Tensor,
     ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest]]:
         """
         Handles post-processing for requests after a step.
@@ -191,6 +193,7 @@ class DynamicInferenceEngine(AbstractEngine):
             finished_request_ids (torch.Tensor): A list of finished request ids
             step_time (float): The latency of the last step
             sample: (torch.Tensor): The newly generated tokens for each request
+            log_probs: (List): Log probs for each request
         Returns:
             A list of active requests and completed requests as `DynamicInferenceRequest` objects
@@ -200,13 +203,25 @@ class DynamicInferenceEngine(AbstractEngine):
         finished_request_ids = set(finished_request_ids.tolist())
         self.finished_request_count += len(finished_request_ids)
-        for request_id, token in zip(request_ids.tolist(), sample.tolist()):
+        log_probs_iter = log_probs if log_probs else repeat(None)
+        for request_id, token, request_log_probs in zip(
+            request_ids.tolist(), sample.tolist(), log_probs_iter
+        ):
             request: DynamicInferenceRequest = self.requests[request_id]
             request.generated_tokens.append(token)
             if request.tpot is None:
                 request.tpot = []
             request.tpot.append(step_time)
+            if request_log_probs is not None:
+                # If prompt log probs is None we are in prefill
+                if request.prompt_log_probs is None:
+                    request.prompt_log_probs = request_log_probs
+                    request.generated_log_probs = []
+                else:
+                    request.generated_log_probs.extend(request_log_probs)
             if request_id in finished_request_ids:
                 request.generated_length = len(request.generated_tokens)
                 request.status = Status.COMPLETED
@@ -266,11 +281,11 @@ class DynamicInferenceEngine(AbstractEngine):
         step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3
         if result is not None:
-            request_ids, finished_request_ids, sample = result
+            request_ids, finished_request_ids, sample, log_probs = result
             # TODO: Move this to a background thread?
             (active_requests, finished_requests) = self.post_process_requests(
-                request_ids, finished_request_ids, step_time, sample
+                request_ids, finished_request_ids, step_time, sample, log_probs
             )
             # TODO: Move this to a background thread?

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Dict, Iterable, Optional, Union
 import torch
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import parallel_state
 from megatron.core.inference.communication_utils import (
     is_pipeline_first_stage,
     is_pipeline_last_stage,
@@ -152,13 +152,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
         tokens = inference_input["tokens"]
         position_ids = inference_input["position_ids"]
         attention_mask = inference_input["attention_mask"]
-        runtime_gather_output = inference_input.get("runtime_gather_output")
         return self.model(
             tokens,
             position_ids,
             attention_mask,
             inference_context=self.inference_context,
-            runtime_gather_output=runtime_gather_output,
+            runtime_gather_output=True,  # Inference should always gather the logits
         )
     def _get_batch_size_and_seq_len(
@@ -201,7 +200,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         """
         tokens = inference_input["tokens"]
         logits = self._forward(inference_input)
-        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits, self.tp_group)
         self.inference_context.increment_sequence_len_offset(tokens.size(1))
         return logits
@@ -243,7 +241,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         logits = None
         if is_pipeline_last_stage(self.pp_group):
             logits = output_tensor
-            logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits, self.tp_group)
             # Explicitly cast logits to expected dtype
             logits = logits.to(self.inference_wrapper_config.params_dtype)
@@ -269,7 +266,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         tokens = inference_input["tokens"]
         position_ids = inference_input["position_ids"]
         attention_mask = inference_input["attention_mask"]
-        runtime_gather_output = inference_input.get("runtime_gather_output")
         materialize_only_last_token_logits = (
             self.inference_context.materialize_only_last_token_logits
         )
@@ -317,7 +313,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
                     "position_ids": position_ids2use,
                     "attention_mask": attention_mask,
                     "inference_context": self.inference_context,
-                    "runtime_gather_output": runtime_gather_output,
                 }
             )
@@ -327,9 +322,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
             self.inference_context.batch_size_offset += current_micro_batch_size
             if is_pipeline_last_stage(self.pp_group):
-                output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(
-                    output_tensor, self.tp_group
-                )
                 assert logits is not None
                 logits[start:end, ...] = output_tensor

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py RENAMED Viewed

@@ -10,6 +10,7 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_w
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
+from megatron.core.inference.utils import get_attention_mask
 from megatron.core.models.gpt import GPTModel
 from megatron.core.transformer.enums import AttnBackend
 from megatron.core.utils import get_model_config
@@ -74,12 +75,7 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
         attention_backend = config.attention_backend
         if attention_backend == AttnBackend.local:
-            attention_mask = torch.tril(
-                torch.ones((1, seq_length, seq_length), device=prompts_tokens.device)
-            ).view(1, 1, seq_length, seq_length)
-            # Convert to boolean
-            attention_mask = attention_mask < 0.5
+            attention_mask = get_attention_mask(seq_length)
         elif (
             attention_backend == AttnBackend.flash
             or attention_backend == AttnBackend.fused

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py RENAMED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
 import torch
-from megatron.core import parallel_state
 from megatron.core.inference.communication_utils import (
     is_pipeline_first_stage,
     is_pipeline_last_stage,
@@ -48,16 +47,10 @@ class VLMInferenceWrapper(GPTInferenceWrapper):
         # has part of the LM decoder. In this case, the current stage should only receive
         # vision embeddings.
         if pp_rank > 0:
-            self._recv_only_vision_embeds = (
-                parallel_state.is_inside_encoder(pp_rank - 1)
-                and (not parallel_state.is_inside_decoder(pp_rank - 1))
-                and parallel_state.is_inside_decoder()
-            )
+            self._recv_only_vision_embeds = False  # TODO: Implement new logic for vision embeddings
         # Checks if the current stage only has a vision encoder
-        self._encoder_only = (
-            parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
-        )
+        self._encoder_only = False  # TODO: Implement new logic for encoder-only stages
     def prep_inference_input(
         self,

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py RENAMED Viewed

@@ -7,6 +7,7 @@ from megatron.core.inference.inference_request import InferenceRequest
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
+from megatron.core.inference.utils import get_attention_mask
 class EncoderDecoderTextGenerationController(TextGenerationController):
@@ -18,13 +19,18 @@ class EncoderDecoderTextGenerationController(TextGenerationController):
     """
     def prep_inference_input(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[str, InferenceRequest]
+        self,
+        prompts_tokens: torch.Tensor,
+        active_requests: OrderedDict[str, InferenceRequest],
+        use_attention_mask: bool = False,
     ) -> Dict[str, Any]:
         """Preparing input data for inference, using respective wrapper's prep_inference_input method # pylint: disable=line-too-long
         Args:
             prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
             active_requests (OrderedDict[str, InferenceRequest]): The input active requests
+            use_attention_mask (bool): Whether to use an attention mask. Should be set to True only
+                when exclusively doing prefill (no decode) with variable prompt lengths.
         Returns:
             A dict of the inference input for the current batch.
@@ -33,6 +39,13 @@ class EncoderDecoderTextGenerationController(TextGenerationController):
             map(lambda request: request.encoder_prompt, active_requests.values())
         )
-        return self.inference_wrapped_model.prep_inference_input(
+        inference_input = self.inference_wrapped_model.prep_inference_input(
             prompts_tokens, encoder_prompts, tokenizer=self.tokenizer
         )
+        if use_attention_mask and (
+            attention_mask := inference_input.get("attention_mask", None) is None
+        ):
+            inference_input["attention_mask"] = get_attention_mask(prompts_tokens.size(1))
+        return inference_input

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/text_generation_controllers/text_generation_controller.py RENAMED Viewed

@@ -24,10 +24,13 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_w
     AbstractModelInferenceWrapper,
 )
 from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.utils import get_attention_mask
 from megatron.core.transformer.cuda_graphs import create_cudagraphs
 from megatron.core.utils import get_model_config
 try:
+    import transformer_engine as te  # pylint: disable=unused-import
     from megatron.core.extensions.transformer_engine import Fp8Padding, Fp8Unpadding
     HAVE_TE = True
@@ -429,6 +432,11 @@ class TextGenerationController:
         context = self.inference_wrapped_model.inference_context
+        if sampling_params.return_log_probs:
+            assert (
+                context.materialize_only_last_token_logits is False
+            ), "Materialize only last token logits must be false for returning log probs"
         # No tokens?
         if context.active_token_count == 0:
             return None
@@ -478,7 +486,13 @@ class TextGenerationController:
                 pp_group=self.pp_group,
             )
-        last_token_logits = logits.squeeze(0)
+        # Last token logits.
+        if context.materialize_only_last_token_logits:
+            # When materialize_only_last_token_logits is true, last_token_logits is
+            # already called in the forward pass of GPT.
+            last_token_logits = logits.squeeze(0)
+        else:
+            last_token_logits = context.last_token_logits(logits)
         # Sample.
         # Use padded vocab size because tokenizer vocab size might not include padding
@@ -505,11 +519,15 @@ class TextGenerationController:
         )
         finished_request_ids = context.request_ids[finished_idxs]
+        log_probs = None
+        if sampling_params.return_log_probs:
+            log_probs = context.calculate_log_probs(logits)
         # Update requests.
         # New sample gets updated in update_requests, so we pass in a clone
         context.update_requests(active_request_mask, new_sample.clone())
-        return current_request_ids, finished_request_ids, new_sample
+        return current_request_ids, finished_request_ids, new_sample, log_probs
     def _update_top_n_logprobs_dict(
         self,
@@ -581,13 +599,12 @@ class TextGenerationController:
         model_config = get_model_config(self.inference_wrapped_model.model)
-        # Verify that if echo mode is requested we do not generate any new tokens
-        echo = getattr(sampling_params, "echo", False)
-        assert (
-            not echo or sampling_params.num_tokens_to_generate == 0
-        ), f"Cannot generate new tokens when echoing"
-        if sampling_params.num_tokens_to_generate == 0 and not echo:
-            sampling_params.add_attributes({"echo": True})
+        # We only need an attention mask if we are exclusively doing prefill over
+        # prompts of variable length
+        use_attention_mask = (
+            sampling_params.num_tokens_to_generate == 0
+            and min_prompt_length_in_batch != max_prompt_length_in_batch
+        )
         # Check whether CUDA graphs are enabled
         enable_cuda_graph = model_config.enable_cuda_graph
@@ -689,7 +706,9 @@ class TextGenerationController:
             self.inference_wrapped_model.prep_model_for_inference()
             inference_input: Dict[str, Any] = self.prep_inference_input(
-                prompts_tokens=padded_batch_prompt_tokens, active_requests=active_requests
+                prompts_tokens=padded_batch_prompt_tokens,
+                active_requests=active_requests,
+                use_attention_mask=use_attention_mask,
             )
             assert (
@@ -706,7 +725,13 @@ class TextGenerationController:
                 self.inference_wrapped_model.model.module.set_symmetric_ar(None)
             context_start_position = 0
-            context_end_position = min_prompt_length_in_batch
+            # If we are exclusively doing prefill then we can process all prompt tokens
+            # together even if the prompt lengths are different
+            if sampling_params.num_tokens_to_generate == 0:
+                context_end_position = max_prompt_length_in_batch
+            else:
+                context_end_position = min_prompt_length_in_batch
             # The initial iteration of this loop runs the prefill phase up to the shortest
             # prompt length in the batch. Then every subsequent iterations runs a decode step.
@@ -734,6 +759,13 @@ class TextGenerationController:
                     and "attention_mask" in inference_input_for_context_window
                 ):
                     inference_input_for_context_window["attention_mask"] = None
+                elif use_attention_mask:
+                    assert (
+                        attention_mask := inference_input_for_context_window.get(
+                            "attention_mask", None
+                        )
+                        is not None
+                    )
                 # Only materialize prompt log probs if the user requests log probs
                 materialize_only_last_token_logits = (
@@ -985,18 +1017,30 @@ class TextGenerationController:
         return active_requests
     def prep_inference_input(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[str, InferenceRequest]
+        self,
+        prompts_tokens: torch.Tensor,
+        active_requests: OrderedDict[str, InferenceRequest],
+        use_attention_mask: bool = False,
     ) -> Dict[str, Any]:
         """Preparing input data for inference, using respective wrapper's prep_inference_input method # pylint: disable=line-too-long
         Args:
             prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
             active_requests (OrderedDict[str, InferenceRequest]): The input active requests
+            use_attention_mask (bool): Whether to use an attention mask. Should be set to True only
+                when exclusively doing prefill (no decode) with variable prompt lengths.
         Returns:
             A dict of the inference input for the current batch.
         """
-        return self.inference_wrapped_model.prep_inference_input(prompts_tokens)
+        inference_input = self.inference_wrapped_model.prep_inference_input(prompts_tokens)
+        if use_attention_mask and (
+            attention_mask := inference_input.get("attention_mask", None) is None
+        ):
+            inference_input["attention_mask"] = get_attention_mask(prompts_tokens.size(1))
+        return inference_input
     def stream_tokens(
         self,

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py RENAMED Viewed

@@ -7,13 +7,17 @@ from megatron.core.inference.inference_request import InferenceRequest, VLMInfer
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
+from megatron.core.inference.utils import get_attention_mask
 class VLMTextGenerationController(TextGenerationController):
     """The text generation controller for VLMs"""
     def prep_inference_input(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[str, InferenceRequest]
+        self,
+        prompts_tokens: torch.Tensor,
+        active_requests: OrderedDict[str, InferenceRequest],
+        use_attention_mask: bool = False,
     ):
         """Preparing input data for inference, using respective wrapper's prep_inference_input method # pylint: disable=line-too-long
@@ -22,6 +26,8 @@ class VLMTextGenerationController(TextGenerationController):
         Args:
             prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
             active_requests (OrderedDict[str, InferenceRequest]): The input active requests
+            use_attention_mask (bool): Whether to use an attention mask. Should be set to True only
+                when exclusively doing prefill (no decode) with variable prompt lengths.
         """
         assert len(active_requests) == 1, f"VLM inference currently only supports batch size 1"
@@ -31,10 +37,17 @@ class VLMTextGenerationController(TextGenerationController):
             request, VLMInferenceRequest
         ), f"Found inference request of type {type(request)}, expected VLMInferenceRequest"
-        return self.inference_wrapped_model.prep_inference_input(
+        inference_input = self.inference_wrapped_model.prep_inference_input(
             prompts_tokens,
             request.num_img_embeddings_per_tile,
             request.imgs,
             request.num_tiles,
             request.decoder_seq_length,
         )
+        if use_attention_mask and (
+            attention_mask := inference_input.get("attention_mask", None) is None
+        ):
+            inference_input["attention_mask"] = get_attention_mask(prompts_tokens.size(1))
+        return inference_input

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/inference/utils.py RENAMED Viewed

@@ -1,4 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import torch
 class Counter:
     """A simple counter class
@@ -16,3 +20,15 @@ class Counter:
     def reset(self) -> None:
         """Reset counter"""
         self.counter = 0
+def get_attention_mask(seq_length: int) -> torch.Tensor:
+    """Constructs an attention mask given the input sequence length."""
+    attention_mask = torch.tril(
+        torch.ones((1, seq_length, seq_length), device=torch.cuda.current_device())
+    ).view(1, 1, seq_length, seq_length)
+    # Convert to boolean
+    attention_mask = attention_mask < 0.5
+    return attention_mask

{megatron_core-0.14.0rc1 → megatron_core-0.14.0rc2}/megatron/core/model_parallel_config.py RENAMED Viewed

@@ -286,11 +286,6 @@ class ModelParallelConfig:
        Defaults to 0, which means all micro-batches are deferred.
     """
-    pipeline_model_parallel_split_rank: Optional[int] = None
-    """If int, rank where encoder and decoder should be split in cases where the model has both an
-       encoder and decoder (e.g., T5). Ignored if None.
-    """
     overlap_p2p_comm_warmup_flush: bool = False
     """If true, overlap communication and computation in warm up and flush phase.
        Only valid when overlap_p2p_comm is True and batch_p2p_comm is False.

megatron-core 0.14.0rc1__tar.gz → 0.14.0rc2__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc1tar.gz → 0.14.0rc2tar.gz