PyPI - megatron-core - Versions diffs - 0.14.0rc0__tar.gz → 0.14.0rc2__tar.gz - Mend

megatron-core 0.14.0rc0tar.gz → 0.14.0rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show

{megatron_core-0.14.0rc0/megatron_core.egg-info → megatron_core-0.14.0rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc0
+Version: 0.14.0rc2
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -44,7 +44,7 @@ Requires-Dist: nvtx; extra == "dev"
 Requires-Dist: transformers; extra == "dev"
 Requires-Dist: multi-storage-client; extra == "dev"
 Requires-Dist: setuptools<80.0.0; extra == "dev"
-Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
+Requires-Dist: nvidia-modelopt[torch]~=0.31.0; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/datasets/indexed_dataset.py RENAMED Viewed

@@ -5,6 +5,7 @@
 # Essentially re-written in entirety
+import gc
 import logging
 import os
 import shutil
@@ -906,6 +907,10 @@ class IndexedDatasetBuilder(object):
             assert index.sequence_modes is not None, "sequence_modes cannot not be None"
             self.sequence_modes.extend(index.sequence_modes)
+        # Free up memory to make space for new indices
+        del index
+        gc.collect()
         # Concatenate data
         with self._open(get_bin_path(path_prefix), "rb") as f:
             shutil.copyfileobj(f, self.data_file)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/distributed/distributed_data_parallel_config.py RENAMED Viewed

@@ -113,6 +113,15 @@ class DistributedDataParallelConfig:
    """
     def __post_init__(self):
+        import os
         """Check the validity of the config."""
         if self.reuse_grad_buf_for_mxfp8_param_ag:
             assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
+        if self.nccl_ub:
+            if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
+                raise ValueError(
+                    "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True is currently not supported "
+                    "with nccl_ub due to compatibility issue with torch.cuda.MemPool API."
+                )

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/enums.py RENAMED Viewed

@@ -7,9 +7,16 @@ class ModelType(enum.Enum):
     """Model type."""
     encoder_or_decoder = 1
-    encoder_and_decoder = 2
-    retro_encoder = 3
-    retro_decoder = 4
+    retro_encoder = 2
+    retro_decoder = 3
+    @property
+    def encoder_and_decoder(self):
+        """Deprecated property - use encoder_or_decoder instead."""
+        raise ValueError(
+            "ModelType.encoder_and_decoder is deprecated. Please use ModelType.encoder_or_decoder "
+            "instead."
+        )
 class Fp8Recipe(str, enum.Enum):

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -39,6 +39,8 @@ from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 from megatron.core.utils import (
+    get_pg_rank,
+    get_pg_size,
     get_te_version,
     get_tensor_model_parallel_group_if_none,
     is_te_min_version,
@@ -228,8 +230,7 @@ class TELinear(te.pytorch.Linear):
             assert tp_group is None, "duplicated linear should not have tp_group set"
             tp_size = 1
         else:
-            assert tp_group is not None, "Parallel linear should always have tp_group set"
-            tp_size = tp_group.size()
+            tp_size = get_pg_size(tp_group)
         self.expert_parallel = self.config.expert_model_parallel_size > 1
         if is_expert:
@@ -374,8 +375,8 @@ class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
         self.is_first_microbatch = True
         self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
         extra_kwargs = _get_extra_te_kwargs(config)
-        self.tp_size = tp_group.size()
-        self.tp_rank = tp_group.rank()
+        self.tp_size = get_pg_size(tp_group)
+        self.tp_rank = get_pg_rank(tp_group)
         if self.config.delay_wgrad_compute:
             if is_te_min_version("2.3.0"):
@@ -542,8 +543,8 @@ class TEColumnParallelLinear(TELinear):
         if gather_output:
             raise ValueError("Transformer Engine linear layers do not support gather_output = True")
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-        world_size = tp_group.size()
-        rank = tp_group.rank()
+        world_size = get_pg_size(tp_group)
+        rank = get_pg_rank(tp_group)
         super().__init__(
             input_size=input_size,
@@ -657,8 +658,8 @@ class TERowParallelLinear(TELinear):
             tp_group=tp_group,
         )
         if config.use_cpu_initialization:
-            world_size = tp_group.size()
-            rank = tp_group.rank()
+            world_size = get_pg_size(tp_group)
+            rank = get_pg_rank(tp_group)
             input_size_per_partition = divide(input_size, world_size)
             self.master_weight = _initialize_affine_weight_cpu(
                 self.weight,
@@ -1003,7 +1004,7 @@ if HAVE_TE and is_te_min_version("1.9.0.dev0"):
             # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
             # So we disable comms by making TE agnostic of model parallel.
             tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-            tp_size = tp_group.size()
+            tp_size = get_pg_size(tp_group)
             self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -346,8 +346,12 @@ else:
     def _modify_underlying_storage_impl(*args, **kwargs):
         raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
-    def _quantize_param_shard_impl(*args, **kwargs):
-        raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
+    def _quantize_param_shard_impl(model_params, *args, **kwargs):
+        if len(model_params) == 0:
+            return
+        else:
+            # If TE is not installed, there shouldn't be any fp8 params.
+            raise RuntimeError("Invalid Transformer Engine version for FP8 distributed optimizer")
     def _correct_amax_history_if_needed_impl(*args, **kwargs):
         # If TE is not installed, we are definitely not using fp8 for training, so no correction

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -2,9 +2,11 @@
 import math
 import warnings
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 import torch
+import torch.nn.functional as F
+from packaging.version import Version as PkgVersion
 from torch import Tensor
 from megatron.core import parallel_state
@@ -123,8 +125,10 @@ class DynamicInferenceContext(BaseInferenceContext):
         max_requests_override: Optional[int] = None,
         max_tokens_override: Optional[int] = None,
         tensor_model_parallel_size: Optional[int] = None,
+        materialize_only_last_token_logits: bool = True,
     ):
-        super().__init__(materialize_only_last_token_logits=True)
+        super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
         # Per partition num heads and hidden size.
         projection_size = kv_channels * num_attention_heads
         if tensor_model_parallel_size is None:
@@ -762,7 +766,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.total_request_count += 1
         self.active_token_count += context_length
-    def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
+    def _move_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         """
         Swaps all the relevent booking tensors with src idxs to dst idxs
         """
@@ -866,7 +870,12 @@ class DynamicInferenceContext(BaseInferenceContext):
             kv_chunks_asigned = self.request_to_kv_chunk_ids[finished_idxs]
             non_zero_values_in_kv_memory = kv_chunks_asigned[kv_chunks_asigned != -1]
             self.chunk_allocator.release_memory_chunks(non_zero_values_in_kv_memory)
-            self.request_to_kv_chunk_ids[finished_idxs].fill_(-1)
+            # Reset the KV chunks for finished requests.
+            # Note: do not use fill_() (or add_() and similar inplace ops) here.
+            # The combinition of indexing with a tensor (like finished_idxs) and fill_()/add_() creates a clone
+            # and updates it instead of the original tensor.
+            self.request_to_kv_chunk_ids[finished_idxs] = -1
             if active_request_count > 0:
                 finished_idxs_on_left = (
@@ -881,12 +890,15 @@ class DynamicInferenceContext(BaseInferenceContext):
                     + self.paused_request_count
                 )
-                self._swap_book_keeping_tensors(
+                self._move_book_keeping_tensors(
                     src_idxs=active_idxs_on_right,
                     dst_idxs=finished_idxs_on_left,
                     next_tokens=next_tokens,
                 )
+                # Reset chunk ids for recently moved requests.
+                self.request_to_kv_chunk_ids[active_idxs_on_right] = -1
         # 5. We identify requests that require a new chunk and add them to the paused requests (i.e move them left) :-
         #       a) Put requests that have filled their current chunk and  require a new one in a pause state temporarily
         #       b) Move the paused requests to the left, and active requets to the right
@@ -931,7 +943,7 @@ class DynamicInferenceContext(BaseInferenceContext):
                 )
                 dst_idxs = torch.cat((active_request_ids_on_left, paused_requests_idxs_on_right))
                 src_idxs = torch.cat((paused_requests_idxs_on_right, active_request_ids_on_left))
-                self._swap_book_keeping_tensors(
+                self._move_book_keeping_tensors(
                     src_idxs=src_idxs, dst_idxs=dst_idxs, next_tokens=next_tokens
                 )
@@ -974,6 +986,8 @@ class DynamicInferenceContext(BaseInferenceContext):
         if self.paused_request_count > 0:
             self.paused_tokens = next_tokens[: self.paused_request_count]
+        # add_ and fill_ calls seems to work as intended with sliced indexing (i.e. x[3:5].add(...) or x[3:5].fill_)
+        # but when another tensor is used for indexing, it does not work as expected (i.e. x[y] if x and y are torch tensors)
         self.request_kv_length_offsets[self.paused_request_count : self.total_request_count].add_(
             self.request_query_lengths[self.paused_request_count : self.total_request_count]
         )
@@ -1027,3 +1041,35 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.token_to_local_position_within_kv_chunk[: self.active_token_count] = (
             self.request_last_kv_chunk_offset[self.paused_request_count : self.total_request_count]
         )
+    def calculate_log_probs(self, logits: torch.Tensor) -> List[List[float]]:
+        """Calculate log probs for all active requests and return them.
+        TODO: @wdykas support top-n log probs.
+        Args:
+            logits: Raw model output logits with shape [1, sequence_length, vocab_size].
+        Returns:
+            List of lists where each inner list contains log probs for a request in the
+            same order as the active requests (from paused_request_count to total_request_count).
+        """
+        # Calculate log_probs (sequence_length x vocab_size)
+        log_probs = F.log_softmax(logits, dim=-1).to(torch.float32).squeeze()
+        # Extract the log probs for only the selected tokens
+        # (sequence_length x vocab_size) -> (sequence_length)
+        active_token_ids = self.token_to_input_ids[: self.active_token_count]
+        sequence_indices = torch.arange(self.active_token_count, device=log_probs.device)
+        selected_log_probs = log_probs[sequence_indices, active_token_ids]
+        # Split the log probs across request boundaries
+        active_query_lengths = self.request_query_lengths[
+            self.paused_request_count : self.total_request_count
+        ]
+        selected_log_probs_list = selected_log_probs.cpu().split(
+            active_query_lengths.tolist(), dim=0
+        )
+        # Convert each log prob tensor into a list
+        return [lp.tolist() for lp in selected_log_probs_list]

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/contexts/static_context.py RENAMED Viewed

@@ -17,7 +17,7 @@ class StaticInferenceContext(BaseInferenceContext):
     """
     def __init__(self, max_batch_size: int, max_sequence_length: int):
-        super().__init__(materialize_only_last_token_logits=False)
+        super().__init__(materialize_only_last_token_logits=True)
         self.max_sequence_length = max_sequence_length
         self.max_batch_size = max_batch_size
         self.sequence_len_offset = 0

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/engines/dynamic_engine.py RENAMED Viewed

@@ -1,8 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 import asyncio
-import time
 from collections import deque
+from itertools import repeat
 from typing import Dict, List, Optional, Tuple, Union
 import torch
@@ -70,6 +70,8 @@ class DynamicInferenceEngine(AbstractEngine):
         self.request_counter = Counter()
         self.requests: Dict[int, DynamicInferenceRequest] = {}
         self.request_completion_futures: Dict[int, asyncio.Future] = {}
+        self.step_start_event = torch.cuda.Event(enable_timing=True)
+        self.step_end_event = torch.cuda.Event(enable_timing=True)
         # Initialize the asyncio loop if it has not already been initialized.
         # TODO: Start the engine loop here.
@@ -176,26 +178,49 @@ class DynamicInferenceEngine(AbstractEngine):
         return self.request_completion_futures[request_id]
     def post_process_requests(
-        self, request_ids: torch.Tensor, finished_request_ids: torch.Tensor, sample: torch.Tensor
-    ) -> List[DynamicInferenceRequest]:
+        self,
+        request_ids: torch.Tensor,
+        finished_request_ids: torch.Tensor,
+        step_time: float,
+        sample: torch.Tensor,
+        log_probs: torch.Tensor,
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest]]:
         """
         Handles post-processing for requests after a step.
         Args:
             request_ids (torch.Tensor): A list of request_ids
             finished_request_ids (torch.Tensor): A list of finished request ids
+            step_time (float): The latency of the last step
             sample: (torch.Tensor): The newly generated tokens for each request
+            log_probs: (List): Log probs for each request
         Returns:
-            A list of completed requests as `DynamicInferenceRequest` objects
+            A list of active requests and completed requests as `DynamicInferenceRequest` objects
         """
+        active_requests: List[DynamicInferenceRequest] = []
         finished_requests: List[DynamicInferenceRequest] = []
         finished_request_ids = set(finished_request_ids.tolist())
         self.finished_request_count += len(finished_request_ids)
-        for request_id, token in zip(request_ids.tolist(), sample.tolist()):
+        log_probs_iter = log_probs if log_probs else repeat(None)
+        for request_id, token, request_log_probs in zip(
+            request_ids.tolist(), sample.tolist(), log_probs_iter
+        ):
             request: DynamicInferenceRequest = self.requests[request_id]
             request.generated_tokens.append(token)
+            if request.tpot is None:
+                request.tpot = []
+            request.tpot.append(step_time)
+            if request_log_probs is not None:
+                # If prompt log probs is None we are in prefill
+                if request.prompt_log_probs is None:
+                    request.prompt_log_probs = request_log_probs
+                    request.generated_log_probs = []
+                else:
+                    request.generated_log_probs.extend(request_log_probs)
             if request_id in finished_request_ids:
                 request.generated_length = len(request.generated_tokens)
@@ -207,50 +232,67 @@ class DynamicInferenceEngine(AbstractEngine):
                     finished_request.generated_tokens
                 )
                 self.request_completion_futures[request_id].set_result(finished_request)
-        return finished_requests
+            else:
+                active_requests.append(request)
+        return active_requests, finished_requests
+    def schedule_waiting_requests(self):
+        """Tries to schedule any requests in the waiting pool."""
+        for waiting_request_id in self.waiting_request_ids.copy():
+            waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
+            try:
+                self.context.add_request(
+                    waiting_request_id,
+                    waiting_request.prompt_tokens,
+                    waiting_request.sampling_params.num_tokens_to_generate,
+                )
+                self.waiting_request_ids.popleft()
+            except Exception as e:
+                break
     async def async_step(
         self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
-    ) -> Tuple[List[DynamicInferenceRequest], float]:
-        """Wrapper for controller.generate_output_tokens_dynamic_batch(), to
-        match vLLM API.
-        Uses `asyncio` for continuous generation which allows this
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
+        """
+        Wrapper for controller.generate_output_tokens_dynamic_batch(), to
+        match vLLM API. Uses `asyncio` for continuous generation which allows this
         method to sleep and wake up when new requests are available.
+        Args:
+            sampling_params (SamplingParams): The sampling parameters.
+            verbose (bool): Whether to run in verbose mode.
+        Returns:
+            A tuple comprised of:
+                1. Requests that ran in the last step and are still active.
+                2. Requests that ran in the last step and have now finished.
+                3. The step time in seconds.
         """
         # Generate tokens.
-        t = time.time()
         is_decode_only = self.context.is_decode_only()
+        self.step_start_event.record()
         result = self.controller.generate_output_tokens_dynamic_batch(
             sampling_params, self.termination_id
         )
-        step_time = time.time() - t
-        finished_requests: List[DynamicInferenceRequest] = []
+        self.step_end_event.record()
+        self.step_end_event.synchronize()
+        step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3
         if result is not None:
-            request_ids, finished_request_ids, sample = result
+            request_ids, finished_request_ids, sample, log_probs = result
             # TODO: Move this to a background thread?
-            finished_requests.extend(
-                self.post_process_requests(request_ids, finished_request_ids, sample)
+            (active_requests, finished_requests) = self.post_process_requests(
+                request_ids, finished_request_ids, step_time, sample, log_probs
             )
-            # Schedule waiting requests
             # TODO: Move this to a background thread?
-            for waiting_request_id in self.waiting_request_ids.copy():
-                waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
-                try:
-                    self.context.add_request(
-                        waiting_request_id,
-                        waiting_request.prompt_tokens,
-                        waiting_request.sampling_params.num_tokens_to_generate,
-                    )
-                    self.waiting_request_ids.popleft()
-                except Exception as e:
-                    break
+            self.schedule_waiting_requests()
+        else:
+            active_requests: List[DynamicInferenceRequest] = []
+            finished_requests: List[DynamicInferenceRequest] = []
         # Print context state.
         if verbose:
@@ -278,9 +320,11 @@ class DynamicInferenceEngine(AbstractEngine):
                 )
             )
-        return finished_requests, step_time
+        return active_requests, finished_requests, step_time
-    def step(self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False):
+    def step(
+        self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
         """Synchronous wrapper for `self.async_step`."""
         return self._loop.run_until_complete(
             self.async_step(sampling_params=sampling_params, verbose=verbose)
@@ -297,7 +341,7 @@ class DynamicInferenceEngine(AbstractEngine):
         finished_requests_list = []
         while self.has_unfinished_requests():
-            finished_requests, step_time = self.step(sampling_params)
+            active_requests, finished_requests, step_time = self.step(sampling_params)
             finished_requests_list.extend(finished_requests)
         return finished_requests_list

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/inference_request.py RENAMED Viewed

@@ -46,6 +46,7 @@ class InferenceRequest:
     prompt_top_n_logprobs: Optional[List[Dict[str, float]]] = None
     generated_top_n_logprobs: Optional[List[Dict[str, float]]] = None
     generated_length: Optional[int] = None
+    tpot: Optional[List[int]] = None
     def __post_init__(self):
         if self.sampling_params is None and self.inference_parameters is not None:

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py RENAMED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Dict, Iterable, Optional, Union
 import torch
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import parallel_state
 from megatron.core.inference.communication_utils import (
     is_pipeline_first_stage,
     is_pipeline_last_stage,
@@ -152,13 +152,12 @@ class AbstractModelInferenceWrapper(abc.ABC):
         tokens = inference_input["tokens"]
         position_ids = inference_input["position_ids"]
         attention_mask = inference_input["attention_mask"]
-        runtime_gather_output = inference_input.get("runtime_gather_output")
         return self.model(
             tokens,
             position_ids,
             attention_mask,
             inference_context=self.inference_context,
-            runtime_gather_output=runtime_gather_output,
+            runtime_gather_output=True,  # Inference should always gather the logits
         )
     def _get_batch_size_and_seq_len(
@@ -201,7 +200,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         """
         tokens = inference_input["tokens"]
         logits = self._forward(inference_input)
-        logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits, self.tp_group)
         self.inference_context.increment_sequence_len_offset(tokens.size(1))
         return logits
@@ -243,7 +241,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         logits = None
         if is_pipeline_last_stage(self.pp_group):
             logits = output_tensor
-            logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits, self.tp_group)
             # Explicitly cast logits to expected dtype
             logits = logits.to(self.inference_wrapper_config.params_dtype)
@@ -269,7 +266,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
         tokens = inference_input["tokens"]
         position_ids = inference_input["position_ids"]
         attention_mask = inference_input["attention_mask"]
-        runtime_gather_output = inference_input.get("runtime_gather_output")
         materialize_only_last_token_logits = (
             self.inference_context.materialize_only_last_token_logits
         )
@@ -317,7 +313,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
                     "position_ids": position_ids2use,
                     "attention_mask": attention_mask,
                     "inference_context": self.inference_context,
-                    "runtime_gather_output": runtime_gather_output,
                 }
             )
@@ -327,9 +322,6 @@ class AbstractModelInferenceWrapper(abc.ABC):
             self.inference_context.batch_size_offset += current_micro_batch_size
             if is_pipeline_last_stage(self.pp_group):
-                output_tensor = tensor_parallel.gather_from_tensor_model_parallel_region(
-                    output_tensor, self.tp_group
-                )
                 assert logits is not None
                 logits[start:end, ...] = output_tensor

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py RENAMED Viewed

@@ -10,6 +10,7 @@ from megatron.core.inference.model_inference_wrappers.abstract_model_inference_w
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
+from megatron.core.inference.utils import get_attention_mask
 from megatron.core.models.gpt import GPTModel
 from megatron.core.transformer.enums import AttnBackend
 from megatron.core.utils import get_model_config
@@ -74,12 +75,7 @@ class GPTInferenceWrapper(AbstractModelInferenceWrapper):
         attention_backend = config.attention_backend
         if attention_backend == AttnBackend.local:
-            attention_mask = torch.tril(
-                torch.ones((1, seq_length, seq_length), device=prompts_tokens.device)
-            ).view(1, 1, seq_length, seq_length)
-            # Convert to boolean
-            attention_mask = attention_mask < 0.5
+            attention_mask = get_attention_mask(seq_length)
         elif (
             attention_backend == AttnBackend.flash
             or attention_backend == AttnBackend.fused

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py RENAMED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
 import torch
-from megatron.core import parallel_state
 from megatron.core.inference.communication_utils import (
     is_pipeline_first_stage,
     is_pipeline_last_stage,
@@ -48,16 +47,10 @@ class VLMInferenceWrapper(GPTInferenceWrapper):
         # has part of the LM decoder. In this case, the current stage should only receive
         # vision embeddings.
         if pp_rank > 0:
-            self._recv_only_vision_embeds = (
-                parallel_state.is_inside_encoder(pp_rank - 1)
-                and (not parallel_state.is_inside_decoder(pp_rank - 1))
-                and parallel_state.is_inside_decoder()
-            )
+            self._recv_only_vision_embeds = False  # TODO: Implement new logic for vision embeddings
         # Checks if the current stage only has a vision encoder
-        self._encoder_only = (
-            parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
-        )
+        self._encoder_only = False  # TODO: Implement new logic for encoder-only stages
     def prep_inference_input(
         self,

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc2}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py RENAMED Viewed

@@ -7,6 +7,7 @@ from megatron.core.inference.inference_request import InferenceRequest
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
+from megatron.core.inference.utils import get_attention_mask
 class EncoderDecoderTextGenerationController(TextGenerationController):
@@ -18,13 +19,18 @@ class EncoderDecoderTextGenerationController(TextGenerationController):
     """
     def prep_inference_input(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[str, InferenceRequest]
+        self,
+        prompts_tokens: torch.Tensor,
+        active_requests: OrderedDict[str, InferenceRequest],
+        use_attention_mask: bool = False,
     ) -> Dict[str, Any]:
         """Preparing input data for inference, using respective wrapper's prep_inference_input method # pylint: disable=line-too-long
         Args:
             prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
             active_requests (OrderedDict[str, InferenceRequest]): The input active requests
+            use_attention_mask (bool): Whether to use an attention mask. Should be set to True only
+                when exclusively doing prefill (no decode) with variable prompt lengths.
         Returns:
             A dict of the inference input for the current batch.
@@ -33,6 +39,13 @@ class EncoderDecoderTextGenerationController(TextGenerationController):
             map(lambda request: request.encoder_prompt, active_requests.values())
         )
-        return self.inference_wrapped_model.prep_inference_input(
+        inference_input = self.inference_wrapped_model.prep_inference_input(
             prompts_tokens, encoder_prompts, tokenizer=self.tokenizer
         )
+        if use_attention_mask and (
+            attention_mask := inference_input.get("attention_mask", None) is None
+        ):
+            inference_input["attention_mask"] = get_attention_mask(prompts_tokens.size(1))
+        return inference_input

megatron-core 0.14.0rc0__tar.gz → 0.14.0rc2__tar.gz

Potentially problematic release.

megatron-core 0.14.0rc0tar.gz → 0.14.0rc2tar.gz