PyPI - megatron-core - Versions diffs - 0.14.0rc0__tar.gz → 0.14.0rc1__tar.gz - Mend

megatron-core 0.14.0rc0tar.gz → 0.14.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show

{megatron_core-0.14.0rc0/megatron_core.egg-info → megatron_core-0.14.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc0
+Version: 0.14.0rc1
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/indexed_dataset.py RENAMED Viewed

@@ -5,6 +5,7 @@
 # Essentially re-written in entirety
+import gc
 import logging
 import os
 import shutil
@@ -906,6 +907,10 @@ class IndexedDatasetBuilder(object):
             assert index.sequence_modes is not None, "sequence_modes cannot not be None"
             self.sequence_modes.extend(index.sequence_modes)
+        # Free up memory to make space for new indices
+        del index
+        gc.collect()
         # Concatenate data
         with self._open(get_bin_path(path_prefix), "rb") as f:
             shutil.copyfileobj(f, self.data_file)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/distributed_data_parallel_config.py RENAMED Viewed

@@ -113,6 +113,15 @@ class DistributedDataParallelConfig:
    """
     def __post_init__(self):
+        import os
         """Check the validity of the config."""
         if self.reuse_grad_buf_for_mxfp8_param_ag:
             assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
+        if self.nccl_ub:
+            if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
+                raise ValueError(
+                    "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True is currently not supported "
+                    "with nccl_ub due to compatibility issue with torch.cuda.MemPool API."
+                )

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/extensions/transformer_engine.py RENAMED Viewed

@@ -39,6 +39,8 @@ from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 from megatron.core.utils import (
+    get_pg_rank,
+    get_pg_size,
     get_te_version,
     get_tensor_model_parallel_group_if_none,
     is_te_min_version,
@@ -228,8 +230,7 @@ class TELinear(te.pytorch.Linear):
             assert tp_group is None, "duplicated linear should not have tp_group set"
             tp_size = 1
         else:
-            assert tp_group is not None, "Parallel linear should always have tp_group set"
-            tp_size = tp_group.size()
+            tp_size = get_pg_size(tp_group)
         self.expert_parallel = self.config.expert_model_parallel_size > 1
         if is_expert:
@@ -374,8 +375,8 @@ class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
         self.is_first_microbatch = True
         self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
         extra_kwargs = _get_extra_te_kwargs(config)
-        self.tp_size = tp_group.size()
-        self.tp_rank = tp_group.rank()
+        self.tp_size = get_pg_size(tp_group)
+        self.tp_rank = get_pg_rank(tp_group)
         if self.config.delay_wgrad_compute:
             if is_te_min_version("2.3.0"):
@@ -542,8 +543,8 @@ class TEColumnParallelLinear(TELinear):
         if gather_output:
             raise ValueError("Transformer Engine linear layers do not support gather_output = True")
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-        world_size = tp_group.size()
-        rank = tp_group.rank()
+        world_size = get_pg_size(tp_group)
+        rank = get_pg_rank(tp_group)
         super().__init__(
             input_size=input_size,
@@ -657,8 +658,8 @@ class TERowParallelLinear(TELinear):
             tp_group=tp_group,
         )
         if config.use_cpu_initialization:
-            world_size = tp_group.size()
-            rank = tp_group.rank()
+            world_size = get_pg_size(tp_group)
+            rank = get_pg_rank(tp_group)
             input_size_per_partition = divide(input_size, world_size)
             self.master_weight = _initialize_affine_weight_cpu(
                 self.weight,
@@ -1003,7 +1004,7 @@ if HAVE_TE and is_te_min_version("1.9.0.dev0"):
             # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
             # So we disable comms by making TE agnostic of model parallel.
             tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
-            tp_size = tp_group.size()
+            tp_size = get_pg_size(tp_group)
             self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/dynamic_engine.py RENAMED Viewed

@@ -1,7 +1,6 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 import asyncio
-import time
 from collections import deque
 from typing import Dict, List, Optional, Tuple, Union
@@ -70,6 +69,8 @@ class DynamicInferenceEngine(AbstractEngine):
         self.request_counter = Counter()
         self.requests: Dict[int, DynamicInferenceRequest] = {}
         self.request_completion_futures: Dict[int, asyncio.Future] = {}
+        self.step_start_event = torch.cuda.Event(enable_timing=True)
+        self.step_end_event = torch.cuda.Event(enable_timing=True)
         # Initialize the asyncio loop if it has not already been initialized.
         # TODO: Start the engine loop here.
@@ -176,19 +177,25 @@ class DynamicInferenceEngine(AbstractEngine):
         return self.request_completion_futures[request_id]
     def post_process_requests(
-        self, request_ids: torch.Tensor, finished_request_ids: torch.Tensor, sample: torch.Tensor
-    ) -> List[DynamicInferenceRequest]:
+        self,
+        request_ids: torch.Tensor,
+        finished_request_ids: torch.Tensor,
+        step_time: float,
+        sample: torch.Tensor,
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest]]:
         """
         Handles post-processing for requests after a step.
         Args:
             request_ids (torch.Tensor): A list of request_ids
             finished_request_ids (torch.Tensor): A list of finished request ids
+            step_time (float): The latency of the last step
             sample: (torch.Tensor): The newly generated tokens for each request
         Returns:
-            A list of completed requests as `DynamicInferenceRequest` objects
+            A list of active requests and completed requests as `DynamicInferenceRequest` objects
         """
+        active_requests: List[DynamicInferenceRequest] = []
         finished_requests: List[DynamicInferenceRequest] = []
         finished_request_ids = set(finished_request_ids.tolist())
         self.finished_request_count += len(finished_request_ids)
@@ -196,6 +203,9 @@ class DynamicInferenceEngine(AbstractEngine):
         for request_id, token in zip(request_ids.tolist(), sample.tolist()):
             request: DynamicInferenceRequest = self.requests[request_id]
             request.generated_tokens.append(token)
+            if request.tpot is None:
+                request.tpot = []
+            request.tpot.append(step_time)
             if request_id in finished_request_ids:
                 request.generated_length = len(request.generated_tokens)
@@ -207,50 +217,67 @@ class DynamicInferenceEngine(AbstractEngine):
                     finished_request.generated_tokens
                 )
                 self.request_completion_futures[request_id].set_result(finished_request)
-        return finished_requests
+            else:
+                active_requests.append(request)
+        return active_requests, finished_requests
+    def schedule_waiting_requests(self):
+        """Tries to schedule any requests in the waiting pool."""
+        for waiting_request_id in self.waiting_request_ids.copy():
+            waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
+            try:
+                self.context.add_request(
+                    waiting_request_id,
+                    waiting_request.prompt_tokens,
+                    waiting_request.sampling_params.num_tokens_to_generate,
+                )
+                self.waiting_request_ids.popleft()
+            except Exception as e:
+                break
     async def async_step(
         self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
-    ) -> Tuple[List[DynamicInferenceRequest], float]:
-        """Wrapper for controller.generate_output_tokens_dynamic_batch(), to
-        match vLLM API.
-        Uses `asyncio` for continuous generation which allows this
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
+        """
+        Wrapper for controller.generate_output_tokens_dynamic_batch(), to
+        match vLLM API. Uses `asyncio` for continuous generation which allows this
         method to sleep and wake up when new requests are available.
+        Args:
+            sampling_params (SamplingParams): The sampling parameters.
+            verbose (bool): Whether to run in verbose mode.
+        Returns:
+            A tuple comprised of:
+                1. Requests that ran in the last step and are still active.
+                2. Requests that ran in the last step and have now finished.
+                3. The step time in seconds.
         """
         # Generate tokens.
-        t = time.time()
         is_decode_only = self.context.is_decode_only()
+        self.step_start_event.record()
         result = self.controller.generate_output_tokens_dynamic_batch(
             sampling_params, self.termination_id
         )
-        step_time = time.time() - t
-        finished_requests: List[DynamicInferenceRequest] = []
+        self.step_end_event.record()
+        self.step_end_event.synchronize()
+        step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3
         if result is not None:
             request_ids, finished_request_ids, sample = result
             # TODO: Move this to a background thread?
-            finished_requests.extend(
-                self.post_process_requests(request_ids, finished_request_ids, sample)
+            (active_requests, finished_requests) = self.post_process_requests(
+                request_ids, finished_request_ids, step_time, sample
             )
-            # Schedule waiting requests
             # TODO: Move this to a background thread?
-            for waiting_request_id in self.waiting_request_ids.copy():
-                waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
-                try:
-                    self.context.add_request(
-                        waiting_request_id,
-                        waiting_request.prompt_tokens,
-                        waiting_request.sampling_params.num_tokens_to_generate,
-                    )
-                    self.waiting_request_ids.popleft()
-                except Exception as e:
-                    break
+            self.schedule_waiting_requests()
+        else:
+            active_requests: List[DynamicInferenceRequest] = []
+            finished_requests: List[DynamicInferenceRequest] = []
         # Print context state.
         if verbose:
@@ -278,9 +305,11 @@ class DynamicInferenceEngine(AbstractEngine):
                 )
             )
-        return finished_requests, step_time
+        return active_requests, finished_requests, step_time
-    def step(self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False):
+    def step(
+        self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
+    ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
         """Synchronous wrapper for `self.async_step`."""
         return self._loop.run_until_complete(
             self.async_step(sampling_params=sampling_params, verbose=verbose)
@@ -297,7 +326,7 @@ class DynamicInferenceEngine(AbstractEngine):
         finished_requests_list = []
         while self.has_unfinished_requests():
-            finished_requests, step_time = self.step(sampling_params)
+            active_requests, finished_requests, step_time = self.step(sampling_params)
             finished_requests_list.extend(finished_requests)
         return finished_requests_list

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/inference_request.py RENAMED Viewed

@@ -46,6 +46,7 @@ class InferenceRequest:
     prompt_top_n_logprobs: Optional[List[Dict[str, float]]] = None
     generated_top_n_logprobs: Optional[List[Dict[str, float]]] = None
     generated_length: Optional[int] = None
+    tpot: Optional[List[int]] = None
     def __post_init__(self):
         if self.sampling_params is None and self.inference_parameters is not None:

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/text_generation_controller.py RENAMED Viewed

@@ -34,6 +34,8 @@ try:
 except ImportError:
     HAVE_TE = False
+    Fp8Padding = None
+    Fp8Unpadding = None
 class TextGenerationController:
@@ -312,6 +314,7 @@ class TextGenerationController:
         current_context_end_position: int,
         is_generation_done_tensor: torch.Tensor,
         generated_sequence_lengths: torch.Tensor,
+        termination_id: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Checks which prompts have reached an end condition
@@ -337,10 +340,12 @@ class TextGenerationController:
             Tuple[torch.Tensor, torch.Tensor]: Returns the boolean
                 is_generation_done_tensor and the generated_sequence_lengths after updating it
         """
+        if termination_id is None:
+            termination_id = self.tokenizer.eod
         latest_samples = updated_prompts_tokens[:, current_context_end_position]
         # Make sure we are checking eod criterion only for prompts that have started generating
         # (i.e) We only look at the generated tokenns and not the input tokens.
-        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
+        reached_eod = (latest_samples == termination_id) & generation_started
         is_generation_done_tensor = is_generation_done_tensor | reached_eod
         # We increment generated sequence lengths when that prompt has not hit the
         # EOD and generation has started
@@ -543,7 +548,7 @@ class TextGenerationController:
         active_requests: OrderedDict[str, InferenceRequest],
         active_streams: Optional[OrderedDict[str, AsyncStream]] = None,
     ) -> OrderedDict[str, InferenceRequest]:
-        """Utility to generate the all the output tokens and probabilities for the prompts .
+        """Utility to generate all the output tokens and probabilities for the prompts.
         This utility generates the output tokens for a static batch. It runs the forward steps till
         all prompts complete generation, updates the status of these requests to completed, adds
@@ -654,6 +659,10 @@ class TextGenerationController:
         # to nearest power of 2
         vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size
+        # Check whether early termination is enabled
+        no_early_termination = getattr(sampling_params, "no_early_termination", False)
+        termination_id = -1 if no_early_termination else self.tokenizer.eod
         streaming_enabled = active_streams is not None and len(active_streams) > 0
         if streaming_enabled:
             # Start a separate thread for streaming tokens to avoid blocking the
@@ -671,6 +680,11 @@ class TextGenerationController:
             streaming_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
             stream_tokens = functools.partial(self.stream_tokens, sampling_params)
+        for request in active_requests.values():
+            # Initialize to a list to store a latency measurement for each generated token.
+            request.tpot = []
+        timing_events = []
         with torch.inference_mode():
             self.inference_wrapped_model.prep_model_for_inference()
@@ -694,7 +708,18 @@ class TextGenerationController:
             context_start_position = 0
             context_end_position = min_prompt_length_in_batch
+            # The initial iteration of this loop runs the prefill phase up to the shortest
+            # prompt length in the batch. Then every subsequent iterations runs a decode step.
+            # At least one new token will be generated in each iteration. The generated token
+            # will be ignored for requests which have prompt length > the current generated
+            # sequence length. Similarly, the generated token is ignored for requests which
+            # have maximum total sequence length < the current generated sequence length.
             while True:
+                # Add a timing event at the start of each iteration. The token generation
+                # time will be the elapsed time between consective timing events.
+                timing_events.append(torch.cuda.Event(enable_timing=True))
+                timing_events[-1].record()
                 # Pick the context window that we need to pass through the network.
                 inference_input_for_context_window: Dict[str, Any] = (
                     self.inference_wrapped_model.get_batch_for_context_window(
@@ -817,6 +842,7 @@ class TextGenerationController:
                             current_context_end_position=context_end_position,
                             is_generation_done_tensor=is_generation_done_tensor,
                             generated_sequence_lengths=generated_sequence_lengths,
+                            termination_id=termination_id,
                         )
                     )
@@ -852,6 +878,10 @@ class TextGenerationController:
                 if context_end_position >= max_sequence_length:
                     break
+        # Add a final timing event to compute the latency of every loop iteration
+        timing_events.append(torch.cuda.Event(enable_timing=True))
+        timing_events[-1].record()
         # Close all streams
         if streaming_enabled:
             streaming_executor.shutdown()
@@ -870,6 +900,15 @@ class TextGenerationController:
             generated_sequence_lengths > sampling_params.num_tokens_to_generate
         ] = sampling_params.num_tokens_to_generate
+        timing_events[-1].synchronize()
+        tpot = torch.tensor(
+            [
+                timing_events[i].elapsed_time(timing_events[i + 1]) / 1e3
+                for i in range(len(timing_events) - 1)
+            ],
+            dtype=torch.float32,
+        )
         for idx, request in enumerate(active_requests.values()):
             input_prompt_length = int(prompt_lengths_in_batch[idx])
             # Shorter prompts might have generated more than required tokens. So we trim them down
@@ -885,6 +924,20 @@ class TextGenerationController:
             request.generated_length = required_sequence_length
             request.generated_tokens = required_result_tokens
+            # Record the decode latencies for only the generated tokens
+            request_tpot = tpot.clone()
+            # Sum up the latencies of the first prompt tokens if the
+            # request prompt length > minimum prompt length
+            spill_length = input_prompt_length - min_prompt_length_in_batch
+            if spill_length > 0:
+                spill_latency = request_tpot[:spill_length].sum()
+                request_tpot = torch.cat((spill_latency.unsqueeze(0), request_tpot[spill_length:]))
+            # Remove the extraneous latencies if the
+            # request sequence length < maximum sequence length
+            request_tpot = request_tpot[:required_sequence_length]
+            request.tpot = request_tpot.tolist()
             if output_log_probs is not None:
                 request.prompt_log_probs = output_log_probs[idx, : input_prompt_length - 1].tolist()
                 request.generated_log_probs = output_log_probs[

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 14
 PATCH = 0
-PRE_RELEASE = 'rc0'
+PRE_RELEASE = 'rc1'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/layers.py RENAMED Viewed

@@ -20,6 +20,8 @@ from megatron.core.parallel_state import (
 )
 from megatron.core.utils import (
     divide,
+    get_pg_rank,
+    get_pg_size,
     get_tensor_model_parallel_group_if_none,
     is_torch_min_version,
     make_tp_sharded_tensor_for_checkpoint,
@@ -219,7 +221,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         (self.vocab_start_index, self.vocab_end_index) = (
             VocabUtility.vocab_range_from_global_vocab_size(
-                self.num_embeddings, self.tp_group.rank(), self.tp_group.size()
+                self.num_embeddings, get_pg_rank(self.tp_group), get_pg_size(self.tp_group)
             )
         )
         self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
@@ -241,8 +243,8 @@ class VocabParallelEmbedding(torch.nn.Module):
                     0,
                     init_method,
                     params_dtype=config.params_dtype,
-                    rank=self.tp_group.rank(),
-                    world_size=self.tp_group.size(),
+                    rank=get_pg_rank(self.tp_group),
+                    world_size=get_pg_size(self.tp_group),
                 )
         else:
             self.weight = Parameter(
@@ -808,8 +810,8 @@ class ColumnParallelLinear(torch.nn.Module):
         self.tp_group = get_tensor_model_parallel_group_if_none(
             self.tp_group, is_expert=self.is_expert
         )
-        world_size = self.tp_group.size()
-        rank = self.tp_group.rank()
+        world_size = get_pg_size(self.tp_group)
+        rank = get_pg_rank(self.tp_group)
         self.explicit_expert_comm = self.is_expert and (world_size > 1 or self.expert_parallel)
         self.output_size_per_partition = divide(output_size, world_size)
@@ -1120,8 +1122,8 @@ class RowParallelLinear(torch.nn.Module):
             self.tp_group, is_expert=self.is_expert
         )
-        world_size = self.tp_group.size()
-        rank = self.tp_group.rank()
+        world_size = get_pg_size(self.tp_group)
+        rank = get_pg_rank(self.tp_group)
         self.explicit_expert_comm = self.is_expert and (world_size > 1 or self.expert_parallel)
         self.input_size_per_partition = divide(input_size, world_size)

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/attention.py RENAMED Viewed

@@ -28,6 +28,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.utils import (
     deprecate_inference_params,
     divide,
+    get_pg_size,
     is_fa_min_version,
     nvtx_range_pop,
     nvtx_range_push,
@@ -135,7 +136,7 @@ class Attention(MegatronModule, ABC):
         self.model_comm_pgs = model_comm_pgs
         # Per attention head and per partition values
-        world_size = self.model_comm_pgs.tp.size()
+        world_size = get_pg_size(self.model_comm_pgs.tp)
         self.hidden_size_per_attention_head = divide(
             self.query_projection_size, self.config.num_attention_heads
         )

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/cuda_graphs.py RENAMED Viewed

@@ -977,9 +977,13 @@ class CudaGraphManager(torch.nn.Module):
                 runner = self.get_cudagraph_runner(megatron_module)
                 runner.eval()
                 out = runner.record_graph_capture(args, kwargs)
-            elif self.training and torch.is_grad_enabled():
+            elif self.training:
                 # Training mode
                 runner = self.get_cudagraph_runner(megatron_module)
+                # check if a layer is frozen during training.
+                if not torch.is_grad_enabled():
+                    # If the layer is frozen, we need to set the runner to eval mode.
+                    runner.eval()
                 out = runner.record_graph_capture(args, kwargs)
             else:
                 # No cudagraphs were found in training mode with grad disabled, so fallback to

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/utils.py RENAMED Viewed

@@ -401,6 +401,9 @@ def deprecate_inference_params(inference_context, inference_params):
 def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_initialized=True):
     """Issue a deprecation warning if tp_group is None and return the default tp group."""
     # TODO(zijiey): remove this function later.
+    if not torch.distributed.is_initialized():
+        return None
     if tp_group is None:
         if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
             warnings.warn(

{megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1/megatron_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.14.0rc0
+Version: 0.14.0rc1
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>