PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev127802__tar.gz → 0.16.0rc0.dev128858__tar.gz - Mend

megatron-core 0.16.0rc0.dev127802tar.gz → 0.16.0rc0.dev128858tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev127802
+Version: 0.16.0rc0.dev128858
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/async_stream.py RENAMED Viewed

@@ -9,7 +9,6 @@ import asyncio
 from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
 from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.utils import get_asyncio_loop
 STOP_ITERATION = Exception()
@@ -21,17 +20,12 @@ class AsyncStream:
     Adopted from https://github.com/vllm-project/vllm/blob/eb881ed006ca458b052905e33f0d16dbb428063a/vllm/v1/engine/async_stream.py # pylint: disable=line-too-long
     """
-    def __init__(
-        self,
-        request_id: int,
-        cancel: Callable[[str], None],
-        loop: Optional[asyncio.AbstractEventLoop] = None,
-    ) -> None:
+    def __init__(self, request_id: int, cancel: Callable[[str], None]) -> None:
         self._request_id = request_id
         self._cancel = cancel
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
-        self._loop = get_asyncio_loop(loop)
+        self._loop = asyncio.get_running_loop()
     def put(self, item: Union[InferenceRequest, Exception]) -> None:
         """Adds a new value to the stream"""

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -23,14 +23,9 @@ from megatron.core.inference.unified_memory import (
 from megatron.core.inference.utils import tensor_swap
 from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
 from megatron.core.package_info import __version__ as mcore_version
-from megatron.core.ssm.mamba_hybrid_layer_allocation import (
-    Symbols,
-    get_layer_maps_from_layer_type_list,
-)
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import divide as core_divide
-from .attention_context.mamba_metadata import MambaMetadata
 from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
 from .base_context import BaseInferenceContext
 from .dynamic_block_allocator import BlockAllocator
@@ -232,17 +227,8 @@ class DynamicInferenceContext(BaseInferenceContext):
             where the cuda graph batch sizes range from 1 to `max_requests` (as
             computed below). Due to rounding, the actual number of cuda graphs may
             not equal this argument.
-        materialize_only_last_token_logits (Optional[bool]): Whether to only
-            materialize logits for the last token. This should be set to False
-            if returning log probs.
-        layer_type_list (Optional[List[str]]): A list of strings that indicates
-            the layer type (Mamba / Attention / MLP) for each layer.
-            See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list
-            of symbols. This must be provided for hybrid models.
-        mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request.
-            This must be provided for hybrid models.
-        mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request.
-            This must be provided for hybrid models.
+        materialize_only_last_token_logits (bool): If True, only the last token logits
+            are materialized in the context.
         use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode
             engine steps.
         unified_memory_level (Optional[int]): Set unified memory usage within the
@@ -273,10 +259,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         kv_lora_rank: Optional[int] = None,
         qk_pos_emb_head_dim: Optional[int] = None,
         num_cuda_graphs: Optional[int] = None,
-        materialize_only_last_token_logits: Optional[bool] = True,
-        layer_type_list: Optional[List[str]] = None,
-        mamba_conv_states_shape: Optional[Tuple[int]] = None,
-        mamba_ssm_states_shape: Optional[Tuple[int]] = None,
+        materialize_only_last_token_logits: bool = True,
         use_cuda_graphs_for_non_decode_steps: bool = True,
         use_flashinfer_fused_rope: bool = False,
         unified_memory_level: Optional[int] = 0,
@@ -300,41 +283,6 @@ class DynamicInferenceContext(BaseInferenceContext):
             tp_size = tensor_model_parallel_size
         hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
         num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
-        # Mamba states.
-        self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list
-        if self.is_hybrid_model:
-            assert (
-                mamba_conv_states_shape is not None
-            ), "`mamba_conv_states_shape` must be specified for hybrid models"
-            assert (
-                mamba_ssm_states_shape is not None
-            ), "`mamba_ssm_states_shape` must be specified for hybrid models"
-            assert (
-                not use_cuda_graphs_for_non_decode_steps
-            ), "Non-decode CUDA graphs not yet supported for hybrid models"
-            # For hybrid models, the layer map converts the global layer index to the
-            # corresponding attention layer index or Mamba layer index depending on the
-            # layer type.
-            attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list(
-                layer_type_list
-            )
-            self.num_attention_layers = len(attention_layer_map)
-            self.num_mamba_layers = len(mamba_layer_map)
-            self.layer_map = attention_layer_map | mamba_layer_map
-        else:
-            # The layer map is the identity function for pure Transformer models.
-            self.num_attention_layers = num_layers
-            self.num_mamba_layers = 0
-            (mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None)
-            self.layer_map = {i: i for i in range(self.num_attention_layers)}
-        if self.num_attention_layers == 0:
-            raise NotImplementedError(
-                f"Using `DynamicInferenceContext` with no attention is not supported."
-            )
         # Block size tokens, bytes.
         dtype_size_bytes = params_dtype.itemsize
         self.block_size_tokens = block_size_tokens
@@ -349,38 +297,24 @@ class DynamicInferenceContext(BaseInferenceContext):
             self.block_size_bytes = (
                 dtype_size_bytes
                 * 2  # key, value
-                * self.num_attention_layers
+                * num_layers
                 * self.block_size_tokens
                 * num_attention_heads_per_partition
                 * hidden_size_per_attention_head
             )
-        assert self.block_size_bytes > 0
         # Adjust buffer to be a multiple of block size.
         buffer_size_bytes = int(buffer_size_gb * 1024**3)
         buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes
         buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem
-        mamba_states_memory_per_request = 0
-        if self.is_hybrid_model:
-            mamba_states_memory_per_request += math.prod(mamba_conv_states_shape)
-            mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape)
-            mamba_states_memory_per_request *= self.num_mamba_layers
-            mamba_states_memory_per_request *= dtype_size_bytes
-        # Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size.
+        # Compute max_requets, max_tokens from buffer size and overflow factor.
         def bytes_to_max_requests_and_tokens(n_bytes):
-            bytes_per_token = self.block_size_bytes / self.block_size_tokens
-            cost_per_request_bytes = (
-                mamba_states_memory_per_request + max_sequence_length * bytes_per_token
+            n_tokens = n_bytes / self.block_size_bytes * self.block_size_tokens
+            n_requests = n_tokens / max_sequence_length
+            return self.round_up_requests(int(n_requests), tp_size=tp_size), self.round_up_tokens(
+                int(n_tokens), tp_size=tp_size
             )
-            # TODO(ksanthanam): Leave room for an extra request in the event of padding
-            # for non-decode CUDA graphs
-            n_requests = n_bytes / cost_per_request_bytes
-            n_tokens = n_requests * max_sequence_length
-            n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size)
-            n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size)
-            return n_requests, n_tokens
         self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes)
         if buffer_overflow_factor is not None:
@@ -405,6 +339,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         # Initialize context state.
         self.params_dtype = params_dtype
+        self.num_layers = num_layers
         self.max_sequence_length = max_sequence_length
         # Unified memory.
@@ -455,11 +390,8 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids)
         self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids)
-        # Calculate the total number of chunks available in the buffer
-        total_mamba_states_memory = mamba_states_memory_per_request * self.max_requests
-        block_count_total = (
-            max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes
-        )
+        # Calculate the total number of blocks available in the buffer
+        block_count_total = buffer_size_bytes // self.block_size_bytes
         # Memory buffer.
         ctx_manager = (
@@ -470,12 +402,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         with ctx_manager:
             if cache_mla_latent:
                 self.memory_buffer = torch.full(
-                    (
-                        self.num_attention_layers,
-                        block_count_total,
-                        self.block_size_tokens,
-                        kv_reduced_dim,
-                    ),
+                    (self.num_layers, block_count_total, self.block_size_tokens, kv_reduced_dim),
                     -1,
                     dtype=self.params_dtype,
                     device=torch.cuda.current_device(),
@@ -484,7 +411,7 @@ class DynamicInferenceContext(BaseInferenceContext):
                 self.memory_buffer = torch.full(
                     (
                         2,  # key and value
-                        self.num_attention_layers,
+                        self.num_layers,
                         block_count_total,
                         self.block_size_tokens,
                         num_attention_heads_per_partition,
@@ -589,34 +516,14 @@ class DynamicInferenceContext(BaseInferenceContext):
             block_count_total=block_count_total, gtd_block_count=self.gtd_block_count
         )
-        # Optional state tensors for hybrid models
-        if self.is_hybrid_model:
-            self.mamba_metadata = MambaMetadata(max_requests=self.max_requests)
-            with ctx_manager:
-                self.mamba_conv_states = torch.zeros(
-                    (self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape,
-                    dtype=self.params_dtype,
-                    device=torch.cuda.current_device(),
-                )
-                self.mamba_ssm_states = torch.zeros(
-                    (self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape,
-                    dtype=self.params_dtype,
-                    device=torch.cuda.current_device(),
-                )
-        else:
-            self.mamba_metadata = None
         # Store the dummy block idx reference for convenience
         self.dummy_block_idx = self.block_allocator.dummy_block_idx
         # Deal with chunked prefill
         self.chunked_prefill_request_id = -1
-        # Reset attention and Mamba state.
+        # Reset attention state.
         self.reset_attention_state()
-        self.reset_mamba_state()
         if use_flashinfer_fused_rope is True:
             assert HAVE_FLASHINFER, "flashinfer is not installed"
@@ -721,8 +628,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         """Test if all active requests are in decode phase.
         For a request in prefill phase active_tokens = query length
-        Once the request moves to decode phase active tokens is 1 for that request.
-        So if all active requests are in decode phase, they will be equal to active token count.
+        Once the request moves to decode phase active tokens is 1 for that request. So if all active requests are in decode phase, they will be equal to active token count.
         """
         total_active_requests = self.total_request_count - self.paused_request_count
         return total_active_requests == self.active_token_count
@@ -758,7 +664,11 @@ class DynamicInferenceContext(BaseInferenceContext):
     def get_active_request_count(self):
         """Returns the current number of active requests."""
-        return self.total_request_count - self.paused_request_count
+        active_sequence_lengths = self.get_active_sequence_lengths()
+        max_sequence_lengths = self.get_max_sequence_lengths()
+        active_requests_mask = torch.less(active_sequence_lengths, max_sequence_lengths).byte()
+        active_request_count = (active_requests_mask == 1).sum().item()
+        return active_request_count
     def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) -> None:
         """Append to KV cache.
@@ -768,12 +678,10 @@ class DynamicInferenceContext(BaseInferenceContext):
             key (Tensor): Key tensor.
             value (Tensor): Value tensor.
         """
-        attention_layer_number = self.layer_map[layer_number - 1]
         if triton_append_key_value_cache is not None and not self.cache_mla_latent:
             # currently does not support MLA latent cache
             return triton_append_key_value_cache(
-                layer_number=attention_layer_number,
+                layer_number=layer_number,
                 key=key,
                 value=value,
                 memory_buffer=self.memory_buffer,
@@ -798,14 +706,14 @@ class DynamicInferenceContext(BaseInferenceContext):
         if self.cache_mla_latent:
             # We pass the kv_concat as the key in cache_mla_latent
             kv_concat = key
-            self.memory_buffer[attention_layer_number, block_idx, local_kv_seq_idx] = kv_concat[
+            self.memory_buffer[layer_number - 1, block_idx, local_kv_seq_idx] = kv_concat[
                 : self.padded_active_token_count
             ]
         else:
-            self.memory_buffer[0, attention_layer_number, block_idx, local_kv_seq_idx] = key[
+            self.memory_buffer[0, layer_number - 1, block_idx, local_kv_seq_idx] = key[
                 : self.padded_active_token_count
             ]
-            self.memory_buffer[1, attention_layer_number, block_idx, local_kv_seq_idx] = value[
+            self.memory_buffer[1, layer_number - 1, block_idx, local_kv_seq_idx] = value[
                 : self.padded_active_token_count
             ]
@@ -819,30 +727,19 @@ class DynamicInferenceContext(BaseInferenceContext):
             (Tuple[Tensor, Tensor]) The key and value pointer tensors that point
             to blocks within the block-level memory buffer.
         """
-        attention_layer_number = self.layer_map[layer_number - 1]
         if self.cache_mla_latent:
             return (
-                self.memory_buffer[attention_layer_number],
+                self.memory_buffer[layer_number - 1],
                 None,
                 self.active_attn_metadata["mha_metadata"].state_data["block_table"],
             )
         else:
             return (
-                self.memory_buffer[0, attention_layer_number],
-                self.memory_buffer[1, attention_layer_number],
+                self.memory_buffer[0, layer_number - 1],
+                self.memory_buffer[1, layer_number - 1],
                 self.active_attn_metadata["mha_metadata"].state_data["block_table"],
             )
-    def mamba_states_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]:
-        """Returns the Mamba state tensors for the given layer."""
-        assert self.is_hybrid_model, "Only hybrid models have Mamba state tensors"
-        mamba_layer_number = self.layer_map[layer_number - 1]
-        conv_state = self.mamba_conv_states[mamba_layer_number]
-        ssm_state = self.mamba_ssm_states[mamba_layer_number]
-        return (conv_state, ssm_state)
     def apply_fused_qk_rotary_emb(
         self, query: Tensor, key: Tensor, cos_sin_emb: Tensor, config: TransformerConfig
     ) -> Tuple[Tensor, Tensor]:
@@ -957,16 +854,6 @@ class DynamicInferenceContext(BaseInferenceContext):
             attn_metadata.reset()
         self.active_attn_metadata = None
-        if self.is_hybrid_model:
-            self.mamba_metadata.reset_cudagraph_mapping()
-    def reset_mamba_state(self) -> None:
-        """Reset state used within Mamba layers."""
-        if self.is_hybrid_model:
-            self.mamba_conv_states.fill_(0)
-            self.mamba_ssm_states.fill_(0)
-            self.mamba_metadata.reset()
     def using_cuda_graph_this_step(self) -> bool:
         """Returns True if cuda graphs are being used for this step."""
         has_cuda_graphs = self.cuda_graph_token_counts is not None
@@ -1090,17 +977,6 @@ class DynamicInferenceContext(BaseInferenceContext):
         )
         # All attention metadata calculations are now handled by MHAMetadata.update()
-        # Create Mamba state block table if it's a hybrid model
-        if self.is_hybrid_model:
-            active_mamba_indices = self.mamba_metadata.request_to_mamba_state_idx[
-                self.paused_request_count : self.total_request_count
-            ]
-            if self.is_decode_only() or self.using_cuda_graph_this_step():
-                self.mamba_metadata.update_cudagraph_mapping(
-                    active_mamba_indices, self.total_request_count - self.paused_request_count
-                )
     def reset(self) -> None:
         """Reset entire context.
@@ -1142,13 +1018,15 @@ class DynamicInferenceContext(BaseInferenceContext):
         # Reset available block count.
         self.reset_attention_state()
-        self.reset_mamba_state()
         self.block_allocator.reset()
         self.request_to_kv_block_ids.fill_(-1)
         # Reset chunked prefill state
         self.chunked_prefill_request_id = -1
+        # Reset chunked prefill state
+        self.chunked_prefill_request_id = -1
     def current_input_and_position_ids(
         self, *, num_warmup_tokens: Optional[int] = None
     ) -> Tuple[Tensor, Tensor]:
@@ -1320,18 +1198,6 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.token_to_local_position_within_kv_block[
             self.active_token_count : self.active_token_count + chunk_length
         ] = (token_offset_range % self.block_size_tokens)
-        if self.is_hybrid_model and not is_chunked_prefill:
-            # Allocate a slot for Mamba states
-            mamba_idx = self.mamba_metadata.allocate_slot()
-            if mamba_idx is None:
-                raise ContextOverflowError(req.request_id, "No Mamba slots available")
-            # Initialize the allocated Mamba state
-            self.mamba_conv_states[:, mamba_idx] = 0.0
-            self.mamba_ssm_states[:, mamba_idx] = 0.0
-            self.mamba_metadata.request_to_mamba_state_idx[self.total_request_count] = mamba_idx
         self.active_token_count += chunk_length
         self.total_request_count += 0 if req.finished_chunk_token_count > 0 else 1
@@ -1350,11 +1216,6 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.request_last_kv_block_id[dst_idxs] = self.request_last_kv_block_id[src_idxs]
         self.request_last_kv_block_offset[dst_idxs] = self.request_last_kv_block_offset[src_idxs]
-        if self.is_hybrid_model:
-            self.mamba_metadata.request_to_mamba_state_idx[dst_idxs] = (
-                self.mamba_metadata.request_to_mamba_state_idx[src_idxs]
-            )
     def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         """
         Swaps all the relevent booking tensors with src idxs to dst idxs
@@ -1369,9 +1230,6 @@ class DynamicInferenceContext(BaseInferenceContext):
         tensor_swap(self.request_last_kv_block_id, src_idxs, dst_idxs)
         tensor_swap(self.request_last_kv_block_offset, src_idxs, dst_idxs)
-        if self.is_hybrid_model:
-            tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
     # TODO: see if we can compile this function
     def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
         """Update context state after calling engine.step().
@@ -1443,17 +1301,10 @@ class DynamicInferenceContext(BaseInferenceContext):
                 non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
                 self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
-                if self.is_hybrid_model:
-                    self.mamba_metadata.free_slots(finished_idxs)
             # Reset request/token counts.
             self.request_to_kv_block_ids.fill_(-1)
             self.total_request_count = 0
             self.active_token_count = 0
-            # Reset Mamba state.
-            self.reset_mamba_state()
             return
         # 3. Concatenate the paused tokens to the active tokens if present.
@@ -1481,10 +1332,6 @@ class DynamicInferenceContext(BaseInferenceContext):
             # and updates it instead of the original tensor.
             self.request_to_kv_block_ids[finished_idxs] = -1
-            if self.is_hybrid_model:
-                # Get the Mamba state indices for finished requests and free them
-                self.mamba_metadata.free_slots(finished_idxs)
             if active_request_count > 0:
                 finished_idxs_on_left = (
                     torch.nonzero(active_requests_mask[:active_request_count] == 0, as_tuple=True)[
@@ -1504,10 +1351,8 @@ class DynamicInferenceContext(BaseInferenceContext):
                     next_tokens=next_tokens,
                 )
-                # Reset chunk ids for recently moved requests.
+                # Reset block ids for recently moved requests.
                 self.request_to_kv_block_ids[active_idxs_on_right] = -1
-                if self.is_hybrid_model:
-                    self.mamba_metadata.request_to_mamba_state_idx[active_idxs_on_right] = -1
         # 5. We identify requests that require a new block and add them to the paused requests (i.e move them left) :-
         #       a) Put requests that have filled their current block and  require a new one in a pause state temporarily
@@ -1605,7 +1450,6 @@ class DynamicInferenceContext(BaseInferenceContext):
         # 7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
         self.total_request_count = active_request_count + self.paused_request_count
         # All these active requests are in decode phase, so they need only 1 token per request
         self.active_token_count = active_request_count
         # Always the first section of token input ids are only used.

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/fused_kv_append_kernel.py RENAMED Viewed

@@ -119,8 +119,8 @@ def triton_append_key_value_cache(
     _, num_heads, h_dim = key.shape
-    key_cache = memory_buffer[0, layer_number]
-    value_cache = memory_buffer[1, layer_number]
+    key_cache = memory_buffer[0, layer_number - 1]
+    value_cache = memory_buffer[1, layer_number - 1]
     key_to_cache = key[:n_tokens]
     value_to_cache = value[:n_tokens]

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/data_parallel_inference_coordinator.py RENAMED Viewed

@@ -1,8 +1,6 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-import faulthandler
 import logging
-import signal
 from collections import deque
 from itertools import cycle
 from multiprocessing import Event
@@ -25,11 +23,6 @@ try:
 except:
     HAVE_MSGPACK = False
-# Register faulthandler to emit stack traces upon process kill.
-faulthandler.enable()
-faulthandler.register(signal.SIGTERM, all_threads=False, chain=True)
-faulthandler.register(signal.SIGINT, all_threads=False, chain=True)
 class DataParallelInferenceCoordinator:
     """

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/dynamic_engine.py RENAMED Viewed

@@ -33,8 +33,8 @@ from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.inference.utils import Counter, await_process_event
-from megatron.core.utils import get_asyncio_loop, trace_async_exceptions
+from megatron.core.inference.utils import Counter
+from megatron.core.utils import get_asyncio_loop
 try:
     from tqdm import tqdm
@@ -293,11 +293,7 @@ class DynamicInferenceEngine(AbstractEngine):
         self.capture_stats = capture_stats
     async def start_listening_to_data_parallel_coordinator(
-        self,
-        inference_coordinator_port: int,
-        launch_inference_coordinator: bool = True,
-        *,
-        loop: Optional[asyncio.AbstractEventLoop] = None,
+        self, inference_coordinator_port: int, launch_inference_coordinator: bool = True
     ):
         """Initializes ZMQ communication to connect the engine with an inference coordinator.
@@ -411,14 +407,12 @@ class DynamicInferenceEngine(AbstractEngine):
         torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group())
         if launch_inference_coordinator and torch.distributed.get_rank() == 0:
-            await await_process_event(coordinator_ready_event, self.inference_coordinator_process)
+            coordinator_ready_event.wait()
             logging.info("Inference co-ordinator is ready to receive requests!")
         # Finally run the engine infinite loop
-        loop = get_asyncio_loop(loop)
-        self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop))
+        self.engine_loop_task = asyncio.create_task(self.run_engine_with_coordinator())
-    @trace_async_exceptions
     async def _notify_cond_for_new_request(self):
         """Helper function to notify condition variable when a new request is added."""
         async with self._cond:
@@ -472,7 +466,7 @@ class DynamicInferenceEngine(AbstractEngine):
             self.waiting_request_ids.append(request_id)
         # Create a new asyncio Future to notify the user when the request has completed.
-        self.request_completion_futures[request_id] = self._loop.create_future()
+        self.request_completion_futures[request_id] = asyncio.Future()
         return self.request_completion_futures[request_id]
     def add_request(
@@ -647,7 +641,7 @@ class DynamicInferenceEngine(AbstractEngine):
             if request_can_be_added and request_tokens_can_be_added and kv_cache_available:
                 self.context.add_request(req)
                 self._loop.call_soon_threadsafe(
-                    self._loop.create_task, self._notify_cond_for_new_request()
+                    asyncio.create_task, self._notify_cond_for_new_request()
                 )
                 req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
                 req.add_event_add()
@@ -708,7 +702,7 @@ class DynamicInferenceEngine(AbstractEngine):
             # is_continuing_chunked_prefill is True if we are scheduling next
             # chunk of a existing chunked prefill request
-            is_continuing_chunked_prefill = self.context.chunked_prefill_request_id >= 0
+            is_continuing_chunked_prefill = self.context.chunked_prefill_request_id > 0
             # Use remaining prompt tokens for scheduling decisions
             remaining_len = len(req.remaining_prompt_tokens)
@@ -726,7 +720,7 @@ class DynamicInferenceEngine(AbstractEngine):
                     self.context.chunked_prefill_request_id = -1
                     self.context.add_request(req)
                     self._loop.call_soon_threadsafe(
-                        self._loop.create_task, self._notify_cond_for_new_request()
+                        asyncio.create_task, self._notify_cond_for_new_request()
                     )
                     req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
                     req.add_event_add()
@@ -738,7 +732,7 @@ class DynamicInferenceEngine(AbstractEngine):
                     chunk_length = self.context.max_tokens - self.context.active_token_count
                     self.context.add_request(req, chunk_length=chunk_length)
                     self._loop.call_soon_threadsafe(
-                        self._loop.create_task, self._notify_cond_for_new_request()
+                        asyncio.create_task, self._notify_cond_for_new_request()
                     )
                     self.context.chunked_prefill_request_id = req.request_id
                     req.remaining_prompt_tokens = req.remaining_prompt_tokens[chunk_length:]
@@ -945,7 +939,7 @@ class DynamicInferenceEngine(AbstractEngine):
             result = self.step_modern()
             finished_requests_list.extend(result["finished_requests"])
-        # Ensure requests are returned in the same order they were passed in
+        # Ensure requests are returned in the same order they were passed in.
         finished_requests_list.sort(key=lambda x: x.request_id)
         return finished_requests_list
@@ -1045,12 +1039,8 @@ class DynamicInferenceEngine(AbstractEngine):
         self.zmq_context.term()
         parallel_state.destroy_model_parallel()
-    @trace_async_exceptions
-    async def run_engine(
-        self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
-    ):
+    async def run_engine(self, *, verbose: Optional[bool] = False):
         """Continually steps the engine asynchronously."""
-        self._loop = get_asyncio_loop(loop)
         try:
             while True:
                 # Wait until there are active requests before proceeding.
@@ -1064,12 +1054,8 @@ class DynamicInferenceEngine(AbstractEngine):
         except asyncio.CancelledError:
             pass
-    @trace_async_exceptions
-    async def run_engine_with_coordinator(
-        self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
-    ):
+    async def run_engine_with_coordinator(self, *, verbose: Optional[bool] = False):
         """Continually steps the engine asynchronously."""
-        self._loop = get_asyncio_loop(loop)
         try:
             while True:
                 self.schedule_requests()

{megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/static_engine.py RENAMED Viewed

@@ -17,7 +17,6 @@ from megatron.core.inference.scheduler import Scheduler
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.utils import get_asyncio_loop
 try:
     from tqdm import tqdm
@@ -218,6 +217,11 @@ class StaticInferenceEngine(AbstractEngine):
             generated tokens, texts and log probs if required
         """
         assert hasattr(self, 'dynamic_engine'), "Dynamic engine not initialized"
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:  # 'RuntimeError: There is no current event loop...'
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
         if common_inference_params:
             sampling_params = common_inference_params
@@ -381,8 +385,8 @@ class StaticInferenceEngine(AbstractEngine):
         torch.cuda.set_device(cuda_device)
         self.run_engine()
-    async def run_engine_async(self, loop: Optional[asyncio.AbstractEventLoop] = None):
+    async def run_engine_async(self):
         """Runs the engine asynchronously using asyncio"""
-        loop = get_asyncio_loop(loop)
+        loop = asyncio.get_running_loop()
         await loop.run_in_executor(None, self._wrapped_run_engine, torch.cuda.current_device())

megatron-core 0.16.0rc0.dev127802__tar.gz → 0.16.0rc0.dev128858__tar.gz

Potentially problematic release.

megatron-core 0.16.0rc0.dev127802tar.gz → 0.16.0rc0.dev128858tar.gz