PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev122519__tar.gz → 0.16.0rc0.dev123313__tar.gz - Mend

megatron-core 0.16.0rc0.dev122519tar.gz → 0.16.0rc0.dev123313tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (363) hide show

{megatron_core-0.16.0rc0.dev122519 → megatron_core-0.16.0rc0.dev123313}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev122519
+Version: 0.16.0rc0.dev123313
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -41,7 +41,7 @@ Requires-Dist: transformers; extra == "mlm"
 Provides-Extra: dev
 Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
-Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
+Requires-Dist: nvidia-resiliency-ext; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
 Requires-Dist: einops~=0.8; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
@@ -59,13 +59,20 @@ Requires-Dist: wget; extra == "dev"
 Requires-Dist: onnxscript; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
-Requires-Dist: einops; extra == "lts"
-Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
-Requires-Dist: nvtx; extra == "lts"
-Requires-Dist: transformers; extra == "lts"
-Requires-Dist: zarr; extra == "lts"
+Requires-Dist: einops~=0.8; extra == "lts"
+Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "lts"
+Requires-Dist: nvtx~=0.2; extra == "lts"
+Requires-Dist: multi-storage-client~=0.27; extra == "lts"
+Requires-Dist: opentelemetry-api~=1.33.1; extra == "lts"
 Requires-Dist: setuptools<80.0.0; extra == "lts"
+Requires-Dist: mamba-ssm~=2.2; extra == "lts"
+Requires-Dist: causal-conv1d~=1.5; extra == "lts"
+Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
+Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
+Requires-Dist: av<16.0.0; extra == "lts"
+Requires-Dist: flashinfer-python; extra == "lts"
 Requires-Dist: wget; extra == "lts"
+Requires-Dist: onnxscript; extra == "lts"
 <div align="center">

{megatron_core-0.16.0rc0.dev122519 → megatron_core-0.16.0rc0.dev123313}/megatron/core/datasets/megatron_tokenizer.py RENAMED Viewed

@@ -1,11 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import json
+import logging
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Any
 import numpy
+logger = logging.getLogger(__name__)
 class MegatronLegacyTokenizer(ABC):
     """Abstract class for tokenizer
@@ -20,6 +23,12 @@ class MegatronLegacyTokenizer(ABC):
     """
     def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
+        # Deprecation warning
+        logger.warning(
+            "You’re using the legacy tokenizer system, which is deprecated "
+            "and will be removed in a future release. Please migrate to the new tokenizer system "
+            "(`megatron.core.tokenizers.MegatronTokenizer`)."
+        )
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
         self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)

{megatron_core-0.16.0rc0.dev122519 → megatron_core-0.16.0rc0.dev123313}/megatron/core/fp8_utils.py RENAMED Viewed

@@ -10,6 +10,12 @@ from typing import List, Optional
 import torch
 from megatron.core.enums import Fp8Recipe
+from megatron.core.tensor_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    gather_from_sequence_parallel_region,
+    reduce_scatter_to_sequence_parallel_region,
+)
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import get_te_version, is_te_min_version
@@ -112,6 +118,27 @@ def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int:
         return 16
+def is_column_parallel_linear(module):
+    """Returns whether the given module is a ColumnParallelLinear layer."""
+    if HAVE_TE and (
+        isinstance(module, TEColumnParallelLinear)
+        or isinstance(module, TELayerNormColumnParallelLinear)
+    ):
+        return True
+    elif isinstance(module, ColumnParallelLinear):
+        return True
+    return False
+def is_row_parallel_linear(module):
+    """Returns whether the given module is a RowParallelLinear layer."""
+    if HAVE_TE and isinstance(module, TERowParallelLinear):
+        return True
+    elif isinstance(module, RowParallelLinear):
+        return True
+    return False
 """
 The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into
 several functions. It provides different implementations for each function based on different
@@ -587,6 +614,18 @@ if HAVE_TE:
             if not FP8GlobalStateManager.is_fp8_enabled():
                 return original_forward(input_tensor, *args, **kwargs)
+            # With sequence parallelism we need to all-gather before padding
+            # and reduce-scatter after unpadding
+            if is_sequence_parallel := getattr(module, "sequence_parallel", False):
+                if is_column_parallel_linear(module):
+                    input_tensor = gather_from_sequence_parallel_region(
+                        input_tensor, group=module.tp_group
+                    )
+                # Disable sequence parallelism on the module because we are handling the
+                # all-gather and reduce-scatter externally
+                module.sequence_parallel = False
             seq_len, batch_size, hidden_size = input_tensor.shape
             # Reshape to (S, B*H) to pad sequence dimension
             input_2d = input_tensor.reshape(seq_len, -1)
@@ -612,6 +651,16 @@ if HAVE_TE:
             unpadded_output_2d = _unpad_func(output_2d, [seq_len])
             unpadded_output = unpadded_output_2d.reshape(seq_len, batch_size, output_hidden_size)
+            if is_sequence_parallel:
+                # Reduce-scatter after unpadding
+                if is_row_parallel_linear(module):
+                    unpadded_output = reduce_scatter_to_sequence_parallel_region(
+                        unpadded_output, group=module.tp_group
+                    )
+                # Reset sequence parallelism flag on the module
+                module.sequence_parallel = True
             if other_outputs:
                 return (unpadded_output,) + other_outputs
             else:

{megatron_core-0.16.0rc0.dev122519 → megatron_core-0.16.0rc0.dev123313}/megatron/core/inference/async_stream.py RENAMED Viewed

@@ -9,6 +9,7 @@ import asyncio
 from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
 from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.utils import get_asyncio_loop
 STOP_ITERATION = Exception()
@@ -20,12 +21,17 @@ class AsyncStream:
     Adopted from https://github.com/vllm-project/vllm/blob/eb881ed006ca458b052905e33f0d16dbb428063a/vllm/v1/engine/async_stream.py # pylint: disable=line-too-long
     """
-    def __init__(self, request_id: int, cancel: Callable[[str], None]) -> None:
+    def __init__(
+        self,
+        request_id: int,
+        cancel: Callable[[str], None],
+        loop: Optional[asyncio.AbstractEventLoop] = None,
+    ) -> None:
         self._request_id = request_id
         self._cancel = cancel
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
-        self._loop = asyncio.get_running_loop()
+        self._loop = get_asyncio_loop(loop)
     def put(self, item: Union[InferenceRequest, Exception]) -> None:
         """Adds a new value to the stream"""

megatron_core-0.16.0rc0.dev123313/megatron/core/inference/contexts/attention_context/mamba_metadata.py ADDED Viewed

@@ -0,0 +1,106 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import torch
+class MambaMetadata:
+    """Manages the metadata tensors required for Mamba layers during inference."""
+    def __init__(self, max_requests: int):
+        """
+        Initializes the Mamba slot allocator.
+        Args:
+            max_requests (int): The maximum number of concurrent requests.
+        """
+        self.max_requests = max_requests
+        # Metadata for mapping requests to slots in the static Mamba state buffer
+        self.request_to_mamba_state_idx = torch.full(
+            (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        # Separate mapping used only for CUDA graph compatibility
+        self.request_to_mamba_state_idx_cudagraph_only = torch.full(
+            (self.max_requests,), -1, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        # Allocator for Mamba state slots
+        self.mamba_state_free_slots = torch.arange(
+            self.max_requests, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        self.mamba_state_free_slot_count = self.max_requests
+    def reset(self) -> None:
+        """
+        Resets all Mamba states and frees all allocated slots.
+        """
+        self.request_to_mamba_state_idx.fill_(-1)
+        self.request_to_mamba_state_idx_cudagraph_only.fill_(-1)
+        # Re-initialize the free slot pool
+        self.mamba_state_free_slots = torch.arange(
+            self.max_requests, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        self.mamba_state_free_slot_count = self.max_requests
+    def reset_cudagraph_mapping(self) -> None:
+        """
+        Resets only the CUDA graph mapping tensor.
+        """
+        self.request_to_mamba_state_idx_cudagraph_only.fill_(-1)
+    def update_cudagraph_mapping(
+        self, active_mamba_indices: torch.Tensor, num_active_requests: int
+    ) -> None:
+        """
+        Updates the dedicated CUDA graph mapping tensor with the indices
+        of currently active requests.
+        Args:
+            active_mamba_indices (Tensor): Tensor containing the Mamba slot indices
+                                           for active requests.
+            num_active_requests (int): The number of active requests.
+        """
+        self.request_to_mamba_state_idx_cudagraph_only[0:num_active_requests] = active_mamba_indices
+    def allocate_slot(self) -> int:
+        """
+        Allocates a new slot for a request in the Mamba state buffers.
+        Returns:
+            int: The index of the allocated slot.
+            Returns None if no slots are available.
+        """
+        if self.mamba_state_free_slot_count == 0:
+            return None
+        # Get a free slot
+        self.mamba_state_free_slot_count -= 1
+        mamba_idx = self.mamba_state_free_slots[self.mamba_state_free_slot_count]
+        return mamba_idx
+    def free_slots(self, request_indices: torch.Tensor) -> None:
+        """
+        Frees the Mamba state slots associated with the given request indices.
+        Args:
+            request_indices (Tensor): A 1D tensor of request indices to free.
+        """
+        # Get the Mamba state indices for finished requests
+        mamba_indices_to_free = self.request_to_mamba_state_idx[request_indices]
+        # Filter out any invalid indices (e.g., -1)
+        mamba_indices_to_free = mamba_indices_to_free[mamba_indices_to_free != -1]
+        num_to_free = len(mamba_indices_to_free)
+        if num_to_free > 0:
+            # Add the freed indices back to the free slot pool
+            start_idx = self.mamba_state_free_slot_count
+            end_idx = start_idx + num_to_free
+            self.mamba_state_free_slots[start_idx:end_idx] = mamba_indices_to_free
+            self.mamba_state_free_slot_count = end_idx
+        # Invalidate the Mamba state index for the finished requests
+        self.request_to_mamba_state_idx[request_indices] = -1

megatron_core-0.16.0rc0.dev123313/megatron/core/inference/contexts/attention_context/metadata_base.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+class MetadataBase:
+    """
+    Base class for attention metadata.
+    High-performance attention kernels often require input metadata in specific
+    formats—such as cumulative query lengths, cumulative key/value lengths,
+    and similar structures. Moreover, when using CUDA Graphs, these metadata
+    buffers must be statically allocated. This class serves as a unified container
+    that manages all such metadata in one place.
+    """
+    def __init__(self):
+        """
+        Initialize the metadata.
+        """
+        self.state_data = {}
+    def update(self, *args, **kwargs):
+        """
+        Construct the metadata from request states.
+        """
+        pass
+    def reset(self):
+        """
+        Reset the metadata.
+        """
+        pass
+    def tensor_copy_and_pad(
+        self,
+        tensor_buf,
+        unpadded_tensor,
+        real_batch_size,
+        padded_batch_size,
+        is_cumulative_tensor=False,
+        pad_value=0,
+    ):
+        """
+        Copy the unpadded tensor to the tensor_buf,
+        pad the tensor_buf with zero or the last value of the tensor,
+        depending on whether the tensor is cumulative.
+        Args:
+            tensor_buf: The destination tensor, at least padded_batch_size long.
+            unpadded_tensor: The tensor to copy, at least real_batch_size long.
+            real_batch_size: The real batch size.
+            padded_batch_size: Padded boundary of the tensor.
+            is_cumulative_tensor: Whether the tensor is cumulative.
+                If True, we pad the tensor_buf with the last value of the unpadded_tensor.
+            pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
+        """
+        assert real_batch_size <= padded_batch_size
+        assert tensor_buf.shape[0] >= padded_batch_size
+        assert unpadded_tensor.shape[0] >= real_batch_size
+        if is_cumulative_tensor:
+            if real_batch_size == 0:
+                value = pad_value
+            else:
+                value = unpadded_tensor[real_batch_size - 1]
+        else:
+            value = pad_value
+        tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
+        tensor_buf[real_batch_size:padded_batch_size] = value
+        return tensor_buf
+    def __str__(self):
+        """
+        Return a string representation of the metadata.
+        """
+        return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])

megatron_core-0.16.0rc0.dev123313/megatron/core/inference/contexts/attention_context/mha_metadata.py ADDED Viewed

@@ -0,0 +1,220 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional
+import torch
+from .metadata_base import MetadataBase
+class MHAMetadata(MetadataBase):
+    """
+    Metadata for MHA layer using flash-attention.
+    """
+    def __init__(
+        self, block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+    ):
+        super().__init__()
+        device = torch.cuda.current_device()
+        self.device = device
+        self.max_blocks = block_count_total
+        self.max_kv_blocks = max_kv_block_count
+        self.max_bs = max_requests
+        self.max_seqlen = max_seqlen
+        self._query_lengths_buf = torch.zeros(self.max_bs, dtype=torch.int32, device=device)
+        self._cu_query_seq_lengths_buf = torch.zeros(
+            self.max_bs + 1, dtype=torch.int32, device=device
+        )
+        self._cu_kv_seq_lengths_buf = torch.zeros(self.max_bs + 1, dtype=torch.int32, device=device)
+        self._kv_seq_lengths_buf = torch.zeros(self.max_bs, dtype=torch.int32, device=device)
+        self._block_table_buf = torch.zeros(
+            (self.max_bs, self.max_kv_blocks), dtype=torch.int32, device=device
+        )
+        self._max_seqlen_q = 0
+        self._max_seqlen_k = 0
+        self.state_data = {}
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        if padded_active_request_count is None:
+            padded_active_request_count = real_batch_size
+        assert real_batch_size <= padded_active_request_count <= self.max_bs
+        assert request_query_lengths.shape[0] == real_batch_size
+        assert request_kv_length_offsets.shape[0] == real_batch_size
+        assert request_to_kv_block_ids.shape[0] == real_batch_size
+        self.tensor_copy_and_pad(
+            self._query_lengths_buf,
+            request_query_lengths,
+            real_batch_size,
+            padded_active_request_count,
+        )
+        self._cu_query_seq_lengths_buf[0] = 0
+        self.tensor_copy_and_pad(
+            self._cu_query_seq_lengths_buf[1:],
+            torch.cumsum(request_query_lengths, dim=0),
+            real_batch_size,
+            padded_active_request_count,
+            is_cumulative_tensor=True,
+        )
+        self.tensor_copy_and_pad(
+            self._kv_seq_lengths_buf,
+            request_kv_length_offsets + request_query_lengths,
+            real_batch_size,
+            padded_active_request_count,
+        )
+        self.tensor_copy_and_pad(
+            self._block_table_buf,
+            request_to_kv_block_ids,
+            real_batch_size,
+            padded_active_request_count,
+            pad_value=torch.tensor(self.max_kv_blocks, dtype=torch.int32, device=self.device).fill_(
+                -1
+            ),
+        )
+        self._cu_kv_seq_lengths_buf[0] = 0
+        self.tensor_copy_and_pad(
+            self._cu_kv_seq_lengths_buf[1:],
+            torch.cumsum(self._kv_seq_lengths_buf, dim=0),
+            real_batch_size,
+            padded_active_request_count,
+            is_cumulative_tensor=True,
+        )
+        if decode_only:
+            self._max_seqlen_q = 1
+        else:
+            self._max_seqlen_q = max(2, padded_active_token_count)
+        self._max_seqlen_k = self.max_seqlen
+        self.state_data = {
+            "query_lengths": self._query_lengths_buf[:padded_active_request_count],
+            "cu_query_seq_lengths": self._cu_query_seq_lengths_buf[
+                : padded_active_request_count + 1
+            ],
+            "cu_kv_seq_lengths": self._cu_kv_seq_lengths_buf[: padded_active_request_count + 1],
+            "kv_seq_lengths": self._kv_seq_lengths_buf[:padded_active_request_count],
+            "block_table": self._block_table_buf[0:padded_active_request_count, :],
+            "max_seqlen_q": self._max_seqlen_q,
+            "max_seqlen_k": self._max_seqlen_k,
+        }
+    def reset(self):
+        """
+        Reset the metadata for the next batch.
+        """
+        self._query_lengths_buf.fill_(0)
+        self._cu_query_seq_lengths_buf.fill_(0)
+        self._cu_kv_seq_lengths_buf.fill_(0)
+        self._kv_seq_lengths_buf.fill_(0)
+        self._block_table_buf.fill_(0)
+        self._max_seqlen_q = 0
+        self._max_seqlen_k = 0
+class GraphedMHAMetadata(MHAMetadata):
+    """
+    Metadata for MHA layer using flash-attention with CUDA graphs.
+    """
+    def __init__(
+        self, block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+    ):
+        super().__init__(
+            block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+        )
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        super().update(
+            request_query_lengths,
+            request_kv_length_offsets,
+            request_to_kv_block_ids,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
+        )
+    def reset(self):
+        super().reset()
+class NonGraphedMHAMetadata(MHAMetadata):
+    """
+    Metadata for MHA layer using flash-attention without CUDA graphs.
+    """
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        super().update(
+            request_query_lengths,
+            request_kv_length_offsets,
+            request_to_kv_block_ids,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
+        )
+        if len(self.state_data["query_lengths"]) > 0:
+            self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
+            self.state_data["max_seqlen_k"] = torch.max(self.state_data["kv_seq_lengths"]).item()
+        else:
+            self.state_data["max_seqlen_q"] = 1
+            self.state_data["max_seqlen_k"] = 1

megatron_core-0.16.0rc0.dev123313/megatron/core/inference/contexts/dynamic_block_allocator.py ADDED Viewed

@@ -0,0 +1,118 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional
+import torch
+from torch import Tensor
+class BlockAllocator:
+    """Allocator that manages blocks of memory for the KV cache.
+    This allocator is responsible for:
+    - Initializing a pool of block IDs
+    - Allocating blocks from the pool
+    - Releasing blocks back to the pool
+    Args:
+        context (DynamicInferenceContext): Dynamic inference context.
+        active_count (int): Total number of active blocks available in the buffer.
+            The full buffer size is 2*active_count, to accommodate an equal-size
+            space for paused requests that live on the CPU.
+    """
+    def __init__(self, context: "DynamicInferenceContext", active_count: int):
+        self.context = context
+        active_count -= 1  # -1 for dummy_block_idx (see below)
+        active_count = max(1, active_count)  # need at least one block
+        self.total_count = 2 * active_count + 1  # +1 for dummy_block_idx
+        self.total_avail = self.total_count - 1  # -1 for dummy_block_idx
+        self.active_count = active_count
+        self.paused_count = self.total_count - self.active_count - 1  # -1 for dummy_block_idx
+        self.dummy_block_idx = self.total_count - 1
+        # Initialize block pool as a "stack" data structure
+        self.block_bag = torch.arange(
+            self.total_count, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+    def __str__(self):
+        return (
+            f"total avail {self.total_avail} / {self.total_count - 1}"
+            f"; active {self.active_count}"
+        )
+    def get_active_used(self):
+        """Compute number of active blocks used."""
+        return (
+            self.context.request_kv_block_counts[
+                self.context.paused_request_count : self.context.total_request_count
+            ]
+            .sum()
+            .item()
+        )
+    def get_paused_used(self):
+        """Compute number of paused blocks used."""
+        return (
+            self.context.request_kv_block_counts[: self.context.paused_request_count].sum().item()
+        )
+    def get_active_avail(self):
+        """Compute number of active blocks available."""
+        return self.active_count - self.get_active_used()
+    def get_paused_avail(self):
+        """Compute number of paused blocks available."""
+        return self.paused_count - self.get_paused_used()
+    def is_memory_available(self, num_blocks: int) -> bool:
+        """Check if memory blocks are available.
+        Args:
+            num_blocks (int): Number of blocks to check.
+        Return:
+            (bool) Is memory available?
+        """
+        return self.get_active_avail() >= num_blocks
+    def allocate_memory_blocks(self, num_blocks: int) -> Optional[Tensor]:
+        """Allocate memory blocks if available, else return None.
+        Args:
+            num_blocks (int): Number of blocks to allocate.
+        Return:
+            (Optional[Tensor]) Allocated block IDs.
+        """
+        if self.is_memory_available(num_blocks):
+            self.total_avail -= num_blocks
+            block_ids = self.block_bag[self.total_avail : (self.total_avail + num_blocks)]
+            assert num_blocks == block_ids.numel()
+            return block_ids
+        else:
+            return None
+    def release_memory_blocks(self, blocks: Tensor) -> None:
+        """Release memory blocks.
+        Args:
+            blocks (Tensor): Block IDs to release.
+        Return:
+            None
+        """
+        num_blocks = blocks.size(dim=0)
+        self.block_bag[self.total_avail : (self.total_avail + num_blocks)] = blocks
+        self.total_avail += num_blocks
+    def reset(self) -> None:
+        """Reset the allocator to initial state.
+        This resets the available block count to the entire memory pool
+        (except for the dummy block).
+        """
+        self.total_avail = self.total_count - 1

megatron-core 0.16.0rc0.dev122519__tar.gz → 0.16.0rc0.dev123313__tar.gz

megatron-core 0.16.0rc0.dev122519tar.gz → 0.16.0rc0.dev123313tar.gz