PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev131152__tar.gz → 0.16.0rc0.dev131564__tar.gz - Mend

megatron-core 0.16.0rc0.dev131152tar.gz → 0.16.0rc0.dev131564tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (360) hide show

{megatron_core-0.16.0rc0.dev131152 → megatron_core-0.16.0rc0.dev131564}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev131152
+Version: 0.16.0rc0.dev131564
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

megatron_core-0.16.0rc0.dev131564/megatron/core/inference/contexts/attention_context/metadata_base.py ADDED Viewed

@@ -0,0 +1,72 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+class MetadataBase:
+    """
+    Base class for attention metadata.
+    High-performance attention kernels often require input metadata in specific
+    formats—such as cumulative query lengths, cumulative key/value lengths,
+    and similar structures. Moreover, when using CUDA Graphs, these metadata
+    buffers must be statically allocated. This class serves as a unified container
+    that manages all such metadata in one place.
+    """
+    def __init__(self):
+        """
+        Initialize the metadata.
+        """
+        self.state_data = {}
+    def update(self, *args, **kwargs):
+        """
+        Construct the metadata from request states.
+        """
+        pass
+    def reset(self):
+        """
+        Reset the metadata.
+        """
+        pass
+    def tensor_copy_and_pad(
+        self,
+        tensor_buf,
+        unpadded_tensor,
+        real_batch_size,
+        padded_batch_size,
+        is_cumulative_tensor=False,
+        pad_value=0,
+    ):
+        """
+        Copy the unpadded tensor to the tensor_buf,
+        pad the tensor_buf with zero or the last value of the tensor,
+        depending on whether the tensor is cumulative.
+        Args:
+            tensor_buf: The destination tensor, at least padded_batch_size long.
+            unpadded_tensor: The tensor to copy, at least real_batch_size long.
+            real_batch_size: The real batch size.
+            padded_batch_size: Padded boundary of the tensor.
+            is_cumulative_tensor: Whether the tensor is cumulative.
+                If True, we pad the tensor_buf with the last value of the unpadded_tensor.
+            pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
+        """
+        assert real_batch_size <= padded_batch_size
+        assert tensor_buf.shape[0] >= padded_batch_size
+        assert unpadded_tensor.shape[0] >= real_batch_size
+        if is_cumulative_tensor:
+            if real_batch_size == 0:
+                value = pad_value
+            else:
+                value = unpadded_tensor[real_batch_size - 1]
+        else:
+            value = pad_value
+        tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
+        tensor_buf[real_batch_size:padded_batch_size] = value
+        return tensor_buf
+    def __str__(self):
+        """
+        Return a string representation of the metadata.
+        """
+        return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])

megatron_core-0.16.0rc0.dev131564/megatron/core/inference/contexts/attention_context/mha_metadata.py ADDED Viewed

@@ -0,0 +1,220 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+from typing import Optional
+import torch
+from .metadata_base import MetadataBase
+class MHAMetadata(MetadataBase):
+    """
+    Metadata for MHA layer using flash-attention.
+    """
+    def __init__(
+        self, block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+    ):
+        super().__init__()
+        device = torch.cuda.current_device()
+        self.device = device
+        self.max_blocks = block_count_total
+        self.max_kv_blocks = max_kv_block_count
+        self.max_bs = max_requests
+        self.max_seqlen = max_seqlen
+        self._query_lengths_buf = torch.zeros(self.max_bs, dtype=torch.int32, device=device)
+        self._cu_query_seq_lengths_buf = torch.zeros(
+            self.max_bs + 1, dtype=torch.int32, device=device
+        )
+        self._cu_kv_seq_lengths_buf = torch.zeros(self.max_bs + 1, dtype=torch.int32, device=device)
+        self._kv_seq_lengths_buf = torch.zeros(self.max_bs, dtype=torch.int32, device=device)
+        self._block_table_buf = torch.zeros(
+            (self.max_bs, self.max_kv_blocks), dtype=torch.int32, device=device
+        )
+        self._max_seqlen_q = 0
+        self._max_seqlen_k = 0
+        self.state_data = {}
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        if padded_active_request_count is None:
+            padded_active_request_count = real_batch_size
+        assert real_batch_size <= padded_active_request_count <= self.max_bs
+        assert request_query_lengths.shape[0] == real_batch_size
+        assert request_kv_length_offsets.shape[0] == real_batch_size
+        assert request_to_kv_block_ids.shape[0] == real_batch_size
+        self.tensor_copy_and_pad(
+            self._query_lengths_buf,
+            request_query_lengths,
+            real_batch_size,
+            padded_active_request_count,
+        )
+        self._cu_query_seq_lengths_buf[0] = 0
+        self.tensor_copy_and_pad(
+            self._cu_query_seq_lengths_buf[1:],
+            torch.cumsum(request_query_lengths, dim=0),
+            real_batch_size,
+            padded_active_request_count,
+            is_cumulative_tensor=True,
+        )
+        self.tensor_copy_and_pad(
+            self._kv_seq_lengths_buf,
+            request_kv_length_offsets + request_query_lengths,
+            real_batch_size,
+            padded_active_request_count,
+        )
+        self.tensor_copy_and_pad(
+            self._block_table_buf,
+            request_to_kv_block_ids,
+            real_batch_size,
+            padded_active_request_count,
+            pad_value=torch.tensor(self.max_kv_blocks, dtype=torch.int32, device=self.device).fill_(
+                -1
+            ),
+        )
+        self._cu_kv_seq_lengths_buf[0] = 0
+        self.tensor_copy_and_pad(
+            self._cu_kv_seq_lengths_buf[1:],
+            torch.cumsum(self._kv_seq_lengths_buf, dim=0),
+            real_batch_size,
+            padded_active_request_count,
+            is_cumulative_tensor=True,
+        )
+        if decode_only:
+            self._max_seqlen_q = 1
+        else:
+            self._max_seqlen_q = max(2, padded_active_token_count)
+        self._max_seqlen_k = self.max_seqlen
+        self.state_data = {
+            "query_lengths": self._query_lengths_buf[:padded_active_request_count],
+            "cu_query_seq_lengths": self._cu_query_seq_lengths_buf[
+                : padded_active_request_count + 1
+            ],
+            "cu_kv_seq_lengths": self._cu_kv_seq_lengths_buf[: padded_active_request_count + 1],
+            "kv_seq_lengths": self._kv_seq_lengths_buf[:padded_active_request_count],
+            "block_table": self._block_table_buf[0:padded_active_request_count, :],
+            "max_seqlen_q": self._max_seqlen_q,
+            "max_seqlen_k": self._max_seqlen_k,
+        }
+    def reset(self):
+        """
+        Reset the metadata for the next batch.
+        """
+        self._query_lengths_buf.fill_(0)
+        self._cu_query_seq_lengths_buf.fill_(0)
+        self._cu_kv_seq_lengths_buf.fill_(0)
+        self._kv_seq_lengths_buf.fill_(0)
+        self._block_table_buf.fill_(0)
+        self._max_seqlen_q = 0
+        self._max_seqlen_k = 0
+class GraphedMHAMetadata(MHAMetadata):
+    """
+    Metadata for MHA layer using flash-attention with CUDA graphs.
+    """
+    def __init__(
+        self, block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+    ):
+        super().__init__(
+            block_count_total, max_kv_block_count, max_requests, block_size_tokens, max_seqlen
+        )
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        super().update(
+            request_query_lengths,
+            request_kv_length_offsets,
+            request_to_kv_block_ids,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
+        )
+    def reset(self):
+        super().reset()
+class NonGraphedMHAMetadata(MHAMetadata):
+    """
+    Metadata for MHA layer using flash-attention without CUDA graphs.
+    """
+    def update(
+        self,
+        request_query_lengths: torch.Tensor,
+        request_kv_length_offsets: torch.Tensor,
+        request_to_kv_block_ids: torch.Tensor,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
+    ):
+        """
+        Args:
+            request_query_lengths: (>real_batch_size,)
+            request_kv_length_offsets: (>real_batch_size,)
+            request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
+        """
+        super().update(
+            request_query_lengths,
+            request_kv_length_offsets,
+            request_to_kv_block_ids,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
+        )
+        if len(self.state_data["query_lengths"]) > 0:
+            self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
+            self.state_data["max_seqlen_k"] = torch.max(self.state_data["kv_seq_lengths"]).item()
+        else:
+            self.state_data["max_seqlen_q"] = 1
+            self.state_data["max_seqlen_k"] = 1

{megatron_core-0.16.0rc0.dev131152 → megatron_core-0.16.0rc0.dev131564}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -4,7 +4,7 @@ import math
 import warnings
 from contextlib import nullcontext
 from enum import Enum
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
@@ -26,6 +26,7 @@ from megatron.core.package_info import __version__ as mcore_version
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import divide as core_divide
+from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
 from .base_context import BaseInferenceContext
 from .dynamic_block_allocator import BlockAllocator
@@ -48,6 +49,17 @@ try:
 except ImportError:
     HAVE_FLASHINFER = False
+try:
+    import wandb  # pylint: disable=unused-import
+    HAVE_WANDB = True
+except ImportError:
+    HAVE_WANDB = False
+    wandb = None
+if TYPE_CHECKING:
+    import wandb as WandbModule
 class ContextOverflowError(Exception):
     """Base exception for when a new request does not fit.
@@ -225,6 +237,7 @@ class DynamicInferenceContext(BaseInferenceContext):
             levels will be included to control other tensors within the context.
         use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
         If None, defaults to using flash-infer if available.
+        metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
     """
     def __init__(
@@ -250,6 +263,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         use_cuda_graphs_for_non_decode_steps: bool = True,
         use_flashinfer_fused_rope: bool = False,
         unified_memory_level: Optional[int] = 0,
+        metrics_writer: Optional['WandbModule'] = None,
     ):
         super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
@@ -259,6 +273,8 @@ class DynamicInferenceContext(BaseInferenceContext):
                 block_size_tokens == 64
             ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
+        self.metrics_writer = metrics_writer
         # Per partition num heads and hidden size.
         projection_size = kv_channels * num_attention_heads
         if tensor_model_parallel_size is None:
@@ -454,30 +470,26 @@ class DynamicInferenceContext(BaseInferenceContext):
             num_cuda_graphs is not None
         )
-        # `*_cudagraph_only` tensors are for use with cuda graphs to maintain
-        # consistent input shapes, which is required to use cuda graphs.
-        # During these steps, the `*_cudagraph_only`
-        # tensors are used, otherwise their same-name but un-suffixed
-        # corresponding tensors are used.
+        # Attention metadata initialization (tensors are now handled by MHAMetadata classes)
-        self.query_seq_lengths_cudagraph_only = torch.full(
-            (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
-        )
-        self.cu_query_seq_lengths_cudagraph_only = torch.full(
-            (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
-        )
-        self.kv_seq_lengths_cudagraph_only = torch.full(
-            (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
-        )
-        self.cu_kv_seq_lengths_cudagraph_only = torch.full(
-            (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
+        self.graph_attn_metadata = {}
+        self.non_graph_attn_metadata = {}
+        self.active_attn_metadata = None
+        self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata(
+            block_count_total=block_count_total,
+            max_kv_block_count=self.max_kv_block_count,
+            max_requests=self.max_requests,
+            block_size_tokens=self.block_size_tokens,
+            max_seqlen=self.max_sequence_length,
         )
-        self.request_to_kv_block_ids_cudagraph_only = torch.full(
-            (self.max_requests, self.max_kv_block_count),
-            0,
-            dtype=torch.int,
-            device=torch.cuda.current_device(),
+        self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata(
+            block_count_total=block_count_total,
+            max_kv_block_count=self.max_kv_block_count,
+            max_requests=self.max_requests,
+            block_size_tokens=self.block_size_tokens,
+            max_seqlen=self.max_sequence_length,
         )
         # Guaranteed active requests.
@@ -627,11 +639,18 @@ class DynamicInferenceContext(BaseInferenceContext):
     def cu_query_lengths(self) -> Tuple[Tensor, int]:
         """Cumulative query sequence lengths."""
-        return self.cu_query_seq_lengths, self.max_seqlen_q
+        return (
+            self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"],
+            self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"],
+        )
-    def cu_kv_lengths(self) -> Tensor:
+    def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
         """Cumulative key/value sequence lengths."""
-        return (self.cu_kv_seq_lengths, self.kv_seq_lengths, self.max_seqlen_k)
+        return (
+            self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"],
+            self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"],
+            self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_k"],
+        )
     def get_active_sequence_lengths(self) -> Tensor:
         """Total sequence length (query + key) for active requests."""
@@ -709,12 +728,16 @@ class DynamicInferenceContext(BaseInferenceContext):
             to blocks within the block-level memory buffer.
         """
         if self.cache_mla_latent:
-            return (self.memory_buffer[layer_number - 1], None, self.block_table)
+            return (
+                self.memory_buffer[layer_number - 1],
+                None,
+                self.active_attn_metadata["mha_metadata"].state_data["block_table"],
+            )
         else:
             return (
                 self.memory_buffer[0, layer_number - 1],
                 self.memory_buffer[1, layer_number - 1],
-                self.block_table,
+                self.active_attn_metadata["mha_metadata"].state_data["block_table"],
             )
     def apply_fused_qk_rotary_emb(
@@ -824,17 +847,12 @@ class DynamicInferenceContext(BaseInferenceContext):
     def reset_attention_state(self) -> None:
         """Reset state used within attention, after each step."""
-        self.max_seqlen_q = None
-        self.max_seqlen_k = None
-        self.cu_query_seq_lengths = None
-        self.cu_query_seq_lengths_cudagraph_only.fill_(0)
-        self.query_seq_lengths_cudagraph_only.fill_(0)
-        self.cu_kv_seq_lengths = None
-        self.cu_kv_seq_lengths_cudagraph_only.fill_(0)
-        self.kv_seq_lengths = None
-        self.kv_seq_lengths_cudagraph_only.fill_(0)
-        self.request_to_kv_block_ids_cudagraph_only.fill_(0)
-        self.block_table = None
+        # Attention metadata reset is now handled by MHAMetadata.reset()
+        for attn_metadata in self.non_graph_attn_metadata.values():
+            attn_metadata.reset()
+        for attn_metadata in self.graph_attn_metadata.values():
+            attn_metadata.reset()
+        self.active_attn_metadata = None
     def using_cuda_graph_this_step(self) -> bool:
         """Returns True if cuda graphs are being used for this step."""
@@ -934,89 +952,30 @@ class DynamicInferenceContext(BaseInferenceContext):
             self.active_token_count : self.padded_active_token_count
         ] = 0
-        # Update cu_query_seq_lengths, max_seqlen_q.
-        query_lengths = self.request_query_lengths[
-            self.paused_request_count : self.total_request_count
-        ]
-        if self.is_decode_only() or self.using_cuda_graph_this_step():
-            self.query_seq_lengths_cudagraph_only[
-                0 : self.total_request_count - self.paused_request_count
-            ] = query_lengths
-            if self.is_decode_only():
-                self.cu_query_seq_lengths = None  # ensure no accidental use
-                self.max_seqlen_q = 1
-            else:
-                self.cu_query_seq_lengths_cudagraph_only[
-                    1 : self.padded_active_request_count + 1
-                ] = torch.cumsum(
-                    self.query_seq_lengths_cudagraph_only[: self.padded_active_request_count], dim=0
-                )
-                # The following will be passed to the FA kernel.
-                self.cu_query_seq_lengths = self.cu_query_seq_lengths_cudagraph_only[
-                    : (self.padded_active_request_count + 1)
-                ]
-                self.max_seqlen_q = self.padded_active_token_count
-        else:
-            cu_query_lengths = torch.cumsum(query_lengths, dim=0)
-            self.cu_query_seq_lengths = torch.full(
-                (self.total_request_count - self.paused_request_count + 1,),
-                0,
-                dtype=torch.int32,
-                device=torch.cuda.current_device(),
-            )
-            self.cu_query_seq_lengths[1:] = cu_query_lengths
-            self.max_seqlen_q = query_lengths.max().item()
-        kv_seq_lengths = self.request_kv_length_offsets + self.request_query_lengths
-        self.kv_seq_lengths = kv_seq_lengths[self.paused_request_count : self.total_request_count]
-        if self.is_decode_only() or self.using_cuda_graph_this_step():
-            # Re-assign `kv_seq_lengths` to be a view of the first
-            # `active_cuda_graph_request_count` tokens of `kv_seq_lengths_decode_only`,
-            # such that `kv_seq_lengths` has a static memory address and is therefore
-            # cuda graph compatible. This allows `kv_seq_lengths` to transition between,
-            # cuda graph sizes, which makes multi-batch-size cuda graphs possible.
-            self.kv_seq_lengths_cudagraph_only[
-                0 : self.total_request_count - self.paused_request_count
-            ] = self.kv_seq_lengths
-            self.kv_seq_lengths = self.kv_seq_lengths_cudagraph_only[
-                : self.padded_active_request_count
-            ]
-            self.max_seqlen_k = self.max_sequence_length
-            if self.is_decode_only():
-                self.cu_kv_seq_lengths = None  # ensure no accidental use
-            else:
-                cu_kv_lengths = torch.cumsum(self.kv_seq_lengths, dim=0)
-                # The following will be passed to the FA kernel.
-                self.cu_kv_seq_lengths_cudagraph_only[1 : cu_kv_lengths.size(0) + 1] = cu_kv_lengths
-                self.cu_kv_seq_lengths = self.cu_kv_seq_lengths_cudagraph_only[
-                    : (self.padded_active_request_count + 1)
-                ]
-        else:
-            self.cu_kv_seq_lengths = torch.full(
-                (self.total_request_count - self.paused_request_count + 1,),
-                0,
-                dtype=torch.int32,
-                device=torch.cuda.current_device(),
-            )
-            self.cu_kv_seq_lengths[1:] = torch.cumsum(self.kv_seq_lengths, dim=0)
-            self.max_seqlen_k = self.kv_seq_lengths.max().item()
+        real_req_batch_size = (
+            self.total_request_count - self.paused_request_count
+        )  # how many requests are indeed active
+        self.active_attn_metadata = (
+            self.graph_attn_metadata
+            if self.using_cuda_graph_this_step()
+            else self.non_graph_attn_metadata
+        )
-        # Update KV block IDs, block table.
-        request_to_kv_block_ids = self.request_to_kv_block_ids[
-            self.paused_request_count : self.total_request_count
-        ]
-        if self.is_decode_only() or self.using_cuda_graph_this_step():
-            self.request_to_kv_block_ids_cudagraph_only[
-                0 : self.total_request_count - self.paused_request_count
-            ] = request_to_kv_block_ids
-            self.block_table = self.request_to_kv_block_ids_cudagraph_only[
-                : self.padded_active_request_count
-            ]
-        else:
-            self.block_table = self.request_to_kv_block_ids[
-                self.paused_request_count : self.total_request_count
-            ]
+        # Update cu_query_seq_lengths, max_seqlen_q.
+        active_slice = slice(self.paused_request_count, self.total_request_count)
+        query_lengths_view = self.request_query_lengths[active_slice]
+        request_kv_length_offsets_view = self.request_kv_length_offsets[active_slice]
+        request_to_kv_block_ids_view = self.request_to_kv_block_ids[active_slice]
+        self.active_attn_metadata["mha_metadata"].update(
+            request_query_lengths=query_lengths_view,
+            request_kv_length_offsets=request_kv_length_offsets_view,
+            request_to_kv_block_ids=request_to_kv_block_ids_view,
+            padded_active_token_count=self.padded_active_token_count,
+            real_batch_size=real_req_batch_size,
+            padded_active_request_count=self.padded_active_request_count,
+            decode_only=self.is_decode_only(),
+        )
+        # All attention metadata calculations are now handled by MHAMetadata.update()
     def reset(self) -> None:
         """Reset entire context.
@@ -1625,3 +1584,67 @@ class DynamicInferenceContext(BaseInferenceContext):
         # Convert each log prob tensor into a list
         return [lp.tolist() for lp in selected_log_probs_list]
+    def get_kvcache_utilization_stats(self) -> dict:
+        """Compute KV cache buffer utilization stats for the current step.
+        Returns a dictionary with counts and percentages for both allocated block
+        usage (overall buffer occupancy) and active usage (blocks referenced by
+        currently active requests this step).
+        Return:
+            {
+            'total_blocks': int,
+            'allocated_blocks': int,
+            'active_unique_blocks': int,
+            'allocated_utilization': float,
+            'active_utilization': float,
+            'active_request_count': int,
+            'paused_request_count': int,
+            'gtd_block_count': int,
+            }
+        """
+        # Total usable blocks exclude the reserved dummy block.
+        total_blocks = max(self.block_allocator.block_count_total - 1, 1)
+        block_count_avail = int(self.block_allocator.block_count_avail)
+        # Overall allocated blocks in the buffer right now.
+        allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail
+        allocated_blocks = int(max(0, allocated_blocks))
+        # Active unique blocks referenced by current active requests only.
+        active_start = self.paused_request_count
+        active_end = self.total_request_count
+        if active_end > active_start:
+            active_rows = self.request_to_kv_block_ids[active_start:active_end]
+            # Filter valid block ids (>= 0) and count unique ids.
+            valid_ids = active_rows[active_rows >= 0]
+            if valid_ids.numel() > 0:
+                unique_ids = torch.unique(valid_ids)
+                active_unique_blocks = int(unique_ids.numel())
+            else:
+                active_unique_blocks = 0
+        else:
+            active_unique_blocks = 0
+        allocated_utilization = float(allocated_blocks) / float(total_blocks)
+        active_utilization = float(active_unique_blocks) / float(total_blocks)
+        # Diagnostic helpers
+        num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count))
+        total_request_count = int(self.total_request_count)
+        return {
+            'total_blocks': int(total_blocks),
+            'allocated_blocks': int(allocated_blocks),
+            'active_unique_blocks': int(active_unique_blocks),
+            'allocated_utilization': allocated_utilization,
+            'active_utilization': active_utilization,
+            'active_request_count': int(self.get_active_request_count()),
+            'paused_request_count': int(self.paused_request_count),
+            'gtd_block_count': int(self.gtd_block_count),
+            'block_count_avail': int(block_count_avail),
+            'num_non_gtd_blocks': int(num_non_gtd_blocks),
+            'active_token_count': int(self.active_token_count),
+            'total_request_count': int(total_request_count),
+            'max_requests': int(self.max_requests),
+        }

megatron-core 0.16.0rc0.dev131152__tar.gz → 0.16.0rc0.dev131564__tar.gz

Potentially problematic release.

megatron-core 0.16.0rc0.dev131152tar.gz → 0.16.0rc0.dev131564tar.gz