PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev111286__tar.gz → 0.16.0rc0.dev111655__tar.gz - Mend

megatron-core 0.16.0rc0.dev111286tar.gz → 0.16.0rc0.dev111655tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show

{megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev111286
+Version: 0.16.0rc0.dev111655
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -16,14 +16,16 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
-from megatron.core.inference.unified_memory import create_unified_mempool, has_unified_memory
+from megatron.core.inference.unified_memory import (
+    UnifiedMemoryUnsupportedError,
+    create_unified_mempool,
+)
 from megatron.core.inference.utils import tensor_swap
 from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
 from megatron.core.package_info import __version__ as mcore_version
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import divide as core_divide
-from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
 from .base_context import BaseInferenceContext
 from .dynamic_block_allocator import BlockAllocator
@@ -323,16 +325,20 @@ class DynamicInferenceContext(BaseInferenceContext):
         self.params_dtype = params_dtype
         self.num_layers = num_layers
         self.max_sequence_length = max_sequence_length
+        # Unified memory.
         self.unified_memory_level = unified_memory_level
         if unified_memory_level > 0:
-            if not has_unified_memory and torch.distributed.get_rank() == 0:
-                warnings.warn(
-                    "Unified memory requested but not available; defaulting to GPU memory."
-                )
-                self.unified_memory_level = 0
-            else:
+            try:
                 self.unified_memory_mempool = create_unified_mempool()
+            except UnifiedMemoryUnsupportedError:
+                if torch.distributed.get_rank() == 0:
+                    warnings.warn(
+                        "Unified memory requested but not available; defaulting to GPU memory."
+                    )
+                self.unified_memory_level = 0
+        # Request and token counts.
         self.total_request_count = 0
         self.active_token_count = 0
         self.paused_request_count = 0
@@ -448,26 +454,30 @@ class DynamicInferenceContext(BaseInferenceContext):
             num_cuda_graphs is not None
         )
-        # Attention metadata initialization (tensors are now handled by MHAMetadata classes)
+        # `*_cudagraph_only` tensors are for use with cuda graphs to maintain
+        # consistent input shapes, which is required to use cuda graphs.
+        # During these steps, the `*_cudagraph_only`
+        # tensors are used, otherwise their same-name but un-suffixed
+        # corresponding tensors are used.
-        self.graph_attn_metadata = {}
-        self.non_graph_attn_metadata = {}
-        self.active_attn_metadata = None
-        self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata(
-            block_count_total=block_count_total,
-            max_kv_block_count=self.max_kv_block_count,
-            max_requests=self.max_requests,
-            block_size_tokens=self.block_size_tokens,
-            max_seqlen=self.max_sequence_length,
+        self.query_seq_lengths_cudagraph_only = torch.full(
+            (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        self.cu_query_seq_lengths_cudagraph_only = torch.full(
+            (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        self.kv_seq_lengths_cudagraph_only = torch.full(
+            (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
+        )
+        self.cu_kv_seq_lengths_cudagraph_only = torch.full(
+            (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
         )
-        self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata(
-            block_count_total=block_count_total,
-            max_kv_block_count=self.max_kv_block_count,
-            max_requests=self.max_requests,
-            block_size_tokens=self.block_size_tokens,
-            max_seqlen=self.max_sequence_length,
+        self.request_to_kv_block_ids_cudagraph_only = torch.full(
+            (self.max_requests, self.max_kv_block_count),
+            0,
+            dtype=torch.int,
+            device=torch.cuda.current_device(),
         )
         # Guaranteed active requests.
@@ -617,18 +627,11 @@ class DynamicInferenceContext(BaseInferenceContext):
     def cu_query_lengths(self) -> Tuple[Tensor, int]:
         """Cumulative query sequence lengths."""
-        return (
-            self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"],
-            self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"],
-        )
+        return self.cu_query_seq_lengths, self.max_seqlen_q
-    def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
+    def cu_kv_lengths(self) -> Tensor:
         """Cumulative key/value sequence lengths."""
-        return (
-            self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"],
-            self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"],
-            self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_k"],
-        )
+        return (self.cu_kv_seq_lengths, self.kv_seq_lengths, self.max_seqlen_k)
     def get_active_sequence_lengths(self) -> Tensor:
         """Total sequence length (query + key) for active requests."""
@@ -706,16 +709,12 @@ class DynamicInferenceContext(BaseInferenceContext):
             to blocks within the block-level memory buffer.
         """
         if self.cache_mla_latent:
-            return (
-                self.memory_buffer[layer_number - 1],
-                None,
-                self.active_attn_metadata["mha_metadata"].state_data["block_table"],
-            )
+            return (self.memory_buffer[layer_number - 1], None, self.block_table)
         else:
             return (
                 self.memory_buffer[0, layer_number - 1],
                 self.memory_buffer[1, layer_number - 1],
-                self.active_attn_metadata["mha_metadata"].state_data["block_table"],
+                self.block_table,
             )
     def apply_fused_qk_rotary_emb(
@@ -825,12 +824,17 @@ class DynamicInferenceContext(BaseInferenceContext):
     def reset_attention_state(self) -> None:
         """Reset state used within attention, after each step."""
-        # Attention metadata reset is now handled by MHAMetadata.reset()
-        for attn_metadata in self.non_graph_attn_metadata.values():
-            attn_metadata.reset()
-        for attn_metadata in self.graph_attn_metadata.values():
-            attn_metadata.reset()
-        self.active_attn_metadata = None
+        self.max_seqlen_q = None
+        self.max_seqlen_k = None
+        self.cu_query_seq_lengths = None
+        self.cu_query_seq_lengths_cudagraph_only.fill_(0)
+        self.query_seq_lengths_cudagraph_only.fill_(0)
+        self.cu_kv_seq_lengths = None
+        self.cu_kv_seq_lengths_cudagraph_only.fill_(0)
+        self.kv_seq_lengths = None
+        self.kv_seq_lengths_cudagraph_only.fill_(0)
+        self.request_to_kv_block_ids_cudagraph_only.fill_(0)
+        self.block_table = None
     def using_cuda_graph_this_step(self) -> bool:
         """Returns True if cuda graphs are being used for this step."""
@@ -930,29 +934,89 @@ class DynamicInferenceContext(BaseInferenceContext):
             self.active_token_count : self.padded_active_token_count
         ] = 0
-        real_req_batch_size = (
-            self.total_request_count - self.paused_request_count
-        )  # how many requests are indeed active
-        self.active_attn_metadata = (
-            self.graph_attn_metadata
-            if self.using_cuda_graph_this_step()
-            else self.non_graph_attn_metadata
-        )
         # Update cu_query_seq_lengths, max_seqlen_q.
-        active_slice = slice(self.paused_request_count, self.total_request_count)
-        query_lengths_view = self.request_query_lengths[active_slice]
-        request_kv_length_offsets_view = self.request_kv_length_offsets[active_slice]
-        request_to_kv_block_ids_view = self.request_to_kv_block_ids[active_slice]
-        self.active_attn_metadata["mha_metadata"].update(
-            request_query_lengths=query_lengths_view,
-            request_kv_length_offsets=request_kv_length_offsets_view,
-            request_to_kv_block_ids=request_to_kv_block_ids_view,
-            padded_active_token_count=self.padded_active_token_count,
-            real_batch_size=real_req_batch_size,
-            padded_active_request_count=self.padded_active_request_count,
-        )
-        # All attention metadata calculations are now handled by MHAMetadata.update()
+        query_lengths = self.request_query_lengths[
+            self.paused_request_count : self.total_request_count
+        ]
+        if self.is_decode_only() or self.using_cuda_graph_this_step():
+            self.query_seq_lengths_cudagraph_only[
+                0 : self.total_request_count - self.paused_request_count
+            ] = query_lengths
+            if self.is_decode_only():
+                self.cu_query_seq_lengths = None  # ensure no accidental use
+                self.max_seqlen_q = 1
+            else:
+                self.cu_query_seq_lengths_cudagraph_only[
+                    1 : self.padded_active_request_count + 1
+                ] = torch.cumsum(
+                    self.query_seq_lengths_cudagraph_only[: self.padded_active_request_count], dim=0
+                )
+                # The following will be passed to the FA kernel.
+                self.cu_query_seq_lengths = self.cu_query_seq_lengths_cudagraph_only[
+                    : (self.padded_active_request_count + 1)
+                ]
+                self.max_seqlen_q = self.padded_active_token_count
+        else:
+            cu_query_lengths = torch.cumsum(query_lengths, dim=0)
+            self.cu_query_seq_lengths = torch.full(
+                (self.total_request_count - self.paused_request_count + 1,),
+                0,
+                dtype=torch.int32,
+                device=torch.cuda.current_device(),
+            )
+            self.cu_query_seq_lengths[1:] = cu_query_lengths
+            self.max_seqlen_q = query_lengths.max().item()
+        kv_seq_lengths = self.request_kv_length_offsets + self.request_query_lengths
+        self.kv_seq_lengths = kv_seq_lengths[self.paused_request_count : self.total_request_count]
+        if self.is_decode_only() or self.using_cuda_graph_this_step():
+            # Re-assign `kv_seq_lengths` to be a view of the first
+            # `active_cuda_graph_request_count` tokens of `kv_seq_lengths_decode_only`,
+            # such that `kv_seq_lengths` has a static memory address and is therefore
+            # cuda graph compatible. This allows `kv_seq_lengths` to transition between,
+            # cuda graph sizes, which makes multi-batch-size cuda graphs possible.
+            self.kv_seq_lengths_cudagraph_only[
+                0 : self.total_request_count - self.paused_request_count
+            ] = self.kv_seq_lengths
+            self.kv_seq_lengths = self.kv_seq_lengths_cudagraph_only[
+                : self.padded_active_request_count
+            ]
+            self.max_seqlen_k = self.max_sequence_length
+            if self.is_decode_only():
+                self.cu_kv_seq_lengths = None  # ensure no accidental use
+            else:
+                cu_kv_lengths = torch.cumsum(self.kv_seq_lengths, dim=0)
+                # The following will be passed to the FA kernel.
+                self.cu_kv_seq_lengths_cudagraph_only[1 : cu_kv_lengths.size(0) + 1] = cu_kv_lengths
+                self.cu_kv_seq_lengths = self.cu_kv_seq_lengths_cudagraph_only[
+                    : (self.padded_active_request_count + 1)
+                ]
+        else:
+            self.cu_kv_seq_lengths = torch.full(
+                (self.total_request_count - self.paused_request_count + 1,),
+                0,
+                dtype=torch.int32,
+                device=torch.cuda.current_device(),
+            )
+            self.cu_kv_seq_lengths[1:] = torch.cumsum(self.kv_seq_lengths, dim=0)
+            self.max_seqlen_k = self.kv_seq_lengths.max().item()
+        # Update KV block IDs, block table.
+        request_to_kv_block_ids = self.request_to_kv_block_ids[
+            self.paused_request_count : self.total_request_count
+        ]
+        if self.is_decode_only() or self.using_cuda_graph_this_step():
+            self.request_to_kv_block_ids_cudagraph_only[
+                0 : self.total_request_count - self.paused_request_count
+            ] = request_to_kv_block_ids
+            self.block_table = self.request_to_kv_block_ids_cudagraph_only[
+                : self.padded_active_request_count
+            ]
+        else:
+            self.block_table = self.request_to_kv_block_ids[
+                self.paused_request_count : self.total_request_count
+            ]
     def reset(self) -> None:
         """Reset entire context.

megatron_core-0.16.0rc0.dev111655/megatron/core/inference/unified_memory.py ADDED Viewed

@@ -0,0 +1,127 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+import os
+import warnings
+from enum import Enum, auto
+from pathlib import Path
+from torch.cuda.memory import CUDAPluggableAllocator
+from torch.utils.cpp_extension import CUDA_HOME, load_inline
+from megatron.core.utils import is_torch_min_version
+try:
+    if is_torch_min_version("2.8.0"):
+        from torch.cuda.memory import MemPool
+    else:
+        from torch.cuda import MemPool
+    _has_mem_pool = True
+except ImportError:
+    _has_mem_pool = False
+class CompilationState(Enum):
+    """Enum to distinguish between unified memory (UVM) compilation states."""
+    UNATTEMPTED = auto()  # Compilation has not been attempted.
+    FAILURE = auto()  # Compilation attempted, but failed.
+    SUCCESS = auto()  # Compilation attempted, and succeeded.
+# Compilation vars.
+_compilation_state = CompilationState.UNATTEMPTED
+_alloc = None  # must remain global until process exit.
+_mod = None  # must remain global until process exit.
+class UnifiedMemoryUnsupportedError(Exception):
+    """Unified memory is not supported on this system."""
+    pass
+def compile_allocator():
+    """Attempt to compile UVM allocator."""
+    global _compilation_state, _alloc, _mod
+    if _compilation_state != CompilationState.UNATTEMPTED:
+        return
+    _mempool_c_src = r"""
+    #include <cuda_runtime_api.h>
+    #include <cstddef>
+    #define EXPORT extern "C"
+    EXPORT void* managed_malloc(size_t size, int device, void* stream) {
+      (void)stream;
+      int cur = -1;
+      cudaGetDevice(&cur);
+      if (device != cur && device >= 0) cudaSetDevice(device);
+      // cudaMallocManaged allows for more memory to be allocated than the device memory size.
+      // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
+      void* ptr = nullptr;
+      cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
+      if (err != cudaSuccess) return nullptr;
+      if (device >= 0) {
+        // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
+        // This is a hint that tries to prevent data from being migrated away from the device.
+        cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
+        // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
+        // Even if the memory has to be migrated away from the device, it still does not page fault.
+        // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
+        // but there is no harm in adding this flag as well for future-proofing.
+        cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
+      }
+      return ptr;
+    }
+    EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
+      // Memory allocated with cudaMallocManaged should be released with cudaFree.
+      (void)size; (void)device; (void)stream;
+      if (ptr) cudaFree(ptr);
+    }
+    """
+    # Build the .so upon import; this avoids issues.
+    if _has_mem_pool:
+        _extra_ldflags = ["-lcudart"]
+        if CUDA_HOME:
+            _cuda_lib = os.path.join(CUDA_HOME, "lib64")
+            if os.path.isdir(_cuda_lib):
+                _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
+        try:
+            _mod = load_inline(
+                name="managed_alloc_runtime",
+                cpp_sources=[_mempool_c_src],
+                functions=[],
+                with_cuda=True,
+                extra_ldflags=_extra_ldflags,
+                verbose=False,
+            )
+            _so_path = Path(_mod.__file__).as_posix()
+            _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
+            _compilation_state = CompilationState.SUCCESS
+        except (RuntimeError, ImportError, OSError):
+            warnings.warn("Failed to create unified memory mempool.")
+            _compilation_state = CompilationState.FAILURE
+def create_unified_mempool() -> MemPool:
+    """Create a unified memory mempool using CUDA managed memory.
+    Returns:
+        (MemPool) Unified memory mempool.
+    """
+    # Attempt to compile allocator.
+    compile_allocator()
+    # Return mempool.
+    if _compilation_state != CompilationState.SUCCESS:
+        raise UnifiedMemoryUnsupportedError()
+    else:
+        return MemPool(allocator=_alloc)

{megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 16
 PATCH = 0
-PRE_RELEASE = 'rc0.dev111286'
+PRE_RELEASE = 'rc0.dev111655'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev111286
+Version: 0.16.0rc0.dev111655
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>

{megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/SOURCES.txt RENAMED Viewed

@@ -169,8 +169,6 @@ megatron/core/inference/contexts/dynamic_block_allocator.py
 megatron/core/inference/contexts/dynamic_context.py
 megatron/core/inference/contexts/fused_kv_append_kernel.py
 megatron/core/inference/contexts/static_context.py
-megatron/core/inference/contexts/attention_context/metadata_base.py
-megatron/core/inference/contexts/attention_context/mha_metadata.py
 megatron/core/inference/engines/__init__.py
 megatron/core/inference/engines/abstract_engine.py
 megatron/core/inference/engines/dynamic_engine.py

megatron_core-0.16.0rc0.dev111286/megatron/core/inference/contexts/attention_context/metadata_base.py DELETED Viewed

@@ -1,72 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-class MetadataBase:
-    """
-    Base class for attention metadata.
-    High-performance attention kernels often require input metadata in specific
-    formats—such as cumulative query lengths, cumulative key/value lengths,
-    and similar structures. Moreover, when using CUDA Graphs, these metadata
-    buffers must be statically allocated. This class serves as a unified container
-    that manages all such metadata in one place.
-    """
-    def __init__(self):
-        """
-        Initialize the metadata.
-        """
-        self.state_data = {}
-    def update(self, *args, **kwargs):
-        """
-        Construct the metadata from request states.
-        """
-        pass
-    def reset(self):
-        """
-        Reset the metadata.
-        """
-        pass
-    def tensor_copy_and_pad(
-        self,
-        tensor_buf,
-        unpadded_tensor,
-        real_batch_size,
-        padded_batch_size,
-        is_cumulative_tensor=False,
-        pad_value=0,
-    ):
-        """
-        Copy the unpadded tensor to the tensor_buf,
-        pad the tensor_buf with zero or the last value of the tensor,
-        depending on whether the tensor is cumulative.
-        Args:
-            tensor_buf: The destination tensor, at least padded_batch_size long.
-            unpadded_tensor: The tensor to copy, at least real_batch_size long.
-            real_batch_size: The real batch size.
-            padded_batch_size: Padded boundary of the tensor.
-            is_cumulative_tensor: Whether the tensor is cumulative.
-                If True, we pad the tensor_buf with the last value of the unpadded_tensor.
-            pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
-        """
-        assert real_batch_size <= padded_batch_size
-        assert tensor_buf.shape[0] >= padded_batch_size
-        assert unpadded_tensor.shape[0] >= real_batch_size
-        if is_cumulative_tensor:
-            if real_batch_size == 0:
-                value = pad_value
-            else:
-                value = unpadded_tensor[real_batch_size - 1]
-        else:
-            value = pad_value
-        tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
-        tensor_buf[real_batch_size:padded_batch_size] = value
-        return tensor_buf
-    def __str__(self):
-        """
-        Return a string representation of the metadata.
-        """
-        return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])

megatron-core 0.16.0rc0.dev111286__tar.gz → 0.16.0rc0.dev111655__tar.gz

Potentially problematic release.

megatron-core 0.16.0rc0.dev111286tar.gz → 0.16.0rc0.dev111655tar.gz