PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev126546__tar.gz → 0.16.0rc0.dev126744__tar.gz - Mend

megatron-core 0.16.0rc0.dev126546tar.gz → 0.16.0rc0.dev126744tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megatron-core might be problematic. Click here for more details.

Files changed (360) hide show

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev126546
+Version: 0.16.0rc0.dev126744
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -41,7 +41,7 @@ Requires-Dist: transformers; extra == "mlm"
 Provides-Extra: dev
 Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
-Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
+Requires-Dist: nvidia-resiliency-ext; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
 Requires-Dist: einops~=0.8; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
@@ -59,13 +59,20 @@ Requires-Dist: wget; extra == "dev"
 Requires-Dist: onnxscript; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
-Requires-Dist: einops; extra == "lts"
-Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
-Requires-Dist: nvtx; extra == "lts"
-Requires-Dist: transformers; extra == "lts"
-Requires-Dist: zarr; extra == "lts"
+Requires-Dist: einops~=0.8; extra == "lts"
+Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "lts"
+Requires-Dist: nvtx~=0.2; extra == "lts"
+Requires-Dist: multi-storage-client~=0.27; extra == "lts"
+Requires-Dist: opentelemetry-api~=1.33.1; extra == "lts"
 Requires-Dist: setuptools<80.0.0; extra == "lts"
+Requires-Dist: mamba-ssm~=2.2; extra == "lts"
+Requires-Dist: causal-conv1d~=1.5; extra == "lts"
+Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
+Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
+Requires-Dist: av<16.0.0; extra == "lts"
+Requires-Dist: flashinfer-python; extra == "lts"
 Requires-Dist: wget; extra == "lts"
+Requires-Dist: onnxscript; extra == "lts"
 <div align="center">

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/datasets/megatron_tokenizer.py RENAMED Viewed

@@ -1,11 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 import json
+import logging
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Any
 import numpy
+logger = logging.getLogger(__name__)
 class MegatronLegacyTokenizer(ABC):
     """Abstract class for tokenizer
@@ -20,6 +23,12 @@ class MegatronLegacyTokenizer(ABC):
     """
     def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
+        # Deprecation warning
+        logger.warning(
+            "You’re using the legacy tokenizer system, which is deprecated "
+            "and will be removed in a future release. Please migrate to the new tokenizer system "
+            "(`megatron.core.tokenizers.MegatronTokenizer`)."
+        )
         self.unique_identifiers = OrderedDict()
         self.unique_identifiers["class"] = type(self).__name__
         self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/inference/contexts/dynamic_context.py RENAMED Viewed

@@ -4,7 +4,7 @@ import math
 import warnings
 from contextlib import nullcontext
 from enum import Enum
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
@@ -49,6 +49,17 @@ try:
 except ImportError:
     HAVE_FLASHINFER = False
+try:
+    import wandb  # pylint: disable=unused-import
+    HAVE_WANDB = True
+except ImportError:
+    HAVE_WANDB = False
+    wandb = None
+if TYPE_CHECKING:
+    import wandb as WandbModule
 class ContextOverflowError(Exception):
     """Base exception for when a new request does not fit.
@@ -226,6 +237,7 @@ class DynamicInferenceContext(BaseInferenceContext):
             levels will be included to control other tensors within the context.
         use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
         If None, defaults to using flash-infer if available.
+        metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
     """
     def __init__(
@@ -251,6 +263,7 @@ class DynamicInferenceContext(BaseInferenceContext):
         use_cuda_graphs_for_non_decode_steps: bool = True,
         use_flashinfer_fused_rope: bool = False,
         unified_memory_level: Optional[int] = 0,
+        metrics_writer: Optional['WandbModule'] = None,
     ):
         super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
@@ -260,6 +273,8 @@ class DynamicInferenceContext(BaseInferenceContext):
                 block_size_tokens == 64
             ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
+        self.metrics_writer = metrics_writer
         # Per partition num heads and hidden size.
         projection_size = kv_channels * num_attention_heads
         if tensor_model_parallel_size is None:
@@ -1569,3 +1584,67 @@ class DynamicInferenceContext(BaseInferenceContext):
         # Convert each log prob tensor into a list
         return [lp.tolist() for lp in selected_log_probs_list]
+    def get_kvcache_utilization_stats(self) -> dict:
+        """Compute KV cache buffer utilization stats for the current step.
+        Returns a dictionary with counts and percentages for both allocated block
+        usage (overall buffer occupancy) and active usage (blocks referenced by
+        currently active requests this step).
+        Return:
+            {
+            'total_blocks': int,
+            'allocated_blocks': int,
+            'active_unique_blocks': int,
+            'allocated_utilization': float,
+            'active_utilization': float,
+            'active_request_count': int,
+            'paused_request_count': int,
+            'gtd_block_count': int,
+            }
+        """
+        # Total usable blocks exclude the reserved dummy block.
+        total_blocks = max(self.block_allocator.block_count_total - 1, 1)
+        block_count_avail = int(self.block_allocator.block_count_avail)
+        # Overall allocated blocks in the buffer right now.
+        allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail
+        allocated_blocks = int(max(0, allocated_blocks))
+        # Active unique blocks referenced by current active requests only.
+        active_start = self.paused_request_count
+        active_end = self.total_request_count
+        if active_end > active_start:
+            active_rows = self.request_to_kv_block_ids[active_start:active_end]
+            # Filter valid block ids (>= 0) and count unique ids.
+            valid_ids = active_rows[active_rows >= 0]
+            if valid_ids.numel() > 0:
+                unique_ids = torch.unique(valid_ids)
+                active_unique_blocks = int(unique_ids.numel())
+            else:
+                active_unique_blocks = 0
+        else:
+            active_unique_blocks = 0
+        allocated_utilization = float(allocated_blocks) / float(total_blocks)
+        active_utilization = float(active_unique_blocks) / float(total_blocks)
+        # Diagnostic helpers
+        num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count))
+        total_request_count = int(self.total_request_count)
+        return {
+            'total_blocks': int(total_blocks),
+            'allocated_blocks': int(allocated_blocks),
+            'active_unique_blocks': int(active_unique_blocks),
+            'allocated_utilization': allocated_utilization,
+            'active_utilization': active_utilization,
+            'active_request_count': int(self.get_active_request_count()),
+            'paused_request_count': int(self.paused_request_count),
+            'gtd_block_count': int(self.gtd_block_count),
+            'block_count_avail': int(block_count_avail),
+            'num_non_gtd_blocks': int(num_non_gtd_blocks),
+            'active_token_count': int(self.active_token_count),
+            'total_request_count': int(total_request_count),
+            'max_requests': int(self.max_requests),
+        }

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/inference/engines/dynamic_engine.py RENAMED Viewed

@@ -57,6 +57,14 @@ try:
 except:
     HAVE_MSGPACK = False
+try:
+    import wandb
+    HAVE_WANDB = True
+except ImportError:
+    HAVE_WANDB = False
+    wandb = None
 def format_mem_bytes(mem_bytes):
     """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes."""
@@ -89,6 +97,8 @@ class DynamicInferenceEngine(AbstractEngine):
         static_sampling (bool): If True, all requests are assumed to have the same
             sampling parameters. This avoids needing to loop through all requests and
             their sampling parameters every generation step, improving latency.
+        inference_logging_step_interval (int): The step interval at which to log
+        inference metrics to wandb. Defaults to 0, which means no logging.
     """
     def __init__(
@@ -101,6 +111,7 @@ class DynamicInferenceEngine(AbstractEngine):
         track_paused_request_events: bool = False,
         enable_chunked_prefill: bool = True,
         static_sampling: bool = False,
+        inference_logging_step_interval: int = 0,
     ):
         if enable_cuda_graph is not None:
@@ -137,6 +148,32 @@ class DynamicInferenceEngine(AbstractEngine):
         self.enable_chunked_prefill = enable_chunked_prefill
         self.static_sampling = static_sampling
+        self.inference_logging_step_interval = inference_logging_step_interval
+        # Configure wandb to use separate step counter for inference metrics (only once)
+        if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
+            logging.info(
+                f"\033[1;93m[INFERENCE]\033[0m "
+                f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m"
+            )
+            if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
+                # Make all inference/* metrics use inference_step as their x-axis
+                # This allows inference and training to have independent step counters
+                context.metrics_writer.define_metric(
+                    "inference/*", step_metric="inference/inference_step"
+                )
+                # Initialize inference step offset by querying existing run history
+                self.inference_step_offset = 0
+                if wandb.run is not None:
+                    api_run = wandb.Api().run(
+                        f"{wandb.run.entity}/{wandb.run.project}/{wandb.run.id}"
+                    )
+                    max_step = 0
+                    for row in api_run.scan_history(keys=["inference/inference_step"]):
+                        val = row.get("inference/inference_step")
+                        if isinstance(val, (int, float)) and int(val) > max_step:
+                            max_step = int(val)
+                    self.inference_step_offset = int(max_step)
         # Initialize the asyncio loop if it has not already been initialized.
         # TODO: Start the engine loop here.
         self._loop = get_asyncio_loop()
@@ -780,6 +817,41 @@ class DynamicInferenceEngine(AbstractEngine):
             self.request_completion_futures[failed_request_id].set_result(failed_request)
         self.failed_request_ids.clear()
+        # Log KV cache utilization stats to W&B
+        if (
+            self.inference_logging_step_interval > 0
+            and self.step_count > 0
+            and self.step_count % self.inference_logging_step_interval == 0
+            and self.context.metrics_writer is not None
+        ):
+            # Get KV cache utilization stats from dynamic context
+            kv_stats = self.context.get_kvcache_utilization_stats()
+            # Prepare metrics dictionary with all stats
+            # Use 'inference/' prefix for all metrics to separate from training metrics
+            metrics = {
+                'inference/inference_step': int(self.inference_step_offset + int(self.step_count)),
+                'inference/step_time_s': float(step_time),
+                'inference/waiting_queue_len': int(len(self.waiting_request_ids)),
+                'inference/total_requests_dict_size': int(len(self.requests)),
+            }
+            # Add KV stats with inference/ prefix
+            # Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization
+            for key, value in kv_stats.items():
+                if 'utilization' in key:
+                    # Convert to percentage (0-100) and group under kvcache_utilization
+                    metrics[f'inference/{key}'] = float(value * 100.0)
+                else:
+                    metrics[f'inference/{key}'] = value
+            if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
+                self.context.metrics_writer.log(metrics, commit=True)
+            else:
+                raise ValueError(
+                    f"Unsupported metrics writer type: {type(self.context.metrics_writer)}"
+                )
         # Print context state.
         if verbose:
             context = self.context

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/package_info.py RENAMED Viewed

@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 16
 PATCH = 0
-PRE_RELEASE = 'rc0.dev126546'
+PRE_RELEASE = 'rc0.dev126744'
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/safe_globals.py RENAMED Viewed

@@ -1,6 +1,7 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 from argparse import Namespace
+from io import BytesIO
 from pathlib import PosixPath
 from types import SimpleNamespace
@@ -26,6 +27,7 @@ SAFE_GLOBALS = [
     RerunDiagnostic,
     RerunMode,
     RerunState,
+    BytesIO,
 ]

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/transformer/attention.py RENAMED Viewed

@@ -48,15 +48,26 @@ except ImportError:
     rearrange = None
 try:
-    from flashattn_hopper.flash_attn_interface import _flash_attn_forward
-    from flashattn_hopper.flash_attn_interface import (
+    from flash_attn_3.flash_attn_interface import _flash_attn_forward
+    from flash_attn_3.flash_attn_interface import (
         flash_attn_with_kvcache as flash_attn3_with_kvcache,
     )
     HAVE_FA3 = True
-except:
+except ImportError as e:
     HAVE_FA3 = False
+if not HAVE_FA3:
+    try:
+        from flashattn_hopper.flash_attn_interface import _flash_attn_forward
+        from flashattn_hopper.flash_attn_interface import (
+            flash_attn_with_kvcache as flash_attn3_with_kvcache,
+        )
+        HAVE_FA3 = True
+    except ImportError as e:
+        pass
 try:
     from flash_mla import flash_mla_with_kvcache, get_mla_metadata

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/transformer/cuda_graphs.py RENAMED Viewed

@@ -1182,7 +1182,11 @@ class CudaGraphManager(torch.nn.Module):
             if runner is None:
                 if _CudagraphGlobalRecord.cudagraph_created:
-                    assert False
+                    assert False, (
+                        f"`cudagraph_created` is set to True but no matching cudagraph "
+                        f"runners were found. This module has {len(self.cudagraph_runners)} "
+                        f"existing runners. Use `get_mismatch_errors` to debug mismatches."
+                    )
                 else:
                     runner = _CudaGraphRunner(
                         megatron_module,

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/transformer/dot_product_attention.py RENAMED Viewed

@@ -126,6 +126,8 @@ class DotProductAttention(MegatronModule):
                     )
                 ),
             )
+            if config.perform_initialization:
+                self.softmax_offset = config.init_method(self.softmax_offset)
         else:
             raise ValueError("Softmax type not supported")

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/transformer/moe/router.py RENAMED Viewed

@@ -66,6 +66,8 @@ class Router(ABC, MegatronModule):
         """Reset the router parameters."""
         if self.config.perform_initialization:
             self.config.init_method(self.weight)
+            if self.bias is not None:
+                self.config.init_method(self.bias)
         self.weight.data = self.weight.data.to(dtype=self.config.params_dtype)
         setattr(self.weight, 'sequence_parallel', self.config.sequence_parallel)
         if self.bias is not None:

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron/core/transformer/pipeline_parallel_layer_layout.py RENAMED Viewed

@@ -15,8 +15,11 @@ logger = logging.getLogger(__name__)
 class PipelineParallelLayerLayout:
     """Configuration of custom pipeline parallel layer partitioning."""
-    def __repr__(self):
-        return self.input_data
+    def __repr__(self) -> str:
+        if isinstance(self.input_data, str):
+            return self.input_data
+        else:
+            return str(self.input_data)
     def __init__(self, layout: str | list, pipeline_model_parallel_size: int):
         """Initialize PipelineParallelLayerLayout from a list or a str.

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev126546
+Version: 0.16.0rc0.dev126744
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -41,7 +41,7 @@ Requires-Dist: transformers; extra == "mlm"
 Provides-Extra: dev
 Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
 Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
-Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
+Requires-Dist: nvidia-resiliency-ext; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
 Requires-Dist: einops~=0.8; extra == "dev"
 Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
@@ -59,13 +59,20 @@ Requires-Dist: wget; extra == "dev"
 Requires-Dist: onnxscript; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
-Requires-Dist: einops; extra == "lts"
-Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
-Requires-Dist: nvtx; extra == "lts"
-Requires-Dist: transformers; extra == "lts"
-Requires-Dist: zarr; extra == "lts"
+Requires-Dist: einops~=0.8; extra == "lts"
+Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "lts"
+Requires-Dist: nvtx~=0.2; extra == "lts"
+Requires-Dist: multi-storage-client~=0.27; extra == "lts"
+Requires-Dist: opentelemetry-api~=1.33.1; extra == "lts"
 Requires-Dist: setuptools<80.0.0; extra == "lts"
+Requires-Dist: mamba-ssm~=2.2; extra == "lts"
+Requires-Dist: causal-conv1d~=1.5; extra == "lts"
+Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
+Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
+Requires-Dist: av<16.0.0; extra == "lts"
+Requires-Dist: flashinfer-python; extra == "lts"
 Requires-Dist: wget; extra == "lts"
+Requires-Dist: onnxscript; extra == "lts"
 <div align="center">

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/megatron_core.egg-info/requires.txt RENAMED Viewed

@@ -4,7 +4,7 @@ packaging>=24.2
 [dev]
 transformer-engine[pytorch]<2.10.0,>=2.9.0a0
-nvidia-resiliency-ext<0.5.0,>=0.4.0a0
+nvidia-resiliency-ext
 tqdm
 einops~=0.8
 tensorstore!=0.1.46,!=0.1.72,~=0.1
@@ -26,13 +26,20 @@ nvidia-modelopt[torch]
 [lts]
 tqdm
-einops
-tensorstore!=0.1.46,!=0.1.72
-nvtx
-transformers
-zarr
+einops~=0.8
+tensorstore!=0.1.46,!=0.1.72,~=0.1
+nvtx~=0.2
+multi-storage-client~=0.27
+opentelemetry-api~=1.33.1
 setuptools<80.0.0
+mamba-ssm~=2.2
+causal-conv1d~=1.5
+nv-grouped-gemm~=1.1
+megatron-energon[av_decode]~=6.0
+av<16.0.0
+flashinfer-python
 wget
+onnxscript
 [mlm]
 flask-restful

{megatron_core-0.16.0rc0.dev126546 → megatron_core-0.16.0rc0.dev126744}/pyproject.toml RENAMED Viewed

@@ -69,7 +69,7 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
 dev = [
     "nvidia-modelopt[torch]; sys_platform != 'darwin'",
     "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
-    "nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
+    "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
@@ -89,13 +89,20 @@ dev = [
 lts = [
     "tqdm",
-    "einops",
-    "tensorstore!=0.1.46,!=0.1.72",
-    "nvtx",
-    "transformers",
-    "zarr",
+    "einops~=0.8",
+    "tensorstore~=0.1,!=0.1.46,!=0.1.72",
+    "nvtx~=0.2",
+    "multi-storage-client~=0.27",
+    "opentelemetry-api~=1.33.1",
     "setuptools<80.0.0",
+    "mamba-ssm~=2.2",
+    "causal-conv1d~=1.5",
+    "nv-grouped-gemm~=1.1",
+    "megatron-energon[av_decode]~=6.0",
+    "av<16.0.0",                          # At the time, av 16.0.0 is not compatible with Python 3.12
+    "flashinfer-python",
     "wget",
+    "onnxscript",
 ]
 [dependency-groups]