PyPI - megatron-core - Versions diffs - 0.16.0rc0.dev102440__tar.gz → 0.16.0rc0.dev116068__tar.gz - Mend

megatron-core 0.16.0rc0.dev102440tar.gz → 0.16.0rc0.dev116068tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (363) hide show

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: megatron-core
-Version: 0.16.0rc0.dev102440
+Version: 0.16.0rc0.dev116068
 Summary: Megatron Core - a library for efficient and scalable training of transformer based models
 Author-email: NVIDIA <nemo-toolkit@nvidia.com>
 Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -30,7 +30,7 @@ Classifier: Topic :: Utilities
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 Requires-Dist: torch
-Requires-Dist: numpy
+Requires-Dist: numpy<2.0.0
 Requires-Dist: packaging>=24.2
 Provides-Extra: mlm
 Requires-Dist: flask-restful; extra == "mlm"
@@ -40,7 +40,7 @@ Requires-Dist: wandb; extra == "mlm"
 Requires-Dist: transformers; extra == "mlm"
 Provides-Extra: dev
 Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
-Requires-Dist: transformer-engine[core_cu13,pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
+Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
 Requires-Dist: nvidia-resiliency-ext; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
 Requires-Dist: einops~=0.8; extra == "dev"
@@ -48,16 +48,15 @@ Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
 Requires-Dist: nvtx~=0.2; extra == "dev"
 Requires-Dist: multi-storage-client~=0.27; extra == "dev"
 Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
+Requires-Dist: setuptools<80.0.0; extra == "dev"
 Requires-Dist: mamba-ssm~=2.2; extra == "dev"
 Requires-Dist: causal-conv1d~=1.5; extra == "dev"
 Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
 Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
-Requires-Dist: av; extra == "dev"
+Requires-Dist: av<16.0.0; extra == "dev"
 Requires-Dist: flashinfer-python; extra == "dev"
 Requires-Dist: wget; extra == "dev"
 Requires-Dist: onnxscript; extra == "dev"
-Requires-Dist: fastapi~=0.50; extra == "dev"
-Requires-Dist: datasets; extra == "dev"
 Provides-Extra: lts
 Requires-Dist: tqdm; extra == "lts"
 Requires-Dist: einops~=0.8; extra == "lts"
@@ -65,16 +64,15 @@ Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "lts"
 Requires-Dist: nvtx~=0.2; extra == "lts"
 Requires-Dist: multi-storage-client~=0.27; extra == "lts"
 Requires-Dist: opentelemetry-api~=1.33.1; extra == "lts"
+Requires-Dist: setuptools<80.0.0; extra == "lts"
 Requires-Dist: mamba-ssm~=2.2; extra == "lts"
 Requires-Dist: causal-conv1d~=1.5; extra == "lts"
 Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
 Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
-Requires-Dist: av; extra == "lts"
+Requires-Dist: av<16.0.0; extra == "lts"
 Requires-Dist: flashinfer-python; extra == "lts"
 Requires-Dist: wget; extra == "lts"
 Requires-Dist: onnxscript; extra == "lts"
-Requires-Dist: fastapi~=0.50; extra == "lts"
-Requires-Dist: datasets; extra == "lts"
 <div align="center">

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/exchange_utils.py RENAMED Viewed

@@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple):
 def _shard_size(sh_ten: ShardedTensor):
     """Returns size in bytes of a given sharded tensor."""
     if sh_ten.flattened_range is None:
-        numel = np.prod(sh_ten.local_shape)
+        numel = np.product(sh_ten.local_shape)
     else:
         numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
     return numel * torch._utils._element_size(sh_ten.dtype)

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/mapping.py RENAMED Viewed

@@ -216,7 +216,7 @@ class ShardedTensor(ShardedBase):
             )
         # TODO: np.unravel_index?
-        mask = np.zeros(np.prod(self.local_shape), dtype=bool)
+        mask = np.zeros(np.product(self.local_shape), dtype=bool)
         mask[self.flattened_range] = True
         return np.nonzero(mask.reshape(self.local_shape))

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/validation.py RENAMED Viewed

@@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
         all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
     starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    expected_size = np.prod(local_shape)
+    expected_size = np.product(local_shape)
     if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
         raise CheckpointingException(
             f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py RENAMED Viewed

@@ -25,8 +25,6 @@ from torch.distributed.checkpoint.metadata import (
 from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
 from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
-from .utils import get_mesh_names
 def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
     """
@@ -274,14 +272,7 @@ def gather_uneven_dtensor_to_full_tensor(
     if not device_mesh.mesh_dim_names:
         process_group = device_mesh.get_group()
     else:
-        # Check if the fully-flattened mesh exists first.
-        full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
-        if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
-            # Retrieve the existing flattened DeviceMesh ProcessGroup.
-            process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
-        else:
-            # Create the _-separated flattened DeviceMesh ProcessGroup.
-            process_group = device_mesh._flatten().get_group()
+        process_group = device_mesh._flatten().get_group()
     # Collect chunk metadata for uneven shards (update if missing)
     if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py RENAMED Viewed

@@ -167,10 +167,13 @@ def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
         submesh_dim_name
         for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
         for submesh_dim_name in (child_mesh.mesh_dim_names or [])
-        # Add flattened or other unaccounted for children of the root mesh.
-        if root_mesh == device_mesh and submesh_dim_name not in mesh_dim_names
+        if root_mesh == device_mesh
     ]
-    return mesh_dim_names + submesh_dim_names
+    # Combine without duplicate dimensions.
+    for dim_name in submesh_dim_names:
+        if dim_name not in mesh_dim_names:
+            mesh_dim_names.append(dim_name)
+    return mesh_dim_names
 def contains_submesh(

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_indices_converter.py RENAMED Viewed

@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 import torch
 from packaging import version
-from megatron.core.utils import null_decorator
+from megatron.core.utils import experimental_fn, null_decorator
 try:
     import triton
@@ -279,6 +279,7 @@ class IndicesToMultihot(torch.autograd.Function):
         return None, grad_probs_indices, None, None
+@experimental_fn(introduced_with_version='0.11.0rc0')
 def fused_indices_to_multihot(indices, probs_indices, num_of_local_experts):
     """Convert moe topk indices to multihot representation.

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_mla_yarn_rope_apply.py RENAMED Viewed

@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
 import torch
 from packaging import version
-from megatron.core.utils import null_decorator
+from megatron.core.utils import experimental_fn, null_decorator
 try:
     import triton
@@ -324,6 +324,7 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
         return grad, None, None, None, None, None, None, None, None
+@experimental_fn(introduced_with_version="0.13.0")
 def fused_apply_mla_rope_for_q(
     t: torch.Tensor,
     cos: torch.Tensor,
@@ -732,6 +733,7 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
         return d_kv, d_emb, None, None, None, None, None, None, None, None, None
+@experimental_fn(introduced_with_version="0.13.0")
 def fused_apply_mla_rope_for_kv(
     kv: torch.Tensor,
     k_pos_emb: torch.Tensor,

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_pad_routing_map.py RENAMED Viewed

@@ -6,7 +6,7 @@ import torch
 from packaging import version
 from megatron.core.jit import jit_fuser
-from megatron.core.utils import null_decorator
+from megatron.core.utils import experimental_fn, null_decorator
 try:
     import triton
@@ -70,6 +70,7 @@ def _pad_routing_map_kernel(
     tl.store(output_row_ptr + token_indices, output_row, mask=token_mask)
+@experimental_fn(introduced_with_version="0.13.0")
 @jit_fuser
 def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> torch.Tensor:
     """Fused version of pad_routing_map.

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/attention_context/mha_metadata.py RENAMED Viewed

@@ -1,7 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-import torch
-from megatron.core.inference.batch_dimensions_utils import InferenceBatchDimensions
+from typing import Optional
+import torch
 from .metadata_base import MetadataBase
@@ -39,21 +40,23 @@ class MHAMetadata(MetadataBase):
         request_query_lengths: torch.Tensor,
         request_kv_length_offsets: torch.Tensor,
         request_to_kv_block_ids: torch.Tensor,
-        batch_dimensions: InferenceBatchDimensions,
-        padded_batch_dimensions: InferenceBatchDimensions,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
     ):
         """
         Args:
             request_query_lengths: (>real_batch_size,)
             request_kv_length_offsets: (>real_batch_size,)
             request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
-            batch_dimensions: Configuration object containing real batch settings
-            padded_batch_dimensions: Configuration object containing padded batch settings
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
         """
-        # Extract values from configs
-        real_batch_size = batch_dimensions.req_count
-        padded_active_token_count = padded_batch_dimensions.token_count
-        padded_active_request_count = padded_batch_dimensions.req_count
+        if padded_active_request_count is None:
+            padded_active_request_count = real_batch_size
         assert real_batch_size <= padded_active_request_count <= self.max_bs
         assert request_query_lengths.shape[0] == real_batch_size
@@ -98,12 +101,10 @@ class MHAMetadata(MetadataBase):
             is_cumulative_tensor=True,
         )
-        if padded_batch_dimensions.prefill_req_count == 0:
+        if decode_only:
             self._max_seqlen_q = 1
         else:
-            # Make sure we will launch the prefill kernel for prefill graphs
-            self._max_seqlen_q = max(2, padded_batch_dimensions.token_count)
+            self._max_seqlen_q = max(2, padded_active_token_count)
         self._max_seqlen_k = self.max_seqlen
         self.state_data = {
@@ -148,23 +149,29 @@ class GraphedMHAMetadata(MHAMetadata):
         request_query_lengths: torch.Tensor,
         request_kv_length_offsets: torch.Tensor,
         request_to_kv_block_ids: torch.Tensor,
-        batch_dimensions: InferenceBatchDimensions,
-        padded_batch_dimensions: InferenceBatchDimensions,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
     ):
         """
         Args:
             request_query_lengths: (>real_batch_size,)
             request_kv_length_offsets: (>real_batch_size,)
             request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
-            batch_dimensions: Configuration object containing real batch settings
-            padded_batch_dimensions: Configuration object containing padded batch settings
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
         """
         super().update(
             request_query_lengths,
             request_kv_length_offsets,
             request_to_kv_block_ids,
-            batch_dimensions,
-            padded_batch_dimensions,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
         )
     def reset(self):
@@ -181,23 +188,29 @@ class NonGraphedMHAMetadata(MHAMetadata):
         request_query_lengths: torch.Tensor,
         request_kv_length_offsets: torch.Tensor,
         request_to_kv_block_ids: torch.Tensor,
-        batch_dimensions: InferenceBatchDimensions,
-        padded_batch_dimensions: InferenceBatchDimensions,
+        padded_active_token_count: int,
+        real_batch_size: int,
+        padded_active_request_count: Optional[int] = None,
+        decode_only: bool = False,
     ):
         """
         Args:
             request_query_lengths: (>real_batch_size,)
             request_kv_length_offsets: (>real_batch_size,)
             request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
-            batch_dimensions: Configuration object containing real batch settings
-            padded_batch_dimensions: Configuration object containing padded batch settings
+            padded_active_token_count: int
+            real_batch_size: int
+            padded_active_request_count: Optional[int]
+            decode_only: bool
         """
         super().update(
             request_query_lengths,
             request_kv_length_offsets,
             request_to_kv_block_ids,
-            batch_dimensions,
-            padded_batch_dimensions,
+            padded_active_token_count,
+            real_batch_size,
+            padded_active_request_count,
+            decode_only,
         )
         if len(self.state_data["query_lengths"]) > 0:
             self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()

{megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/dynamic_block_allocator.py RENAMED Viewed

@@ -115,5 +115,4 @@ class BlockAllocator:
         This resets the available block count to the entire memory pool
         (except for the dummy block).
         """
         self.total_avail = self.total_count - 1

megatron-core 0.16.0rc0.dev102440__tar.gz → 0.16.0rc0.dev116068__tar.gz

megatron-core 0.16.0rc0.dev102440tar.gz → 0.16.0rc0.dev116068tar.gz