megatron-core 0.16.0rc0.dev102440__tar.gz → 0.16.0rc0.dev116068__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/PKG-INFO +7 -9
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/exchange_utils.py +1 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/mapping.py +1 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/validation.py +1 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +1 -10
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +6 -3
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_indices_converter.py +2 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +3 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_pad_routing_map.py +2 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/attention_context/mha_metadata.py +39 -26
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/dynamic_context.py +155 -313
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/data_parallel_inference_coordinator.py +2 -48
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/engines/dynamic_engine.py +139 -193
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/headers.py +1 -3
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/inference_client.py +19 -67
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/inference_request.py +0 -2
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +28 -141
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/unified_memory.py +15 -51
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/rope_utils.py +25 -26
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +4 -11
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/gpt_model.py +18 -6
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mamba/mamba_model.py +14 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/distrib_optimizer.py +1 -2
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/optimizer.py +0 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/package_info.py +1 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/timers.py +6 -15
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +3 -8
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -8
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/text_tokenizer.py +6 -10
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/attention.py +6 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/cuda_graphs.py +10 -14
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/dot_product_attention.py +2 -8
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +23 -44
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/moe_utils.py +7 -15
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/router.py +1 -15
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/transformer_config.py +2 -2
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/utils.py +3 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/utils.py +7 -9
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron_core.egg-info/PKG-INFO +7 -9
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron_core.egg-info/SOURCES.txt +0 -1
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron_core.egg-info/requires.txt +6 -8
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/pyproject.toml +8 -10
- megatron_core-0.16.0rc0.dev102440/megatron/core/inference/batch_dimensions_utils.py +0 -379
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/MANIFEST.in +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/README.md +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/README.md +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/activations.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/extensions/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/attention_context/mamba_metadata.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/jit.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mamba_block.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mamba_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mamba_mixer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/inference_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/setup.cfg +0 -0
- {megatron_core-0.16.0rc0.dev102440 → megatron_core-0.16.0rc0.dev116068}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev116068
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Utilities
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Description-Content-Type: text/markdown
|
|
32
32
|
Requires-Dist: torch
|
|
33
|
-
Requires-Dist: numpy
|
|
33
|
+
Requires-Dist: numpy<2.0.0
|
|
34
34
|
Requires-Dist: packaging>=24.2
|
|
35
35
|
Provides-Extra: mlm
|
|
36
36
|
Requires-Dist: flask-restful; extra == "mlm"
|
|
@@ -40,7 +40,7 @@ Requires-Dist: wandb; extra == "mlm"
|
|
|
40
40
|
Requires-Dist: transformers; extra == "mlm"
|
|
41
41
|
Provides-Extra: dev
|
|
42
42
|
Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
|
|
43
|
-
Requires-Dist: transformer-engine[
|
|
43
|
+
Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
|
|
44
44
|
Requires-Dist: nvidia-resiliency-ext; extra == "dev"
|
|
45
45
|
Requires-Dist: tqdm; extra == "dev"
|
|
46
46
|
Requires-Dist: einops~=0.8; extra == "dev"
|
|
@@ -48,16 +48,15 @@ Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
|
|
|
48
48
|
Requires-Dist: nvtx~=0.2; extra == "dev"
|
|
49
49
|
Requires-Dist: multi-storage-client~=0.27; extra == "dev"
|
|
50
50
|
Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
|
|
51
|
+
Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
51
52
|
Requires-Dist: mamba-ssm~=2.2; extra == "dev"
|
|
52
53
|
Requires-Dist: causal-conv1d~=1.5; extra == "dev"
|
|
53
54
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
|
|
54
55
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
|
|
55
|
-
Requires-Dist: av; extra == "dev"
|
|
56
|
+
Requires-Dist: av<16.0.0; extra == "dev"
|
|
56
57
|
Requires-Dist: flashinfer-python; extra == "dev"
|
|
57
58
|
Requires-Dist: wget; extra == "dev"
|
|
58
59
|
Requires-Dist: onnxscript; extra == "dev"
|
|
59
|
-
Requires-Dist: fastapi~=0.50; extra == "dev"
|
|
60
|
-
Requires-Dist: datasets; extra == "dev"
|
|
61
60
|
Provides-Extra: lts
|
|
62
61
|
Requires-Dist: tqdm; extra == "lts"
|
|
63
62
|
Requires-Dist: einops~=0.8; extra == "lts"
|
|
@@ -65,16 +64,15 @@ Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "lts"
|
|
|
65
64
|
Requires-Dist: nvtx~=0.2; extra == "lts"
|
|
66
65
|
Requires-Dist: multi-storage-client~=0.27; extra == "lts"
|
|
67
66
|
Requires-Dist: opentelemetry-api~=1.33.1; extra == "lts"
|
|
67
|
+
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
68
68
|
Requires-Dist: mamba-ssm~=2.2; extra == "lts"
|
|
69
69
|
Requires-Dist: causal-conv1d~=1.5; extra == "lts"
|
|
70
70
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
|
|
71
71
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
|
|
72
|
-
Requires-Dist: av; extra == "lts"
|
|
72
|
+
Requires-Dist: av<16.0.0; extra == "lts"
|
|
73
73
|
Requires-Dist: flashinfer-python; extra == "lts"
|
|
74
74
|
Requires-Dist: wget; extra == "lts"
|
|
75
75
|
Requires-Dist: onnxscript; extra == "lts"
|
|
76
|
-
Requires-Dist: fastapi~=0.50; extra == "lts"
|
|
77
|
-
Requires-Dist: datasets; extra == "lts"
|
|
78
76
|
|
|
79
77
|
<div align="center">
|
|
80
78
|
|
|
@@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple):
|
|
|
63
63
|
def _shard_size(sh_ten: ShardedTensor):
|
|
64
64
|
"""Returns size in bytes of a given sharded tensor."""
|
|
65
65
|
if sh_ten.flattened_range is None:
|
|
66
|
-
numel = np.
|
|
66
|
+
numel = np.product(sh_ten.local_shape)
|
|
67
67
|
else:
|
|
68
68
|
numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
|
|
69
69
|
return numel * torch._utils._element_size(sh_ten.dtype)
|
|
@@ -216,7 +216,7 @@ class ShardedTensor(ShardedBase):
|
|
|
216
216
|
)
|
|
217
217
|
|
|
218
218
|
# TODO: np.unravel_index?
|
|
219
|
-
mask = np.zeros(np.
|
|
219
|
+
mask = np.zeros(np.product(self.local_shape), dtype=bool)
|
|
220
220
|
mask[self.flattened_range] = True
|
|
221
221
|
return np.nonzero(mask.reshape(self.local_shape))
|
|
222
222
|
|
|
@@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
|
|
|
519
519
|
all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
|
|
520
520
|
|
|
521
521
|
starts, stops = map(np.asarray, zip(*sorted(all_slices)))
|
|
522
|
-
expected_size = np.
|
|
522
|
+
expected_size = np.product(local_shape)
|
|
523
523
|
if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
|
|
524
524
|
raise CheckpointingException(
|
|
525
525
|
f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"
|
|
@@ -25,8 +25,6 @@ from torch.distributed.checkpoint.metadata import (
|
|
|
25
25
|
from torch.distributed.checkpoint.planner import TensorWriteData, WriteItem, WriteItemType
|
|
26
26
|
from torch.distributed.tensor.placement_types import Replicate, Shard, _StridedShard
|
|
27
27
|
|
|
28
|
-
from .utils import get_mesh_names
|
|
29
|
-
|
|
30
28
|
|
|
31
29
|
def gather_and_compute_chunk_metadata(dtensor: DTensor) -> ChunkStorageMetadata:
|
|
32
30
|
"""
|
|
@@ -274,14 +272,7 @@ def gather_uneven_dtensor_to_full_tensor(
|
|
|
274
272
|
if not device_mesh.mesh_dim_names:
|
|
275
273
|
process_group = device_mesh.get_group()
|
|
276
274
|
else:
|
|
277
|
-
|
|
278
|
-
full_flattened_mesh_dim_name = "_".join(device_mesh.mesh_dim_names)
|
|
279
|
-
if full_flattened_mesh_dim_name in get_mesh_names(device_mesh):
|
|
280
|
-
# Retrieve the existing flattened DeviceMesh ProcessGroup.
|
|
281
|
-
process_group = device_mesh[full_flattened_mesh_dim_name].get_group()
|
|
282
|
-
else:
|
|
283
|
-
# Create the _-separated flattened DeviceMesh ProcessGroup.
|
|
284
|
-
process_group = device_mesh._flatten().get_group()
|
|
275
|
+
process_group = device_mesh._flatten().get_group()
|
|
285
276
|
|
|
286
277
|
# Collect chunk metadata for uneven shards (update if missing)
|
|
287
278
|
if not hasattr(dtensor._local_tensor, "__create_chunk_list__"):
|
|
@@ -167,10 +167,13 @@ def get_mesh_names(device_mesh: Optional[DeviceMesh] = None) -> list[str]:
|
|
|
167
167
|
submesh_dim_name
|
|
168
168
|
for child_mesh, root_mesh in _mesh_resources.child_to_root_mapping.items()
|
|
169
169
|
for submesh_dim_name in (child_mesh.mesh_dim_names or [])
|
|
170
|
-
|
|
171
|
-
if root_mesh == device_mesh and submesh_dim_name not in mesh_dim_names
|
|
170
|
+
if root_mesh == device_mesh
|
|
172
171
|
]
|
|
173
|
-
|
|
172
|
+
# Combine without duplicate dimensions.
|
|
173
|
+
for dim_name in submesh_dim_names:
|
|
174
|
+
if dim_name not in mesh_dim_names:
|
|
175
|
+
mesh_dim_names.append(dim_name)
|
|
176
|
+
return mesh_dim_names
|
|
174
177
|
|
|
175
178
|
|
|
176
179
|
def contains_submesh(
|
|
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
|
|
|
6
6
|
import torch
|
|
7
7
|
from packaging import version
|
|
8
8
|
|
|
9
|
-
from megatron.core.utils import null_decorator
|
|
9
|
+
from megatron.core.utils import experimental_fn, null_decorator
|
|
10
10
|
|
|
11
11
|
try:
|
|
12
12
|
import triton
|
|
@@ -279,6 +279,7 @@ class IndicesToMultihot(torch.autograd.Function):
|
|
|
279
279
|
return None, grad_probs_indices, None, None
|
|
280
280
|
|
|
281
281
|
|
|
282
|
+
@experimental_fn(introduced_with_version='0.11.0rc0')
|
|
282
283
|
def fused_indices_to_multihot(indices, probs_indices, num_of_local_experts):
|
|
283
284
|
"""Convert moe topk indices to multihot representation.
|
|
284
285
|
|
|
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock
|
|
|
6
6
|
import torch
|
|
7
7
|
from packaging import version
|
|
8
8
|
|
|
9
|
-
from megatron.core.utils import null_decorator
|
|
9
|
+
from megatron.core.utils import experimental_fn, null_decorator
|
|
10
10
|
|
|
11
11
|
try:
|
|
12
12
|
import triton
|
|
@@ -324,6 +324,7 @@ class ApplyMLARotaryEmbQ(torch.autograd.Function):
|
|
|
324
324
|
return grad, None, None, None, None, None, None, None, None
|
|
325
325
|
|
|
326
326
|
|
|
327
|
+
@experimental_fn(introduced_with_version="0.13.0")
|
|
327
328
|
def fused_apply_mla_rope_for_q(
|
|
328
329
|
t: torch.Tensor,
|
|
329
330
|
cos: torch.Tensor,
|
|
@@ -732,6 +733,7 @@ class ApplyMLARotaryEmbKV(torch.autograd.Function):
|
|
|
732
733
|
return d_kv, d_emb, None, None, None, None, None, None, None, None, None
|
|
733
734
|
|
|
734
735
|
|
|
736
|
+
@experimental_fn(introduced_with_version="0.13.0")
|
|
735
737
|
def fused_apply_mla_rope_for_kv(
|
|
736
738
|
kv: torch.Tensor,
|
|
737
739
|
k_pos_emb: torch.Tensor,
|
|
@@ -6,7 +6,7 @@ import torch
|
|
|
6
6
|
from packaging import version
|
|
7
7
|
|
|
8
8
|
from megatron.core.jit import jit_fuser
|
|
9
|
-
from megatron.core.utils import null_decorator
|
|
9
|
+
from megatron.core.utils import experimental_fn, null_decorator
|
|
10
10
|
|
|
11
11
|
try:
|
|
12
12
|
import triton
|
|
@@ -70,6 +70,7 @@ def _pad_routing_map_kernel(
|
|
|
70
70
|
tl.store(output_row_ptr + token_indices, output_row, mask=token_mask)
|
|
71
71
|
|
|
72
72
|
|
|
73
|
+
@experimental_fn(introduced_with_version="0.13.0")
|
|
73
74
|
@jit_fuser
|
|
74
75
|
def fused_pad_routing_map(routing_map: torch.Tensor, pad_multiple: int) -> torch.Tensor:
|
|
75
76
|
"""Fused version of pad_routing_map.
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
-
import torch
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import torch
|
|
5
6
|
|
|
6
7
|
from .metadata_base import MetadataBase
|
|
7
8
|
|
|
@@ -39,21 +40,23 @@ class MHAMetadata(MetadataBase):
|
|
|
39
40
|
request_query_lengths: torch.Tensor,
|
|
40
41
|
request_kv_length_offsets: torch.Tensor,
|
|
41
42
|
request_to_kv_block_ids: torch.Tensor,
|
|
42
|
-
|
|
43
|
-
|
|
43
|
+
padded_active_token_count: int,
|
|
44
|
+
real_batch_size: int,
|
|
45
|
+
padded_active_request_count: Optional[int] = None,
|
|
46
|
+
decode_only: bool = False,
|
|
44
47
|
):
|
|
45
48
|
"""
|
|
46
49
|
Args:
|
|
47
50
|
request_query_lengths: (>real_batch_size,)
|
|
48
51
|
request_kv_length_offsets: (>real_batch_size,)
|
|
49
52
|
request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
padded_active_token_count: int
|
|
54
|
+
real_batch_size: int
|
|
55
|
+
padded_active_request_count: Optional[int]
|
|
56
|
+
decode_only: bool
|
|
52
57
|
"""
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
padded_active_token_count = padded_batch_dimensions.token_count
|
|
56
|
-
padded_active_request_count = padded_batch_dimensions.req_count
|
|
58
|
+
if padded_active_request_count is None:
|
|
59
|
+
padded_active_request_count = real_batch_size
|
|
57
60
|
|
|
58
61
|
assert real_batch_size <= padded_active_request_count <= self.max_bs
|
|
59
62
|
assert request_query_lengths.shape[0] == real_batch_size
|
|
@@ -98,12 +101,10 @@ class MHAMetadata(MetadataBase):
|
|
|
98
101
|
is_cumulative_tensor=True,
|
|
99
102
|
)
|
|
100
103
|
|
|
101
|
-
if
|
|
104
|
+
if decode_only:
|
|
102
105
|
self._max_seqlen_q = 1
|
|
103
106
|
else:
|
|
104
|
-
|
|
105
|
-
self._max_seqlen_q = max(2, padded_batch_dimensions.token_count)
|
|
106
|
-
|
|
107
|
+
self._max_seqlen_q = max(2, padded_active_token_count)
|
|
107
108
|
self._max_seqlen_k = self.max_seqlen
|
|
108
109
|
|
|
109
110
|
self.state_data = {
|
|
@@ -148,23 +149,29 @@ class GraphedMHAMetadata(MHAMetadata):
|
|
|
148
149
|
request_query_lengths: torch.Tensor,
|
|
149
150
|
request_kv_length_offsets: torch.Tensor,
|
|
150
151
|
request_to_kv_block_ids: torch.Tensor,
|
|
151
|
-
|
|
152
|
-
|
|
152
|
+
padded_active_token_count: int,
|
|
153
|
+
real_batch_size: int,
|
|
154
|
+
padded_active_request_count: Optional[int] = None,
|
|
155
|
+
decode_only: bool = False,
|
|
153
156
|
):
|
|
154
157
|
"""
|
|
155
158
|
Args:
|
|
156
159
|
request_query_lengths: (>real_batch_size,)
|
|
157
160
|
request_kv_length_offsets: (>real_batch_size,)
|
|
158
161
|
request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
padded_active_token_count: int
|
|
163
|
+
real_batch_size: int
|
|
164
|
+
padded_active_request_count: Optional[int]
|
|
165
|
+
decode_only: bool
|
|
161
166
|
"""
|
|
162
167
|
super().update(
|
|
163
168
|
request_query_lengths,
|
|
164
169
|
request_kv_length_offsets,
|
|
165
170
|
request_to_kv_block_ids,
|
|
166
|
-
|
|
167
|
-
|
|
171
|
+
padded_active_token_count,
|
|
172
|
+
real_batch_size,
|
|
173
|
+
padded_active_request_count,
|
|
174
|
+
decode_only,
|
|
168
175
|
)
|
|
169
176
|
|
|
170
177
|
def reset(self):
|
|
@@ -181,23 +188,29 @@ class NonGraphedMHAMetadata(MHAMetadata):
|
|
|
181
188
|
request_query_lengths: torch.Tensor,
|
|
182
189
|
request_kv_length_offsets: torch.Tensor,
|
|
183
190
|
request_to_kv_block_ids: torch.Tensor,
|
|
184
|
-
|
|
185
|
-
|
|
191
|
+
padded_active_token_count: int,
|
|
192
|
+
real_batch_size: int,
|
|
193
|
+
padded_active_request_count: Optional[int] = None,
|
|
194
|
+
decode_only: bool = False,
|
|
186
195
|
):
|
|
187
196
|
"""
|
|
188
197
|
Args:
|
|
189
198
|
request_query_lengths: (>real_batch_size,)
|
|
190
199
|
request_kv_length_offsets: (>real_batch_size,)
|
|
191
200
|
request_to_kv_block_ids: (>real_batch_size, max_kv_blocks)
|
|
192
|
-
|
|
193
|
-
|
|
201
|
+
padded_active_token_count: int
|
|
202
|
+
real_batch_size: int
|
|
203
|
+
padded_active_request_count: Optional[int]
|
|
204
|
+
decode_only: bool
|
|
194
205
|
"""
|
|
195
206
|
super().update(
|
|
196
207
|
request_query_lengths,
|
|
197
208
|
request_kv_length_offsets,
|
|
198
209
|
request_to_kv_block_ids,
|
|
199
|
-
|
|
200
|
-
|
|
210
|
+
padded_active_token_count,
|
|
211
|
+
real_batch_size,
|
|
212
|
+
padded_active_request_count,
|
|
213
|
+
decode_only,
|
|
201
214
|
)
|
|
202
215
|
if len(self.state_data["query_lengths"]) > 0:
|
|
203
216
|
self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
|