megatron-core 0.16.0rc0.dev129362__tar.gz → 0.16.0rc0.dev129924__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/PKG-INFO +1 -1
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/attention_context/mha_metadata.py +12 -2
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/dynamic_context.py +95 -8
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/dynamic_engine.py +72 -0
- megatron_core-0.16.0rc0.dev129924/megatron/core/inference/unified_memory.py +127 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/gpt_model.py +1 -2
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/package_info.py +1 -1
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/attention.py +14 -3
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/cuda_graphs.py +5 -1
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/dot_product_attention.py +2 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/router.py +2 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/pipeline_parallel_layer_layout.py +5 -2
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/PKG-INFO +1 -1
- megatron_core-0.16.0rc0.dev129362/megatron/core/inference/unified_memory.py +0 -89
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/MANIFEST.in +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/README.md +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/README.md +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/activations.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/mapping.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/inference_client.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/inference_request.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/jit.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/distrib_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_block.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_mixer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/timers.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/moe_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/SOURCES.txt +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/requires.txt +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/pyproject.toml +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/setup.cfg +0 -0
- {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev129924
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -43,6 +43,7 @@ class MHAMetadata(MetadataBase):
|
|
|
43
43
|
padded_active_token_count: int,
|
|
44
44
|
real_batch_size: int,
|
|
45
45
|
padded_active_request_count: Optional[int] = None,
|
|
46
|
+
decode_only: bool = False,
|
|
46
47
|
):
|
|
47
48
|
"""
|
|
48
49
|
Args:
|
|
@@ -52,6 +53,7 @@ class MHAMetadata(MetadataBase):
|
|
|
52
53
|
padded_active_token_count: int
|
|
53
54
|
real_batch_size: int
|
|
54
55
|
padded_active_request_count: Optional[int]
|
|
56
|
+
decode_only: bool
|
|
55
57
|
"""
|
|
56
58
|
if padded_active_request_count is None:
|
|
57
59
|
padded_active_request_count = real_batch_size
|
|
@@ -98,9 +100,11 @@ class MHAMetadata(MetadataBase):
|
|
|
98
100
|
padded_active_request_count,
|
|
99
101
|
is_cumulative_tensor=True,
|
|
100
102
|
)
|
|
101
|
-
|
|
102
|
-
if
|
|
103
|
+
|
|
104
|
+
if decode_only:
|
|
103
105
|
self._max_seqlen_q = 1
|
|
106
|
+
else:
|
|
107
|
+
self._max_seqlen_q = max(2, padded_active_token_count)
|
|
104
108
|
self._max_seqlen_k = self.max_seqlen
|
|
105
109
|
|
|
106
110
|
self.state_data = {
|
|
@@ -148,6 +152,7 @@ class GraphedMHAMetadata(MHAMetadata):
|
|
|
148
152
|
padded_active_token_count: int,
|
|
149
153
|
real_batch_size: int,
|
|
150
154
|
padded_active_request_count: Optional[int] = None,
|
|
155
|
+
decode_only: bool = False,
|
|
151
156
|
):
|
|
152
157
|
"""
|
|
153
158
|
Args:
|
|
@@ -157,6 +162,7 @@ class GraphedMHAMetadata(MHAMetadata):
|
|
|
157
162
|
padded_active_token_count: int
|
|
158
163
|
real_batch_size: int
|
|
159
164
|
padded_active_request_count: Optional[int]
|
|
165
|
+
decode_only: bool
|
|
160
166
|
"""
|
|
161
167
|
super().update(
|
|
162
168
|
request_query_lengths,
|
|
@@ -165,6 +171,7 @@ class GraphedMHAMetadata(MHAMetadata):
|
|
|
165
171
|
padded_active_token_count,
|
|
166
172
|
real_batch_size,
|
|
167
173
|
padded_active_request_count,
|
|
174
|
+
decode_only,
|
|
168
175
|
)
|
|
169
176
|
|
|
170
177
|
def reset(self):
|
|
@@ -184,6 +191,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
|
|
|
184
191
|
padded_active_token_count: int,
|
|
185
192
|
real_batch_size: int,
|
|
186
193
|
padded_active_request_count: Optional[int] = None,
|
|
194
|
+
decode_only: bool = False,
|
|
187
195
|
):
|
|
188
196
|
"""
|
|
189
197
|
Args:
|
|
@@ -193,6 +201,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
|
|
|
193
201
|
padded_active_token_count: int
|
|
194
202
|
real_batch_size: int
|
|
195
203
|
padded_active_request_count: Optional[int]
|
|
204
|
+
decode_only: bool
|
|
196
205
|
"""
|
|
197
206
|
super().update(
|
|
198
207
|
request_query_lengths,
|
|
@@ -201,6 +210,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
|
|
|
201
210
|
padded_active_token_count,
|
|
202
211
|
real_batch_size,
|
|
203
212
|
padded_active_request_count,
|
|
213
|
+
decode_only,
|
|
204
214
|
)
|
|
205
215
|
if len(self.state_data["query_lengths"]) > 0:
|
|
206
216
|
self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
|
|
@@ -4,7 +4,7 @@ import math
|
|
|
4
4
|
import warnings
|
|
5
5
|
from contextlib import nullcontext
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import List, Optional, Tuple
|
|
7
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple
|
|
8
8
|
|
|
9
9
|
import torch
|
|
10
10
|
import torch.nn.functional as F
|
|
@@ -16,7 +16,10 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
|
|
|
16
16
|
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
|
|
17
17
|
InferenceWrapperConfig,
|
|
18
18
|
)
|
|
19
|
-
from megatron.core.inference.unified_memory import
|
|
19
|
+
from megatron.core.inference.unified_memory import (
|
|
20
|
+
UnifiedMemoryUnsupportedError,
|
|
21
|
+
create_unified_mempool,
|
|
22
|
+
)
|
|
20
23
|
from megatron.core.inference.utils import tensor_swap
|
|
21
24
|
from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
|
|
22
25
|
from megatron.core.package_info import __version__ as mcore_version
|
|
@@ -46,6 +49,17 @@ try:
|
|
|
46
49
|
except ImportError:
|
|
47
50
|
HAVE_FLASHINFER = False
|
|
48
51
|
|
|
52
|
+
try:
|
|
53
|
+
import wandb # pylint: disable=unused-import
|
|
54
|
+
|
|
55
|
+
HAVE_WANDB = True
|
|
56
|
+
except ImportError:
|
|
57
|
+
HAVE_WANDB = False
|
|
58
|
+
wandb = None
|
|
59
|
+
|
|
60
|
+
if TYPE_CHECKING:
|
|
61
|
+
import wandb as WandbModule
|
|
62
|
+
|
|
49
63
|
|
|
50
64
|
class ContextOverflowError(Exception):
|
|
51
65
|
"""Base exception for when a new request does not fit.
|
|
@@ -223,6 +237,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
223
237
|
levels will be included to control other tensors within the context.
|
|
224
238
|
use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
|
|
225
239
|
If None, defaults to using flash-infer if available.
|
|
240
|
+
metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
|
|
226
241
|
"""
|
|
227
242
|
|
|
228
243
|
def __init__(
|
|
@@ -248,6 +263,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
248
263
|
use_cuda_graphs_for_non_decode_steps: bool = True,
|
|
249
264
|
use_flashinfer_fused_rope: bool = False,
|
|
250
265
|
unified_memory_level: Optional[int] = 0,
|
|
266
|
+
metrics_writer: Optional['WandbModule'] = None,
|
|
251
267
|
):
|
|
252
268
|
super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
|
|
253
269
|
|
|
@@ -257,6 +273,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
257
273
|
block_size_tokens == 64
|
|
258
274
|
), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
|
|
259
275
|
|
|
276
|
+
self.metrics_writer = metrics_writer
|
|
277
|
+
|
|
260
278
|
# Per partition num heads and hidden size.
|
|
261
279
|
projection_size = kv_channels * num_attention_heads
|
|
262
280
|
if tensor_model_parallel_size is None:
|
|
@@ -323,16 +341,20 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
323
341
|
self.params_dtype = params_dtype
|
|
324
342
|
self.num_layers = num_layers
|
|
325
343
|
self.max_sequence_length = max_sequence_length
|
|
344
|
+
|
|
345
|
+
# Unified memory.
|
|
326
346
|
self.unified_memory_level = unified_memory_level
|
|
327
347
|
if unified_memory_level > 0:
|
|
328
|
-
|
|
329
|
-
warnings.warn(
|
|
330
|
-
"Unified memory requested but not available; defaulting to GPU memory."
|
|
331
|
-
)
|
|
332
|
-
self.unified_memory_level = 0
|
|
333
|
-
else:
|
|
348
|
+
try:
|
|
334
349
|
self.unified_memory_mempool = create_unified_mempool()
|
|
350
|
+
except UnifiedMemoryUnsupportedError:
|
|
351
|
+
if torch.distributed.get_rank() == 0:
|
|
352
|
+
warnings.warn(
|
|
353
|
+
"Unified memory requested but not available; defaulting to GPU memory."
|
|
354
|
+
)
|
|
355
|
+
self.unified_memory_level = 0
|
|
335
356
|
|
|
357
|
+
# Request and token counts.
|
|
336
358
|
self.total_request_count = 0
|
|
337
359
|
self.active_token_count = 0
|
|
338
360
|
self.paused_request_count = 0
|
|
@@ -951,6 +973,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
951
973
|
padded_active_token_count=self.padded_active_token_count,
|
|
952
974
|
real_batch_size=real_req_batch_size,
|
|
953
975
|
padded_active_request_count=self.padded_active_request_count,
|
|
976
|
+
decode_only=self.is_decode_only(),
|
|
954
977
|
)
|
|
955
978
|
# All attention metadata calculations are now handled by MHAMetadata.update()
|
|
956
979
|
|
|
@@ -1561,3 +1584,67 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1561
1584
|
|
|
1562
1585
|
# Convert each log prob tensor into a list
|
|
1563
1586
|
return [lp.tolist() for lp in selected_log_probs_list]
|
|
1587
|
+
|
|
1588
|
+
def get_kvcache_utilization_stats(self) -> dict:
|
|
1589
|
+
"""Compute KV cache buffer utilization stats for the current step.
|
|
1590
|
+
|
|
1591
|
+
Returns a dictionary with counts and percentages for both allocated block
|
|
1592
|
+
usage (overall buffer occupancy) and active usage (blocks referenced by
|
|
1593
|
+
currently active requests this step).
|
|
1594
|
+
|
|
1595
|
+
Return:
|
|
1596
|
+
{
|
|
1597
|
+
'total_blocks': int,
|
|
1598
|
+
'allocated_blocks': int,
|
|
1599
|
+
'active_unique_blocks': int,
|
|
1600
|
+
'allocated_utilization': float,
|
|
1601
|
+
'active_utilization': float,
|
|
1602
|
+
'active_request_count': int,
|
|
1603
|
+
'paused_request_count': int,
|
|
1604
|
+
'gtd_block_count': int,
|
|
1605
|
+
}
|
|
1606
|
+
"""
|
|
1607
|
+
# Total usable blocks exclude the reserved dummy block.
|
|
1608
|
+
total_blocks = max(self.block_allocator.block_count_total - 1, 1)
|
|
1609
|
+
block_count_avail = int(self.block_allocator.block_count_avail)
|
|
1610
|
+
|
|
1611
|
+
# Overall allocated blocks in the buffer right now.
|
|
1612
|
+
allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail
|
|
1613
|
+
allocated_blocks = int(max(0, allocated_blocks))
|
|
1614
|
+
|
|
1615
|
+
# Active unique blocks referenced by current active requests only.
|
|
1616
|
+
active_start = self.paused_request_count
|
|
1617
|
+
active_end = self.total_request_count
|
|
1618
|
+
if active_end > active_start:
|
|
1619
|
+
active_rows = self.request_to_kv_block_ids[active_start:active_end]
|
|
1620
|
+
# Filter valid block ids (>= 0) and count unique ids.
|
|
1621
|
+
valid_ids = active_rows[active_rows >= 0]
|
|
1622
|
+
if valid_ids.numel() > 0:
|
|
1623
|
+
unique_ids = torch.unique(valid_ids)
|
|
1624
|
+
active_unique_blocks = int(unique_ids.numel())
|
|
1625
|
+
else:
|
|
1626
|
+
active_unique_blocks = 0
|
|
1627
|
+
else:
|
|
1628
|
+
active_unique_blocks = 0
|
|
1629
|
+
|
|
1630
|
+
allocated_utilization = float(allocated_blocks) / float(total_blocks)
|
|
1631
|
+
active_utilization = float(active_unique_blocks) / float(total_blocks)
|
|
1632
|
+
|
|
1633
|
+
# Diagnostic helpers
|
|
1634
|
+
num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count))
|
|
1635
|
+
total_request_count = int(self.total_request_count)
|
|
1636
|
+
return {
|
|
1637
|
+
'total_blocks': int(total_blocks),
|
|
1638
|
+
'allocated_blocks': int(allocated_blocks),
|
|
1639
|
+
'active_unique_blocks': int(active_unique_blocks),
|
|
1640
|
+
'allocated_utilization': allocated_utilization,
|
|
1641
|
+
'active_utilization': active_utilization,
|
|
1642
|
+
'active_request_count': int(self.get_active_request_count()),
|
|
1643
|
+
'paused_request_count': int(self.paused_request_count),
|
|
1644
|
+
'gtd_block_count': int(self.gtd_block_count),
|
|
1645
|
+
'block_count_avail': int(block_count_avail),
|
|
1646
|
+
'num_non_gtd_blocks': int(num_non_gtd_blocks),
|
|
1647
|
+
'active_token_count': int(self.active_token_count),
|
|
1648
|
+
'total_request_count': int(total_request_count),
|
|
1649
|
+
'max_requests': int(self.max_requests),
|
|
1650
|
+
}
|
|
@@ -57,6 +57,14 @@ try:
|
|
|
57
57
|
except:
|
|
58
58
|
HAVE_MSGPACK = False
|
|
59
59
|
|
|
60
|
+
try:
|
|
61
|
+
import wandb
|
|
62
|
+
|
|
63
|
+
HAVE_WANDB = True
|
|
64
|
+
except ImportError:
|
|
65
|
+
HAVE_WANDB = False
|
|
66
|
+
wandb = None
|
|
67
|
+
|
|
60
68
|
|
|
61
69
|
def format_mem_bytes(mem_bytes):
|
|
62
70
|
"""Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes."""
|
|
@@ -89,6 +97,8 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
89
97
|
static_sampling (bool): If True, all requests are assumed to have the same
|
|
90
98
|
sampling parameters. This avoids needing to loop through all requests and
|
|
91
99
|
their sampling parameters every generation step, improving latency.
|
|
100
|
+
inference_logging_step_interval (int): The step interval at which to log
|
|
101
|
+
inference metrics to wandb. Defaults to 0, which means no logging.
|
|
92
102
|
"""
|
|
93
103
|
|
|
94
104
|
def __init__(
|
|
@@ -101,6 +111,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
101
111
|
track_paused_request_events: bool = False,
|
|
102
112
|
enable_chunked_prefill: bool = True,
|
|
103
113
|
static_sampling: bool = False,
|
|
114
|
+
inference_logging_step_interval: int = 0,
|
|
104
115
|
):
|
|
105
116
|
|
|
106
117
|
if enable_cuda_graph is not None:
|
|
@@ -137,6 +148,32 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
137
148
|
self.enable_chunked_prefill = enable_chunked_prefill
|
|
138
149
|
self.static_sampling = static_sampling
|
|
139
150
|
|
|
151
|
+
self.inference_logging_step_interval = inference_logging_step_interval
|
|
152
|
+
# Configure wandb to use separate step counter for inference metrics (only once)
|
|
153
|
+
if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
|
|
154
|
+
logging.info(
|
|
155
|
+
f"\033[1;93m[INFERENCE]\033[0m "
|
|
156
|
+
f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m"
|
|
157
|
+
)
|
|
158
|
+
if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
|
|
159
|
+
# Make all inference/* metrics use inference_step as their x-axis
|
|
160
|
+
# This allows inference and training to have independent step counters
|
|
161
|
+
context.metrics_writer.define_metric(
|
|
162
|
+
"inference/*", step_metric="inference/inference_step"
|
|
163
|
+
)
|
|
164
|
+
# Initialize inference step offset by querying existing run history
|
|
165
|
+
self.inference_step_offset = 0
|
|
166
|
+
if wandb.run is not None:
|
|
167
|
+
api_run = wandb.Api().run(
|
|
168
|
+
f"{wandb.run.entity}/{wandb.run.project}/{wandb.run.id}"
|
|
169
|
+
)
|
|
170
|
+
max_step = 0
|
|
171
|
+
for row in api_run.scan_history(keys=["inference/inference_step"]):
|
|
172
|
+
val = row.get("inference/inference_step")
|
|
173
|
+
if isinstance(val, (int, float)) and int(val) > max_step:
|
|
174
|
+
max_step = int(val)
|
|
175
|
+
self.inference_step_offset = int(max_step)
|
|
176
|
+
|
|
140
177
|
# Initialize the asyncio loop if it has not already been initialized.
|
|
141
178
|
# TODO: Start the engine loop here.
|
|
142
179
|
self._loop = get_asyncio_loop()
|
|
@@ -780,6 +817,41 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
780
817
|
self.request_completion_futures[failed_request_id].set_result(failed_request)
|
|
781
818
|
self.failed_request_ids.clear()
|
|
782
819
|
|
|
820
|
+
# Log KV cache utilization stats to W&B
|
|
821
|
+
if (
|
|
822
|
+
self.inference_logging_step_interval > 0
|
|
823
|
+
and self.step_count > 0
|
|
824
|
+
and self.step_count % self.inference_logging_step_interval == 0
|
|
825
|
+
and self.context.metrics_writer is not None
|
|
826
|
+
):
|
|
827
|
+
|
|
828
|
+
# Get KV cache utilization stats from dynamic context
|
|
829
|
+
kv_stats = self.context.get_kvcache_utilization_stats()
|
|
830
|
+
|
|
831
|
+
# Prepare metrics dictionary with all stats
|
|
832
|
+
# Use 'inference/' prefix for all metrics to separate from training metrics
|
|
833
|
+
metrics = {
|
|
834
|
+
'inference/inference_step': int(self.inference_step_offset + int(self.step_count)),
|
|
835
|
+
'inference/step_time_s': float(step_time),
|
|
836
|
+
'inference/waiting_queue_len': int(len(self.waiting_request_ids)),
|
|
837
|
+
'inference/total_requests_dict_size': int(len(self.requests)),
|
|
838
|
+
}
|
|
839
|
+
# Add KV stats with inference/ prefix
|
|
840
|
+
# Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization
|
|
841
|
+
for key, value in kv_stats.items():
|
|
842
|
+
if 'utilization' in key:
|
|
843
|
+
# Convert to percentage (0-100) and group under kvcache_utilization
|
|
844
|
+
metrics[f'inference/{key}'] = float(value * 100.0)
|
|
845
|
+
else:
|
|
846
|
+
metrics[f'inference/{key}'] = value
|
|
847
|
+
|
|
848
|
+
if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
|
|
849
|
+
self.context.metrics_writer.log(metrics, commit=True)
|
|
850
|
+
else:
|
|
851
|
+
raise ValueError(
|
|
852
|
+
f"Unsupported metrics writer type: {type(self.context.metrics_writer)}"
|
|
853
|
+
)
|
|
854
|
+
|
|
783
855
|
# Print context state.
|
|
784
856
|
if verbose:
|
|
785
857
|
context = self.context
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import warnings
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from torch.cuda.memory import CUDAPluggableAllocator
|
|
9
|
+
from torch.utils.cpp_extension import CUDA_HOME, load_inline
|
|
10
|
+
|
|
11
|
+
from megatron.core.utils import is_torch_min_version
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
if is_torch_min_version("2.8.0"):
|
|
15
|
+
from torch.cuda.memory import MemPool
|
|
16
|
+
else:
|
|
17
|
+
from torch.cuda import MemPool
|
|
18
|
+
_has_mem_pool = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
_has_mem_pool = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CompilationState(Enum):
|
|
24
|
+
"""Enum to distinguish between unified memory (UVM) compilation states."""
|
|
25
|
+
|
|
26
|
+
UNATTEMPTED = auto() # Compilation has not been attempted.
|
|
27
|
+
FAILURE = auto() # Compilation attempted, but failed.
|
|
28
|
+
SUCCESS = auto() # Compilation attempted, and succeeded.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Compilation vars.
|
|
32
|
+
_compilation_state = CompilationState.UNATTEMPTED
|
|
33
|
+
_alloc = None # must remain global until process exit.
|
|
34
|
+
_mod = None # must remain global until process exit.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class UnifiedMemoryUnsupportedError(Exception):
|
|
38
|
+
"""Unified memory is not supported on this system."""
|
|
39
|
+
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def compile_allocator():
|
|
44
|
+
"""Attempt to compile UVM allocator."""
|
|
45
|
+
|
|
46
|
+
global _compilation_state, _alloc, _mod
|
|
47
|
+
|
|
48
|
+
if _compilation_state != CompilationState.UNATTEMPTED:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
_mempool_c_src = r"""
|
|
52
|
+
#include <cuda_runtime_api.h>
|
|
53
|
+
#include <cstddef>
|
|
54
|
+
|
|
55
|
+
#define EXPORT extern "C"
|
|
56
|
+
|
|
57
|
+
EXPORT void* managed_malloc(size_t size, int device, void* stream) {
|
|
58
|
+
(void)stream;
|
|
59
|
+
int cur = -1;
|
|
60
|
+
cudaGetDevice(&cur);
|
|
61
|
+
if (device != cur && device >= 0) cudaSetDevice(device);
|
|
62
|
+
|
|
63
|
+
// cudaMallocManaged allows for more memory to be allocated than the device memory size.
|
|
64
|
+
// The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
|
|
65
|
+
void* ptr = nullptr;
|
|
66
|
+
cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
|
|
67
|
+
if (err != cudaSuccess) return nullptr;
|
|
68
|
+
|
|
69
|
+
if (device >= 0) {
|
|
70
|
+
// cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
|
|
71
|
+
// This is a hint that tries to prevent data from being migrated away from the device.
|
|
72
|
+
cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
|
|
73
|
+
// cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
|
|
74
|
+
// Even if the memory has to be migrated away from the device, it still does not page fault.
|
|
75
|
+
// The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
|
|
76
|
+
// but there is no harm in adding this flag as well for future-proofing.
|
|
77
|
+
cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
|
|
78
|
+
}
|
|
79
|
+
return ptr;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
|
|
83
|
+
// Memory allocated with cudaMallocManaged should be released with cudaFree.
|
|
84
|
+
(void)size; (void)device; (void)stream;
|
|
85
|
+
if (ptr) cudaFree(ptr);
|
|
86
|
+
}
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# Build the .so upon import; this avoids issues.
|
|
90
|
+
if _has_mem_pool:
|
|
91
|
+
_extra_ldflags = ["-lcudart"]
|
|
92
|
+
if CUDA_HOME:
|
|
93
|
+
_cuda_lib = os.path.join(CUDA_HOME, "lib64")
|
|
94
|
+
if os.path.isdir(_cuda_lib):
|
|
95
|
+
_extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
|
|
96
|
+
try:
|
|
97
|
+
_mod = load_inline(
|
|
98
|
+
name="managed_alloc_runtime",
|
|
99
|
+
cpp_sources=[_mempool_c_src],
|
|
100
|
+
functions=[],
|
|
101
|
+
with_cuda=True,
|
|
102
|
+
extra_ldflags=_extra_ldflags,
|
|
103
|
+
verbose=False,
|
|
104
|
+
)
|
|
105
|
+
_so_path = Path(_mod.__file__).as_posix()
|
|
106
|
+
_alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
|
|
107
|
+
_compilation_state = CompilationState.SUCCESS
|
|
108
|
+
except (RuntimeError, ImportError, OSError):
|
|
109
|
+
warnings.warn("Failed to create unified memory mempool.")
|
|
110
|
+
_compilation_state = CompilationState.FAILURE
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def create_unified_mempool() -> "MemPool":
|
|
114
|
+
"""Create a unified memory mempool using CUDA managed memory.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
(MemPool) Unified memory mempool.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# Attempt to compile allocator.
|
|
121
|
+
compile_allocator()
|
|
122
|
+
|
|
123
|
+
# Return mempool.
|
|
124
|
+
if _compilation_state != CompilationState.SUCCESS:
|
|
125
|
+
raise UnifiedMemoryUnsupportedError()
|
|
126
|
+
else:
|
|
127
|
+
return MemPool(allocator=_alloc)
|
|
@@ -375,14 +375,13 @@ class GPTModel(LanguageModule):
|
|
|
375
375
|
)
|
|
376
376
|
or self.config.flash_decode
|
|
377
377
|
)
|
|
378
|
-
and rotary_pos_cos is not None
|
|
379
378
|
and inference_context.is_static_batching()
|
|
380
379
|
):
|
|
381
380
|
current_batch_size = input_ids.shape[0]
|
|
382
381
|
sequence_len_offset = torch.tensor(
|
|
383
382
|
[inference_context.sequence_len_offset] * current_batch_size,
|
|
384
383
|
dtype=torch.int32,
|
|
385
|
-
device=
|
|
384
|
+
device=torch.cuda.current_device(),
|
|
386
385
|
)
|
|
387
386
|
else:
|
|
388
387
|
sequence_len_offset = None
|
|
@@ -48,15 +48,26 @@ except ImportError:
|
|
|
48
48
|
rearrange = None
|
|
49
49
|
|
|
50
50
|
try:
|
|
51
|
-
from
|
|
52
|
-
from
|
|
51
|
+
from flash_attn_3.flash_attn_interface import _flash_attn_forward
|
|
52
|
+
from flash_attn_3.flash_attn_interface import (
|
|
53
53
|
flash_attn_with_kvcache as flash_attn3_with_kvcache,
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
HAVE_FA3 = True
|
|
57
|
-
except:
|
|
57
|
+
except ImportError as e:
|
|
58
58
|
HAVE_FA3 = False
|
|
59
59
|
|
|
60
|
+
if not HAVE_FA3:
|
|
61
|
+
try:
|
|
62
|
+
from flashattn_hopper.flash_attn_interface import _flash_attn_forward
|
|
63
|
+
from flashattn_hopper.flash_attn_interface import (
|
|
64
|
+
flash_attn_with_kvcache as flash_attn3_with_kvcache,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
HAVE_FA3 = True
|
|
68
|
+
except ImportError as e:
|
|
69
|
+
pass
|
|
70
|
+
|
|
60
71
|
try:
|
|
61
72
|
from flash_mla import flash_mla_with_kvcache, get_mla_metadata
|
|
62
73
|
|
|
@@ -1182,7 +1182,11 @@ class CudaGraphManager(torch.nn.Module):
|
|
|
1182
1182
|
|
|
1183
1183
|
if runner is None:
|
|
1184
1184
|
if _CudagraphGlobalRecord.cudagraph_created:
|
|
1185
|
-
assert False
|
|
1185
|
+
assert False, (
|
|
1186
|
+
f"`cudagraph_created` is set to True but no matching cudagraph "
|
|
1187
|
+
f"runners were found. This module has {len(self.cudagraph_runners)} "
|
|
1188
|
+
f"existing runners. Use `get_mismatch_errors` to debug mismatches."
|
|
1189
|
+
)
|
|
1186
1190
|
else:
|
|
1187
1191
|
runner = _CudaGraphRunner(
|
|
1188
1192
|
megatron_module,
|
|
@@ -66,6 +66,8 @@ class Router(ABC, MegatronModule):
|
|
|
66
66
|
"""Reset the router parameters."""
|
|
67
67
|
if self.config.perform_initialization:
|
|
68
68
|
self.config.init_method(self.weight)
|
|
69
|
+
if self.bias is not None:
|
|
70
|
+
self.config.init_method(self.bias)
|
|
69
71
|
self.weight.data = self.weight.data.to(dtype=self.config.params_dtype)
|
|
70
72
|
setattr(self.weight, 'sequence_parallel', self.config.sequence_parallel)
|
|
71
73
|
if self.bias is not None:
|
|
@@ -15,8 +15,11 @@ logger = logging.getLogger(__name__)
|
|
|
15
15
|
class PipelineParallelLayerLayout:
|
|
16
16
|
"""Configuration of custom pipeline parallel layer partitioning."""
|
|
17
17
|
|
|
18
|
-
def __repr__(self):
|
|
19
|
-
|
|
18
|
+
def __repr__(self) -> str:
|
|
19
|
+
if isinstance(self.input_data, str):
|
|
20
|
+
return self.input_data
|
|
21
|
+
else:
|
|
22
|
+
return str(self.input_data)
|
|
20
23
|
|
|
21
24
|
def __init__(self, layout: str | list, pipeline_model_parallel_size: int):
|
|
22
25
|
"""Initialize PipelineParallelLayerLayout from a list or a str.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev129924
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|