megatron-core 0.16.0rc0.dev111286__tar.gz → 0.16.0rc0.dev111655__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/PKG-INFO +1 -1
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/dynamic_context.py +133 -69
- megatron_core-0.16.0rc0.dev111655/megatron/core/inference/unified_memory.py +127 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/package_info.py +1 -1
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/PKG-INFO +1 -1
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/SOURCES.txt +0 -2
- megatron_core-0.16.0rc0.dev111286/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -72
- megatron_core-0.16.0rc0.dev111286/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -210
- megatron_core-0.16.0rc0.dev111286/megatron/core/inference/unified_memory.py +0 -89
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/MANIFEST.in +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/README.md +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/README.md +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/activations.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/mapping.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/extensions/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/engines/dynamic_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/inference_client.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/inference_request.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/jit.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/gpt_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/distrib_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mamba_block.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mamba_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mamba_mixer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/timers.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/cuda_graphs.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/moe_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/router.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/transformer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/transformer/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron/core/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/requires.txt +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/pyproject.toml +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/setup.cfg +0 -0
- {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev111655}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev111655
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -16,14 +16,16 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
|
|
|
16
16
|
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
|
|
17
17
|
InferenceWrapperConfig,
|
|
18
18
|
)
|
|
19
|
-
from megatron.core.inference.unified_memory import
|
|
19
|
+
from megatron.core.inference.unified_memory import (
|
|
20
|
+
UnifiedMemoryUnsupportedError,
|
|
21
|
+
create_unified_mempool,
|
|
22
|
+
)
|
|
20
23
|
from megatron.core.inference.utils import tensor_swap
|
|
21
24
|
from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
|
|
22
25
|
from megatron.core.package_info import __version__ as mcore_version
|
|
23
26
|
from megatron.core.transformer import TransformerConfig
|
|
24
27
|
from megatron.core.utils import divide as core_divide
|
|
25
28
|
|
|
26
|
-
from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
|
|
27
29
|
from .base_context import BaseInferenceContext
|
|
28
30
|
from .dynamic_block_allocator import BlockAllocator
|
|
29
31
|
|
|
@@ -323,16 +325,20 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
323
325
|
self.params_dtype = params_dtype
|
|
324
326
|
self.num_layers = num_layers
|
|
325
327
|
self.max_sequence_length = max_sequence_length
|
|
328
|
+
|
|
329
|
+
# Unified memory.
|
|
326
330
|
self.unified_memory_level = unified_memory_level
|
|
327
331
|
if unified_memory_level > 0:
|
|
328
|
-
|
|
329
|
-
warnings.warn(
|
|
330
|
-
"Unified memory requested but not available; defaulting to GPU memory."
|
|
331
|
-
)
|
|
332
|
-
self.unified_memory_level = 0
|
|
333
|
-
else:
|
|
332
|
+
try:
|
|
334
333
|
self.unified_memory_mempool = create_unified_mempool()
|
|
334
|
+
except UnifiedMemoryUnsupportedError:
|
|
335
|
+
if torch.distributed.get_rank() == 0:
|
|
336
|
+
warnings.warn(
|
|
337
|
+
"Unified memory requested but not available; defaulting to GPU memory."
|
|
338
|
+
)
|
|
339
|
+
self.unified_memory_level = 0
|
|
335
340
|
|
|
341
|
+
# Request and token counts.
|
|
336
342
|
self.total_request_count = 0
|
|
337
343
|
self.active_token_count = 0
|
|
338
344
|
self.paused_request_count = 0
|
|
@@ -448,26 +454,30 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
448
454
|
num_cuda_graphs is not None
|
|
449
455
|
)
|
|
450
456
|
|
|
451
|
-
#
|
|
457
|
+
# `*_cudagraph_only` tensors are for use with cuda graphs to maintain
|
|
458
|
+
# consistent input shapes, which is required to use cuda graphs.
|
|
459
|
+
# During these steps, the `*_cudagraph_only`
|
|
460
|
+
# tensors are used, otherwise their same-name but un-suffixed
|
|
461
|
+
# corresponding tensors are used.
|
|
452
462
|
|
|
453
|
-
self.
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
+
self.query_seq_lengths_cudagraph_only = torch.full(
|
|
464
|
+
(self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
|
|
465
|
+
)
|
|
466
|
+
self.cu_query_seq_lengths_cudagraph_only = torch.full(
|
|
467
|
+
(self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
|
|
468
|
+
)
|
|
469
|
+
self.kv_seq_lengths_cudagraph_only = torch.full(
|
|
470
|
+
(self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
|
|
471
|
+
)
|
|
472
|
+
self.cu_kv_seq_lengths_cudagraph_only = torch.full(
|
|
473
|
+
(self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
|
|
463
474
|
)
|
|
464
475
|
|
|
465
|
-
self.
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
max_seqlen=self.max_sequence_length,
|
|
476
|
+
self.request_to_kv_block_ids_cudagraph_only = torch.full(
|
|
477
|
+
(self.max_requests, self.max_kv_block_count),
|
|
478
|
+
0,
|
|
479
|
+
dtype=torch.int,
|
|
480
|
+
device=torch.cuda.current_device(),
|
|
471
481
|
)
|
|
472
482
|
|
|
473
483
|
# Guaranteed active requests.
|
|
@@ -617,18 +627,11 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
617
627
|
|
|
618
628
|
def cu_query_lengths(self) -> Tuple[Tensor, int]:
|
|
619
629
|
"""Cumulative query sequence lengths."""
|
|
620
|
-
return
|
|
621
|
-
self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"],
|
|
622
|
-
self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"],
|
|
623
|
-
)
|
|
630
|
+
return self.cu_query_seq_lengths, self.max_seqlen_q
|
|
624
631
|
|
|
625
|
-
def cu_kv_lengths(self) ->
|
|
632
|
+
def cu_kv_lengths(self) -> Tensor:
|
|
626
633
|
"""Cumulative key/value sequence lengths."""
|
|
627
|
-
return (
|
|
628
|
-
self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"],
|
|
629
|
-
self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"],
|
|
630
|
-
self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_k"],
|
|
631
|
-
)
|
|
634
|
+
return (self.cu_kv_seq_lengths, self.kv_seq_lengths, self.max_seqlen_k)
|
|
632
635
|
|
|
633
636
|
def get_active_sequence_lengths(self) -> Tensor:
|
|
634
637
|
"""Total sequence length (query + key) for active requests."""
|
|
@@ -706,16 +709,12 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
706
709
|
to blocks within the block-level memory buffer.
|
|
707
710
|
"""
|
|
708
711
|
if self.cache_mla_latent:
|
|
709
|
-
return (
|
|
710
|
-
self.memory_buffer[layer_number - 1],
|
|
711
|
-
None,
|
|
712
|
-
self.active_attn_metadata["mha_metadata"].state_data["block_table"],
|
|
713
|
-
)
|
|
712
|
+
return (self.memory_buffer[layer_number - 1], None, self.block_table)
|
|
714
713
|
else:
|
|
715
714
|
return (
|
|
716
715
|
self.memory_buffer[0, layer_number - 1],
|
|
717
716
|
self.memory_buffer[1, layer_number - 1],
|
|
718
|
-
self.
|
|
717
|
+
self.block_table,
|
|
719
718
|
)
|
|
720
719
|
|
|
721
720
|
def apply_fused_qk_rotary_emb(
|
|
@@ -825,12 +824,17 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
825
824
|
|
|
826
825
|
def reset_attention_state(self) -> None:
|
|
827
826
|
"""Reset state used within attention, after each step."""
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
self.
|
|
827
|
+
self.max_seqlen_q = None
|
|
828
|
+
self.max_seqlen_k = None
|
|
829
|
+
self.cu_query_seq_lengths = None
|
|
830
|
+
self.cu_query_seq_lengths_cudagraph_only.fill_(0)
|
|
831
|
+
self.query_seq_lengths_cudagraph_only.fill_(0)
|
|
832
|
+
self.cu_kv_seq_lengths = None
|
|
833
|
+
self.cu_kv_seq_lengths_cudagraph_only.fill_(0)
|
|
834
|
+
self.kv_seq_lengths = None
|
|
835
|
+
self.kv_seq_lengths_cudagraph_only.fill_(0)
|
|
836
|
+
self.request_to_kv_block_ids_cudagraph_only.fill_(0)
|
|
837
|
+
self.block_table = None
|
|
834
838
|
|
|
835
839
|
def using_cuda_graph_this_step(self) -> bool:
|
|
836
840
|
"""Returns True if cuda graphs are being used for this step."""
|
|
@@ -930,29 +934,89 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
930
934
|
self.active_token_count : self.padded_active_token_count
|
|
931
935
|
] = 0
|
|
932
936
|
|
|
933
|
-
real_req_batch_size = (
|
|
934
|
-
self.total_request_count - self.paused_request_count
|
|
935
|
-
) # how many requests are indeed active
|
|
936
|
-
self.active_attn_metadata = (
|
|
937
|
-
self.graph_attn_metadata
|
|
938
|
-
if self.using_cuda_graph_this_step()
|
|
939
|
-
else self.non_graph_attn_metadata
|
|
940
|
-
)
|
|
941
|
-
|
|
942
937
|
# Update cu_query_seq_lengths, max_seqlen_q.
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
938
|
+
query_lengths = self.request_query_lengths[
|
|
939
|
+
self.paused_request_count : self.total_request_count
|
|
940
|
+
]
|
|
941
|
+
if self.is_decode_only() or self.using_cuda_graph_this_step():
|
|
942
|
+
self.query_seq_lengths_cudagraph_only[
|
|
943
|
+
0 : self.total_request_count - self.paused_request_count
|
|
944
|
+
] = query_lengths
|
|
945
|
+
if self.is_decode_only():
|
|
946
|
+
self.cu_query_seq_lengths = None # ensure no accidental use
|
|
947
|
+
self.max_seqlen_q = 1
|
|
948
|
+
else:
|
|
949
|
+
self.cu_query_seq_lengths_cudagraph_only[
|
|
950
|
+
1 : self.padded_active_request_count + 1
|
|
951
|
+
] = torch.cumsum(
|
|
952
|
+
self.query_seq_lengths_cudagraph_only[: self.padded_active_request_count], dim=0
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
# The following will be passed to the FA kernel.
|
|
956
|
+
self.cu_query_seq_lengths = self.cu_query_seq_lengths_cudagraph_only[
|
|
957
|
+
: (self.padded_active_request_count + 1)
|
|
958
|
+
]
|
|
959
|
+
self.max_seqlen_q = self.padded_active_token_count
|
|
960
|
+
else:
|
|
961
|
+
cu_query_lengths = torch.cumsum(query_lengths, dim=0)
|
|
962
|
+
self.cu_query_seq_lengths = torch.full(
|
|
963
|
+
(self.total_request_count - self.paused_request_count + 1,),
|
|
964
|
+
0,
|
|
965
|
+
dtype=torch.int32,
|
|
966
|
+
device=torch.cuda.current_device(),
|
|
967
|
+
)
|
|
968
|
+
self.cu_query_seq_lengths[1:] = cu_query_lengths
|
|
969
|
+
self.max_seqlen_q = query_lengths.max().item()
|
|
970
|
+
|
|
971
|
+
kv_seq_lengths = self.request_kv_length_offsets + self.request_query_lengths
|
|
972
|
+
self.kv_seq_lengths = kv_seq_lengths[self.paused_request_count : self.total_request_count]
|
|
973
|
+
if self.is_decode_only() or self.using_cuda_graph_this_step():
|
|
974
|
+
# Re-assign `kv_seq_lengths` to be a view of the first
|
|
975
|
+
# `active_cuda_graph_request_count` tokens of `kv_seq_lengths_decode_only`,
|
|
976
|
+
# such that `kv_seq_lengths` has a static memory address and is therefore
|
|
977
|
+
# cuda graph compatible. This allows `kv_seq_lengths` to transition between,
|
|
978
|
+
# cuda graph sizes, which makes multi-batch-size cuda graphs possible.
|
|
979
|
+
self.kv_seq_lengths_cudagraph_only[
|
|
980
|
+
0 : self.total_request_count - self.paused_request_count
|
|
981
|
+
] = self.kv_seq_lengths
|
|
982
|
+
self.kv_seq_lengths = self.kv_seq_lengths_cudagraph_only[
|
|
983
|
+
: self.padded_active_request_count
|
|
984
|
+
]
|
|
985
|
+
self.max_seqlen_k = self.max_sequence_length
|
|
986
|
+
if self.is_decode_only():
|
|
987
|
+
self.cu_kv_seq_lengths = None # ensure no accidental use
|
|
988
|
+
else:
|
|
989
|
+
cu_kv_lengths = torch.cumsum(self.kv_seq_lengths, dim=0)
|
|
990
|
+
# The following will be passed to the FA kernel.
|
|
991
|
+
self.cu_kv_seq_lengths_cudagraph_only[1 : cu_kv_lengths.size(0) + 1] = cu_kv_lengths
|
|
992
|
+
self.cu_kv_seq_lengths = self.cu_kv_seq_lengths_cudagraph_only[
|
|
993
|
+
: (self.padded_active_request_count + 1)
|
|
994
|
+
]
|
|
995
|
+
else:
|
|
996
|
+
self.cu_kv_seq_lengths = torch.full(
|
|
997
|
+
(self.total_request_count - self.paused_request_count + 1,),
|
|
998
|
+
0,
|
|
999
|
+
dtype=torch.int32,
|
|
1000
|
+
device=torch.cuda.current_device(),
|
|
1001
|
+
)
|
|
1002
|
+
self.cu_kv_seq_lengths[1:] = torch.cumsum(self.kv_seq_lengths, dim=0)
|
|
1003
|
+
self.max_seqlen_k = self.kv_seq_lengths.max().item()
|
|
1004
|
+
|
|
1005
|
+
# Update KV block IDs, block table.
|
|
1006
|
+
request_to_kv_block_ids = self.request_to_kv_block_ids[
|
|
1007
|
+
self.paused_request_count : self.total_request_count
|
|
1008
|
+
]
|
|
1009
|
+
if self.is_decode_only() or self.using_cuda_graph_this_step():
|
|
1010
|
+
self.request_to_kv_block_ids_cudagraph_only[
|
|
1011
|
+
0 : self.total_request_count - self.paused_request_count
|
|
1012
|
+
] = request_to_kv_block_ids
|
|
1013
|
+
self.block_table = self.request_to_kv_block_ids_cudagraph_only[
|
|
1014
|
+
: self.padded_active_request_count
|
|
1015
|
+
]
|
|
1016
|
+
else:
|
|
1017
|
+
self.block_table = self.request_to_kv_block_ids[
|
|
1018
|
+
self.paused_request_count : self.total_request_count
|
|
1019
|
+
]
|
|
956
1020
|
|
|
957
1021
|
def reset(self) -> None:
|
|
958
1022
|
"""Reset entire context.
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import warnings
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from torch.cuda.memory import CUDAPluggableAllocator
|
|
9
|
+
from torch.utils.cpp_extension import CUDA_HOME, load_inline
|
|
10
|
+
|
|
11
|
+
from megatron.core.utils import is_torch_min_version
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
if is_torch_min_version("2.8.0"):
|
|
15
|
+
from torch.cuda.memory import MemPool
|
|
16
|
+
else:
|
|
17
|
+
from torch.cuda import MemPool
|
|
18
|
+
_has_mem_pool = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
_has_mem_pool = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CompilationState(Enum):
|
|
24
|
+
"""Enum to distinguish between unified memory (UVM) compilation states."""
|
|
25
|
+
|
|
26
|
+
UNATTEMPTED = auto() # Compilation has not been attempted.
|
|
27
|
+
FAILURE = auto() # Compilation attempted, but failed.
|
|
28
|
+
SUCCESS = auto() # Compilation attempted, and succeeded.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Compilation vars.
|
|
32
|
+
_compilation_state = CompilationState.UNATTEMPTED
|
|
33
|
+
_alloc = None # must remain global until process exit.
|
|
34
|
+
_mod = None # must remain global until process exit.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class UnifiedMemoryUnsupportedError(Exception):
|
|
38
|
+
"""Unified memory is not supported on this system."""
|
|
39
|
+
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def compile_allocator():
|
|
44
|
+
"""Attempt to compile UVM allocator."""
|
|
45
|
+
|
|
46
|
+
global _compilation_state, _alloc, _mod
|
|
47
|
+
|
|
48
|
+
if _compilation_state != CompilationState.UNATTEMPTED:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
_mempool_c_src = r"""
|
|
52
|
+
#include <cuda_runtime_api.h>
|
|
53
|
+
#include <cstddef>
|
|
54
|
+
|
|
55
|
+
#define EXPORT extern "C"
|
|
56
|
+
|
|
57
|
+
EXPORT void* managed_malloc(size_t size, int device, void* stream) {
|
|
58
|
+
(void)stream;
|
|
59
|
+
int cur = -1;
|
|
60
|
+
cudaGetDevice(&cur);
|
|
61
|
+
if (device != cur && device >= 0) cudaSetDevice(device);
|
|
62
|
+
|
|
63
|
+
// cudaMallocManaged allows for more memory to be allocated than the device memory size.
|
|
64
|
+
// The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
|
|
65
|
+
void* ptr = nullptr;
|
|
66
|
+
cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
|
|
67
|
+
if (err != cudaSuccess) return nullptr;
|
|
68
|
+
|
|
69
|
+
if (device >= 0) {
|
|
70
|
+
// cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
|
|
71
|
+
// This is a hint that tries to prevent data from being migrated away from the device.
|
|
72
|
+
cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
|
|
73
|
+
// cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
|
|
74
|
+
// Even if the memory has to be migrated away from the device, it still does not page fault.
|
|
75
|
+
// The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
|
|
76
|
+
// but there is no harm in adding this flag as well for future-proofing.
|
|
77
|
+
cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
|
|
78
|
+
}
|
|
79
|
+
return ptr;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
|
|
83
|
+
// Memory allocated with cudaMallocManaged should be released with cudaFree.
|
|
84
|
+
(void)size; (void)device; (void)stream;
|
|
85
|
+
if (ptr) cudaFree(ptr);
|
|
86
|
+
}
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# Build the .so upon import; this avoids issues.
|
|
90
|
+
if _has_mem_pool:
|
|
91
|
+
_extra_ldflags = ["-lcudart"]
|
|
92
|
+
if CUDA_HOME:
|
|
93
|
+
_cuda_lib = os.path.join(CUDA_HOME, "lib64")
|
|
94
|
+
if os.path.isdir(_cuda_lib):
|
|
95
|
+
_extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
|
|
96
|
+
try:
|
|
97
|
+
_mod = load_inline(
|
|
98
|
+
name="managed_alloc_runtime",
|
|
99
|
+
cpp_sources=[_mempool_c_src],
|
|
100
|
+
functions=[],
|
|
101
|
+
with_cuda=True,
|
|
102
|
+
extra_ldflags=_extra_ldflags,
|
|
103
|
+
verbose=False,
|
|
104
|
+
)
|
|
105
|
+
_so_path = Path(_mod.__file__).as_posix()
|
|
106
|
+
_alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
|
|
107
|
+
_compilation_state = CompilationState.SUCCESS
|
|
108
|
+
except (RuntimeError, ImportError, OSError):
|
|
109
|
+
warnings.warn("Failed to create unified memory mempool.")
|
|
110
|
+
_compilation_state = CompilationState.FAILURE
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def create_unified_mempool() -> MemPool:
|
|
114
|
+
"""Create a unified memory mempool using CUDA managed memory.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
(MemPool) Unified memory mempool.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# Attempt to compile allocator.
|
|
121
|
+
compile_allocator()
|
|
122
|
+
|
|
123
|
+
# Return mempool.
|
|
124
|
+
if _compilation_state != CompilationState.SUCCESS:
|
|
125
|
+
raise UnifiedMemoryUnsupportedError()
|
|
126
|
+
else:
|
|
127
|
+
return MemPool(allocator=_alloc)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev111655
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -169,8 +169,6 @@ megatron/core/inference/contexts/dynamic_block_allocator.py
|
|
|
169
169
|
megatron/core/inference/contexts/dynamic_context.py
|
|
170
170
|
megatron/core/inference/contexts/fused_kv_append_kernel.py
|
|
171
171
|
megatron/core/inference/contexts/static_context.py
|
|
172
|
-
megatron/core/inference/contexts/attention_context/metadata_base.py
|
|
173
|
-
megatron/core/inference/contexts/attention_context/mha_metadata.py
|
|
174
172
|
megatron/core/inference/engines/__init__.py
|
|
175
173
|
megatron/core/inference/engines/abstract_engine.py
|
|
176
174
|
megatron/core/inference/engines/dynamic_engine.py
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class MetadataBase:
|
|
5
|
-
"""
|
|
6
|
-
Base class for attention metadata.
|
|
7
|
-
High-performance attention kernels often require input metadata in specific
|
|
8
|
-
formats—such as cumulative query lengths, cumulative key/value lengths,
|
|
9
|
-
and similar structures. Moreover, when using CUDA Graphs, these metadata
|
|
10
|
-
buffers must be statically allocated. This class serves as a unified container
|
|
11
|
-
that manages all such metadata in one place.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
def __init__(self):
|
|
15
|
-
"""
|
|
16
|
-
Initialize the metadata.
|
|
17
|
-
"""
|
|
18
|
-
self.state_data = {}
|
|
19
|
-
|
|
20
|
-
def update(self, *args, **kwargs):
|
|
21
|
-
"""
|
|
22
|
-
Construct the metadata from request states.
|
|
23
|
-
"""
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
def reset(self):
|
|
27
|
-
"""
|
|
28
|
-
Reset the metadata.
|
|
29
|
-
"""
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
def tensor_copy_and_pad(
|
|
33
|
-
self,
|
|
34
|
-
tensor_buf,
|
|
35
|
-
unpadded_tensor,
|
|
36
|
-
real_batch_size,
|
|
37
|
-
padded_batch_size,
|
|
38
|
-
is_cumulative_tensor=False,
|
|
39
|
-
pad_value=0,
|
|
40
|
-
):
|
|
41
|
-
"""
|
|
42
|
-
Copy the unpadded tensor to the tensor_buf,
|
|
43
|
-
pad the tensor_buf with zero or the last value of the tensor,
|
|
44
|
-
depending on whether the tensor is cumulative.
|
|
45
|
-
Args:
|
|
46
|
-
tensor_buf: The destination tensor, at least padded_batch_size long.
|
|
47
|
-
unpadded_tensor: The tensor to copy, at least real_batch_size long.
|
|
48
|
-
real_batch_size: The real batch size.
|
|
49
|
-
padded_batch_size: Padded boundary of the tensor.
|
|
50
|
-
is_cumulative_tensor: Whether the tensor is cumulative.
|
|
51
|
-
If True, we pad the tensor_buf with the last value of the unpadded_tensor.
|
|
52
|
-
pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
|
|
53
|
-
"""
|
|
54
|
-
assert real_batch_size <= padded_batch_size
|
|
55
|
-
assert tensor_buf.shape[0] >= padded_batch_size
|
|
56
|
-
assert unpadded_tensor.shape[0] >= real_batch_size
|
|
57
|
-
if is_cumulative_tensor:
|
|
58
|
-
if real_batch_size == 0:
|
|
59
|
-
value = pad_value
|
|
60
|
-
else:
|
|
61
|
-
value = unpadded_tensor[real_batch_size - 1]
|
|
62
|
-
else:
|
|
63
|
-
value = pad_value
|
|
64
|
-
tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
|
|
65
|
-
tensor_buf[real_batch_size:padded_batch_size] = value
|
|
66
|
-
return tensor_buf
|
|
67
|
-
|
|
68
|
-
def __str__(self):
|
|
69
|
-
"""
|
|
70
|
-
Return a string representation of the metadata.
|
|
71
|
-
"""
|
|
72
|
-
return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])
|