megatron-core 0.16.0rc0.dev127802__tar.gz → 0.16.0rc0.dev128858__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/PKG-INFO +1 -1
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/async_stream.py +2 -8
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/dynamic_context.py +32 -188
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/fused_kv_append_kernel.py +2 -2
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -7
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/dynamic_engine.py +13 -27
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/static_engine.py +7 -3
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/inference_client.py +1 -3
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +2 -4
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/utils.py +0 -28
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/gpt_model.py +3 -1
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/mamba_model.py +1 -30
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/package_info.py +1 -1
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_block.py +25 -16
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +2 -29
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_layer.py +5 -5
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_mixer.py +57 -301
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/utils.py +1 -143
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/PKG-INFO +1 -1
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/SOURCES.txt +0 -1
- megatron_core-0.16.0rc0.dev127802/megatron/core/inference/contexts/attention_context/mamba_metadata.py +0 -106
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/MANIFEST.in +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/README.md +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/README.md +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/activations.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/mapping.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/inference_request.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/unified_memory.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/jit.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/distrib_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/timers.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/cuda_graphs.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/moe_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/router.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/requires.txt +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/pyproject.toml +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/setup.cfg +0 -0
- {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev128858
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -9,7 +9,6 @@ import asyncio
|
|
|
9
9
|
from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
|
|
10
10
|
|
|
11
11
|
from megatron.core.inference.inference_request import InferenceRequest
|
|
12
|
-
from megatron.core.utils import get_asyncio_loop
|
|
13
12
|
|
|
14
13
|
STOP_ITERATION = Exception()
|
|
15
14
|
|
|
@@ -21,17 +20,12 @@ class AsyncStream:
|
|
|
21
20
|
Adopted from https://github.com/vllm-project/vllm/blob/eb881ed006ca458b052905e33f0d16dbb428063a/vllm/v1/engine/async_stream.py # pylint: disable=line-too-long
|
|
22
21
|
"""
|
|
23
22
|
|
|
24
|
-
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
request_id: int,
|
|
27
|
-
cancel: Callable[[str], None],
|
|
28
|
-
loop: Optional[asyncio.AbstractEventLoop] = None,
|
|
29
|
-
) -> None:
|
|
23
|
+
def __init__(self, request_id: int, cancel: Callable[[str], None]) -> None:
|
|
30
24
|
self._request_id = request_id
|
|
31
25
|
self._cancel = cancel
|
|
32
26
|
self._queue: asyncio.Queue = asyncio.Queue()
|
|
33
27
|
self._finished = False
|
|
34
|
-
self._loop =
|
|
28
|
+
self._loop = asyncio.get_running_loop()
|
|
35
29
|
|
|
36
30
|
def put(self, item: Union[InferenceRequest, Exception]) -> None:
|
|
37
31
|
"""Adds a new value to the stream"""
|
|
@@ -23,14 +23,9 @@ from megatron.core.inference.unified_memory import (
|
|
|
23
23
|
from megatron.core.inference.utils import tensor_swap
|
|
24
24
|
from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
|
|
25
25
|
from megatron.core.package_info import __version__ as mcore_version
|
|
26
|
-
from megatron.core.ssm.mamba_hybrid_layer_allocation import (
|
|
27
|
-
Symbols,
|
|
28
|
-
get_layer_maps_from_layer_type_list,
|
|
29
|
-
)
|
|
30
26
|
from megatron.core.transformer import TransformerConfig
|
|
31
27
|
from megatron.core.utils import divide as core_divide
|
|
32
28
|
|
|
33
|
-
from .attention_context.mamba_metadata import MambaMetadata
|
|
34
29
|
from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
|
|
35
30
|
from .base_context import BaseInferenceContext
|
|
36
31
|
from .dynamic_block_allocator import BlockAllocator
|
|
@@ -232,17 +227,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
232
227
|
where the cuda graph batch sizes range from 1 to `max_requests` (as
|
|
233
228
|
computed below). Due to rounding, the actual number of cuda graphs may
|
|
234
229
|
not equal this argument.
|
|
235
|
-
materialize_only_last_token_logits (
|
|
236
|
-
|
|
237
|
-
if returning log probs.
|
|
238
|
-
layer_type_list (Optional[List[str]]): A list of strings that indicates
|
|
239
|
-
the layer type (Mamba / Attention / MLP) for each layer.
|
|
240
|
-
See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list
|
|
241
|
-
of symbols. This must be provided for hybrid models.
|
|
242
|
-
mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request.
|
|
243
|
-
This must be provided for hybrid models.
|
|
244
|
-
mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request.
|
|
245
|
-
This must be provided for hybrid models.
|
|
230
|
+
materialize_only_last_token_logits (bool): If True, only the last token logits
|
|
231
|
+
are materialized in the context.
|
|
246
232
|
use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode
|
|
247
233
|
engine steps.
|
|
248
234
|
unified_memory_level (Optional[int]): Set unified memory usage within the
|
|
@@ -273,10 +259,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
273
259
|
kv_lora_rank: Optional[int] = None,
|
|
274
260
|
qk_pos_emb_head_dim: Optional[int] = None,
|
|
275
261
|
num_cuda_graphs: Optional[int] = None,
|
|
276
|
-
materialize_only_last_token_logits:
|
|
277
|
-
layer_type_list: Optional[List[str]] = None,
|
|
278
|
-
mamba_conv_states_shape: Optional[Tuple[int]] = None,
|
|
279
|
-
mamba_ssm_states_shape: Optional[Tuple[int]] = None,
|
|
262
|
+
materialize_only_last_token_logits: bool = True,
|
|
280
263
|
use_cuda_graphs_for_non_decode_steps: bool = True,
|
|
281
264
|
use_flashinfer_fused_rope: bool = False,
|
|
282
265
|
unified_memory_level: Optional[int] = 0,
|
|
@@ -300,41 +283,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
300
283
|
tp_size = tensor_model_parallel_size
|
|
301
284
|
hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
|
|
302
285
|
num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
|
|
303
|
-
|
|
304
|
-
# Mamba states.
|
|
305
|
-
self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list
|
|
306
|
-
if self.is_hybrid_model:
|
|
307
|
-
assert (
|
|
308
|
-
mamba_conv_states_shape is not None
|
|
309
|
-
), "`mamba_conv_states_shape` must be specified for hybrid models"
|
|
310
|
-
assert (
|
|
311
|
-
mamba_ssm_states_shape is not None
|
|
312
|
-
), "`mamba_ssm_states_shape` must be specified for hybrid models"
|
|
313
|
-
assert (
|
|
314
|
-
not use_cuda_graphs_for_non_decode_steps
|
|
315
|
-
), "Non-decode CUDA graphs not yet supported for hybrid models"
|
|
316
|
-
|
|
317
|
-
# For hybrid models, the layer map converts the global layer index to the
|
|
318
|
-
# corresponding attention layer index or Mamba layer index depending on the
|
|
319
|
-
# layer type.
|
|
320
|
-
attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list(
|
|
321
|
-
layer_type_list
|
|
322
|
-
)
|
|
323
|
-
self.num_attention_layers = len(attention_layer_map)
|
|
324
|
-
self.num_mamba_layers = len(mamba_layer_map)
|
|
325
|
-
self.layer_map = attention_layer_map | mamba_layer_map
|
|
326
|
-
else:
|
|
327
|
-
# The layer map is the identity function for pure Transformer models.
|
|
328
|
-
self.num_attention_layers = num_layers
|
|
329
|
-
self.num_mamba_layers = 0
|
|
330
|
-
(mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None)
|
|
331
|
-
self.layer_map = {i: i for i in range(self.num_attention_layers)}
|
|
332
|
-
|
|
333
|
-
if self.num_attention_layers == 0:
|
|
334
|
-
raise NotImplementedError(
|
|
335
|
-
f"Using `DynamicInferenceContext` with no attention is not supported."
|
|
336
|
-
)
|
|
337
|
-
|
|
338
286
|
# Block size tokens, bytes.
|
|
339
287
|
dtype_size_bytes = params_dtype.itemsize
|
|
340
288
|
self.block_size_tokens = block_size_tokens
|
|
@@ -349,38 +297,24 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
349
297
|
self.block_size_bytes = (
|
|
350
298
|
dtype_size_bytes
|
|
351
299
|
* 2 # key, value
|
|
352
|
-
*
|
|
300
|
+
* num_layers
|
|
353
301
|
* self.block_size_tokens
|
|
354
302
|
* num_attention_heads_per_partition
|
|
355
303
|
* hidden_size_per_attention_head
|
|
356
304
|
)
|
|
357
|
-
assert self.block_size_bytes > 0
|
|
358
305
|
|
|
359
306
|
# Adjust buffer to be a multiple of block size.
|
|
360
307
|
buffer_size_bytes = int(buffer_size_gb * 1024**3)
|
|
361
308
|
buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes
|
|
362
309
|
buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem
|
|
363
310
|
|
|
364
|
-
|
|
365
|
-
if self.is_hybrid_model:
|
|
366
|
-
mamba_states_memory_per_request += math.prod(mamba_conv_states_shape)
|
|
367
|
-
mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape)
|
|
368
|
-
mamba_states_memory_per_request *= self.num_mamba_layers
|
|
369
|
-
mamba_states_memory_per_request *= dtype_size_bytes
|
|
370
|
-
|
|
371
|
-
# Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size.
|
|
311
|
+
# Compute max_requets, max_tokens from buffer size and overflow factor.
|
|
372
312
|
def bytes_to_max_requests_and_tokens(n_bytes):
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
313
|
+
n_tokens = n_bytes / self.block_size_bytes * self.block_size_tokens
|
|
314
|
+
n_requests = n_tokens / max_sequence_length
|
|
315
|
+
return self.round_up_requests(int(n_requests), tp_size=tp_size), self.round_up_tokens(
|
|
316
|
+
int(n_tokens), tp_size=tp_size
|
|
376
317
|
)
|
|
377
|
-
# TODO(ksanthanam): Leave room for an extra request in the event of padding
|
|
378
|
-
# for non-decode CUDA graphs
|
|
379
|
-
n_requests = n_bytes / cost_per_request_bytes
|
|
380
|
-
n_tokens = n_requests * max_sequence_length
|
|
381
|
-
n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size)
|
|
382
|
-
n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size)
|
|
383
|
-
return n_requests, n_tokens
|
|
384
318
|
|
|
385
319
|
self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes)
|
|
386
320
|
if buffer_overflow_factor is not None:
|
|
@@ -405,6 +339,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
405
339
|
|
|
406
340
|
# Initialize context state.
|
|
407
341
|
self.params_dtype = params_dtype
|
|
342
|
+
self.num_layers = num_layers
|
|
408
343
|
self.max_sequence_length = max_sequence_length
|
|
409
344
|
|
|
410
345
|
# Unified memory.
|
|
@@ -455,11 +390,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
455
390
|
self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids)
|
|
456
391
|
self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids)
|
|
457
392
|
|
|
458
|
-
# Calculate the total number of
|
|
459
|
-
|
|
460
|
-
block_count_total = (
|
|
461
|
-
max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes
|
|
462
|
-
)
|
|
393
|
+
# Calculate the total number of blocks available in the buffer
|
|
394
|
+
block_count_total = buffer_size_bytes // self.block_size_bytes
|
|
463
395
|
|
|
464
396
|
# Memory buffer.
|
|
465
397
|
ctx_manager = (
|
|
@@ -470,12 +402,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
470
402
|
with ctx_manager:
|
|
471
403
|
if cache_mla_latent:
|
|
472
404
|
self.memory_buffer = torch.full(
|
|
473
|
-
(
|
|
474
|
-
self.num_attention_layers,
|
|
475
|
-
block_count_total,
|
|
476
|
-
self.block_size_tokens,
|
|
477
|
-
kv_reduced_dim,
|
|
478
|
-
),
|
|
405
|
+
(self.num_layers, block_count_total, self.block_size_tokens, kv_reduced_dim),
|
|
479
406
|
-1,
|
|
480
407
|
dtype=self.params_dtype,
|
|
481
408
|
device=torch.cuda.current_device(),
|
|
@@ -484,7 +411,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
484
411
|
self.memory_buffer = torch.full(
|
|
485
412
|
(
|
|
486
413
|
2, # key and value
|
|
487
|
-
self.
|
|
414
|
+
self.num_layers,
|
|
488
415
|
block_count_total,
|
|
489
416
|
self.block_size_tokens,
|
|
490
417
|
num_attention_heads_per_partition,
|
|
@@ -589,34 +516,14 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
589
516
|
block_count_total=block_count_total, gtd_block_count=self.gtd_block_count
|
|
590
517
|
)
|
|
591
518
|
|
|
592
|
-
# Optional state tensors for hybrid models
|
|
593
|
-
if self.is_hybrid_model:
|
|
594
|
-
self.mamba_metadata = MambaMetadata(max_requests=self.max_requests)
|
|
595
|
-
|
|
596
|
-
with ctx_manager:
|
|
597
|
-
self.mamba_conv_states = torch.zeros(
|
|
598
|
-
(self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape,
|
|
599
|
-
dtype=self.params_dtype,
|
|
600
|
-
device=torch.cuda.current_device(),
|
|
601
|
-
)
|
|
602
|
-
self.mamba_ssm_states = torch.zeros(
|
|
603
|
-
(self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape,
|
|
604
|
-
dtype=self.params_dtype,
|
|
605
|
-
device=torch.cuda.current_device(),
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
else:
|
|
609
|
-
self.mamba_metadata = None
|
|
610
|
-
|
|
611
519
|
# Store the dummy block idx reference for convenience
|
|
612
520
|
self.dummy_block_idx = self.block_allocator.dummy_block_idx
|
|
613
521
|
|
|
614
522
|
# Deal with chunked prefill
|
|
615
523
|
self.chunked_prefill_request_id = -1
|
|
616
524
|
|
|
617
|
-
# Reset attention
|
|
525
|
+
# Reset attention state.
|
|
618
526
|
self.reset_attention_state()
|
|
619
|
-
self.reset_mamba_state()
|
|
620
527
|
|
|
621
528
|
if use_flashinfer_fused_rope is True:
|
|
622
529
|
assert HAVE_FLASHINFER, "flashinfer is not installed"
|
|
@@ -721,8 +628,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
721
628
|
"""Test if all active requests are in decode phase.
|
|
722
629
|
|
|
723
630
|
For a request in prefill phase active_tokens = query length
|
|
724
|
-
Once the request moves to decode phase active tokens is 1 for that request.
|
|
725
|
-
So if all active requests are in decode phase, they will be equal to active token count.
|
|
631
|
+
Once the request moves to decode phase active tokens is 1 for that request. So if all active requests are in decode phase, they will be equal to active token count.
|
|
726
632
|
"""
|
|
727
633
|
total_active_requests = self.total_request_count - self.paused_request_count
|
|
728
634
|
return total_active_requests == self.active_token_count
|
|
@@ -758,7 +664,11 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
758
664
|
|
|
759
665
|
def get_active_request_count(self):
|
|
760
666
|
"""Returns the current number of active requests."""
|
|
761
|
-
|
|
667
|
+
active_sequence_lengths = self.get_active_sequence_lengths()
|
|
668
|
+
max_sequence_lengths = self.get_max_sequence_lengths()
|
|
669
|
+
active_requests_mask = torch.less(active_sequence_lengths, max_sequence_lengths).byte()
|
|
670
|
+
active_request_count = (active_requests_mask == 1).sum().item()
|
|
671
|
+
return active_request_count
|
|
762
672
|
|
|
763
673
|
def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) -> None:
|
|
764
674
|
"""Append to KV cache.
|
|
@@ -768,12 +678,10 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
768
678
|
key (Tensor): Key tensor.
|
|
769
679
|
value (Tensor): Value tensor.
|
|
770
680
|
"""
|
|
771
|
-
attention_layer_number = self.layer_map[layer_number - 1]
|
|
772
|
-
|
|
773
681
|
if triton_append_key_value_cache is not None and not self.cache_mla_latent:
|
|
774
682
|
# currently does not support MLA latent cache
|
|
775
683
|
return triton_append_key_value_cache(
|
|
776
|
-
layer_number=
|
|
684
|
+
layer_number=layer_number,
|
|
777
685
|
key=key,
|
|
778
686
|
value=value,
|
|
779
687
|
memory_buffer=self.memory_buffer,
|
|
@@ -798,14 +706,14 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
798
706
|
if self.cache_mla_latent:
|
|
799
707
|
# We pass the kv_concat as the key in cache_mla_latent
|
|
800
708
|
kv_concat = key
|
|
801
|
-
self.memory_buffer[
|
|
709
|
+
self.memory_buffer[layer_number - 1, block_idx, local_kv_seq_idx] = kv_concat[
|
|
802
710
|
: self.padded_active_token_count
|
|
803
711
|
]
|
|
804
712
|
else:
|
|
805
|
-
self.memory_buffer[0,
|
|
713
|
+
self.memory_buffer[0, layer_number - 1, block_idx, local_kv_seq_idx] = key[
|
|
806
714
|
: self.padded_active_token_count
|
|
807
715
|
]
|
|
808
|
-
self.memory_buffer[1,
|
|
716
|
+
self.memory_buffer[1, layer_number - 1, block_idx, local_kv_seq_idx] = value[
|
|
809
717
|
: self.padded_active_token_count
|
|
810
718
|
]
|
|
811
719
|
|
|
@@ -819,30 +727,19 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
819
727
|
(Tuple[Tensor, Tensor]) The key and value pointer tensors that point
|
|
820
728
|
to blocks within the block-level memory buffer.
|
|
821
729
|
"""
|
|
822
|
-
attention_layer_number = self.layer_map[layer_number - 1]
|
|
823
730
|
if self.cache_mla_latent:
|
|
824
731
|
return (
|
|
825
|
-
self.memory_buffer[
|
|
732
|
+
self.memory_buffer[layer_number - 1],
|
|
826
733
|
None,
|
|
827
734
|
self.active_attn_metadata["mha_metadata"].state_data["block_table"],
|
|
828
735
|
)
|
|
829
736
|
else:
|
|
830
737
|
return (
|
|
831
|
-
self.memory_buffer[0,
|
|
832
|
-
self.memory_buffer[1,
|
|
738
|
+
self.memory_buffer[0, layer_number - 1],
|
|
739
|
+
self.memory_buffer[1, layer_number - 1],
|
|
833
740
|
self.active_attn_metadata["mha_metadata"].state_data["block_table"],
|
|
834
741
|
)
|
|
835
742
|
|
|
836
|
-
def mamba_states_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]:
|
|
837
|
-
"""Returns the Mamba state tensors for the given layer."""
|
|
838
|
-
assert self.is_hybrid_model, "Only hybrid models have Mamba state tensors"
|
|
839
|
-
|
|
840
|
-
mamba_layer_number = self.layer_map[layer_number - 1]
|
|
841
|
-
conv_state = self.mamba_conv_states[mamba_layer_number]
|
|
842
|
-
ssm_state = self.mamba_ssm_states[mamba_layer_number]
|
|
843
|
-
|
|
844
|
-
return (conv_state, ssm_state)
|
|
845
|
-
|
|
846
743
|
def apply_fused_qk_rotary_emb(
|
|
847
744
|
self, query: Tensor, key: Tensor, cos_sin_emb: Tensor, config: TransformerConfig
|
|
848
745
|
) -> Tuple[Tensor, Tensor]:
|
|
@@ -957,16 +854,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
957
854
|
attn_metadata.reset()
|
|
958
855
|
self.active_attn_metadata = None
|
|
959
856
|
|
|
960
|
-
if self.is_hybrid_model:
|
|
961
|
-
self.mamba_metadata.reset_cudagraph_mapping()
|
|
962
|
-
|
|
963
|
-
def reset_mamba_state(self) -> None:
|
|
964
|
-
"""Reset state used within Mamba layers."""
|
|
965
|
-
if self.is_hybrid_model:
|
|
966
|
-
self.mamba_conv_states.fill_(0)
|
|
967
|
-
self.mamba_ssm_states.fill_(0)
|
|
968
|
-
self.mamba_metadata.reset()
|
|
969
|
-
|
|
970
857
|
def using_cuda_graph_this_step(self) -> bool:
|
|
971
858
|
"""Returns True if cuda graphs are being used for this step."""
|
|
972
859
|
has_cuda_graphs = self.cuda_graph_token_counts is not None
|
|
@@ -1090,17 +977,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1090
977
|
)
|
|
1091
978
|
# All attention metadata calculations are now handled by MHAMetadata.update()
|
|
1092
979
|
|
|
1093
|
-
# Create Mamba state block table if it's a hybrid model
|
|
1094
|
-
if self.is_hybrid_model:
|
|
1095
|
-
active_mamba_indices = self.mamba_metadata.request_to_mamba_state_idx[
|
|
1096
|
-
self.paused_request_count : self.total_request_count
|
|
1097
|
-
]
|
|
1098
|
-
|
|
1099
|
-
if self.is_decode_only() or self.using_cuda_graph_this_step():
|
|
1100
|
-
self.mamba_metadata.update_cudagraph_mapping(
|
|
1101
|
-
active_mamba_indices, self.total_request_count - self.paused_request_count
|
|
1102
|
-
)
|
|
1103
|
-
|
|
1104
980
|
def reset(self) -> None:
|
|
1105
981
|
"""Reset entire context.
|
|
1106
982
|
|
|
@@ -1142,13 +1018,15 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1142
1018
|
|
|
1143
1019
|
# Reset available block count.
|
|
1144
1020
|
self.reset_attention_state()
|
|
1145
|
-
self.reset_mamba_state()
|
|
1146
1021
|
self.block_allocator.reset()
|
|
1147
1022
|
self.request_to_kv_block_ids.fill_(-1)
|
|
1148
1023
|
|
|
1149
1024
|
# Reset chunked prefill state
|
|
1150
1025
|
self.chunked_prefill_request_id = -1
|
|
1151
1026
|
|
|
1027
|
+
# Reset chunked prefill state
|
|
1028
|
+
self.chunked_prefill_request_id = -1
|
|
1029
|
+
|
|
1152
1030
|
def current_input_and_position_ids(
|
|
1153
1031
|
self, *, num_warmup_tokens: Optional[int] = None
|
|
1154
1032
|
) -> Tuple[Tensor, Tensor]:
|
|
@@ -1320,18 +1198,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1320
1198
|
self.token_to_local_position_within_kv_block[
|
|
1321
1199
|
self.active_token_count : self.active_token_count + chunk_length
|
|
1322
1200
|
] = (token_offset_range % self.block_size_tokens)
|
|
1323
|
-
|
|
1324
|
-
if self.is_hybrid_model and not is_chunked_prefill:
|
|
1325
|
-
# Allocate a slot for Mamba states
|
|
1326
|
-
mamba_idx = self.mamba_metadata.allocate_slot()
|
|
1327
|
-
if mamba_idx is None:
|
|
1328
|
-
raise ContextOverflowError(req.request_id, "No Mamba slots available")
|
|
1329
|
-
|
|
1330
|
-
# Initialize the allocated Mamba state
|
|
1331
|
-
self.mamba_conv_states[:, mamba_idx] = 0.0
|
|
1332
|
-
self.mamba_ssm_states[:, mamba_idx] = 0.0
|
|
1333
|
-
self.mamba_metadata.request_to_mamba_state_idx[self.total_request_count] = mamba_idx
|
|
1334
|
-
|
|
1335
1201
|
self.active_token_count += chunk_length
|
|
1336
1202
|
self.total_request_count += 0 if req.finished_chunk_token_count > 0 else 1
|
|
1337
1203
|
|
|
@@ -1350,11 +1216,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1350
1216
|
self.request_last_kv_block_id[dst_idxs] = self.request_last_kv_block_id[src_idxs]
|
|
1351
1217
|
self.request_last_kv_block_offset[dst_idxs] = self.request_last_kv_block_offset[src_idxs]
|
|
1352
1218
|
|
|
1353
|
-
if self.is_hybrid_model:
|
|
1354
|
-
self.mamba_metadata.request_to_mamba_state_idx[dst_idxs] = (
|
|
1355
|
-
self.mamba_metadata.request_to_mamba_state_idx[src_idxs]
|
|
1356
|
-
)
|
|
1357
|
-
|
|
1358
1219
|
def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
|
|
1359
1220
|
"""
|
|
1360
1221
|
Swaps all the relevent booking tensors with src idxs to dst idxs
|
|
@@ -1369,9 +1230,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1369
1230
|
tensor_swap(self.request_last_kv_block_id, src_idxs, dst_idxs)
|
|
1370
1231
|
tensor_swap(self.request_last_kv_block_offset, src_idxs, dst_idxs)
|
|
1371
1232
|
|
|
1372
|
-
if self.is_hybrid_model:
|
|
1373
|
-
tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
|
|
1374
|
-
|
|
1375
1233
|
# TODO: see if we can compile this function
|
|
1376
1234
|
def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
|
|
1377
1235
|
"""Update context state after calling engine.step().
|
|
@@ -1443,17 +1301,10 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1443
1301
|
non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
|
|
1444
1302
|
self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
|
|
1445
1303
|
|
|
1446
|
-
if self.is_hybrid_model:
|
|
1447
|
-
self.mamba_metadata.free_slots(finished_idxs)
|
|
1448
|
-
|
|
1449
1304
|
# Reset request/token counts.
|
|
1450
1305
|
self.request_to_kv_block_ids.fill_(-1)
|
|
1451
1306
|
self.total_request_count = 0
|
|
1452
1307
|
self.active_token_count = 0
|
|
1453
|
-
|
|
1454
|
-
# Reset Mamba state.
|
|
1455
|
-
self.reset_mamba_state()
|
|
1456
|
-
|
|
1457
1308
|
return
|
|
1458
1309
|
|
|
1459
1310
|
# 3. Concatenate the paused tokens to the active tokens if present.
|
|
@@ -1481,10 +1332,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1481
1332
|
# and updates it instead of the original tensor.
|
|
1482
1333
|
self.request_to_kv_block_ids[finished_idxs] = -1
|
|
1483
1334
|
|
|
1484
|
-
if self.is_hybrid_model:
|
|
1485
|
-
# Get the Mamba state indices for finished requests and free them
|
|
1486
|
-
self.mamba_metadata.free_slots(finished_idxs)
|
|
1487
|
-
|
|
1488
1335
|
if active_request_count > 0:
|
|
1489
1336
|
finished_idxs_on_left = (
|
|
1490
1337
|
torch.nonzero(active_requests_mask[:active_request_count] == 0, as_tuple=True)[
|
|
@@ -1504,10 +1351,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1504
1351
|
next_tokens=next_tokens,
|
|
1505
1352
|
)
|
|
1506
1353
|
|
|
1507
|
-
# Reset
|
|
1354
|
+
# Reset block ids for recently moved requests.
|
|
1508
1355
|
self.request_to_kv_block_ids[active_idxs_on_right] = -1
|
|
1509
|
-
if self.is_hybrid_model:
|
|
1510
|
-
self.mamba_metadata.request_to_mamba_state_idx[active_idxs_on_right] = -1
|
|
1511
1356
|
|
|
1512
1357
|
# 5. We identify requests that require a new block and add them to the paused requests (i.e move them left) :-
|
|
1513
1358
|
# a) Put requests that have filled their current block and require a new one in a pause state temporarily
|
|
@@ -1605,7 +1450,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1605
1450
|
|
|
1606
1451
|
# 7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
|
|
1607
1452
|
self.total_request_count = active_request_count + self.paused_request_count
|
|
1608
|
-
|
|
1609
1453
|
# All these active requests are in decode phase, so they need only 1 token per request
|
|
1610
1454
|
self.active_token_count = active_request_count
|
|
1611
1455
|
# Always the first section of token input ids are only used.
|
|
@@ -119,8 +119,8 @@ def triton_append_key_value_cache(
|
|
|
119
119
|
|
|
120
120
|
_, num_heads, h_dim = key.shape
|
|
121
121
|
|
|
122
|
-
key_cache = memory_buffer[0, layer_number]
|
|
123
|
-
value_cache = memory_buffer[1, layer_number]
|
|
122
|
+
key_cache = memory_buffer[0, layer_number - 1]
|
|
123
|
+
value_cache = memory_buffer[1, layer_number - 1]
|
|
124
124
|
|
|
125
125
|
key_to_cache = key[:n_tokens]
|
|
126
126
|
value_to_cache = value[:n_tokens]
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
|
|
3
|
-
import faulthandler
|
|
4
3
|
import logging
|
|
5
|
-
import signal
|
|
6
4
|
from collections import deque
|
|
7
5
|
from itertools import cycle
|
|
8
6
|
from multiprocessing import Event
|
|
@@ -25,11 +23,6 @@ try:
|
|
|
25
23
|
except:
|
|
26
24
|
HAVE_MSGPACK = False
|
|
27
25
|
|
|
28
|
-
# Register faulthandler to emit stack traces upon process kill.
|
|
29
|
-
faulthandler.enable()
|
|
30
|
-
faulthandler.register(signal.SIGTERM, all_threads=False, chain=True)
|
|
31
|
-
faulthandler.register(signal.SIGINT, all_threads=False, chain=True)
|
|
32
|
-
|
|
33
26
|
|
|
34
27
|
class DataParallelInferenceCoordinator:
|
|
35
28
|
"""
|
|
@@ -33,8 +33,8 @@ from megatron.core.inference.sampling_params import SamplingParams
|
|
|
33
33
|
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
|
|
34
34
|
TextGenerationController,
|
|
35
35
|
)
|
|
36
|
-
from megatron.core.inference.utils import Counter
|
|
37
|
-
from megatron.core.utils import get_asyncio_loop
|
|
36
|
+
from megatron.core.inference.utils import Counter
|
|
37
|
+
from megatron.core.utils import get_asyncio_loop
|
|
38
38
|
|
|
39
39
|
try:
|
|
40
40
|
from tqdm import tqdm
|
|
@@ -293,11 +293,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
293
293
|
self.capture_stats = capture_stats
|
|
294
294
|
|
|
295
295
|
async def start_listening_to_data_parallel_coordinator(
|
|
296
|
-
self,
|
|
297
|
-
inference_coordinator_port: int,
|
|
298
|
-
launch_inference_coordinator: bool = True,
|
|
299
|
-
*,
|
|
300
|
-
loop: Optional[asyncio.AbstractEventLoop] = None,
|
|
296
|
+
self, inference_coordinator_port: int, launch_inference_coordinator: bool = True
|
|
301
297
|
):
|
|
302
298
|
"""Initializes ZMQ communication to connect the engine with an inference coordinator.
|
|
303
299
|
|
|
@@ -411,14 +407,12 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
411
407
|
torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group())
|
|
412
408
|
|
|
413
409
|
if launch_inference_coordinator and torch.distributed.get_rank() == 0:
|
|
414
|
-
|
|
410
|
+
coordinator_ready_event.wait()
|
|
415
411
|
logging.info("Inference co-ordinator is ready to receive requests!")
|
|
416
412
|
|
|
417
413
|
# Finally run the engine infinite loop
|
|
418
|
-
|
|
419
|
-
self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop))
|
|
414
|
+
self.engine_loop_task = asyncio.create_task(self.run_engine_with_coordinator())
|
|
420
415
|
|
|
421
|
-
@trace_async_exceptions
|
|
422
416
|
async def _notify_cond_for_new_request(self):
|
|
423
417
|
"""Helper function to notify condition variable when a new request is added."""
|
|
424
418
|
async with self._cond:
|
|
@@ -472,7 +466,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
472
466
|
self.waiting_request_ids.append(request_id)
|
|
473
467
|
|
|
474
468
|
# Create a new asyncio Future to notify the user when the request has completed.
|
|
475
|
-
self.request_completion_futures[request_id] =
|
|
469
|
+
self.request_completion_futures[request_id] = asyncio.Future()
|
|
476
470
|
return self.request_completion_futures[request_id]
|
|
477
471
|
|
|
478
472
|
def add_request(
|
|
@@ -647,7 +641,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
647
641
|
if request_can_be_added and request_tokens_can_be_added and kv_cache_available:
|
|
648
642
|
self.context.add_request(req)
|
|
649
643
|
self._loop.call_soon_threadsafe(
|
|
650
|
-
|
|
644
|
+
asyncio.create_task, self._notify_cond_for_new_request()
|
|
651
645
|
)
|
|
652
646
|
req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
|
|
653
647
|
req.add_event_add()
|
|
@@ -708,7 +702,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
708
702
|
|
|
709
703
|
# is_continuing_chunked_prefill is True if we are scheduling next
|
|
710
704
|
# chunk of a existing chunked prefill request
|
|
711
|
-
is_continuing_chunked_prefill = self.context.chunked_prefill_request_id
|
|
705
|
+
is_continuing_chunked_prefill = self.context.chunked_prefill_request_id > 0
|
|
712
706
|
|
|
713
707
|
# Use remaining prompt tokens for scheduling decisions
|
|
714
708
|
remaining_len = len(req.remaining_prompt_tokens)
|
|
@@ -726,7 +720,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
726
720
|
self.context.chunked_prefill_request_id = -1
|
|
727
721
|
self.context.add_request(req)
|
|
728
722
|
self._loop.call_soon_threadsafe(
|
|
729
|
-
|
|
723
|
+
asyncio.create_task, self._notify_cond_for_new_request()
|
|
730
724
|
)
|
|
731
725
|
req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
|
|
732
726
|
req.add_event_add()
|
|
@@ -738,7 +732,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
738
732
|
chunk_length = self.context.max_tokens - self.context.active_token_count
|
|
739
733
|
self.context.add_request(req, chunk_length=chunk_length)
|
|
740
734
|
self._loop.call_soon_threadsafe(
|
|
741
|
-
|
|
735
|
+
asyncio.create_task, self._notify_cond_for_new_request()
|
|
742
736
|
)
|
|
743
737
|
self.context.chunked_prefill_request_id = req.request_id
|
|
744
738
|
req.remaining_prompt_tokens = req.remaining_prompt_tokens[chunk_length:]
|
|
@@ -945,7 +939,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
945
939
|
result = self.step_modern()
|
|
946
940
|
finished_requests_list.extend(result["finished_requests"])
|
|
947
941
|
|
|
948
|
-
# Ensure requests are returned in the same order they were passed in
|
|
942
|
+
# Ensure requests are returned in the same order they were passed in.
|
|
949
943
|
finished_requests_list.sort(key=lambda x: x.request_id)
|
|
950
944
|
|
|
951
945
|
return finished_requests_list
|
|
@@ -1045,12 +1039,8 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
1045
1039
|
self.zmq_context.term()
|
|
1046
1040
|
parallel_state.destroy_model_parallel()
|
|
1047
1041
|
|
|
1048
|
-
|
|
1049
|
-
async def run_engine(
|
|
1050
|
-
self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
|
|
1051
|
-
):
|
|
1042
|
+
async def run_engine(self, *, verbose: Optional[bool] = False):
|
|
1052
1043
|
"""Continually steps the engine asynchronously."""
|
|
1053
|
-
self._loop = get_asyncio_loop(loop)
|
|
1054
1044
|
try:
|
|
1055
1045
|
while True:
|
|
1056
1046
|
# Wait until there are active requests before proceeding.
|
|
@@ -1064,12 +1054,8 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
1064
1054
|
except asyncio.CancelledError:
|
|
1065
1055
|
pass
|
|
1066
1056
|
|
|
1067
|
-
|
|
1068
|
-
async def run_engine_with_coordinator(
|
|
1069
|
-
self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
|
|
1070
|
-
):
|
|
1057
|
+
async def run_engine_with_coordinator(self, *, verbose: Optional[bool] = False):
|
|
1071
1058
|
"""Continually steps the engine asynchronously."""
|
|
1072
|
-
self._loop = get_asyncio_loop(loop)
|
|
1073
1059
|
try:
|
|
1074
1060
|
while True:
|
|
1075
1061
|
self.schedule_requests()
|
|
@@ -17,7 +17,6 @@ from megatron.core.inference.scheduler import Scheduler
|
|
|
17
17
|
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
|
|
18
18
|
TextGenerationController,
|
|
19
19
|
)
|
|
20
|
-
from megatron.core.utils import get_asyncio_loop
|
|
21
20
|
|
|
22
21
|
try:
|
|
23
22
|
from tqdm import tqdm
|
|
@@ -218,6 +217,11 @@ class StaticInferenceEngine(AbstractEngine):
|
|
|
218
217
|
generated tokens, texts and log probs if required
|
|
219
218
|
"""
|
|
220
219
|
assert hasattr(self, 'dynamic_engine'), "Dynamic engine not initialized"
|
|
220
|
+
try:
|
|
221
|
+
loop = asyncio.get_running_loop()
|
|
222
|
+
except RuntimeError: # 'RuntimeError: There is no current event loop...'
|
|
223
|
+
loop = asyncio.new_event_loop()
|
|
224
|
+
asyncio.set_event_loop(loop)
|
|
221
225
|
|
|
222
226
|
if common_inference_params:
|
|
223
227
|
sampling_params = common_inference_params
|
|
@@ -381,8 +385,8 @@ class StaticInferenceEngine(AbstractEngine):
|
|
|
381
385
|
torch.cuda.set_device(cuda_device)
|
|
382
386
|
self.run_engine()
|
|
383
387
|
|
|
384
|
-
async def run_engine_async(self
|
|
388
|
+
async def run_engine_async(self):
|
|
385
389
|
"""Runs the engine asynchronously using asyncio"""
|
|
386
|
-
loop =
|
|
390
|
+
loop = asyncio.get_running_loop()
|
|
387
391
|
|
|
388
392
|
await loop.run_in_executor(None, self._wrapped_run_engine, torch.cuda.current_device())
|