megatron-core 0.16.0rc0.dev115691__tar.gz → 0.16.0rc0.dev115944__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/PKG-INFO +7 -14
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/megatron_tokenizer.py +0 -9
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fp8_utils.py +0 -49
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/dynamic_context.py +32 -188
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/fused_kv_append_kernel.py +2 -2
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/engines/dynamic_engine.py +2 -2
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/gpt_model.py +3 -1
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mamba/mamba_model.py +1 -30
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/package_info.py +1 -1
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/safe_globals.py +0 -2
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mamba_block.py +25 -16
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +2 -29
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mamba_layer.py +5 -5
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mamba_mixer.py +57 -301
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/dot_product_attention.py +0 -2
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/router.py +0 -2
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/utils.py +0 -85
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron_core.egg-info/PKG-INFO +7 -14
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron_core.egg-info/SOURCES.txt +0 -1
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron_core.egg-info/requires.txt +6 -13
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/pyproject.toml +6 -13
- megatron_core-0.16.0rc0.dev115691/megatron/core/inference/contexts/attention_context/mamba_metadata.py +0 -106
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/MANIFEST.in +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/README.md +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/README.md +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/activations.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/mapping.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/extensions/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/inference_client.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/inference_request.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/unified_memory.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/jit.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/distrib_optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/timers.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/attention.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/cuda_graphs.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/moe_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/transformer_config.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/transformer/utils.py +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/setup.cfg +0 -0
- {megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.16.0rc0.
|
|
3
|
+
Version: 0.16.0rc0.dev115944
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -41,7 +41,7 @@ Requires-Dist: transformers; extra == "mlm"
|
|
|
41
41
|
Provides-Extra: dev
|
|
42
42
|
Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
|
|
43
43
|
Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
|
|
44
|
-
Requires-Dist: nvidia-resiliency-ext; extra == "dev"
|
|
44
|
+
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
45
45
|
Requires-Dist: tqdm; extra == "dev"
|
|
46
46
|
Requires-Dist: einops~=0.8; extra == "dev"
|
|
47
47
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
|
|
@@ -59,20 +59,13 @@ Requires-Dist: wget; extra == "dev"
|
|
|
59
59
|
Requires-Dist: onnxscript; extra == "dev"
|
|
60
60
|
Provides-Extra: lts
|
|
61
61
|
Requires-Dist: tqdm; extra == "lts"
|
|
62
|
-
Requires-Dist: einops
|
|
63
|
-
Requires-Dist: tensorstore!=0.1.46,!=0.1.72
|
|
64
|
-
Requires-Dist: nvtx
|
|
65
|
-
Requires-Dist:
|
|
66
|
-
Requires-Dist:
|
|
62
|
+
Requires-Dist: einops; extra == "lts"
|
|
63
|
+
Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
|
|
64
|
+
Requires-Dist: nvtx; extra == "lts"
|
|
65
|
+
Requires-Dist: transformers; extra == "lts"
|
|
66
|
+
Requires-Dist: zarr; extra == "lts"
|
|
67
67
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
68
|
-
Requires-Dist: mamba-ssm~=2.2; extra == "lts"
|
|
69
|
-
Requires-Dist: causal-conv1d~=1.5; extra == "lts"
|
|
70
|
-
Requires-Dist: nv-grouped-gemm~=1.1; extra == "lts"
|
|
71
|
-
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "lts"
|
|
72
|
-
Requires-Dist: av<16.0.0; extra == "lts"
|
|
73
|
-
Requires-Dist: flashinfer-python; extra == "lts"
|
|
74
68
|
Requires-Dist: wget; extra == "lts"
|
|
75
|
-
Requires-Dist: onnxscript; extra == "lts"
|
|
76
69
|
|
|
77
70
|
<div align="center">
|
|
78
71
|
|
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
2
2
|
import json
|
|
3
|
-
import logging
|
|
4
3
|
from abc import ABC, abstractmethod
|
|
5
4
|
from collections import OrderedDict
|
|
6
5
|
from typing import Any
|
|
7
6
|
|
|
8
7
|
import numpy
|
|
9
8
|
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
9
|
|
|
13
10
|
class MegatronLegacyTokenizer(ABC):
|
|
14
11
|
"""Abstract class for tokenizer
|
|
@@ -23,12 +20,6 @@ class MegatronLegacyTokenizer(ABC):
|
|
|
23
20
|
"""
|
|
24
21
|
|
|
25
22
|
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
|
|
26
|
-
# Deprecation warning
|
|
27
|
-
logger.warning(
|
|
28
|
-
"You’re using the legacy tokenizer system, which is deprecated "
|
|
29
|
-
"and will be removed in a future release. Please migrate to the new tokenizer system "
|
|
30
|
-
"(`megatron.core.tokenizers.MegatronTokenizer`)."
|
|
31
|
-
)
|
|
32
23
|
self.unique_identifiers = OrderedDict()
|
|
33
24
|
self.unique_identifiers["class"] = type(self).__name__
|
|
34
25
|
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
|
{megatron_core-0.16.0rc0.dev115691 → megatron_core-0.16.0rc0.dev115944}/megatron/core/fp8_utils.py
RENAMED
|
@@ -10,12 +10,6 @@ from typing import List, Optional
|
|
|
10
10
|
import torch
|
|
11
11
|
|
|
12
12
|
from megatron.core.enums import Fp8Recipe
|
|
13
|
-
from megatron.core.tensor_parallel import (
|
|
14
|
-
ColumnParallelLinear,
|
|
15
|
-
RowParallelLinear,
|
|
16
|
-
gather_from_sequence_parallel_region,
|
|
17
|
-
reduce_scatter_to_sequence_parallel_region,
|
|
18
|
-
)
|
|
19
13
|
from megatron.core.transformer.transformer_config import TransformerConfig
|
|
20
14
|
from megatron.core.utils import get_te_version, is_te_min_version
|
|
21
15
|
|
|
@@ -118,27 +112,6 @@ def get_fp8_align_size(fp8_recipe: Fp8Recipe) -> int:
|
|
|
118
112
|
return 16
|
|
119
113
|
|
|
120
114
|
|
|
121
|
-
def is_column_parallel_linear(module):
|
|
122
|
-
"""Returns whether the given module is a ColumnParallelLinear layer."""
|
|
123
|
-
if HAVE_TE and (
|
|
124
|
-
isinstance(module, TEColumnParallelLinear)
|
|
125
|
-
or isinstance(module, TELayerNormColumnParallelLinear)
|
|
126
|
-
):
|
|
127
|
-
return True
|
|
128
|
-
elif isinstance(module, ColumnParallelLinear):
|
|
129
|
-
return True
|
|
130
|
-
return False
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def is_row_parallel_linear(module):
|
|
134
|
-
"""Returns whether the given module is a RowParallelLinear layer."""
|
|
135
|
-
if HAVE_TE and isinstance(module, TERowParallelLinear):
|
|
136
|
-
return True
|
|
137
|
-
elif isinstance(module, RowParallelLinear):
|
|
138
|
-
return True
|
|
139
|
-
return False
|
|
140
|
-
|
|
141
|
-
|
|
142
115
|
"""
|
|
143
116
|
The code below abstracts the functionalities needed for implementing "--fp8-param-gather" into
|
|
144
117
|
several functions. It provides different implementations for each function based on different
|
|
@@ -614,18 +587,6 @@ if HAVE_TE:
|
|
|
614
587
|
if not FP8GlobalStateManager.is_fp8_enabled():
|
|
615
588
|
return original_forward(input_tensor, *args, **kwargs)
|
|
616
589
|
|
|
617
|
-
# With sequence parallelism we need to all-gather before padding
|
|
618
|
-
# and reduce-scatter after unpadding
|
|
619
|
-
if is_sequence_parallel := getattr(module, "sequence_parallel", False):
|
|
620
|
-
if is_column_parallel_linear(module):
|
|
621
|
-
input_tensor = gather_from_sequence_parallel_region(
|
|
622
|
-
input_tensor, group=module.tp_group
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
# Disable sequence parallelism on the module because we are handling the
|
|
626
|
-
# all-gather and reduce-scatter externally
|
|
627
|
-
module.sequence_parallel = False
|
|
628
|
-
|
|
629
590
|
seq_len, batch_size, hidden_size = input_tensor.shape
|
|
630
591
|
# Reshape to (S, B*H) to pad sequence dimension
|
|
631
592
|
input_2d = input_tensor.reshape(seq_len, -1)
|
|
@@ -651,16 +612,6 @@ if HAVE_TE:
|
|
|
651
612
|
unpadded_output_2d = _unpad_func(output_2d, [seq_len])
|
|
652
613
|
unpadded_output = unpadded_output_2d.reshape(seq_len, batch_size, output_hidden_size)
|
|
653
614
|
|
|
654
|
-
if is_sequence_parallel:
|
|
655
|
-
# Reduce-scatter after unpadding
|
|
656
|
-
if is_row_parallel_linear(module):
|
|
657
|
-
unpadded_output = reduce_scatter_to_sequence_parallel_region(
|
|
658
|
-
unpadded_output, group=module.tp_group
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
# Reset sequence parallelism flag on the module
|
|
662
|
-
module.sequence_parallel = True
|
|
663
|
-
|
|
664
615
|
if other_outputs:
|
|
665
616
|
return (unpadded_output,) + other_outputs
|
|
666
617
|
else:
|
|
@@ -23,14 +23,9 @@ from megatron.core.inference.unified_memory import (
|
|
|
23
23
|
from megatron.core.inference.utils import tensor_swap
|
|
24
24
|
from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
|
|
25
25
|
from megatron.core.package_info import __version__ as mcore_version
|
|
26
|
-
from megatron.core.ssm.mamba_hybrid_layer_allocation import (
|
|
27
|
-
Symbols,
|
|
28
|
-
get_layer_maps_from_layer_type_list,
|
|
29
|
-
)
|
|
30
26
|
from megatron.core.transformer import TransformerConfig
|
|
31
27
|
from megatron.core.utils import divide as core_divide
|
|
32
28
|
|
|
33
|
-
from .attention_context.mamba_metadata import MambaMetadata
|
|
34
29
|
from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
|
|
35
30
|
from .base_context import BaseInferenceContext
|
|
36
31
|
from .dynamic_block_allocator import BlockAllocator
|
|
@@ -232,17 +227,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
232
227
|
where the cuda graph batch sizes range from 1 to `max_requests` (as
|
|
233
228
|
computed below). Due to rounding, the actual number of cuda graphs may
|
|
234
229
|
not equal this argument.
|
|
235
|
-
materialize_only_last_token_logits (
|
|
236
|
-
|
|
237
|
-
if returning log probs.
|
|
238
|
-
layer_type_list (Optional[List[str]]): A list of strings that indicates
|
|
239
|
-
the layer type (Mamba / Attention / MLP) for each layer.
|
|
240
|
-
See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list
|
|
241
|
-
of symbols. This must be provided for hybrid models.
|
|
242
|
-
mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request.
|
|
243
|
-
This must be provided for hybrid models.
|
|
244
|
-
mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request.
|
|
245
|
-
This must be provided for hybrid models.
|
|
230
|
+
materialize_only_last_token_logits (bool): If True, only the last token logits
|
|
231
|
+
are materialized in the context.
|
|
246
232
|
use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode
|
|
247
233
|
engine steps.
|
|
248
234
|
unified_memory_level (Optional[int]): Set unified memory usage within the
|
|
@@ -273,10 +259,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
273
259
|
kv_lora_rank: Optional[int] = None,
|
|
274
260
|
qk_pos_emb_head_dim: Optional[int] = None,
|
|
275
261
|
num_cuda_graphs: Optional[int] = None,
|
|
276
|
-
materialize_only_last_token_logits:
|
|
277
|
-
layer_type_list: Optional[List[str]] = None,
|
|
278
|
-
mamba_conv_states_shape: Optional[Tuple[int]] = None,
|
|
279
|
-
mamba_ssm_states_shape: Optional[Tuple[int]] = None,
|
|
262
|
+
materialize_only_last_token_logits: bool = True,
|
|
280
263
|
use_cuda_graphs_for_non_decode_steps: bool = True,
|
|
281
264
|
use_flashinfer_fused_rope: bool = False,
|
|
282
265
|
unified_memory_level: Optional[int] = 0,
|
|
@@ -300,41 +283,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
300
283
|
tp_size = tensor_model_parallel_size
|
|
301
284
|
hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
|
|
302
285
|
num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
|
|
303
|
-
|
|
304
|
-
# Mamba states.
|
|
305
|
-
self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list
|
|
306
|
-
if self.is_hybrid_model:
|
|
307
|
-
assert (
|
|
308
|
-
mamba_conv_states_shape is not None
|
|
309
|
-
), "`mamba_conv_states_shape` must be specified for hybrid models"
|
|
310
|
-
assert (
|
|
311
|
-
mamba_ssm_states_shape is not None
|
|
312
|
-
), "`mamba_ssm_states_shape` must be specified for hybrid models"
|
|
313
|
-
assert (
|
|
314
|
-
not use_cuda_graphs_for_non_decode_steps
|
|
315
|
-
), "Non-decode CUDA graphs not yet supported for hybrid models"
|
|
316
|
-
|
|
317
|
-
# For hybrid models, the layer map converts the global layer index to the
|
|
318
|
-
# corresponding attention layer index or Mamba layer index depending on the
|
|
319
|
-
# layer type.
|
|
320
|
-
attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list(
|
|
321
|
-
layer_type_list
|
|
322
|
-
)
|
|
323
|
-
self.num_attention_layers = len(attention_layer_map)
|
|
324
|
-
self.num_mamba_layers = len(mamba_layer_map)
|
|
325
|
-
self.layer_map = attention_layer_map | mamba_layer_map
|
|
326
|
-
else:
|
|
327
|
-
# The layer map is the identity function for pure Transformer models.
|
|
328
|
-
self.num_attention_layers = num_layers
|
|
329
|
-
self.num_mamba_layers = 0
|
|
330
|
-
(mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None)
|
|
331
|
-
self.layer_map = {i: i for i in range(self.num_attention_layers)}
|
|
332
|
-
|
|
333
|
-
if self.num_attention_layers == 0:
|
|
334
|
-
raise NotImplementedError(
|
|
335
|
-
f"Using `DynamicInferenceContext` with no attention is not supported."
|
|
336
|
-
)
|
|
337
|
-
|
|
338
286
|
# Block size tokens, bytes.
|
|
339
287
|
dtype_size_bytes = params_dtype.itemsize
|
|
340
288
|
self.block_size_tokens = block_size_tokens
|
|
@@ -349,38 +297,24 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
349
297
|
self.block_size_bytes = (
|
|
350
298
|
dtype_size_bytes
|
|
351
299
|
* 2 # key, value
|
|
352
|
-
*
|
|
300
|
+
* num_layers
|
|
353
301
|
* self.block_size_tokens
|
|
354
302
|
* num_attention_heads_per_partition
|
|
355
303
|
* hidden_size_per_attention_head
|
|
356
304
|
)
|
|
357
|
-
assert self.block_size_bytes > 0
|
|
358
305
|
|
|
359
306
|
# Adjust buffer to be a multiple of block size.
|
|
360
307
|
buffer_size_bytes = int(buffer_size_gb * 1024**3)
|
|
361
308
|
buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes
|
|
362
309
|
buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem
|
|
363
310
|
|
|
364
|
-
|
|
365
|
-
if self.is_hybrid_model:
|
|
366
|
-
mamba_states_memory_per_request += math.prod(mamba_conv_states_shape)
|
|
367
|
-
mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape)
|
|
368
|
-
mamba_states_memory_per_request *= self.num_mamba_layers
|
|
369
|
-
mamba_states_memory_per_request *= dtype_size_bytes
|
|
370
|
-
|
|
371
|
-
# Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size.
|
|
311
|
+
# Compute max_requets, max_tokens from buffer size and overflow factor.
|
|
372
312
|
def bytes_to_max_requests_and_tokens(n_bytes):
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
313
|
+
n_tokens = n_bytes / self.block_size_bytes * self.block_size_tokens
|
|
314
|
+
n_requests = n_tokens / max_sequence_length
|
|
315
|
+
return self.round_up_requests(int(n_requests), tp_size=tp_size), self.round_up_tokens(
|
|
316
|
+
int(n_tokens), tp_size=tp_size
|
|
376
317
|
)
|
|
377
|
-
# TODO(ksanthanam): Leave room for an extra request in the event of padding
|
|
378
|
-
# for non-decode CUDA graphs
|
|
379
|
-
n_requests = n_bytes / cost_per_request_bytes
|
|
380
|
-
n_tokens = n_requests * max_sequence_length
|
|
381
|
-
n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size)
|
|
382
|
-
n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size)
|
|
383
|
-
return n_requests, n_tokens
|
|
384
318
|
|
|
385
319
|
self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes)
|
|
386
320
|
if buffer_overflow_factor is not None:
|
|
@@ -405,6 +339,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
405
339
|
|
|
406
340
|
# Initialize context state.
|
|
407
341
|
self.params_dtype = params_dtype
|
|
342
|
+
self.num_layers = num_layers
|
|
408
343
|
self.max_sequence_length = max_sequence_length
|
|
409
344
|
|
|
410
345
|
# Unified memory.
|
|
@@ -455,11 +390,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
455
390
|
self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids)
|
|
456
391
|
self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids)
|
|
457
392
|
|
|
458
|
-
# Calculate the total number of
|
|
459
|
-
|
|
460
|
-
block_count_total = (
|
|
461
|
-
max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes
|
|
462
|
-
)
|
|
393
|
+
# Calculate the total number of blocks available in the buffer
|
|
394
|
+
block_count_total = buffer_size_bytes // self.block_size_bytes
|
|
463
395
|
|
|
464
396
|
# Memory buffer.
|
|
465
397
|
ctx_manager = (
|
|
@@ -470,12 +402,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
470
402
|
with ctx_manager:
|
|
471
403
|
if cache_mla_latent:
|
|
472
404
|
self.memory_buffer = torch.full(
|
|
473
|
-
(
|
|
474
|
-
self.num_attention_layers,
|
|
475
|
-
block_count_total,
|
|
476
|
-
self.block_size_tokens,
|
|
477
|
-
kv_reduced_dim,
|
|
478
|
-
),
|
|
405
|
+
(self.num_layers, block_count_total, self.block_size_tokens, kv_reduced_dim),
|
|
479
406
|
-1,
|
|
480
407
|
dtype=self.params_dtype,
|
|
481
408
|
device=torch.cuda.current_device(),
|
|
@@ -484,7 +411,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
484
411
|
self.memory_buffer = torch.full(
|
|
485
412
|
(
|
|
486
413
|
2, # key and value
|
|
487
|
-
self.
|
|
414
|
+
self.num_layers,
|
|
488
415
|
block_count_total,
|
|
489
416
|
self.block_size_tokens,
|
|
490
417
|
num_attention_heads_per_partition,
|
|
@@ -589,34 +516,14 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
589
516
|
block_count_total=block_count_total, gtd_block_count=self.gtd_block_count
|
|
590
517
|
)
|
|
591
518
|
|
|
592
|
-
# Optional state tensors for hybrid models
|
|
593
|
-
if self.is_hybrid_model:
|
|
594
|
-
self.mamba_metadata = MambaMetadata(max_requests=self.max_requests)
|
|
595
|
-
|
|
596
|
-
with ctx_manager:
|
|
597
|
-
self.mamba_conv_states = torch.zeros(
|
|
598
|
-
(self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape,
|
|
599
|
-
dtype=self.params_dtype,
|
|
600
|
-
device=torch.cuda.current_device(),
|
|
601
|
-
)
|
|
602
|
-
self.mamba_ssm_states = torch.zeros(
|
|
603
|
-
(self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape,
|
|
604
|
-
dtype=self.params_dtype,
|
|
605
|
-
device=torch.cuda.current_device(),
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
else:
|
|
609
|
-
self.mamba_metadata = None
|
|
610
|
-
|
|
611
519
|
# Store the dummy block idx reference for convenience
|
|
612
520
|
self.dummy_block_idx = self.block_allocator.dummy_block_idx
|
|
613
521
|
|
|
614
522
|
# Deal with chunked prefill
|
|
615
523
|
self.chunked_prefill_request_id = -1
|
|
616
524
|
|
|
617
|
-
# Reset attention
|
|
525
|
+
# Reset attention state.
|
|
618
526
|
self.reset_attention_state()
|
|
619
|
-
self.reset_mamba_state()
|
|
620
527
|
|
|
621
528
|
if use_flashinfer_fused_rope is True:
|
|
622
529
|
assert HAVE_FLASHINFER, "flashinfer is not installed"
|
|
@@ -721,8 +628,7 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
721
628
|
"""Test if all active requests are in decode phase.
|
|
722
629
|
|
|
723
630
|
For a request in prefill phase active_tokens = query length
|
|
724
|
-
Once the request moves to decode phase active tokens is 1 for that request.
|
|
725
|
-
So if all active requests are in decode phase, they will be equal to active token count.
|
|
631
|
+
Once the request moves to decode phase active tokens is 1 for that request. So if all active requests are in decode phase, they will be equal to active token count.
|
|
726
632
|
"""
|
|
727
633
|
total_active_requests = self.total_request_count - self.paused_request_count
|
|
728
634
|
return total_active_requests == self.active_token_count
|
|
@@ -758,7 +664,11 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
758
664
|
|
|
759
665
|
def get_active_request_count(self):
|
|
760
666
|
"""Returns the current number of active requests."""
|
|
761
|
-
|
|
667
|
+
active_sequence_lengths = self.get_active_sequence_lengths()
|
|
668
|
+
max_sequence_lengths = self.get_max_sequence_lengths()
|
|
669
|
+
active_requests_mask = torch.less(active_sequence_lengths, max_sequence_lengths).byte()
|
|
670
|
+
active_request_count = (active_requests_mask == 1).sum().item()
|
|
671
|
+
return active_request_count
|
|
762
672
|
|
|
763
673
|
def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) -> None:
|
|
764
674
|
"""Append to KV cache.
|
|
@@ -768,12 +678,10 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
768
678
|
key (Tensor): Key tensor.
|
|
769
679
|
value (Tensor): Value tensor.
|
|
770
680
|
"""
|
|
771
|
-
attention_layer_number = self.layer_map[layer_number - 1]
|
|
772
|
-
|
|
773
681
|
if triton_append_key_value_cache is not None and not self.cache_mla_latent:
|
|
774
682
|
# currently does not support MLA latent cache
|
|
775
683
|
return triton_append_key_value_cache(
|
|
776
|
-
layer_number=
|
|
684
|
+
layer_number=layer_number,
|
|
777
685
|
key=key,
|
|
778
686
|
value=value,
|
|
779
687
|
memory_buffer=self.memory_buffer,
|
|
@@ -798,14 +706,14 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
798
706
|
if self.cache_mla_latent:
|
|
799
707
|
# We pass the kv_concat as the key in cache_mla_latent
|
|
800
708
|
kv_concat = key
|
|
801
|
-
self.memory_buffer[
|
|
709
|
+
self.memory_buffer[layer_number - 1, block_idx, local_kv_seq_idx] = kv_concat[
|
|
802
710
|
: self.padded_active_token_count
|
|
803
711
|
]
|
|
804
712
|
else:
|
|
805
|
-
self.memory_buffer[0,
|
|
713
|
+
self.memory_buffer[0, layer_number - 1, block_idx, local_kv_seq_idx] = key[
|
|
806
714
|
: self.padded_active_token_count
|
|
807
715
|
]
|
|
808
|
-
self.memory_buffer[1,
|
|
716
|
+
self.memory_buffer[1, layer_number - 1, block_idx, local_kv_seq_idx] = value[
|
|
809
717
|
: self.padded_active_token_count
|
|
810
718
|
]
|
|
811
719
|
|
|
@@ -819,30 +727,19 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
819
727
|
(Tuple[Tensor, Tensor]) The key and value pointer tensors that point
|
|
820
728
|
to blocks within the block-level memory buffer.
|
|
821
729
|
"""
|
|
822
|
-
attention_layer_number = self.layer_map[layer_number - 1]
|
|
823
730
|
if self.cache_mla_latent:
|
|
824
731
|
return (
|
|
825
|
-
self.memory_buffer[
|
|
732
|
+
self.memory_buffer[layer_number - 1],
|
|
826
733
|
None,
|
|
827
734
|
self.active_attn_metadata["mha_metadata"].state_data["block_table"],
|
|
828
735
|
)
|
|
829
736
|
else:
|
|
830
737
|
return (
|
|
831
|
-
self.memory_buffer[0,
|
|
832
|
-
self.memory_buffer[1,
|
|
738
|
+
self.memory_buffer[0, layer_number - 1],
|
|
739
|
+
self.memory_buffer[1, layer_number - 1],
|
|
833
740
|
self.active_attn_metadata["mha_metadata"].state_data["block_table"],
|
|
834
741
|
)
|
|
835
742
|
|
|
836
|
-
def mamba_states_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]:
|
|
837
|
-
"""Returns the Mamba state tensors for the given layer."""
|
|
838
|
-
assert self.is_hybrid_model, "Only hybrid models have Mamba state tensors"
|
|
839
|
-
|
|
840
|
-
mamba_layer_number = self.layer_map[layer_number - 1]
|
|
841
|
-
conv_state = self.mamba_conv_states[mamba_layer_number]
|
|
842
|
-
ssm_state = self.mamba_ssm_states[mamba_layer_number]
|
|
843
|
-
|
|
844
|
-
return (conv_state, ssm_state)
|
|
845
|
-
|
|
846
743
|
def apply_fused_qk_rotary_emb(
|
|
847
744
|
self, query: Tensor, key: Tensor, cos_sin_emb: Tensor, config: TransformerConfig
|
|
848
745
|
) -> Tuple[Tensor, Tensor]:
|
|
@@ -957,16 +854,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
957
854
|
attn_metadata.reset()
|
|
958
855
|
self.active_attn_metadata = None
|
|
959
856
|
|
|
960
|
-
if self.is_hybrid_model:
|
|
961
|
-
self.mamba_metadata.reset_cudagraph_mapping()
|
|
962
|
-
|
|
963
|
-
def reset_mamba_state(self) -> None:
|
|
964
|
-
"""Reset state used within Mamba layers."""
|
|
965
|
-
if self.is_hybrid_model:
|
|
966
|
-
self.mamba_conv_states.fill_(0)
|
|
967
|
-
self.mamba_ssm_states.fill_(0)
|
|
968
|
-
self.mamba_metadata.reset()
|
|
969
|
-
|
|
970
857
|
def using_cuda_graph_this_step(self) -> bool:
|
|
971
858
|
"""Returns True if cuda graphs are being used for this step."""
|
|
972
859
|
has_cuda_graphs = self.cuda_graph_token_counts is not None
|
|
@@ -1090,17 +977,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1090
977
|
)
|
|
1091
978
|
# All attention metadata calculations are now handled by MHAMetadata.update()
|
|
1092
979
|
|
|
1093
|
-
# Create Mamba state block table if it's a hybrid model
|
|
1094
|
-
if self.is_hybrid_model:
|
|
1095
|
-
active_mamba_indices = self.mamba_metadata.request_to_mamba_state_idx[
|
|
1096
|
-
self.paused_request_count : self.total_request_count
|
|
1097
|
-
]
|
|
1098
|
-
|
|
1099
|
-
if self.is_decode_only() or self.using_cuda_graph_this_step():
|
|
1100
|
-
self.mamba_metadata.update_cudagraph_mapping(
|
|
1101
|
-
active_mamba_indices, self.total_request_count - self.paused_request_count
|
|
1102
|
-
)
|
|
1103
|
-
|
|
1104
980
|
def reset(self) -> None:
|
|
1105
981
|
"""Reset entire context.
|
|
1106
982
|
|
|
@@ -1142,13 +1018,15 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1142
1018
|
|
|
1143
1019
|
# Reset available block count.
|
|
1144
1020
|
self.reset_attention_state()
|
|
1145
|
-
self.reset_mamba_state()
|
|
1146
1021
|
self.block_allocator.reset()
|
|
1147
1022
|
self.request_to_kv_block_ids.fill_(-1)
|
|
1148
1023
|
|
|
1149
1024
|
# Reset chunked prefill state
|
|
1150
1025
|
self.chunked_prefill_request_id = -1
|
|
1151
1026
|
|
|
1027
|
+
# Reset chunked prefill state
|
|
1028
|
+
self.chunked_prefill_request_id = -1
|
|
1029
|
+
|
|
1152
1030
|
def current_input_and_position_ids(
|
|
1153
1031
|
self, *, num_warmup_tokens: Optional[int] = None
|
|
1154
1032
|
) -> Tuple[Tensor, Tensor]:
|
|
@@ -1320,18 +1198,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1320
1198
|
self.token_to_local_position_within_kv_block[
|
|
1321
1199
|
self.active_token_count : self.active_token_count + chunk_length
|
|
1322
1200
|
] = (token_offset_range % self.block_size_tokens)
|
|
1323
|
-
|
|
1324
|
-
if self.is_hybrid_model and not is_chunked_prefill:
|
|
1325
|
-
# Allocate a slot for Mamba states
|
|
1326
|
-
mamba_idx = self.mamba_metadata.allocate_slot()
|
|
1327
|
-
if mamba_idx is None:
|
|
1328
|
-
raise ContextOverflowError(req.request_id, "No Mamba slots available")
|
|
1329
|
-
|
|
1330
|
-
# Initialize the allocated Mamba state
|
|
1331
|
-
self.mamba_conv_states[:, mamba_idx] = 0.0
|
|
1332
|
-
self.mamba_ssm_states[:, mamba_idx] = 0.0
|
|
1333
|
-
self.mamba_metadata.request_to_mamba_state_idx[self.total_request_count] = mamba_idx
|
|
1334
|
-
|
|
1335
1201
|
self.active_token_count += chunk_length
|
|
1336
1202
|
self.total_request_count += 0 if req.finished_chunk_token_count > 0 else 1
|
|
1337
1203
|
|
|
@@ -1350,11 +1216,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1350
1216
|
self.request_last_kv_block_id[dst_idxs] = self.request_last_kv_block_id[src_idxs]
|
|
1351
1217
|
self.request_last_kv_block_offset[dst_idxs] = self.request_last_kv_block_offset[src_idxs]
|
|
1352
1218
|
|
|
1353
|
-
if self.is_hybrid_model:
|
|
1354
|
-
self.mamba_metadata.request_to_mamba_state_idx[dst_idxs] = (
|
|
1355
|
-
self.mamba_metadata.request_to_mamba_state_idx[src_idxs]
|
|
1356
|
-
)
|
|
1357
|
-
|
|
1358
1219
|
def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
|
|
1359
1220
|
"""
|
|
1360
1221
|
Swaps all the relevent booking tensors with src idxs to dst idxs
|
|
@@ -1369,9 +1230,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1369
1230
|
tensor_swap(self.request_last_kv_block_id, src_idxs, dst_idxs)
|
|
1370
1231
|
tensor_swap(self.request_last_kv_block_offset, src_idxs, dst_idxs)
|
|
1371
1232
|
|
|
1372
|
-
if self.is_hybrid_model:
|
|
1373
|
-
tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
|
|
1374
|
-
|
|
1375
1233
|
# TODO: see if we can compile this function
|
|
1376
1234
|
def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
|
|
1377
1235
|
"""Update context state after calling engine.step().
|
|
@@ -1443,17 +1301,10 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1443
1301
|
non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
|
|
1444
1302
|
self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
|
|
1445
1303
|
|
|
1446
|
-
if self.is_hybrid_model:
|
|
1447
|
-
self.mamba_metadata.free_slots(finished_idxs)
|
|
1448
|
-
|
|
1449
1304
|
# Reset request/token counts.
|
|
1450
1305
|
self.request_to_kv_block_ids.fill_(-1)
|
|
1451
1306
|
self.total_request_count = 0
|
|
1452
1307
|
self.active_token_count = 0
|
|
1453
|
-
|
|
1454
|
-
# Reset Mamba state.
|
|
1455
|
-
self.reset_mamba_state()
|
|
1456
|
-
|
|
1457
1308
|
return
|
|
1458
1309
|
|
|
1459
1310
|
# 3. Concatenate the paused tokens to the active tokens if present.
|
|
@@ -1481,10 +1332,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1481
1332
|
# and updates it instead of the original tensor.
|
|
1482
1333
|
self.request_to_kv_block_ids[finished_idxs] = -1
|
|
1483
1334
|
|
|
1484
|
-
if self.is_hybrid_model:
|
|
1485
|
-
# Get the Mamba state indices for finished requests and free them
|
|
1486
|
-
self.mamba_metadata.free_slots(finished_idxs)
|
|
1487
|
-
|
|
1488
1335
|
if active_request_count > 0:
|
|
1489
1336
|
finished_idxs_on_left = (
|
|
1490
1337
|
torch.nonzero(active_requests_mask[:active_request_count] == 0, as_tuple=True)[
|
|
@@ -1504,10 +1351,8 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1504
1351
|
next_tokens=next_tokens,
|
|
1505
1352
|
)
|
|
1506
1353
|
|
|
1507
|
-
# Reset
|
|
1354
|
+
# Reset block ids for recently moved requests.
|
|
1508
1355
|
self.request_to_kv_block_ids[active_idxs_on_right] = -1
|
|
1509
|
-
if self.is_hybrid_model:
|
|
1510
|
-
self.mamba_metadata.request_to_mamba_state_idx[active_idxs_on_right] = -1
|
|
1511
1356
|
|
|
1512
1357
|
# 5. We identify requests that require a new block and add them to the paused requests (i.e move them left) :-
|
|
1513
1358
|
# a) Put requests that have filled their current block and require a new one in a pause state temporarily
|
|
@@ -1605,7 +1450,6 @@ class DynamicInferenceContext(BaseInferenceContext):
|
|
|
1605
1450
|
|
|
1606
1451
|
# 7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
|
|
1607
1452
|
self.total_request_count = active_request_count + self.paused_request_count
|
|
1608
|
-
|
|
1609
1453
|
# All these active requests are in decode phase, so they need only 1 token per request
|
|
1610
1454
|
self.active_token_count = active_request_count
|
|
1611
1455
|
# Always the first section of token input ids are only used.
|
|
@@ -119,8 +119,8 @@ def triton_append_key_value_cache(
|
|
|
119
119
|
|
|
120
120
|
_, num_heads, h_dim = key.shape
|
|
121
121
|
|
|
122
|
-
key_cache = memory_buffer[0, layer_number]
|
|
123
|
-
value_cache = memory_buffer[1, layer_number]
|
|
122
|
+
key_cache = memory_buffer[0, layer_number - 1]
|
|
123
|
+
value_cache = memory_buffer[1, layer_number - 1]
|
|
124
124
|
|
|
125
125
|
key_to_cache = key[:n_tokens]
|
|
126
126
|
value_to_cache = value[:n_tokens]
|
|
@@ -702,7 +702,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
702
702
|
|
|
703
703
|
# is_continuing_chunked_prefill is True if we are scheduling next
|
|
704
704
|
# chunk of a existing chunked prefill request
|
|
705
|
-
is_continuing_chunked_prefill = self.context.chunked_prefill_request_id
|
|
705
|
+
is_continuing_chunked_prefill = self.context.chunked_prefill_request_id > 0
|
|
706
706
|
|
|
707
707
|
# Use remaining prompt tokens for scheduling decisions
|
|
708
708
|
remaining_len = len(req.remaining_prompt_tokens)
|
|
@@ -939,7 +939,7 @@ class DynamicInferenceEngine(AbstractEngine):
|
|
|
939
939
|
result = self.step_modern()
|
|
940
940
|
finished_requests_list.extend(result["finished_requests"])
|
|
941
941
|
|
|
942
|
-
# Ensure requests are returned in the same order they were passed in
|
|
942
|
+
# Ensure requests are returned in the same order they were passed in.
|
|
943
943
|
finished_requests_list.sort(key=lambda x: x.request_id)
|
|
944
944
|
|
|
945
945
|
return finished_requests_list
|
|
@@ -588,6 +588,8 @@ class GPTModel(LanguageModule):
|
|
|
588
588
|
# Perform the sequence parallel gather here instead of after the output layer
|
|
589
589
|
# because we need to slice the last token logits from the full view of the
|
|
590
590
|
# packed logits across all requests.
|
|
591
|
+
# TODO(ksanthanam): Make the equivalent change in the `MambaModel` code after
|
|
592
|
+
# merging in !3722.
|
|
591
593
|
hidden_states = gather_from_sequence_parallel_region(
|
|
592
594
|
hidden_states, group=self.pg_collection.tp
|
|
593
595
|
)
|
|
@@ -595,7 +597,7 @@ class GPTModel(LanguageModule):
|
|
|
595
597
|
sequence_parallel_override = True
|
|
596
598
|
|
|
597
599
|
# Reshape [B, 1, H] to [1, B, H] → extract each sample’s true last‐token hidden
|
|
598
|
-
# state ([B, H]) → unsqueeze back to [
|
|
600
|
+
# state ([B, H]) → unsqueeze back to [1, B, H]
|
|
599
601
|
# (so that the output layer, which expects S×B×H, receives only the final token)
|
|
600
602
|
hidden_states = inference_context.last_token_logits(
|
|
601
603
|
hidden_states.squeeze(1).unsqueeze(0)
|