megatron-core 0.15.0rc7__tar.gz → 0.16.0rc0.dev104455__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/MANIFEST.in +1 -0
- {megatron_core-0.15.0rc7/megatron_core.egg-info → megatron_core-0.16.0rc0.dev104455}/PKG-INFO +67 -49
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/README.md +64 -45
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/config_logger.py +13 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/bert_dataset.py +8 -8
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/blended_megatron_dataset_config.py +11 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/gpt_dataset.py +1 -9
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/megatron_dataset.py +47 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/t5_dataset.py +11 -4
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/mapping.py +0 -9
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/torch.py +11 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/zarr.py +11 -4
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/distributed_data_parallel.py +1 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/distributed_data_parallel_config.py +5 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +126 -7
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +24 -5
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +12 -4
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +2 -2
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +96 -46
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +3 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +98 -32
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/param_and_grad_buffer.py +24 -3
- megatron_core-0.16.0rc0.dev104455/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +92 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/extensions/transformer_engine.py +16 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/communication_utils.py +75 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/contexts/__init__.py +2 -2
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/contexts/dynamic_block_allocator.py +92 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/contexts/dynamic_context.py +277 -185
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/contexts/fused_kv_append_kernel.py +14 -14
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/data_parallel_inference_coordinator.py +31 -162
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/engines/dynamic_engine.py +171 -168
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/engines/static_engine.py +154 -24
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/inference_client.py +4 -3
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/inference_request.py +119 -4
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/sampling_params.py +3 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +281 -70
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/__init__.py +3 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/endpoints/common.py +14 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/endpoints/completions.py +212 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/run_mcore_engine.py +111 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/text_generation_server.py +211 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/text_generation_server/tokenization.py +110 -0
- megatron_core-0.16.0rc0.dev104455/megatron/core/inference/utils.py +135 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/model_parallel_config.py +2 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/rope_utils.py +82 -18
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +12 -12
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/fine_grained_callables.py +2 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/gpt_model.py +4 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/nccl_allocator.py +70 -34
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/__init__.py +29 -5
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/distrib_optimizer.py +2 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/package_info.py +2 -2
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/parallel_state.py +62 -11
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/schedules.py +6 -6
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/process_groups_config.py +84 -19
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mamba_block.py +4 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mamba_layer.py +1 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mamba_mixer.py +185 -140
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/attention.py +2 -2
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/cuda_graphs.py +38 -18
- megatron_core-0.16.0rc0.dev104455/megatron/core/transformer/fsdp_dtensor_checkpoint.py +455 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/module.py +18 -24
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/moe_layer.py +21 -15
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/moe_utils.py +29 -5
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/transformer_block.py +50 -2
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/transformer_config.py +64 -10
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/transformer_layer.py +9 -22
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/utils.py +10 -9
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/utils.py +7 -6
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455/megatron_core.egg-info}/PKG-INFO +67 -49
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron_core.egg-info/SOURCES.txt +8 -3
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron_core.egg-info/requires.txt +2 -1
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/pyproject.toml +10 -6
- megatron_core-0.15.0rc7/LICENSE +0 -273
- megatron_core-0.15.0rc7/megatron/core/datasets/utils_object_storage.py +0 -277
- megatron_core-0.15.0rc7/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -92
- megatron_core-0.15.0rc7/megatron/core/inference/utils.py +0 -41
- megatron_core-0.15.0rc7/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -195
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/README.md +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/activations.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/enums.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference/unified_memory.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/jit.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/timers.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/router.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/setup.cfg +0 -0
- {megatron_core-0.15.0rc7 → megatron_core-0.16.0rc0.dev104455}/setup.py +0 -0
{megatron_core-0.15.0rc7/megatron_core.egg-info → megatron_core-0.16.0rc0.dev104455}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.0rc0.dev104455
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -29,7 +29,6 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
29
29
|
Classifier: Topic :: Utilities
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Description-Content-Type: text/markdown
|
|
32
|
-
License-File: LICENSE
|
|
33
32
|
Requires-Dist: torch
|
|
34
33
|
Requires-Dist: numpy<2.0.0
|
|
35
34
|
Requires-Dist: packaging>=24.2
|
|
@@ -50,10 +49,11 @@ Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
|
50
49
|
Requires-Dist: mamba-ssm~=2.2; extra == "dev"
|
|
51
50
|
Requires-Dist: causal-conv1d~=1.5; extra == "dev"
|
|
52
51
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
|
|
53
|
-
Requires-Dist: transformer-engine[pytorch]<2.
|
|
52
|
+
Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.7.0a0; extra == "dev"
|
|
54
53
|
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
55
54
|
Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
|
|
56
55
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
|
|
56
|
+
Requires-Dist: av<16.0.0; extra == "dev"
|
|
57
57
|
Requires-Dist: flashinfer-python; extra == "dev"
|
|
58
58
|
Requires-Dist: wget; extra == "dev"
|
|
59
59
|
Requires-Dist: onnxscript; extra == "dev"
|
|
@@ -66,12 +66,12 @@ Requires-Dist: transformers; extra == "lts"
|
|
|
66
66
|
Requires-Dist: zarr; extra == "lts"
|
|
67
67
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
68
68
|
Requires-Dist: wget; extra == "lts"
|
|
69
|
-
Dynamic: license-file
|
|
70
69
|
|
|
71
70
|
<div align="center">
|
|
72
71
|
|
|
73
72
|
Megatron-LM & Megatron Core
|
|
74
73
|
===========================
|
|
74
|
+
|
|
75
75
|
<h4>GPU-optimized library for training transformer models at scale</h4>
|
|
76
76
|
|
|
77
77
|
[](https://docs.nvidia.com/Megatron-Core/developer-guide/latest/index.html)
|
|
@@ -84,28 +84,29 @@ Megatron-LM & Megatron Core
|
|
|
84
84
|
|
|
85
85
|
```bash
|
|
86
86
|
# 1. Install Megatron Core with required dependencies
|
|
87
|
-
pip install megatron-core
|
|
88
|
-
pip install --no-build-isolation transformer-engine[pytorch]
|
|
87
|
+
pip install --no-build-isolation megatron-core[mlm,dev]
|
|
89
88
|
|
|
90
89
|
# 2. Clone repository for examples
|
|
91
90
|
git clone https://github.com/NVIDIA/Megatron-LM.git
|
|
92
91
|
cd Megatron-LM
|
|
92
|
+
pip install --no-build-isolation .[mlm,dev]
|
|
93
93
|
```
|
|
94
94
|
|
|
95
95
|
**→ [Complete Installation Guide](#installation)** - Docker, pip variants (dev,lts,etc.), source installation, and system requirements
|
|
96
96
|
|
|
97
97
|
# Latest News
|
|
98
98
|
|
|
99
|
-
-
|
|
100
|
-
-
|
|
101
|
-
-
|
|
99
|
+
- 📣 NEW! **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features.
|
|
100
|
+
- 🔄 **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
101
|
+
- **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
102
|
+
- **[2025/08]** **[GPT-OSS Model](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
102
103
|
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
103
|
-
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
104
|
+
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
104
105
|
|
|
105
106
|
<details>
|
|
106
107
|
<summary>Previous News</summary>
|
|
107
108
|
|
|
108
|
-
- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
|
|
109
|
+
- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
|
|
109
110
|
- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
|
|
110
111
|
- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details.
|
|
111
112
|
|
|
@@ -115,25 +116,28 @@ cd Megatron-LM
|
|
|
115
116
|
<summary>Table of Contents</summary>
|
|
116
117
|
|
|
117
118
|
**Getting Started**
|
|
119
|
+
|
|
118
120
|
- [Quick Start](#-quick-start)
|
|
119
121
|
- [Latest News](#latest-news)
|
|
120
122
|
- [Megatron Overview](#megatron-overview)
|
|
121
123
|
- [Project Structure](#project-structure)
|
|
122
124
|
- [Megatron-LM: Reference Implementation](#megatron-lm-reference-implementation)
|
|
123
125
|
- [Megatron Core: Production Library](#megatron-core-production-library)
|
|
124
|
-
- [Installation](#installation)
|
|
126
|
+
- [Installation](#installation)
|
|
125
127
|
- [Docker (Recommended)](#-docker-recommended)
|
|
126
128
|
- [Pip Installation](#-pip-installation)
|
|
127
129
|
- [Source Installation](#-source-installation)
|
|
128
130
|
- [System Requirements](#system-requirements)
|
|
129
131
|
|
|
130
132
|
**Core Features**
|
|
133
|
+
|
|
131
134
|
- [Performance Benchmarking](#performance-benchmarking)
|
|
132
135
|
- [Weak Scaling Results](#weak-scaling-results)
|
|
133
136
|
- [Strong Scaling Results](#strong-scaling-results)
|
|
134
137
|
- [Ecosystem Libraries](#ecosystem-libraries)
|
|
135
138
|
|
|
136
139
|
**Training**
|
|
140
|
+
|
|
137
141
|
- [Training](#training)
|
|
138
142
|
- [Getting Started](#getting-started)
|
|
139
143
|
- [Data Preparation](#data-preparation)
|
|
@@ -147,6 +151,7 @@ cd Megatron-LM
|
|
|
147
151
|
- [Performance Optimizations](#performance-optimizations)
|
|
148
152
|
|
|
149
153
|
**Resources**
|
|
154
|
+
|
|
150
155
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
151
156
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
152
157
|
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
@@ -160,6 +165,7 @@ cd Megatron-LM
|
|
|
160
165
|
# Megatron Overview
|
|
161
166
|
|
|
162
167
|
## Project Structure
|
|
168
|
+
|
|
163
169
|
```
|
|
164
170
|
Megatron-LM/
|
|
165
171
|
├── megatron/
|
|
@@ -184,28 +190,34 @@ Megatron-LM/
|
|
|
184
190
|
```
|
|
185
191
|
|
|
186
192
|
### Megatron-LM: Reference Implementation
|
|
193
|
+
|
|
187
194
|
**Reference implementation** that includes Megatron Core plus everything needed to train models.
|
|
188
195
|
|
|
189
196
|
**Best for:**
|
|
197
|
+
|
|
190
198
|
- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
|
|
191
199
|
- **Research teams** exploring new architectures and training techniques
|
|
192
200
|
- **Learning distributed training** concepts and best practices
|
|
193
201
|
- **Quick experimentation** with proven model configurations
|
|
194
202
|
|
|
195
203
|
**What you get:**
|
|
204
|
+
|
|
196
205
|
- Pre-configured training scripts for GPT, LLama, DeepSeek, Qwen, and more.
|
|
197
206
|
- End-to-end examples from data prep to evaluation
|
|
198
207
|
- Research-focused tools and utilities
|
|
199
208
|
|
|
200
209
|
### Megatron Core: Composable Library
|
|
210
|
+
|
|
201
211
|
**Composable library** with GPU-optimized building blocks for custom training frameworks.
|
|
202
212
|
|
|
203
213
|
**Best for:**
|
|
214
|
+
|
|
204
215
|
- **Framework developers** building on top of modular and optimized components
|
|
205
216
|
- **Research teams** needing custom training loops, optimizers, or data pipelines
|
|
206
217
|
- **ML engineers** requiring fault-tolerant training pipelines
|
|
207
218
|
|
|
208
219
|
**What you get:**
|
|
220
|
+
|
|
209
221
|
- Composable transformer building blocks (attention, MLP, etc.)
|
|
210
222
|
- Advanced parallelism strategies (TP, PP, DP, EP, CP)
|
|
211
223
|
- Pipeline schedules and distributed optimizers
|
|
@@ -237,6 +249,8 @@ Megatron-LM/
|
|
|
237
249
|
|
|
238
250
|
We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
|
|
239
251
|
|
|
252
|
+
**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
|
|
253
|
+
|
|
240
254
|
This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
|
|
241
255
|
|
|
242
256
|
- PyTorch (latest stable version)
|
|
@@ -250,6 +264,7 @@ docker run --runtime --nvidia --gpus all -it --rm \
|
|
|
250
264
|
-v /path/to/megatron:/workspace/megatron \
|
|
251
265
|
-v /path/to/dataset:/workspace/dataset \
|
|
252
266
|
-v /path/to/checkpoints:/workspace/checkpoints \
|
|
267
|
+
-e PIP_CONSTRAINT= \
|
|
253
268
|
nvcr.io/nvidia/pytorch:25.04-py3
|
|
254
269
|
```
|
|
255
270
|
|
|
@@ -263,13 +278,21 @@ Megatron Core offers support for two NGC PyTorch containers:
|
|
|
263
278
|
Both containers can be combined with `mlm` which adds package dependencies for Megatron-LM on top of Megatron Core.
|
|
264
279
|
|
|
265
280
|
```bash
|
|
266
|
-
# Install the latest release
|
|
267
|
-
pip install
|
|
281
|
+
# Install the latest release dependencies
|
|
282
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
283
|
+
pip install --no-build-isolation megatron-core[dev]
|
|
284
|
+
# For running an M-LM application:
|
|
285
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
286
|
+
pip install --no-build-isolation megatron-core[mlm,dev]
|
|
268
287
|
```
|
|
269
288
|
|
|
270
289
|
```bash
|
|
271
290
|
# Install packages for LTS support NGC PyTorch 24.01
|
|
272
|
-
pip install
|
|
291
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
292
|
+
pip install --no-build-isolation megatron-core[lts]
|
|
293
|
+
# For running an M-LM application:
|
|
294
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
295
|
+
pip install --no-build-isolation megatron-core[mlm,lts]
|
|
273
296
|
```
|
|
274
297
|
|
|
275
298
|
For a version of Megatron Core with only torch, run:
|
|
@@ -278,47 +301,15 @@ For a version of Megatron Core with only torch, run:
|
|
|
278
301
|
pip install megatron-core
|
|
279
302
|
```
|
|
280
303
|
|
|
281
|
-
For dependencies required by Megatron-LM, please run:
|
|
282
|
-
|
|
283
|
-
```bash
|
|
284
|
-
pip install megatron-core[mlm]
|
|
285
|
-
```
|
|
286
|
-
|
|
287
|
-
## Source Installation
|
|
288
|
-
|
|
289
|
-
For development or latest features:
|
|
290
|
-
|
|
291
|
-
For Hybrid models, Megatron Core requires [mamba](https://github.com/state-spaces/mamba). If the pre-built wheel in PyPI does not fit your environment, you can fall back to an install script Megatron Core uses in its CI system. For this, please install `uv` first:
|
|
292
|
-
|
|
293
|
-
```bash
|
|
294
|
-
export UV_VERSION=0.7.2
|
|
295
|
-
export PATH="$HOME/.local/bin:$PATH"
|
|
296
|
-
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
|
|
297
|
-
export UV_PROJECT_ENVIRONMENT=./venv
|
|
298
|
-
export PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
|
|
299
|
-
export UV_LINK_MODE=copy
|
|
300
|
-
```
|
|
301
|
-
|
|
302
|
-
Run the following command to build upstream dependencies from source:
|
|
303
|
-
|
|
304
|
-
```bash
|
|
305
|
-
# Clone and install
|
|
306
|
-
git clone https://github.com/NVIDIA/Megatron-LM.git
|
|
307
|
-
cd Megatron-LM
|
|
308
|
-
|
|
309
|
-
# Optional: checkout specific release
|
|
310
|
-
git checkout core_r0.13.0
|
|
311
|
-
|
|
312
|
-
bash docker/common/install.sh --environment {dev,lts}
|
|
313
|
-
```
|
|
314
|
-
|
|
315
304
|
## System Requirements
|
|
316
305
|
|
|
317
306
|
### Hardware Requirements
|
|
307
|
+
|
|
318
308
|
- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
|
|
319
309
|
- **Recommended**: NVIDIA Turing architecture or later
|
|
320
310
|
|
|
321
311
|
### Software Requirements
|
|
312
|
+
|
|
322
313
|
- **CUDA/cuDNN/NCCL**: Latest stable versions
|
|
323
314
|
- **PyTorch**: Latest stable version
|
|
324
315
|
- **Transformer Engine**: Latest stable version
|
|
@@ -333,12 +324,14 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
|
|
|
333
324
|

|
|
334
325
|
|
|
335
326
|
**Benchmark Configuration:**
|
|
327
|
+
|
|
336
328
|
- **Vocabulary size**: 131,072 tokens
|
|
337
329
|
- **Sequence length**: 4096 tokens
|
|
338
330
|
- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts
|
|
339
331
|
- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)
|
|
340
332
|
|
|
341
333
|
**Key Results:**
|
|
334
|
+
|
|
342
335
|
- **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training
|
|
343
336
|
- **Superlinear scaling**: MFU increases from 41% to 47-48% with model size
|
|
344
337
|
- **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging)
|
|
@@ -346,11 +339,13 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
|
|
|
346
339
|
- *Note: Performance results measured without training to convergence*
|
|
347
340
|
|
|
348
341
|
## Weak Scaling Results
|
|
342
|
+
|
|
349
343
|
Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute.
|
|
350
344
|
|
|
351
345
|

|
|
352
346
|
|
|
353
347
|
## Strong Scaling Results
|
|
348
|
+
|
|
354
349
|
We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%.
|
|
355
350
|
|
|
356
351
|

|
|
@@ -360,12 +355,14 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th
|
|
|
360
355
|
## Getting Started
|
|
361
356
|
|
|
362
357
|
### Simple Training Example
|
|
358
|
+
|
|
363
359
|
```bash
|
|
364
360
|
# Distributed training example (2 GPUs, mock data)
|
|
365
361
|
torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
|
|
366
362
|
```
|
|
367
363
|
|
|
368
364
|
### LLama-3 Training Example
|
|
365
|
+
|
|
369
366
|
```bash
|
|
370
367
|
# 8 GPUs, FP8 precision, mock data
|
|
371
368
|
./examples/llama/train_llama3_8b_fp8.sh
|
|
@@ -374,12 +371,14 @@ torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
|
|
|
374
371
|
## Data Preparation
|
|
375
372
|
|
|
376
373
|
### JSONL Data Format
|
|
374
|
+
|
|
377
375
|
```json
|
|
378
376
|
{"text": "Your training text here..."}
|
|
379
377
|
{"text": "Another training sample..."}
|
|
380
378
|
```
|
|
381
379
|
|
|
382
380
|
### Basic Preprocessing
|
|
381
|
+
|
|
383
382
|
```bash
|
|
384
383
|
python tools/preprocess_data.py \
|
|
385
384
|
--input data.jsonl \
|
|
@@ -391,6 +390,7 @@ python tools/preprocess_data.py \
|
|
|
391
390
|
```
|
|
392
391
|
|
|
393
392
|
### Key Arguments
|
|
393
|
+
|
|
394
394
|
- `--input`: Path to input JSON/JSONL file
|
|
395
395
|
- `--output-prefix`: Prefix for output binary files (.bin and .idx)
|
|
396
396
|
- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.)
|
|
@@ -405,6 +405,7 @@ python tools/preprocess_data.py \
|
|
|
405
405
|
## Data Parallelism (DP)
|
|
406
406
|
|
|
407
407
|
### Standard Data Parallel
|
|
408
|
+
|
|
408
409
|
```bash
|
|
409
410
|
# Standard DDP - replicate model on each GPU
|
|
410
411
|
torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
@@ -412,6 +413,7 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
|
412
413
|
```
|
|
413
414
|
|
|
414
415
|
### Fully Sharded Data Parallel (FSDP)
|
|
416
|
+
|
|
415
417
|
```bash
|
|
416
418
|
# Megatron's optimized FSDP (~15% faster than PyTorch FSDP2)
|
|
417
419
|
--use-custom-fsdp
|
|
@@ -426,21 +428,27 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
|
426
428
|
```
|
|
427
429
|
|
|
428
430
|
## Tensor Parallelism (TP)
|
|
431
|
+
|
|
429
432
|
Split individual model layers across GPUs:
|
|
433
|
+
|
|
430
434
|
```bash
|
|
431
435
|
--tensor-model-parallel-size 4 # 4-way tensor parallelism
|
|
432
436
|
--sequence-parallel # Enable sequence parallelism (recommended with TP)
|
|
433
437
|
```
|
|
434
438
|
|
|
435
439
|
## Pipeline Parallelism (PP)
|
|
440
|
+
|
|
436
441
|
Split model depth across GPUs:
|
|
442
|
+
|
|
437
443
|
```bash
|
|
438
444
|
--pipeline-model-parallel-size 8 # 8 pipeline stages
|
|
439
445
|
--virtual-pipeline-model-parallel-size 4 # Virtual pipeline for better load balancing
|
|
440
446
|
```
|
|
441
447
|
|
|
442
448
|
## Context Parallelism (CP)
|
|
449
|
+
|
|
443
450
|
Split long sequences across GPUs for handling long contexts:
|
|
451
|
+
|
|
444
452
|
```bash
|
|
445
453
|
--context-parallel-size 2 # 2-way context parallelism
|
|
446
454
|
--cp-comm-type p2p # Communication: p2p, a2a, allgather, a2a+p2p
|
|
@@ -448,7 +456,9 @@ Split long sequences across GPUs for handling long contexts:
|
|
|
448
456
|
```
|
|
449
457
|
|
|
450
458
|
## Expert Parallelism (EP)
|
|
459
|
+
|
|
451
460
|
For Mixture of Experts (MoE) models:
|
|
461
|
+
|
|
452
462
|
```bash
|
|
453
463
|
--expert-model-parallel-size 4 # 4-way expert parallelism
|
|
454
464
|
--num-experts 8 # 8 experts per MoE layer
|
|
@@ -488,9 +498,11 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
488
498
|
**→ [NVIDIA NeMo Framework Performance Tuning Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#performance-tuning-guide)** - Comprehensive performance optimization guide covering advanced tuning techniques, communication overlaps, memory optimizations, and profiling options.
|
|
489
499
|
|
|
490
500
|
### FlashAttention
|
|
501
|
+
|
|
491
502
|
[FlashAttention](https://github.com/Dao-AILab/flash-attention) is a fast and memory-efficient attention algorithm. We recommend the default usage, which uses cuDNN for attention via Transformer Engine and provides up to 50% speedups on forward and 84% on backward propagation with FP8 kernels. The `flash-attn` package is also supported via `--use-flash-attn`.
|
|
492
503
|
|
|
493
504
|
### Mixed Precision Training
|
|
505
|
+
|
|
494
506
|
```bash
|
|
495
507
|
--fp16 # Standard FP16
|
|
496
508
|
--bf16 # BFloat16 (recommended for large models)
|
|
@@ -498,6 +510,7 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
498
510
|
```
|
|
499
511
|
|
|
500
512
|
### Activation Checkpointing and Recomputation
|
|
513
|
+
|
|
501
514
|
```bash
|
|
502
515
|
# For limited memory
|
|
503
516
|
--recompute-activations
|
|
@@ -515,6 +528,7 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
515
528
|
```
|
|
516
529
|
|
|
517
530
|
### Distributed Optimizer
|
|
531
|
+
|
|
518
532
|
```bash
|
|
519
533
|
--use-distributed-optimizer
|
|
520
534
|
```
|
|
@@ -531,11 +545,14 @@ Stay up-to-date with our development roadmaps and planned features:
|
|
|
531
545
|
# Community & Support
|
|
532
546
|
|
|
533
547
|
## Getting Help
|
|
548
|
+
|
|
534
549
|
- 📖 **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation
|
|
535
550
|
- 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests
|
|
536
551
|
|
|
537
552
|
## Contributing
|
|
553
|
+
|
|
538
554
|
We ❤️ contributions! Ways to contribute:
|
|
555
|
+
|
|
539
556
|
- 🐛 **Report bugs** - Help us improve reliability
|
|
540
557
|
- 💡 **Suggest features** - Shape the future of Megatron Core
|
|
541
558
|
- 📝 **Improve docs** - Make Megatron Core more accessible
|
|
@@ -544,6 +561,7 @@ We ❤️ contributions! Ways to contribute:
|
|
|
544
561
|
**→ [Contributing Guide](./CONTRIBUTING.md)**
|
|
545
562
|
|
|
546
563
|
## Citation
|
|
564
|
+
|
|
547
565
|
```bibtex
|
|
548
566
|
@article{megatron-lm,
|
|
549
567
|
title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
|