megatron-core 0.15.0rc6__tar.gz → 0.16.0rc0.dev100093__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/MANIFEST.in +1 -0
- {megatron_core-0.15.0rc6/megatron_core.egg-info → megatron_core-0.16.0rc0.dev100093}/PKG-INFO +72 -53
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/README.md +64 -45
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/config_logger.py +13 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/bert_dataset.py +8 -8
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/blended_megatron_dataset_config.py +11 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/gpt_dataset.py +1 -9
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/megatron_dataset.py +47 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/t5_dataset.py +11 -4
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/mapping.py +0 -9
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/async_utils.py +3 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +17 -9
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +13 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/torch.py +11 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/zarr.py +11 -4
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/distributed_data_parallel.py +2 -6
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/distributed_data_parallel_config.py +5 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +126 -7
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +24 -5
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +12 -4
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +2 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +96 -46
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +3 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +98 -32
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/param_and_grad_buffer.py +24 -3
- megatron_core-0.16.0rc0.dev100093/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +92 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/extensions/transformer_engine.py +16 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/async_stream.py +1 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/communication_utils.py +75 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/contexts/__init__.py +2 -2
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/contexts/dynamic_block_allocator.py +92 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/contexts/dynamic_context.py +507 -252
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/contexts/fused_kv_append_kernel.py +174 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/contexts/static_context.py +3 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/data_parallel_inference_coordinator.py +41 -115
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/engines/dynamic_engine.py +360 -208
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/engines/static_engine.py +155 -24
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/inference_client.py +4 -3
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/inference_request.py +136 -5
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/sampling_params.py +6 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/scheduler.py +12 -12
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +303 -87
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/__init__.py +3 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/endpoints/common.py +14 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/endpoints/completions.py +212 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/run_mcore_engine.py +111 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/text_generation_server.py +211 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/text_generation_server/tokenization.py +110 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/unified_memory.py +89 -0
- megatron_core-0.16.0rc0.dev100093/megatron/core/inference/utils.py +135 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/model_parallel_config.py +2 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/rope_utils.py +82 -18
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +49 -17
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/fine_grained_callables.py +2 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/gpt_model.py +94 -20
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/nccl_allocator.py +70 -34
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/__init__.py +29 -5
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/distrib_optimizer.py +2 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/package_info.py +2 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/parallel_state.py +62 -11
- megatron_core-0.16.0rc0.dev100093/megatron/core/pipeline_parallel/bridge_communicator.py +922 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/pipeline_parallel/schedules.py +6 -6
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/process_groups_config.py +84 -19
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mamba_block.py +4 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mamba_layer.py +1 -1
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mamba_mixer.py +185 -140
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/attention.py +121 -39
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/cuda_graphs.py +38 -18
- megatron_core-0.16.0rc0.dev100093/megatron/core/transformer/fsdp_dtensor_checkpoint.py +455 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/module.py +18 -24
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/moe_layer.py +21 -15
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/moe_utils.py +29 -5
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/router.py +2 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/multi_latent_attention.py +2 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/multi_token_prediction.py +13 -10
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/transformer_block.py +56 -2
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/transformer_config.py +67 -27
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/transformer_layer.py +15 -22
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/utils.py +10 -9
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/utils.py +23 -10
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093/megatron_core.egg-info}/PKG-INFO +72 -53
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron_core.egg-info/SOURCES.txt +10 -3
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron_core.egg-info/requires.txt +4 -3
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/pyproject.toml +15 -10
- megatron_core-0.15.0rc6/LICENSE +0 -273
- megatron_core-0.15.0rc6/megatron/core/datasets/utils_object_storage.py +0 -277
- megatron_core-0.15.0rc6/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -92
- megatron_core-0.15.0rc6/megatron/core/inference/utils.py +0 -34
- megatron_core-0.15.0rc6/megatron/core/pipeline_parallel/bridge_communicator.py +0 -399
- megatron_core-0.15.0rc6/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -195
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/README.md +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/activations.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/enums.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/jit.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/optimizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer/optimizer_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/timers.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/mlp.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/setup.cfg +0 -0
- {megatron_core-0.15.0rc6 → megatron_core-0.16.0rc0.dev100093}/setup.py +0 -0
{megatron_core-0.15.0rc6/megatron_core.egg-info → megatron_core-0.16.0rc0.dev100093}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.0rc0.dev100093
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -17,8 +17,9 @@ Classifier: License :: OSI Approved :: BSD License
|
|
|
17
17
|
Classifier: Natural Language :: English
|
|
18
18
|
Classifier: Operating System :: OS Independent
|
|
19
19
|
Classifier: Programming Language :: Python :: 3
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
24
|
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
24
25
|
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
@@ -28,7 +29,6 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
28
29
|
Classifier: Topic :: Utilities
|
|
29
30
|
Requires-Python: >=3.10
|
|
30
31
|
Description-Content-Type: text/markdown
|
|
31
|
-
License-File: LICENSE
|
|
32
32
|
Requires-Dist: torch
|
|
33
33
|
Requires-Dist: numpy<2.0.0
|
|
34
34
|
Requires-Dist: packaging>=24.2
|
|
@@ -39,6 +39,9 @@ Requires-Dist: tiktoken; extra == "mlm"
|
|
|
39
39
|
Requires-Dist: wandb; extra == "mlm"
|
|
40
40
|
Requires-Dist: transformers; extra == "mlm"
|
|
41
41
|
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
|
|
43
|
+
Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
|
|
44
|
+
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
42
45
|
Requires-Dist: tqdm; extra == "dev"
|
|
43
46
|
Requires-Dist: einops~=0.8; extra == "dev"
|
|
44
47
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
|
|
@@ -49,10 +52,8 @@ Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
|
49
52
|
Requires-Dist: mamba-ssm~=2.2; extra == "dev"
|
|
50
53
|
Requires-Dist: causal-conv1d~=1.5; extra == "dev"
|
|
51
54
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
|
|
52
|
-
Requires-Dist: transformer-engine[pytorch]<2.8.0,>=2.6.0a0; extra == "dev"
|
|
53
|
-
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
54
|
-
Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
|
|
55
55
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
|
|
56
|
+
Requires-Dist: av<16.0.0; extra == "dev"
|
|
56
57
|
Requires-Dist: flashinfer-python; extra == "dev"
|
|
57
58
|
Requires-Dist: wget; extra == "dev"
|
|
58
59
|
Requires-Dist: onnxscript; extra == "dev"
|
|
@@ -65,12 +66,12 @@ Requires-Dist: transformers; extra == "lts"
|
|
|
65
66
|
Requires-Dist: zarr; extra == "lts"
|
|
66
67
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
67
68
|
Requires-Dist: wget; extra == "lts"
|
|
68
|
-
Dynamic: license-file
|
|
69
69
|
|
|
70
70
|
<div align="center">
|
|
71
71
|
|
|
72
72
|
Megatron-LM & Megatron Core
|
|
73
73
|
===========================
|
|
74
|
+
|
|
74
75
|
<h4>GPU-optimized library for training transformer models at scale</h4>
|
|
75
76
|
|
|
76
77
|
[](https://docs.nvidia.com/Megatron-Core/developer-guide/latest/index.html)
|
|
@@ -83,28 +84,29 @@ Megatron-LM & Megatron Core
|
|
|
83
84
|
|
|
84
85
|
```bash
|
|
85
86
|
# 1. Install Megatron Core with required dependencies
|
|
86
|
-
pip install megatron-core
|
|
87
|
-
pip install --no-build-isolation transformer-engine[pytorch]
|
|
87
|
+
pip install --no-build-isolation megatron-core[mlm,dev]
|
|
88
88
|
|
|
89
89
|
# 2. Clone repository for examples
|
|
90
90
|
git clone https://github.com/NVIDIA/Megatron-LM.git
|
|
91
91
|
cd Megatron-LM
|
|
92
|
+
pip install --no-build-isolation .[mlm,dev]
|
|
92
93
|
```
|
|
93
94
|
|
|
94
95
|
**→ [Complete Installation Guide](#installation)** - Docker, pip variants (dev,lts,etc.), source installation, and system requirements
|
|
95
96
|
|
|
96
97
|
# Latest News
|
|
97
98
|
|
|
98
|
-
-
|
|
99
|
-
-
|
|
100
|
-
-
|
|
99
|
+
- 📣 NEW! **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features.
|
|
100
|
+
- 🔄 **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
101
|
+
- **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
102
|
+
- **[2025/08]** **[GPT-OSS Model](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
101
103
|
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
102
|
-
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
104
|
+
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
103
105
|
|
|
104
106
|
<details>
|
|
105
107
|
<summary>Previous News</summary>
|
|
106
108
|
|
|
107
|
-
- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
|
|
109
|
+
- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
|
|
108
110
|
- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
|
|
109
111
|
- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details.
|
|
110
112
|
|
|
@@ -114,25 +116,28 @@ cd Megatron-LM
|
|
|
114
116
|
<summary>Table of Contents</summary>
|
|
115
117
|
|
|
116
118
|
**Getting Started**
|
|
119
|
+
|
|
117
120
|
- [Quick Start](#-quick-start)
|
|
118
121
|
- [Latest News](#latest-news)
|
|
119
122
|
- [Megatron Overview](#megatron-overview)
|
|
120
123
|
- [Project Structure](#project-structure)
|
|
121
124
|
- [Megatron-LM: Reference Implementation](#megatron-lm-reference-implementation)
|
|
122
125
|
- [Megatron Core: Production Library](#megatron-core-production-library)
|
|
123
|
-
- [Installation](#installation)
|
|
126
|
+
- [Installation](#installation)
|
|
124
127
|
- [Docker (Recommended)](#-docker-recommended)
|
|
125
128
|
- [Pip Installation](#-pip-installation)
|
|
126
129
|
- [Source Installation](#-source-installation)
|
|
127
130
|
- [System Requirements](#system-requirements)
|
|
128
131
|
|
|
129
132
|
**Core Features**
|
|
133
|
+
|
|
130
134
|
- [Performance Benchmarking](#performance-benchmarking)
|
|
131
135
|
- [Weak Scaling Results](#weak-scaling-results)
|
|
132
136
|
- [Strong Scaling Results](#strong-scaling-results)
|
|
133
137
|
- [Ecosystem Libraries](#ecosystem-libraries)
|
|
134
138
|
|
|
135
139
|
**Training**
|
|
140
|
+
|
|
136
141
|
- [Training](#training)
|
|
137
142
|
- [Getting Started](#getting-started)
|
|
138
143
|
- [Data Preparation](#data-preparation)
|
|
@@ -146,6 +151,7 @@ cd Megatron-LM
|
|
|
146
151
|
- [Performance Optimizations](#performance-optimizations)
|
|
147
152
|
|
|
148
153
|
**Resources**
|
|
154
|
+
|
|
149
155
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
150
156
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
151
157
|
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
@@ -159,6 +165,7 @@ cd Megatron-LM
|
|
|
159
165
|
# Megatron Overview
|
|
160
166
|
|
|
161
167
|
## Project Structure
|
|
168
|
+
|
|
162
169
|
```
|
|
163
170
|
Megatron-LM/
|
|
164
171
|
├── megatron/
|
|
@@ -183,28 +190,34 @@ Megatron-LM/
|
|
|
183
190
|
```
|
|
184
191
|
|
|
185
192
|
### Megatron-LM: Reference Implementation
|
|
193
|
+
|
|
186
194
|
**Reference implementation** that includes Megatron Core plus everything needed to train models.
|
|
187
195
|
|
|
188
196
|
**Best for:**
|
|
197
|
+
|
|
189
198
|
- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
|
|
190
199
|
- **Research teams** exploring new architectures and training techniques
|
|
191
200
|
- **Learning distributed training** concepts and best practices
|
|
192
201
|
- **Quick experimentation** with proven model configurations
|
|
193
202
|
|
|
194
203
|
**What you get:**
|
|
204
|
+
|
|
195
205
|
- Pre-configured training scripts for GPT, LLama, DeepSeek, Qwen, and more.
|
|
196
206
|
- End-to-end examples from data prep to evaluation
|
|
197
207
|
- Research-focused tools and utilities
|
|
198
208
|
|
|
199
209
|
### Megatron Core: Composable Library
|
|
210
|
+
|
|
200
211
|
**Composable library** with GPU-optimized building blocks for custom training frameworks.
|
|
201
212
|
|
|
202
213
|
**Best for:**
|
|
214
|
+
|
|
203
215
|
- **Framework developers** building on top of modular and optimized components
|
|
204
216
|
- **Research teams** needing custom training loops, optimizers, or data pipelines
|
|
205
217
|
- **ML engineers** requiring fault-tolerant training pipelines
|
|
206
218
|
|
|
207
219
|
**What you get:**
|
|
220
|
+
|
|
208
221
|
- Composable transformer building blocks (attention, MLP, etc.)
|
|
209
222
|
- Advanced parallelism strategies (TP, PP, DP, EP, CP)
|
|
210
223
|
- Pipeline schedules and distributed optimizers
|
|
@@ -236,6 +249,8 @@ Megatron-LM/
|
|
|
236
249
|
|
|
237
250
|
We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
|
|
238
251
|
|
|
252
|
+
**Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
|
|
253
|
+
|
|
239
254
|
This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
|
|
240
255
|
|
|
241
256
|
- PyTorch (latest stable version)
|
|
@@ -249,6 +264,7 @@ docker run --runtime --nvidia --gpus all -it --rm \
|
|
|
249
264
|
-v /path/to/megatron:/workspace/megatron \
|
|
250
265
|
-v /path/to/dataset:/workspace/dataset \
|
|
251
266
|
-v /path/to/checkpoints:/workspace/checkpoints \
|
|
267
|
+
-e PIP_CONSTRAINT= \
|
|
252
268
|
nvcr.io/nvidia/pytorch:25.04-py3
|
|
253
269
|
```
|
|
254
270
|
|
|
@@ -262,13 +278,21 @@ Megatron Core offers support for two NGC PyTorch containers:
|
|
|
262
278
|
Both containers can be combined with `mlm` which adds package dependencies for Megatron-LM on top of Megatron Core.
|
|
263
279
|
|
|
264
280
|
```bash
|
|
265
|
-
# Install the latest release
|
|
266
|
-
pip install
|
|
281
|
+
# Install the latest release dependencies
|
|
282
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
283
|
+
pip install --no-build-isolation megatron-core[dev]
|
|
284
|
+
# For running an M-LM application:
|
|
285
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
286
|
+
pip install --no-build-isolation megatron-core[mlm,dev]
|
|
267
287
|
```
|
|
268
288
|
|
|
269
289
|
```bash
|
|
270
290
|
# Install packages for LTS support NGC PyTorch 24.01
|
|
271
|
-
pip install
|
|
291
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
292
|
+
pip install --no-build-isolation megatron-core[lts]
|
|
293
|
+
# For running an M-LM application:
|
|
294
|
+
pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
|
|
295
|
+
pip install --no-build-isolation megatron-core[mlm,lts]
|
|
272
296
|
```
|
|
273
297
|
|
|
274
298
|
For a version of Megatron Core with only torch, run:
|
|
@@ -277,47 +301,15 @@ For a version of Megatron Core with only torch, run:
|
|
|
277
301
|
pip install megatron-core
|
|
278
302
|
```
|
|
279
303
|
|
|
280
|
-
For dependencies required by Megatron-LM, please run:
|
|
281
|
-
|
|
282
|
-
```bash
|
|
283
|
-
pip install megatron-core[mlm]
|
|
284
|
-
```
|
|
285
|
-
|
|
286
|
-
## Source Installation
|
|
287
|
-
|
|
288
|
-
For development or latest features:
|
|
289
|
-
|
|
290
|
-
For Hybrid models, Megatron Core requires [mamba](https://github.com/state-spaces/mamba). If the pre-built wheel in PyPI does not fit your environment, you can fall back to an install script Megatron Core uses in its CI system. For this, please install `uv` first:
|
|
291
|
-
|
|
292
|
-
```bash
|
|
293
|
-
export UV_VERSION=0.7.2
|
|
294
|
-
export PATH="$HOME/.local/bin:$PATH"
|
|
295
|
-
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
|
|
296
|
-
export UV_PROJECT_ENVIRONMENT=./venv
|
|
297
|
-
export PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
|
|
298
|
-
export UV_LINK_MODE=copy
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
Run the following command to build upstream dependencies from source:
|
|
302
|
-
|
|
303
|
-
```bash
|
|
304
|
-
# Clone and install
|
|
305
|
-
git clone https://github.com/NVIDIA/Megatron-LM.git
|
|
306
|
-
cd Megatron-LM
|
|
307
|
-
|
|
308
|
-
# Optional: checkout specific release
|
|
309
|
-
git checkout core_r0.13.0
|
|
310
|
-
|
|
311
|
-
bash docker/common/install.sh --environment {dev,lts}
|
|
312
|
-
```
|
|
313
|
-
|
|
314
304
|
## System Requirements
|
|
315
305
|
|
|
316
306
|
### Hardware Requirements
|
|
307
|
+
|
|
317
308
|
- **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
|
|
318
309
|
- **Recommended**: NVIDIA Turing architecture or later
|
|
319
310
|
|
|
320
311
|
### Software Requirements
|
|
312
|
+
|
|
321
313
|
- **CUDA/cuDNN/NCCL**: Latest stable versions
|
|
322
314
|
- **PyTorch**: Latest stable version
|
|
323
315
|
- **Transformer Engine**: Latest stable version
|
|
@@ -332,12 +324,14 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
|
|
|
332
324
|

|
|
333
325
|
|
|
334
326
|
**Benchmark Configuration:**
|
|
327
|
+
|
|
335
328
|
- **Vocabulary size**: 131,072 tokens
|
|
336
329
|
- **Sequence length**: 4096 tokens
|
|
337
330
|
- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts
|
|
338
331
|
- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)
|
|
339
332
|
|
|
340
333
|
**Key Results:**
|
|
334
|
+
|
|
341
335
|
- **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training
|
|
342
336
|
- **Superlinear scaling**: MFU increases from 41% to 47-48% with model size
|
|
343
337
|
- **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging)
|
|
@@ -345,11 +339,13 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
|
|
|
345
339
|
- *Note: Performance results measured without training to convergence*
|
|
346
340
|
|
|
347
341
|
## Weak Scaling Results
|
|
342
|
+
|
|
348
343
|
Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute.
|
|
349
344
|
|
|
350
345
|

|
|
351
346
|
|
|
352
347
|
## Strong Scaling Results
|
|
348
|
+
|
|
353
349
|
We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%.
|
|
354
350
|
|
|
355
351
|

|
|
@@ -359,12 +355,14 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th
|
|
|
359
355
|
## Getting Started
|
|
360
356
|
|
|
361
357
|
### Simple Training Example
|
|
358
|
+
|
|
362
359
|
```bash
|
|
363
360
|
# Distributed training example (2 GPUs, mock data)
|
|
364
361
|
torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
|
|
365
362
|
```
|
|
366
363
|
|
|
367
364
|
### LLama-3 Training Example
|
|
365
|
+
|
|
368
366
|
```bash
|
|
369
367
|
# 8 GPUs, FP8 precision, mock data
|
|
370
368
|
./examples/llama/train_llama3_8b_fp8.sh
|
|
@@ -373,12 +371,14 @@ torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
|
|
|
373
371
|
## Data Preparation
|
|
374
372
|
|
|
375
373
|
### JSONL Data Format
|
|
374
|
+
|
|
376
375
|
```json
|
|
377
376
|
{"text": "Your training text here..."}
|
|
378
377
|
{"text": "Another training sample..."}
|
|
379
378
|
```
|
|
380
379
|
|
|
381
380
|
### Basic Preprocessing
|
|
381
|
+
|
|
382
382
|
```bash
|
|
383
383
|
python tools/preprocess_data.py \
|
|
384
384
|
--input data.jsonl \
|
|
@@ -390,6 +390,7 @@ python tools/preprocess_data.py \
|
|
|
390
390
|
```
|
|
391
391
|
|
|
392
392
|
### Key Arguments
|
|
393
|
+
|
|
393
394
|
- `--input`: Path to input JSON/JSONL file
|
|
394
395
|
- `--output-prefix`: Prefix for output binary files (.bin and .idx)
|
|
395
396
|
- `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.)
|
|
@@ -404,6 +405,7 @@ python tools/preprocess_data.py \
|
|
|
404
405
|
## Data Parallelism (DP)
|
|
405
406
|
|
|
406
407
|
### Standard Data Parallel
|
|
408
|
+
|
|
407
409
|
```bash
|
|
408
410
|
# Standard DDP - replicate model on each GPU
|
|
409
411
|
torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
@@ -411,6 +413,7 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
|
411
413
|
```
|
|
412
414
|
|
|
413
415
|
### Fully Sharded Data Parallel (FSDP)
|
|
416
|
+
|
|
414
417
|
```bash
|
|
415
418
|
# Megatron's optimized FSDP (~15% faster than PyTorch FSDP2)
|
|
416
419
|
--use-custom-fsdp
|
|
@@ -425,21 +428,27 @@ torchrun --nproc_per_node=8 pretrain_gpt.py \
|
|
|
425
428
|
```
|
|
426
429
|
|
|
427
430
|
## Tensor Parallelism (TP)
|
|
431
|
+
|
|
428
432
|
Split individual model layers across GPUs:
|
|
433
|
+
|
|
429
434
|
```bash
|
|
430
435
|
--tensor-model-parallel-size 4 # 4-way tensor parallelism
|
|
431
436
|
--sequence-parallel # Enable sequence parallelism (recommended with TP)
|
|
432
437
|
```
|
|
433
438
|
|
|
434
439
|
## Pipeline Parallelism (PP)
|
|
440
|
+
|
|
435
441
|
Split model depth across GPUs:
|
|
442
|
+
|
|
436
443
|
```bash
|
|
437
444
|
--pipeline-model-parallel-size 8 # 8 pipeline stages
|
|
438
445
|
--virtual-pipeline-model-parallel-size 4 # Virtual pipeline for better load balancing
|
|
439
446
|
```
|
|
440
447
|
|
|
441
448
|
## Context Parallelism (CP)
|
|
449
|
+
|
|
442
450
|
Split long sequences across GPUs for handling long contexts:
|
|
451
|
+
|
|
443
452
|
```bash
|
|
444
453
|
--context-parallel-size 2 # 2-way context parallelism
|
|
445
454
|
--cp-comm-type p2p # Communication: p2p, a2a, allgather, a2a+p2p
|
|
@@ -447,7 +456,9 @@ Split long sequences across GPUs for handling long contexts:
|
|
|
447
456
|
```
|
|
448
457
|
|
|
449
458
|
## Expert Parallelism (EP)
|
|
459
|
+
|
|
450
460
|
For Mixture of Experts (MoE) models:
|
|
461
|
+
|
|
451
462
|
```bash
|
|
452
463
|
--expert-model-parallel-size 4 # 4-way expert parallelism
|
|
453
464
|
--num-experts 8 # 8 experts per MoE layer
|
|
@@ -487,9 +498,11 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
487
498
|
**→ [NVIDIA NeMo Framework Performance Tuning Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#performance-tuning-guide)** - Comprehensive performance optimization guide covering advanced tuning techniques, communication overlaps, memory optimizations, and profiling options.
|
|
488
499
|
|
|
489
500
|
### FlashAttention
|
|
501
|
+
|
|
490
502
|
[FlashAttention](https://github.com/Dao-AILab/flash-attention) is a fast and memory-efficient attention algorithm. We recommend the default usage, which uses cuDNN for attention via Transformer Engine and provides up to 50% speedups on forward and 84% on backward propagation with FP8 kernels. The `flash-attn` package is also supported via `--use-flash-attn`.
|
|
491
503
|
|
|
492
504
|
### Mixed Precision Training
|
|
505
|
+
|
|
493
506
|
```bash
|
|
494
507
|
--fp16 # Standard FP16
|
|
495
508
|
--bf16 # BFloat16 (recommended for large models)
|
|
@@ -497,6 +510,7 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
497
510
|
```
|
|
498
511
|
|
|
499
512
|
### Activation Checkpointing and Recomputation
|
|
513
|
+
|
|
500
514
|
```bash
|
|
501
515
|
# For limited memory
|
|
502
516
|
--recompute-activations
|
|
@@ -514,6 +528,7 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
514
528
|
```
|
|
515
529
|
|
|
516
530
|
### Distributed Optimizer
|
|
531
|
+
|
|
517
532
|
```bash
|
|
518
533
|
--use-distributed-optimizer
|
|
519
534
|
```
|
|
@@ -530,11 +545,14 @@ Stay up-to-date with our development roadmaps and planned features:
|
|
|
530
545
|
# Community & Support
|
|
531
546
|
|
|
532
547
|
## Getting Help
|
|
548
|
+
|
|
533
549
|
- 📖 **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation
|
|
534
550
|
- 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests
|
|
535
551
|
|
|
536
552
|
## Contributing
|
|
553
|
+
|
|
537
554
|
We ❤️ contributions! Ways to contribute:
|
|
555
|
+
|
|
538
556
|
- 🐛 **Report bugs** - Help us improve reliability
|
|
539
557
|
- 💡 **Suggest features** - Shape the future of Megatron Core
|
|
540
558
|
- 📝 **Improve docs** - Make Megatron Core more accessible
|
|
@@ -543,6 +561,7 @@ We ❤️ contributions! Ways to contribute:
|
|
|
543
561
|
**→ [Contributing Guide](./CONTRIBUTING.md)**
|
|
544
562
|
|
|
545
563
|
## Citation
|
|
564
|
+
|
|
546
565
|
```bibtex
|
|
547
566
|
@article{megatron-lm,
|
|
548
567
|
title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
|