megatron-core 0.14.0rc5__tar.gz → 0.16.0rc0.dev115842__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (379) hide show
  1. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/MANIFEST.in +1 -0
  2. megatron_core-0.16.0rc0.dev115842/PKG-INFO +572 -0
  3. megatron_core-0.16.0rc0.dev115842/README.md +503 -0
  4. megatron_core-0.16.0rc0.dev115842/megatron/core/README.md +51 -0
  5. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/__init__.py +17 -0
  6. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/config_logger.py +13 -1
  7. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/bert_dataset.py +8 -8
  8. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/blended_megatron_dataset_builder.py +19 -11
  9. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/blended_megatron_dataset_config.py +20 -3
  10. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/gpt_dataset.py +5 -17
  11. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/helpers.cpp +3 -1
  12. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/indexed_dataset.py +10 -7
  13. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/megatron_dataset.py +47 -0
  14. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/megatron_tokenizer.py +1 -1
  15. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/config/tokenizers.py +3 -3
  16. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/t5_dataset.py +11 -4
  17. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/dict_utils.py +13 -5
  18. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/mapping.py +22 -11
  19. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/optimizer.py +6 -0
  20. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/async_utils.py +55 -14
  21. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/base.py +1 -5
  22. megatron_core-0.16.0rc0.dev115842/megatron/core/dist_checkpointing/strategies/checkpointable.py +196 -0
  23. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/common.py +6 -6
  24. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +17 -9
  25. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +13 -2
  26. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/torch.py +53 -16
  27. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/zarr.py +17 -5
  28. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/validation.py +13 -3
  29. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/__init__.py +1 -0
  30. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/distributed_data_parallel.py +62 -102
  31. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/distributed_data_parallel_config.py +155 -0
  32. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/finalize_model_grads.py +488 -0
  33. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/__init__.py +3 -0
  34. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +431 -0
  35. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/__init__.py +13 -0
  36. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +51 -0
  37. {megatron_core-0.14.0rc5/megatron/core/distributed → megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp}/distributed_data_parallel_config.py +26 -7
  38. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +540 -0
  39. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +1222 -0
  40. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +27 -0
  41. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +3809 -0
  42. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +460 -0
  43. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +992 -0
  44. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/param_and_grad_buffer.py +71 -12
  45. megatron_core-0.16.0rc0.dev115842/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +92 -0
  46. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +8 -0
  47. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/enums.py +6 -0
  48. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +47 -24
  49. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/extensions/kitchen.py +4 -0
  50. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/extensions/transformer_engine.py +495 -111
  51. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/extensions/transformer_engine_spec_provider.py +5 -0
  52. megatron_core-0.16.0rc0.dev115842/megatron/core/fp4_utils.py +136 -0
  53. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fp8_utils.py +83 -64
  54. megatron_core-0.16.0rc0.dev115842/megatron/core/full_cuda_graph.py +198 -0
  55. megatron_core-0.16.0rc0.dev115842/megatron/core/fusions/fused_bias_geglu.py +442 -0
  56. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_softmax.py +149 -10
  57. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/async_stream.py +1 -1
  58. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/communication_utils.py +75 -0
  59. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/contexts/__init__.py +2 -2
  60. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/contexts/attention_context/metadata_base.py +72 -0
  61. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/contexts/attention_context/mha_metadata.py +210 -0
  62. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/contexts/dynamic_block_allocator.py +92 -0
  63. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/contexts/dynamic_context.py +1563 -0
  64. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/contexts/fused_kv_append_kernel.py +174 -0
  65. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/contexts/static_context.py +3 -1
  66. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/data_parallel_inference_coordinator.py +248 -0
  67. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/engines/dynamic_engine.py +1024 -0
  68. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/engines/static_engine.py +155 -24
  69. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/headers.py +17 -0
  70. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/inference_client.py +191 -0
  71. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/inference_request.py +324 -0
  72. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +20 -10
  73. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
  74. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/sampling_params.py +17 -1
  75. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/scheduler.py +12 -12
  76. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +358 -89
  77. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/__init__.py +3 -0
  78. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/endpoints/common.py +14 -0
  79. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/endpoints/completions.py +212 -0
  80. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/run_mcore_engine.py +111 -0
  81. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/text_generation_server.py +211 -0
  82. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/text_generation_server/tokenization.py +110 -0
  83. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/unified_memory.py +89 -0
  84. megatron_core-0.16.0rc0.dev115842/megatron/core/inference/utils.py +135 -0
  85. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/model_parallel_config.py +8 -4
  86. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/T5/t5_model.py +8 -8
  87. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/backends.py +9 -0
  88. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/rope_utils.py +82 -18
  89. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +65 -31
  90. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/language_module/language_module.py +31 -13
  91. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/model_chunk_schedule_plan.py +115 -109
  92. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/fine_grained_callables.py +119 -8
  93. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/gpt_layer_specs.py +23 -9
  94. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/gpt_model.py +172 -36
  95. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +11 -3
  96. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/moe_module_specs.py +7 -1
  97. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/huggingface/clip_model.py +1 -1
  98. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/huggingface/qwen_model.py +1 -1
  99. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mamba/mamba_model.py +8 -8
  100. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/multimodal/context_parallel.py +25 -13
  101. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/multimodal/llava_model.py +17 -12
  102. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/base_attention.py +4 -4
  103. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/decoder_attention.py +5 -5
  104. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/decoder_spec.py +8 -2
  105. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/vision/clip_vit_model.py +5 -5
  106. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/vision/multimodal_projector.py +35 -30
  107. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/vision/radio.py +30 -4
  108. megatron_core-0.16.0rc0.dev115842/megatron/core/nccl_allocator.py +316 -0
  109. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/__init__.py +68 -151
  110. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/clip_grads.py +19 -4
  111. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/distrib_optimizer.py +584 -251
  112. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/optimizer.py +73 -15
  113. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/optimizer_config.py +23 -0
  114. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/package_info.py +4 -6
  115. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/parallel_state.py +128 -22
  116. megatron_core-0.16.0rc0.dev115842/megatron/core/pipeline_parallel/bridge_communicator.py +922 -0
  117. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/pipeline_parallel/combined_1f1b.py +179 -66
  118. megatron_core-0.16.0rc0.dev115842/megatron/core/pipeline_parallel/p2p_communication.py +645 -0
  119. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/pipeline_parallel/schedules.py +611 -373
  120. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/pipeline_parallel/utils.py +12 -18
  121. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +1 -0
  122. megatron_core-0.16.0rc0.dev115842/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
  123. megatron_core-0.16.0rc0.dev115842/megatron/core/process_groups_config.py +554 -0
  124. megatron_core-0.16.0rc0.dev115842/megatron/core/safe_globals.py +35 -0
  125. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mamba_block.py +12 -9
  126. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mamba_layer.py +36 -25
  127. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mamba_mixer.py +194 -149
  128. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mlp_layer.py +3 -3
  129. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/layers.py +27 -20
  130. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/random.py +5 -2
  131. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/timers.py +14 -1
  132. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/__init__.py +4 -0
  133. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/base_tokenizer.py +48 -0
  134. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/megatron_tokenizer.py +171 -0
  135. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/__init__.py +3 -0
  136. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/__init__.py +8 -0
  137. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +147 -0
  138. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +164 -0
  139. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/chat_template.py +71 -0
  140. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +335 -0
  141. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +179 -0
  142. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/null_tokenizer.py +79 -0
  143. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +411 -0
  144. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +303 -0
  145. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/__init__.py +8 -0
  146. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/bert_tokenizer.py +12 -0
  147. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/default_tokenizer.py +12 -0
  148. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/gpt_tokenizer.py +12 -0
  149. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/mamba_tokenizer.py +12 -0
  150. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/retro_tokenizer.py +12 -0
  151. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/models/t5_tokenizer.py +12 -0
  152. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/text_tokenizer.py +254 -0
  153. megatron_core-0.16.0rc0.dev115842/megatron/core/tokenizers/text/utils/build_tokenizer.py +58 -0
  154. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/attention.py +235 -77
  155. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/cuda_graphs.py +597 -76
  156. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/dot_product_attention.py +56 -17
  157. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/enums.py +1 -0
  158. megatron_core-0.16.0rc0.dev115842/megatron/core/transformer/fsdp_dtensor_checkpoint.py +455 -0
  159. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/mlp.py +49 -8
  160. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/module.py +198 -3
  161. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/experts.py +93 -79
  162. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/moe_layer.py +65 -32
  163. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/moe_utils.py +242 -157
  164. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/router.py +248 -149
  165. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/shared_experts.py +36 -5
  166. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/token_dispatcher.py +25 -20
  167. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/multi_latent_attention.py +281 -29
  168. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/multi_token_prediction.py +355 -125
  169. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/pipeline_parallel_layer_layout.py +56 -17
  170. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/transformer_block.py +182 -65
  171. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/transformer_config.py +252 -54
  172. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/transformer_layer.py +192 -212
  173. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/utils.py +152 -4
  174. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/utils.py +59 -63
  175. megatron_core-0.16.0rc0.dev115842/megatron_core.egg-info/PKG-INFO +572 -0
  176. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron_core.egg-info/SOURCES.txt +55 -6
  177. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron_core.egg-info/requires.txt +9 -6
  178. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/pyproject.toml +44 -21
  179. megatron_core-0.14.0rc5/LICENSE +0 -273
  180. megatron_core-0.14.0rc5/PKG-INFO +0 -836
  181. megatron_core-0.14.0rc5/README.md +0 -769
  182. megatron_core-0.14.0rc5/megatron/core/README.md +0 -14
  183. megatron_core-0.14.0rc5/megatron/core/datasets/utils_object_storage.py +0 -277
  184. megatron_core-0.14.0rc5/megatron/core/distributed/custom_fsdp/__init__.py +0 -3
  185. megatron_core-0.14.0rc5/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +0 -835
  186. megatron_core-0.14.0rc5/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +0 -2551
  187. megatron_core-0.14.0rc5/megatron/core/distributed/finalize_model_grads.py +0 -361
  188. megatron_core-0.14.0rc5/megatron/core/fusions/fused_bias_geglu.py +0 -85
  189. megatron_core-0.14.0rc5/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -92
  190. megatron_core-0.14.0rc5/megatron/core/inference/contexts/dynamic_context.py +0 -1180
  191. megatron_core-0.14.0rc5/megatron/core/inference/engines/dynamic_engine.py +0 -420
  192. megatron_core-0.14.0rc5/megatron/core/inference/inference_request.py +0 -81
  193. megatron_core-0.14.0rc5/megatron/core/inference/utils.py +0 -34
  194. megatron_core-0.14.0rc5/megatron/core/pipeline_parallel/p2p_communication.py +0 -628
  195. megatron_core-0.14.0rc5/megatron/core/process_groups_config.py +0 -178
  196. megatron_core-0.14.0rc5/megatron/core/transformer/moe/__init__.py +0 -0
  197. megatron_core-0.14.0rc5/megatron_core.egg-info/PKG-INFO +0 -836
  198. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/activations.py +0 -0
  199. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/config.py +0 -0
  200. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/__init__.py +0 -0
  201. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/blended_dataset.py +0 -0
  202. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/helpers.py +0 -0
  203. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/masked_dataset.py +0 -0
  204. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/multimodal_dataset.py +0 -0
  205. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/object_storage_utils.py +0 -0
  206. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/__init__.py +0 -0
  207. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/config/__init__.py +0 -0
  208. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  209. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/config/config.py +0 -0
  210. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  211. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/db/__init__.py +0 -0
  212. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/db/build.py +0 -0
  213. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/db/dataset.py +0 -0
  214. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/db/utils.py +0 -0
  215. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/external_libs.py +0 -0
  216. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/__init__.py +0 -0
  217. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/build.py +0 -0
  218. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/factory.py +0 -0
  219. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/index.py +0 -0
  220. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  221. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  222. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  223. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/utils.py +0 -0
  224. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/index/validate.py +0 -0
  225. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/__init__.py +0 -0
  226. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  227. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  228. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/query.py +0 -0
  229. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  230. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/query/utils.py +0 -0
  231. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/retro/utils.py +0 -0
  232. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/utils.py +0 -0
  233. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/datasets/utils_s3.py +0 -0
  234. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/__init__.py +0 -0
  235. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/core.py +0 -0
  236. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  237. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/serialization.py +0 -0
  238. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  239. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  240. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  241. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  242. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  243. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  244. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  245. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  246. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/dist_checkpointing/utils.py +0 -0
  247. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/data_parallel_base.py +0 -0
  248. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  249. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/energy_monitor.py +0 -0
  250. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/__init__.py +0 -0
  251. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/data_type.py +0 -0
  252. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/export_config.py +0 -0
  253. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/model_type.py +0 -0
  254. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/__init__.py +0 -0
  255. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  256. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  257. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  258. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  259. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  260. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  261. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  262. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  263. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  264. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  265. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  266. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/extensions/__init__.py +0 -0
  267. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/__init__.py +0 -0
  268. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  269. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  270. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  271. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  272. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_indices_converter.py +0 -0
  273. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_layer_norm.py +0 -0
  274. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  275. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  276. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  277. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/hyper_comm_grid.py +0 -0
  278. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/__init__.py +0 -0
  279. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/common_inference_params.py +0 -0
  280. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/contexts/base_context.py +0 -0
  281. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/engines/__init__.py +0 -0
  282. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/engines/abstract_engine.py +0 -0
  283. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/engines/mcore_engine.py +0 -0
  284. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  285. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  286. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  287. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  288. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  289. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  290. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  291. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  292. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  293. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  294. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/inference_params.py +0 -0
  295. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/jit.py +0 -0
  296. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/T5/__init__.py +0 -0
  297. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/T5/t5_spec.py +0 -0
  298. {megatron_core-0.14.0rc5/megatron/core/post_training → megatron_core-0.16.0rc0.dev115842/megatron/core/models}/__init__.py +0 -0
  299. {megatron_core-0.14.0rc5/megatron/core/models → megatron_core-0.16.0rc0.dev115842/megatron/core/models/bert}/__init__.py +0 -0
  300. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  301. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/bert/bert_lm_head.py +0 -0
  302. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/bert/bert_model.py +0 -0
  303. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/bert/pooler.py +0 -0
  304. {megatron_core-0.14.0rc5/megatron/core/models/bert → megatron_core-0.16.0rc0.dev115842/megatron/core/models/common}/__init__.py +0 -0
  305. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/__init__.py +0 -0
  306. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  307. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  308. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  309. {megatron_core-0.14.0rc5/megatron/core/models/common → megatron_core-0.16.0rc0.dev115842/megatron/core/models/common/language_module}/__init__.py +0 -0
  310. {megatron_core-0.14.0rc5/megatron/core/models/common/language_module → megatron_core-0.16.0rc0.dev115842/megatron/core/models/common/vision_module}/__init__.py +0 -0
  311. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  312. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/gpt/__init__.py +0 -0
  313. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/huggingface/__init__.py +0 -0
  314. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/huggingface/module.py +0 -0
  315. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mamba/__init__.py +0 -0
  316. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  317. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/__init__.py +0 -0
  318. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/config/__init__.py +0 -0
  319. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/config/base_configs.py +0 -0
  320. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/model/__init__.py +0 -0
  321. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/model/base.py +0 -0
  322. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/submodules/audio.py +0 -0
  323. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/submodules/base.py +0 -0
  324. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/mimo/submodules/vision.py +0 -0
  325. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/multimodal/__init__.py +0 -0
  326. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/multimodal/llava_spec.py +0 -0
  327. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/__init__.py +0 -0
  328. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/config.py +0 -0
  329. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/encoder_attention.py +0 -0
  330. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/encoder_spec.py +0 -0
  331. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/model.py +0 -0
  332. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/retro/utils.py +0 -0
  333. {megatron_core-0.14.0rc5/megatron/core/models/common/vision_module → megatron_core-0.16.0rc0.dev115842/megatron/core/models/vision}/__init__.py +0 -0
  334. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  335. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/msc_utils.py +0 -0
  336. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/num_microbatches_calculator.py +0 -0
  337. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  338. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  339. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer/grad_scaler.py +0 -0
  340. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/optimizer_param_scheduler.py +0 -0
  341. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/packed_seq_params.py +0 -0
  342. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/pipeline_parallel/__init__.py +0 -0
  343. {megatron_core-0.14.0rc5/megatron/core/post_training/modelopt/mamba → megatron_core-0.16.0rc0.dev115842/megatron/core/post_training}/__init__.py +0 -0
  344. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/__init__.py +0 -0
  345. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  346. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  347. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/layers.py +0 -0
  348. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  349. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/quantization/__init__.py +0 -0
  350. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/quantization/quant_config.py +0 -0
  351. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/quantization/utils.py +0 -0
  352. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/requirements.txt +0 -0
  353. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/rerun_state_machine.py +0 -0
  354. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/__init__.py +0 -0
  355. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  356. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  357. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/ssm/triton_cache_manager.py +0 -0
  358. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/__init__.py +0 -0
  359. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  360. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/data.py +0 -0
  361. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/mappings.py +0 -0
  362. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/tensor_parallel/utils.py +0 -0
  363. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/__init__.py +0 -0
  364. {megatron_core-0.14.0rc5/megatron/core/models/vision → megatron_core-0.16.0rc0.dev115842/megatron/core/transformer/custom_layers}/__init__.py +0 -0
  365. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  366. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  367. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  368. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/identity_op.py +0 -0
  369. {megatron_core-0.14.0rc5/megatron/core/transformer/custom_layers → megatron_core-0.16.0rc0.dev115842/megatron/core/transformer/moe}/__init__.py +0 -0
  370. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  371. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  372. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  373. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/spec_utils.py +0 -0
  374. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/torch_layer_norm.py +0 -0
  375. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron/core/transformer/torch_norm.py +0 -0
  376. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron_core.egg-info/dependency_links.txt +0 -0
  377. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/megatron_core.egg-info/top_level.txt +0 -0
  378. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/setup.cfg +0 -0
  379. {megatron_core-0.14.0rc5 → megatron_core-0.16.0rc0.dev115842}/setup.py +0 -0
@@ -1,4 +1,5 @@
1
1
  include megatron/core/requirements.txt
2
2
  include megatron/core/README.md
3
3
  include megatron/core/package_info.py
4
+ global-exclude LICENSE
4
5
  recursive-include requirements *
@@ -0,0 +1,572 @@
1
+ Metadata-Version: 2.4
2
+ Name: megatron-core
3
+ Version: 0.16.0rc0.dev115842
4
+ Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
+ Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
+ Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
7
+ License: Apache 2.0
8
+ Project-URL: Download, https://github.com/NVIDIA/Megatron-LM/releases
9
+ Project-URL: Homepage, https://github.com/NVIDIA/Megatron-LM/megatron/core
10
+ Keywords: NLP,NLU,deep,gpu,language,learning,learning,machine,nvidia,pytorch,torch,transformer
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Information Technology
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: BSD License
17
+ Classifier: Natural Language :: English
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
25
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
26
+ Classifier: Topic :: Scientific/Engineering
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Software Development :: Libraries
29
+ Classifier: Topic :: Utilities
30
+ Requires-Python: >=3.10
31
+ Description-Content-Type: text/markdown
32
+ Requires-Dist: torch
33
+ Requires-Dist: numpy<2.0.0
34
+ Requires-Dist: packaging>=24.2
35
+ Provides-Extra: mlm
36
+ Requires-Dist: flask-restful; extra == "mlm"
37
+ Requires-Dist: sentencepiece; extra == "mlm"
38
+ Requires-Dist: tiktoken; extra == "mlm"
39
+ Requires-Dist: wandb; extra == "mlm"
40
+ Requires-Dist: transformers; extra == "mlm"
41
+ Provides-Extra: dev
42
+ Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
43
+ Requires-Dist: transformer-engine[pytorch]<2.10.0,>=2.9.0a0; extra == "dev"
44
+ Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
45
+ Requires-Dist: tqdm; extra == "dev"
46
+ Requires-Dist: einops~=0.8; extra == "dev"
47
+ Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
48
+ Requires-Dist: nvtx~=0.2; extra == "dev"
49
+ Requires-Dist: multi-storage-client~=0.27; extra == "dev"
50
+ Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
51
+ Requires-Dist: setuptools<80.0.0; extra == "dev"
52
+ Requires-Dist: mamba-ssm~=2.2; extra == "dev"
53
+ Requires-Dist: causal-conv1d~=1.5; extra == "dev"
54
+ Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
55
+ Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
56
+ Requires-Dist: av<16.0.0; extra == "dev"
57
+ Requires-Dist: flashinfer-python; extra == "dev"
58
+ Requires-Dist: wget; extra == "dev"
59
+ Requires-Dist: onnxscript; extra == "dev"
60
+ Provides-Extra: lts
61
+ Requires-Dist: tqdm; extra == "lts"
62
+ Requires-Dist: einops; extra == "lts"
63
+ Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
64
+ Requires-Dist: nvtx; extra == "lts"
65
+ Requires-Dist: transformers; extra == "lts"
66
+ Requires-Dist: zarr; extra == "lts"
67
+ Requires-Dist: setuptools<80.0.0; extra == "lts"
68
+ Requires-Dist: wget; extra == "lts"
69
+
70
+ <div align="center">
71
+
72
+ Megatron-LM & Megatron Core
73
+ ===========================
74
+
75
+ <h4>GPU-optimized library for training transformer models at scale</h4>
76
+
77
+ [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/Megatron-Core/developer-guide/latest/index.html)
78
+ [![version](https://img.shields.io/badge/release-0.12.0-green)](./CHANGELOG.md)
79
+ [![license](https://img.shields.io/badge/license-Apache-blue)](./LICENSE)
80
+
81
+ <div align="left">
82
+
83
+ ## ⚡ Quick Start
84
+
85
+ ```bash
86
+ # 1. Install Megatron Core with required dependencies
87
+ pip install --no-build-isolation megatron-core[mlm,dev]
88
+
89
+ # 2. Clone repository for examples
90
+ git clone https://github.com/NVIDIA/Megatron-LM.git
91
+ cd Megatron-LM
92
+ pip install --no-build-isolation .[mlm,dev]
93
+ ```
94
+
95
+ **→ [Complete Installation Guide](#installation)** - Docker, pip variants (dev,lts,etc.), source installation, and system requirements
96
+
97
+ # Latest News
98
+
99
+ - 📣 NEW! **[Megatron Dev Branch](https://github.com/NVIDIA/Megatron-LM/tree/dev)** - early access branch with experimental features.
100
+ - 🔄 **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
101
+ - **[2025/08]** **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
102
+ - **[2025/08]** **[GPT-OSS Model](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
103
+ - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
104
+ - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
105
+
106
+ <details>
107
+ <summary>Previous News</summary>
108
+
109
+ - **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
110
+ - **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
111
+ - **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron Core intro](#Megatron Core) for more details.
112
+
113
+ </details>
114
+
115
+ <details>
116
+ <summary>Table of Contents</summary>
117
+
118
+ **Getting Started**
119
+
120
+ - [Quick Start](#-quick-start)
121
+ - [Latest News](#latest-news)
122
+ - [Megatron Overview](#megatron-overview)
123
+ - [Project Structure](#project-structure)
124
+ - [Megatron-LM: Reference Implementation](#megatron-lm-reference-implementation)
125
+ - [Megatron Core: Production Library](#megatron-core-production-library)
126
+ - [Installation](#installation)
127
+ - [Docker (Recommended)](#-docker-recommended)
128
+ - [Pip Installation](#-pip-installation)
129
+ - [Source Installation](#-source-installation)
130
+ - [System Requirements](#system-requirements)
131
+
132
+ **Core Features**
133
+
134
+ - [Performance Benchmarking](#performance-benchmarking)
135
+ - [Weak Scaling Results](#weak-scaling-results)
136
+ - [Strong Scaling Results](#strong-scaling-results)
137
+ - [Ecosystem Libraries](#ecosystem-libraries)
138
+
139
+ **Training**
140
+
141
+ - [Training](#training)
142
+ - [Getting Started](#getting-started)
143
+ - [Data Preparation](#data-preparation)
144
+ - [Parallelism Strategies](#parallelism-strategies)
145
+ - [Data Parallelism (DP)](#data-parallelism-dp)
146
+ - [Tensor Parallelism (TP)](#tensor-parallelism-tp)
147
+ - [Pipeline Parallelism (PP)](#pipeline-parallelism-pp)
148
+ - [Context Parallelism (CP)](#context-parallelism-cp)
149
+ - [Expert Parallelism (EP)](#expert-parallelism-ep)
150
+ - [Parallelism Selection Guide](#parallelism-selection-guide)
151
+ - [Performance Optimizations](#performance-optimizations)
152
+
153
+ **Resources**
154
+
155
+ - [Examples](./examples/) - Training scripts and tutorials
156
+ - [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
157
+ - [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
158
+ - [Community & Support](#-community--support) - Get help and contribute
159
+ - [Getting Help](#getting-help)
160
+ - [Contributing](#contributing)
161
+ - [Citation](#citation)
162
+
163
+ </details>
164
+
165
+ # Megatron Overview
166
+
167
+ ## Project Structure
168
+
169
+ ```
170
+ Megatron-LM/
171
+ ├── megatron/
172
+ │ ├── core/ # Megatron Core (kernels, parallelism, building blocks)
173
+ │ │ ├── models/ # Transformer models
174
+ │ │ ├── transformer/ # Transformer building blocks
175
+ │ │ ├── tensor_parallel/ # Tensor parallelism
176
+ │ │ ├── pipeline_parallel/ # Pipeline parallelism
177
+ │ │ ├── distributed/ # Distributed training (FSDP, DDP)
178
+ │ │ ├── optimizer/ # Optimizers
179
+ │ │ ├── datasets/ # Dataset loaders
180
+ │ │ ├── inference/ # Inference engines
181
+ │ │ └── export/ # Model export (e.g. TensorRT-LLM)
182
+ │ ├── training/ # Training scripts
183
+ │ ├── inference/ # Inference server
184
+ │ ├── legacy/ # Legacy components
185
+ │ └── post_training/ # Post-training (RLHF, etc.)
186
+ ├── examples/ # Ready-to-use training examples
187
+ ├── tools/ # Utility tools
188
+ ├── tests/ # Comprehensive test suite
189
+ └── docs/ # Documentation
190
+ ```
191
+
192
+ ### Megatron-LM: Reference Implementation
193
+
194
+ **Reference implementation** that includes Megatron Core plus everything needed to train models.
195
+
196
+ **Best for:**
197
+
198
+ - **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
199
+ - **Research teams** exploring new architectures and training techniques
200
+ - **Learning distributed training** concepts and best practices
201
+ - **Quick experimentation** with proven model configurations
202
+
203
+ **What you get:**
204
+
205
+ - Pre-configured training scripts for GPT, LLama, DeepSeek, Qwen, and more.
206
+ - End-to-end examples from data prep to evaluation
207
+ - Research-focused tools and utilities
208
+
209
+ ### Megatron Core: Composable Library
210
+
211
+ **Composable library** with GPU-optimized building blocks for custom training frameworks.
212
+
213
+ **Best for:**
214
+
215
+ - **Framework developers** building on top of modular and optimized components
216
+ - **Research teams** needing custom training loops, optimizers, or data pipelines
217
+ - **ML engineers** requiring fault-tolerant training pipelines
218
+
219
+ **What you get:**
220
+
221
+ - Composable transformer building blocks (attention, MLP, etc.)
222
+ - Advanced parallelism strategies (TP, PP, DP, EP, CP)
223
+ - Pipeline schedules and distributed optimizers
224
+ - Mixed precision support (FP16, BF16, FP8)
225
+ - GPU-optimized kernels and memory management
226
+ - High-performance dataloaders and dataset utilities
227
+ - Model architectures (LLaMA, Qwen, GPT, Mixtral, Mamba, etc.)
228
+
229
+ ## Ecosystem Libraries
230
+
231
+ **Libraries used by Megatron Core:**
232
+
233
+ - **[Megatron Energon](https://github.com/NVIDIA/Megatron-Energon)** 📣 **NEW!** - Multi-modal data loader (text, images, video, audio) with distributed loading and dataset blending
234
+ - **[Transformer Engine](https://github.com/NVIDIA/TransformerEngine)** - Optimized kernels and FP8 mixed precision support
235
+ - **[Resiliency Extension (NVRx)](https://github.com/NVIDIA/nvidia-resiliency-ext)** - Fault tolerant training with failure detection and recovery
236
+
237
+ **Libraries using Megatron Core:**
238
+
239
+ - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
240
+ - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
241
+ - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
242
+ - **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
243
+
244
+ **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
245
+
246
+ # Installation
247
+
248
+ ## 🐳 Docker (Recommended)
249
+
250
+ We strongly recommend using the previous releases of [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) rather than the latest one for optimal compatibility with Megatron Core release and testing. Our releases are always based on the previous month's NGC container, so this ensures compatibility and stability.
251
+
252
+ **Note:** The NGC PyTorch container constraints the python environment globally via `PIP_CONSTRAINT`. In the following examples we will unset the variable.
253
+
254
+ This container comes with all dependencies pre-installed with compatible versions and optimized configurations for NVIDIA GPUs:
255
+
256
+ - PyTorch (latest stable version)
257
+ - CUDA, cuDNN, NCCL (latest stable versions)
258
+ - Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
259
+ - For best performance, use NVIDIA Turing GPU architecture generations and later
260
+
261
+ ```bash
262
+ # Run container with mounted directories
263
+ docker run --runtime --nvidia --gpus all -it --rm \
264
+ -v /path/to/megatron:/workspace/megatron \
265
+ -v /path/to/dataset:/workspace/dataset \
266
+ -v /path/to/checkpoints:/workspace/checkpoints \
267
+ -e PIP_CONSTRAINT= \
268
+ nvcr.io/nvidia/pytorch:25.04-py3
269
+ ```
270
+
271
+ ## Pip Installation
272
+
273
+ Megatron Core offers support for two NGC PyTorch containers:
274
+
275
+ - `dev`: Moving head that supports the most recent upstream dependencies
276
+ - `lts`: Long-term support of NGC PyTorch 24.01
277
+
278
+ Both containers can be combined with `mlm` which adds package dependencies for Megatron-LM on top of Megatron Core.
279
+
280
+ ```bash
281
+ # Install the latest release dependencies
282
+ pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
283
+ pip install --no-build-isolation megatron-core[dev]
284
+ # For running an M-LM application:
285
+ pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
286
+ pip install --no-build-isolation megatron-core[mlm,dev]
287
+ ```
288
+
289
+ ```bash
290
+ # Install packages for LTS support NGC PyTorch 24.01
291
+ pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
292
+ pip install --no-build-isolation megatron-core[lts]
293
+ # For running an M-LM application:
294
+ pip install "setuptools<80.0.0,>=77.0.0" "packaging>=24.2"
295
+ pip install --no-build-isolation megatron-core[mlm,lts]
296
+ ```
297
+
298
+ For a version of Megatron Core with only torch, run:
299
+
300
+ ```bash
301
+ pip install megatron-core
302
+ ```
303
+
304
+ ## System Requirements
305
+
306
+ ### Hardware Requirements
307
+
308
+ - **FP8 Support**: NVIDIA Hopper, Ada, Blackwell GPUs
309
+ - **Recommended**: NVIDIA Turing architecture or later
310
+
311
+ ### Software Requirements
312
+
313
+ - **CUDA/cuDNN/NCCL**: Latest stable versions
314
+ - **PyTorch**: Latest stable version
315
+ - **Transformer Engine**: Latest stable version
316
+ - **Python**: 3.12 recommended
317
+
318
+ # Performance Benchmarking
319
+
320
+ For our latest performance benchmarking results, please refer to [NVIDIA NeMo Framework Performance Summary](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
321
+
322
+ Our codebase efficiently trains models from 2B to 462B parameters across thousands of GPUs, achieving up to **47% Model FLOP Utilization (MFU)** on H100 clusters.
323
+
324
+ ![Model table](images/model_table.png)
325
+
326
+ **Benchmark Configuration:**
327
+
328
+ - **Vocabulary size**: 131,072 tokens
329
+ - **Sequence length**: 4096 tokens
330
+ - **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts
331
+ - **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)
332
+
333
+ **Key Results:**
334
+
335
+ - **6144 H100 GPUs**: Successfully benchmarked 462B parameter model training
336
+ - **Superlinear scaling**: MFU increases from 41% to 47-48% with model size
337
+ - **End-to-end measurement**: Throughputs include all operations (data loading, optimizer steps, communication, logging)
338
+ - **Production ready**: Full training pipeline with checkpointing and fault tolerance
339
+ - *Note: Performance results measured without training to convergence*
340
+
341
+ ## Weak Scaling Results
342
+
343
+ Our weak scaled results show superlinear scaling (MFU increases from 41% for the smallest model considered to 47-48% for the largest models); this is because larger GEMMs have higher arithmetic intensity and are consequently more efficient to execute.
344
+
345
+ ![Weak scaling](images/weak_scaling.png)
346
+
347
+ ## Strong Scaling Results
348
+
349
+ We also strong scaled the standard GPT-3 model (our version has slightly more than 175 billion parameters due to larger vocabulary size) from 96 H100 GPUs to 4608 GPUs, using the same batch size of 1152 sequences throughout. Communication becomes more exposed at larger scale, leading to a reduction in MFU from 47% to 42%.
350
+
351
+ ![Strong scaling](images/strong_scaling.png)
352
+
353
+ # Training
354
+
355
+ ## Getting Started
356
+
357
+ ### Simple Training Example
358
+
359
+ ```bash
360
+ # Distributed training example (2 GPUs, mock data)
361
+ torchrun --nproc_per_node=2 examples/run_simple_mcore_train_loop.py
362
+ ```
363
+
364
+ ### LLama-3 Training Example
365
+
366
+ ```bash
367
+ # 8 GPUs, FP8 precision, mock data
368
+ ./examples/llama/train_llama3_8b_fp8.sh
369
+ ```
370
+
371
+ ## Data Preparation
372
+
373
+ ### JSONL Data Format
374
+
375
+ ```json
376
+ {"text": "Your training text here..."}
377
+ {"text": "Another training sample..."}
378
+ ```
379
+
380
+ ### Basic Preprocessing
381
+
382
+ ```bash
383
+ python tools/preprocess_data.py \
384
+ --input data.jsonl \
385
+ --output-prefix processed_data \
386
+ --tokenizer-type HuggingFaceTokenizer \
387
+ --tokenizer-model /path/to/tokenizer.model \
388
+ --workers 8 \
389
+ --append-eod
390
+ ```
391
+
392
+ ### Key Arguments
393
+
394
+ - `--input`: Path to input JSON/JSONL file
395
+ - `--output-prefix`: Prefix for output binary files (.bin and .idx)
396
+ - `--tokenizer-type`: Tokenizer type (`HuggingFaceTokenizer`, `GPT2BPETokenizer`, etc.)
397
+ - `--tokenizer-model`: Path to tokenizer model file
398
+ - `--workers`: Number of parallel workers for processing
399
+ - `--append-eod`: Add end-of-document token
400
+
401
+ <!-- **→ [Complete Data Preparation Guide](./docs/data-preparation.md)** - Comprehensive guide covering advanced preprocessing, dataset collection, deduplication, and optimization strategies -->
402
+
403
+ # Parallelism Strategies
404
+
405
+ ## Data Parallelism (DP)
406
+
407
+ ### Standard Data Parallel
408
+
409
+ ```bash
410
+ # Standard DDP - replicate model on each GPU
411
+ torchrun --nproc_per_node=8 pretrain_gpt.py \
412
+ --data-parallel-sharding-strategy no_shard
413
+ ```
414
+
415
+ ### Fully Sharded Data Parallel (FSDP)
416
+
417
+ ```bash
418
+ # Megatron's optimized FSDP (~15% faster than PyTorch FSDP2)
419
+ --use-custom-fsdp
420
+
421
+ # PyTorch FSDP2
422
+ --use-torch-fsdp2
423
+
424
+ # Sharding strategies
425
+ --data-parallel-sharding-strategy optim # Shard optimizer states (ZeRO-1)
426
+ --data-parallel-sharding-strategy optim_grads # Shard gradients + optimizer (ZeRO-2)
427
+ --data-parallel-sharding-strategy optim_grads_params # Shard parameters + gradients + optimizer (ZeRO-3)
428
+ ```
429
+
430
+ ## Tensor Parallelism (TP)
431
+
432
+ Split individual model layers across GPUs:
433
+
434
+ ```bash
435
+ --tensor-model-parallel-size 4 # 4-way tensor parallelism
436
+ --sequence-parallel # Enable sequence parallelism (recommended with TP)
437
+ ```
438
+
439
+ ## Pipeline Parallelism (PP)
440
+
441
+ Split model depth across GPUs:
442
+
443
+ ```bash
444
+ --pipeline-model-parallel-size 8 # 8 pipeline stages
445
+ --virtual-pipeline-model-parallel-size 4 # Virtual pipeline for better load balancing
446
+ ```
447
+
448
+ ## Context Parallelism (CP)
449
+
450
+ Split long sequences across GPUs for handling long contexts:
451
+
452
+ ```bash
453
+ --context-parallel-size 2 # 2-way context parallelism
454
+ --cp-comm-type p2p # Communication: p2p, a2a, allgather, a2a+p2p
455
+ --hierarchical-context-parallel-sizes 2 4 # Hierarchical context parallelism
456
+ ```
457
+
458
+ ## Expert Parallelism (EP)
459
+
460
+ For Mixture of Experts (MoE) models:
461
+
462
+ ```bash
463
+ --expert-model-parallel-size 4 # 4-way expert parallelism
464
+ --num-experts 8 # 8 experts per MoE layer
465
+ --moe-grouped-gemm # Optimize expert computation
466
+ ```
467
+
468
+ ## Combining Parallelism Strategies
469
+
470
+ ### Parallelism Selection Guide
471
+
472
+ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance/recommended_model_configs):
473
+
474
+ | Model | Size | GPUs | TP | PP | CP | EP | Notes |
475
+ |-------|------|------|----|----|----|----|-------|
476
+ | **LLama-3** | 8B | 8 | 1 | 1 | 2 | 1 | CP for long seqlen (8K) |
477
+ | **LLama-3** | 70B | 64 | 4 | 4 | 2 | 1 | TP+PP |
478
+ | **LLama-3.1** | 405B | 1024 | 8 | 8 | 2 | 1 | 3D parallelism for scale |
479
+ | **GPT-3** | 175B | 128-512 | 4 | 8 | 1 | 1 | Large model config |
480
+ | **Mixtral** | 8x7B | 64 | 1 | 4 | 1 | 8 | EP for MoE |
481
+ | **Mixtral** | 8x22B | 256 | 4 | 4 | 8 | 8 | Combined TP+EP for large MoE |
482
+ | **DeepSeek-V3** | 671B | 1024 | 2 | 16 | 1 | 64 | Large MoE config |
483
+
484
+ ### MoE-Specific Requirements
485
+
486
+ **Important**: When combining Expert Parallelism (EP) with Tensor Parallelism (TP), **Sequence Parallelism (SP) must be enabled**.
487
+
488
+ ## Performance Optimizations
489
+
490
+ | Feature | Flag | Benefit |
491
+ |---------|------|---------|
492
+ | **FlashAttention** | `--attention-backend` | Faster attention and lower memory usage |
493
+ | **FP8 Training** | `--fp8-hybrid` | Faster training |
494
+ | **Activation Checkpointing** | `--recompute-activations` | Reduced memory usage |
495
+ | **Data Parallelism Communication Overlap** | `--overlap-grad-reduce` | Faster distributed training |
496
+ | **Distributed Optimizer** | `--use-distributed-optimizer` | Reduced checkpointing time |
497
+
498
+ **→ [NVIDIA NeMo Framework Performance Tuning Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#performance-tuning-guide)** - Comprehensive performance optimization guide covering advanced tuning techniques, communication overlaps, memory optimizations, and profiling options.
499
+
500
+ ### FlashAttention
501
+
502
+ [FlashAttention](https://github.com/Dao-AILab/flash-attention) is a fast and memory-efficient attention algorithm. We recommend the default usage, which uses cuDNN for attention via Transformer Engine and provides up to 50% speedups on forward and 84% on backward propagation with FP8 kernels. The `flash-attn` package is also supported via `--use-flash-attn`.
503
+
504
+ ### Mixed Precision Training
505
+
506
+ ```bash
507
+ --fp16 # Standard FP16
508
+ --bf16 # BFloat16 (recommended for large models)
509
+ --fp8-hybrid # FP8 training (Hopper, Ada, and Blackwell GPUs)
510
+ ```
511
+
512
+ ### Activation Checkpointing and Recomputation
513
+
514
+ ```bash
515
+ # For limited memory
516
+ --recompute-activations
517
+
518
+ # For extreme memory constraints
519
+ --recompute-granularity full \
520
+ --recompute-method uniform
521
+ ```
522
+
523
+ ### Data Parallelism Communication Overlap
524
+
525
+ ```bash
526
+ --overlap-grad-reduce
527
+ --overlap-param-gather
528
+ ```
529
+
530
+ ### Distributed Optimizer
531
+
532
+ ```bash
533
+ --use-distributed-optimizer
534
+ ```
535
+
536
+ # Roadmaps
537
+
538
+ Stay up-to-date with our development roadmaps and planned features:
539
+
540
+ - **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
541
+ - **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
542
+
543
+ *More roadmap trackers will be added soon.*
544
+
545
+ # Community & Support
546
+
547
+ ## Getting Help
548
+
549
+ - 📖 **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation
550
+ - 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests
551
+
552
+ ## Contributing
553
+
554
+ We ❤️ contributions! Ways to contribute:
555
+
556
+ - 🐛 **Report bugs** - Help us improve reliability
557
+ - 💡 **Suggest features** - Shape the future of Megatron Core
558
+ - 📝 **Improve docs** - Make Megatron Core more accessible
559
+ - 🔧 **Submit PRs** - Contribute code improvements
560
+
561
+ **→ [Contributing Guide](./CONTRIBUTING.md)**
562
+
563
+ ## Citation
564
+
565
+ ```bibtex
566
+ @article{megatron-lm,
567
+ title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
568
+ author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
569
+ journal={arXiv preprint arXiv:1909.08053},
570
+ year={2019}
571
+ }
572
+ ```