megatron-core 0.11.0__tar.gz → 0.12.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (290) hide show
  1. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/LICENSE +3 -2
  2. {megatron_core-0.11.0/megatron_core.egg-info → megatron_core-0.12.0rc2}/PKG-INFO +7 -7
  3. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/__init__.py +1 -0
  4. megatron_core-0.12.0rc2/megatron/core/config.py +3 -0
  5. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/exchange_utils.py +8 -2
  6. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/mapping.py +7 -1
  7. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/serialization.py +4 -3
  8. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/state_dict_utils.py +28 -1
  9. megatron_core-0.12.0rc2/megatron/core/dist_checkpointing/strategies/async_utils.py +543 -0
  10. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/base.py +1 -0
  11. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +64 -38
  12. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/torch.py +55 -9
  13. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/__init__.py +3 -0
  14. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +694 -0
  15. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +1966 -0
  16. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/data_parallel_base.py +2 -2
  17. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/distributed_data_parallel.py +43 -6
  18. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/distributed_data_parallel_config.py +30 -1
  19. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/finalize_model_grads.py +22 -8
  20. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/param_and_grad_buffer.py +59 -20
  21. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +30 -12
  22. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/enums.py +10 -0
  23. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_layers.py +4 -1
  24. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/extensions/transformer_engine.py +104 -42
  25. megatron_core-0.12.0rc2/megatron/core/fp8_utils.py +449 -0
  26. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_cross_entropy.py +28 -23
  27. megatron_core-0.12.0rc2/megatron/core/inference/contexts/__init__.py +11 -0
  28. megatron_core-0.12.0rc2/megatron/core/inference/contexts/base_context.py +20 -0
  29. megatron_core-0.12.0rc2/megatron/core/inference/contexts/dynamic_context.py +1004 -0
  30. megatron_core-0.11.0/megatron/core/inference_params.py → megatron_core-0.12.0rc2/megatron/core/inference/contexts/static_context.py +52 -8
  31. megatron_core-0.12.0rc2/megatron/core/inference/engines/__init__.py +5 -0
  32. megatron_core-0.12.0rc2/megatron/core/inference/engines/dynamic_engine.py +176 -0
  33. megatron_core-0.12.0rc2/megatron/core/inference/engines/mcore_engine.py +5 -0
  34. megatron_core-0.11.0/megatron/core/inference/engines/mcore_engine.py → megatron_core-0.12.0rc2/megatron/core/inference/engines/static_engine.py +45 -10
  35. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/inference_request.py +13 -1
  36. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +80 -27
  37. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +51 -18
  38. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
  39. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +28 -22
  40. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +7 -3
  41. megatron_core-0.12.0rc2/megatron/core/inference/modelopt_support/gpt/__init__.py +8 -0
  42. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/sampling_params.py +3 -1
  43. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/scheduler.py +25 -7
  44. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +203 -52
  45. megatron_core-0.12.0rc2/megatron/core/inference_params.py +5 -0
  46. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/model_parallel_config.py +10 -0
  47. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/t5_model.py +15 -9
  48. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_model.py +12 -3
  49. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/__init__.py +1 -1
  50. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/relative_pos_embedding.py +12 -6
  51. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/rope_utils.py +12 -2
  52. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +102 -7
  53. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/language_module/language_module.py +41 -8
  54. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/gpt_layer_specs.py +59 -2
  55. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/gpt_model.py +171 -19
  56. megatron_core-0.12.0rc2/megatron/core/models/huggingface/__init__.py +2 -0
  57. megatron_core-0.12.0rc2/megatron/core/models/huggingface/clip_model.py +26 -0
  58. megatron_core-0.12.0rc2/megatron/core/models/huggingface/module.py +63 -0
  59. megatron_core-0.12.0rc2/megatron/core/models/huggingface/qwen_model.py +42 -0
  60. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/mamba_layer_specs.py +2 -1
  61. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/mamba_model.py +32 -14
  62. megatron_core-0.12.0rc2/megatron/core/models/multimodal/context_parallel.py +99 -0
  63. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/multimodal/llava_model.py +189 -100
  64. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/decoder_attention.py +18 -9
  65. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/encoder_attention.py +8 -3
  66. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/model.py +13 -5
  67. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/clip_vit_model.py +28 -7
  68. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/__init__.py +103 -9
  69. megatron_core-0.12.0rc2/megatron/core/optimizer/cpu_offloading/__init__.py +2 -0
  70. megatron_core-0.12.0rc2/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +465 -0
  71. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/distrib_optimizer.py +324 -141
  72. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/optimizer.py +9 -4
  73. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/optimizer_config.py +32 -1
  74. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/package_info.py +2 -2
  75. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/parallel_state.py +137 -25
  76. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/p2p_communication.py +20 -3
  77. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/schedules.py +176 -70
  78. megatron_core-0.12.0rc2/megatron/core/post_training/__init__.py +1 -0
  79. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/__init__.py +10 -0
  80. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt/model_specs.py +245 -0
  81. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +133 -0
  82. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/layers.py +246 -0
  83. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
  84. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/mamba/model_specs.py +90 -0
  85. megatron_core-0.12.0rc2/megatron/core/process_groups_config.py +113 -0
  86. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/rerun_state_machine.py +237 -61
  87. megatron_core-0.12.0rc2/megatron/core/ssm/__init__.py +1 -0
  88. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_block.py +120 -50
  89. megatron_core-0.12.0rc2/megatron/core/ssm/mamba_config.py +22 -0
  90. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_layer.py +33 -9
  91. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_mixer.py +41 -14
  92. megatron_core-0.12.0rc2/megatron/core/ssm/mlp_layer.py +25 -0
  93. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/random.py +27 -9
  94. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/attention.py +277 -93
  95. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/cuda_graphs.py +44 -21
  96. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/mlp.py +13 -2
  97. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/experts.py +47 -7
  98. megatron_core-0.12.0rc2/megatron/core/transformer/moe/fused_a2a.py +202 -0
  99. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/moe_layer.py +7 -3
  100. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/moe_utils.py +40 -6
  101. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/router.py +87 -12
  102. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/token_dispatcher.py +420 -67
  103. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/multi_latent_attention.py +67 -21
  104. megatron_core-0.12.0rc2/megatron/core/transformer/multi_token_prediction.py +737 -0
  105. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/transformer_block.py +70 -127
  106. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/transformer_config.py +167 -36
  107. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/transformer_layer.py +279 -41
  108. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/utils.py +9 -2
  109. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/utils.py +289 -34
  110. {megatron_core-0.11.0 → megatron_core-0.12.0rc2/megatron_core.egg-info}/PKG-INFO +7 -7
  111. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/SOURCES.txt +33 -1
  112. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/requires.txt +1 -5
  113. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/requirements/pytorch_24.10/requirements.txt +0 -11
  114. megatron_core-0.12.0rc2/requirements/pytorch_25.03/requirements.txt +15 -0
  115. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/setup.py +1 -1
  116. megatron_core-0.11.0/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -228
  117. megatron_core-0.11.0/megatron/core/models/multimodal/__init__.py +0 -1
  118. megatron_core-0.11.0/megatron/core/transformer/moe/__init__.py +0 -0
  119. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/MANIFEST.in +0 -0
  120. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/README.md +0 -0
  121. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/README.md +0 -0
  122. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/config_logger.py +0 -0
  123. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/__init__.py +0 -0
  124. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/bert_dataset.py +0 -0
  125. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_dataset.py +0 -0
  126. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  127. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  128. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/gpt_dataset.py +0 -0
  129. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/helpers.cpp +0 -0
  130. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/helpers.py +0 -0
  131. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/indexed_dataset.py +0 -0
  132. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/masked_dataset.py +0 -0
  133. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/megatron_dataset.py +0 -0
  134. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  135. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/multimodal_dataset.py +0 -0
  136. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/__init__.py +0 -0
  137. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/__init__.py +0 -0
  138. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  139. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/config.py +0 -0
  140. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  141. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  142. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/__init__.py +0 -0
  143. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/build.py +0 -0
  144. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/dataset.py +0 -0
  145. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/utils.py +0 -0
  146. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/external_libs.py +0 -0
  147. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/__init__.py +0 -0
  148. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/build.py +0 -0
  149. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/factory.py +0 -0
  150. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/index.py +0 -0
  151. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  152. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  153. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  154. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/utils.py +0 -0
  155. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/validate.py +0 -0
  156. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/__init__.py +0 -0
  157. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  158. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  159. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/query.py +0 -0
  160. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  161. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/utils.py +0 -0
  162. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/utils.py +0 -0
  163. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/t5_dataset.py +0 -0
  164. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/utils.py +0 -0
  165. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/utils_s3.py +0 -0
  166. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/__init__.py +0 -0
  167. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/core.py +0 -0
  168. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  169. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  170. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  171. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  172. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  173. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  174. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  175. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  176. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  177. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  178. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  179. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  180. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/utils.py +0 -0
  181. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/validation.py +0 -0
  182. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/__init__.py +0 -0
  183. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/__init__.py +0 -0
  184. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/data_type.py +0 -0
  185. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/export_config.py +0 -0
  186. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/model_type.py +0 -0
  187. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/__init__.py +0 -0
  188. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  189. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  190. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  191. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  192. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  193. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  194. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  195. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  196. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  197. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  198. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/extensions/__init__.py +0 -0
  199. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/__init__.py +0 -0
  200. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  201. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  202. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  203. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  204. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_layer_norm.py +0 -0
  205. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_softmax.py +0 -0
  206. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/__init__.py +0 -0
  207. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/async_stream.py +0 -0
  208. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/common_inference_params.py +0 -0
  209. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/communication_utils.py +0 -0
  210. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/engines/abstract_engine.py +0 -0
  211. {megatron_core-0.11.0/megatron/core/inference/engines → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers}/__init__.py +0 -0
  212. {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/gpt}/__init__.py +0 -0
  213. {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers/gpt → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/t5}/__init__.py +0 -0
  214. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/__init__.py +0 -0
  215. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/gpt/model_specs.py +0 -0
  216. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py +0 -0
  217. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/mamba/__init__.py +0 -0
  218. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/mamba/model_specs.py +0 -0
  219. {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers/t5 → megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers}/__init__.py +0 -0
  220. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  221. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  222. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  223. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/inference/utils.py +0 -0
  224. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/jit.py +0 -0
  225. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/__init__.py +0 -0
  226. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/t5_spec.py +0 -0
  227. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/__init__.py +0 -0
  228. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/__init__.py +0 -0
  229. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  230. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_lm_head.py +0 -0
  231. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/pooler.py +0 -0
  232. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/__init__.py +0 -0
  233. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  234. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  235. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/language_module/__init__.py +0 -0
  236. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/vision_module/__init__.py +0 -0
  237. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  238. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/__init__.py +0 -0
  239. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  240. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/__init__.py +0 -0
  241. {megatron_core-0.11.0/megatron/core/inference/modelopt_support/gpt → megatron_core-0.12.0rc2/megatron/core/models/multimodal}/__init__.py +0 -0
  242. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/multimodal/llava_spec.py +0 -0
  243. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/__init__.py +0 -0
  244. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/base_attention.py +0 -0
  245. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/config.py +0 -0
  246. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/decoder_spec.py +0 -0
  247. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/encoder_spec.py +0 -0
  248. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/utils.py +0 -0
  249. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/__init__.py +0 -0
  250. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/multimodal_projector.py +0 -0
  251. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/radio.py +0 -0
  252. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  253. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/num_microbatches_calculator.py +0 -0
  254. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/clip_grads.py +0 -0
  255. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/grad_scaler.py +0 -0
  256. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer_param_scheduler.py +0 -0
  257. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/packed_seq_params.py +0 -0
  258. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/__init__.py +0 -0
  259. {megatron_core-0.11.0/megatron/core/inference/text_generation_controllers → megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt}/__init__.py +0 -0
  260. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/requirements.txt +0 -0
  261. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  262. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/triton_cache_manager.py +0 -0
  263. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/__init__.py +0 -0
  264. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  265. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/data.py +0 -0
  266. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/layers.py +0 -0
  267. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/mappings.py +0 -0
  268. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/utils.py +0 -0
  269. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/timers.py +0 -0
  270. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/__init__.py +0 -0
  271. {megatron_core-0.11.0/megatron/core/ssm → megatron_core-0.12.0rc2/megatron/core/transformer/custom_layers}/__init__.py +0 -0
  272. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  273. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/dot_product_attention.py +0 -0
  274. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/enums.py +0 -0
  275. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/identity_op.py +0 -0
  276. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/module.py +0 -0
  277. {megatron_core-0.11.0/megatron/core/transformer/custom_layers → megatron_core-0.12.0rc2/megatron/core/transformer/moe}/__init__.py +0 -0
  278. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  279. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py +0 -0
  280. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/shared_experts.py +0 -0
  281. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  282. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/spec_utils.py +0 -0
  283. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/torch_layer_norm.py +0 -0
  284. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/torch_norm.py +0 -0
  285. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/dependency_links.txt +0 -0
  286. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/top_level.txt +0 -0
  287. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/pyproject.toml +0 -0
  288. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/requirements/pytorch_24.01/requirements.txt +0 -0
  289. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/requirements/pytorch_24.07/requirements.txt +0 -0
  290. {megatron_core-0.11.0 → megatron_core-0.12.0rc2}/setup.cfg +0 -0
@@ -247,8 +247,9 @@ LICENSE FOR
247
247
  Facebook, Inc. and its affiliates,
248
248
  Meta Platforms, Inc. and its affiliates,
249
249
  Microsoft Corporation,
250
- OpenGVLab/InternVL, and
251
- Triton language and compiler.
250
+ OpenGVLab/InternVL,
251
+ Triton language and compiler,
252
+ and DeepSeek.
252
253
 
253
254
  MIT License
254
255
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.11.0
3
+ Version: 0.12.0rc2
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Home-page: https://github.com/NVIDIA/Megatron-LM/megatron/core
6
6
  Download-URL: https://github.com/NVIDIA/Megatron-LM/releases
@@ -257,8 +257,9 @@ License: The following applies to all files unless otherwise noted:
257
257
  Facebook, Inc. and its affiliates,
258
258
  Meta Platforms, Inc. and its affiliates,
259
259
  Microsoft Corporation,
260
- OpenGVLab/InternVL, and
261
- Triton language and compiler.
260
+ OpenGVLab/InternVL,
261
+ Triton language and compiler,
262
+ and DeepSeek.
262
263
 
263
264
  MIT License
264
265
 
@@ -308,7 +309,6 @@ Requires-Dist: einops
308
309
  Requires-Dist: flask-restful
309
310
  Requires-Dist: nltk
310
311
  Requires-Dist: pytest
311
- Requires-Dist: pytest_asyncio
312
312
  Requires-Dist: pytest-cov
313
313
  Requires-Dist: pytest_mock
314
314
  Requires-Dist: pytest-random-order
@@ -319,13 +319,13 @@ Requires-Dist: zarr
319
319
  Requires-Dist: wandb
320
320
  Requires-Dist: tensorstore!=0.1.46,!=0.1.72
321
321
  Requires-Dist: torch
322
- Requires-Dist: nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
323
- Requires-Dist: nvidia-resiliency-ext; platform_machine == "x86_64"
322
+ Requires-Dist: nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
324
323
  Requires-Dist: torch
325
324
  Requires-Dist: packaging
326
325
  Dynamic: author
327
326
  Dynamic: download-url
328
327
  Dynamic: home-page
328
+ Dynamic: license-file
329
329
  Dynamic: maintainer
330
330
  Dynamic: requires-dist
331
331
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+
2
3
  import megatron.core.tensor_parallel
3
4
  import megatron.core.utils
4
5
  from megatron.core import parallel_state
@@ -0,0 +1,3 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ ENABLE_EXPERIMENTAL = False
@@ -325,7 +325,10 @@ def exchange_loaded_tensors_gather_rounds(
325
325
  # this during state dict load.
326
326
  # TODO: remove it once the bug is fixed
327
327
  if is_float8tensor(local_ten):
328
- local_ten = local_ten.from_float8()
328
+ try:
329
+ local_ten = local_ten.from_float8()
330
+ except Exception as e:
331
+ local_ten = local_ten.dequantize()
329
332
  all_loaded_tensors[shard_id] = local_ten
330
333
 
331
334
  round_tensors.append(local_ten)
@@ -483,7 +486,10 @@ def exchange_loaded_tensors_broadcast(
483
486
  # this during state dict load.
484
487
  # TODO: remove it once the bug is fixed
485
488
  if is_float8tensor(local_ten):
486
- local_ten = local_ten.from_float8()
489
+ try:
490
+ local_ten = local_ten.from_float8()
491
+ except Exception as e:
492
+ local_ten = local_ten.dequantize()
487
493
  all_loaded_tensors[shard_id] = local_ten
488
494
 
489
495
  global_src_rank = (
@@ -136,7 +136,13 @@ class ShardedTensor(ShardedBase):
136
136
  )
137
137
 
138
138
  for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
139
- if off % sh != 0:
139
+ # NOTE: In custom FSDP, we have a case where a new parameter shard is created locally.
140
+ # For example, consider parameters [p0, p1, p2] sharded across GPU0 and GPU1.
141
+ # GPU0 receives p0 and a portion of p1, while GPU1 receives the
142
+ # remaining portion of p1 and p2.
143
+ # As a result, there is no parameter shard of p2 on GPU0, and
144
+ # the shape of p2 on GPU0 is zero.
145
+ if sh != 0 and off % sh != 0:
140
146
  raise CheckpointingException(
141
147
  f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.'
142
148
  )
@@ -351,9 +351,10 @@ def save(
351
351
  )
352
352
 
353
353
  if next(checkpoint_dir.iterdir(), None) is not None:
354
- raise CheckpointingException(
355
- f'Checkpoint destination directory ({checkpoint_dir}) is not empty'
356
- )
354
+ # Don't throw exception here since this could cause a cascade of failures
355
+ # without human intervention in cases where multiple jobs are queued up.
356
+ if torch.distributed.get_rank() == 0:
357
+ logger.warning("Overwriting old incomplete / corrupted checkpoint...")
357
358
 
358
359
  if common_strategy is not None:
359
360
  raise NotImplementedError('The only supported common strategy is torch')
@@ -2,12 +2,13 @@
2
2
 
3
3
  """ Utilities for transforming state_dict."""
4
4
 
5
- from typing import Callable
5
+ from typing import Callable, Union
6
6
 
7
7
  from .dict_utils import dict_list_map_inplace, extract_matching_values
8
8
  from .mapping import (
9
9
  CommonStateDict,
10
10
  ShardedStateDict,
11
+ ShardedTensor,
11
12
  ShardedTensorFactory,
12
13
  StateDict,
13
14
  apply_factories,
@@ -39,6 +40,7 @@ def save_preprocess(
39
40
  apply_factories(sharded_state_dict)
40
41
  _, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
41
42
  sharded_part, common_state_dict = extract_sharded_base(sharded_state_dict)
43
+ sharded_part = filter_out_empty_flatten_tensor(sharded_part)
42
44
  if validate_access_integrity:
43
45
  preprocessed_common_state_dict = common_state_dict
44
46
  if preprocess_common_before_consistancy_check:
@@ -69,6 +71,7 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
69
71
  # Create a copy of sharded_state_dict as the passed in state dict may have
70
72
  # references that prevent tensors from being deallocated
71
73
  sharded_state_dict, _ = extract_matching_values(sharded_state_dict, lambda x: True)
74
+ sharded_state_dict = filter_out_empty_flatten_tensor(sharded_state_dict)
72
75
 
73
76
  sh_ten_factories, _ = extract_matching_values(
74
77
  sharded_state_dict,
@@ -83,3 +86,27 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
83
86
  nonpersistent_state_dict, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
84
87
  dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
85
88
  return sharded_state_dict, nonpersistent_state_dict, sh_ten_factories
89
+
90
+
91
+ def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]):
92
+ """
93
+ Filter out ShardedTensors with empty flatten_range.
94
+ These tensors can cause the PyTorch check in failure.
95
+
96
+ Args:
97
+ sharded_state_dict: state dict possibly containing ShardedTensor objects
98
+ """
99
+ # Filter out ShardedTensors with empty flatten_range.
100
+ # These tensors can cause the PyTorch check in
101
+ # `TorchShardedTensor._init_from_local_shards_and_global_metadata` to fail.
102
+ # This situation may occur in custom Fully Sharded Data Parallel (FSDP) cases.
103
+ sharded_state_dict, _ = extract_matching_values(
104
+ sharded_state_dict,
105
+ lambda v: not (
106
+ isinstance(v, ShardedTensor)
107
+ and v.flattened_range
108
+ and v.flattened_range.start == v.flattened_range.stop
109
+ ),
110
+ )
111
+
112
+ return sharded_state_dict