megatron-core 0.10.0__tar.gz → 0.12.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (308) hide show
  1. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/LICENSE +5 -4
  2. {megatron_core-0.10.0/megatron_core.egg-info → megatron_core-0.12.0rc2}/PKG-INFO +75 -13
  3. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/README.md +62 -5
  4. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/__init__.py +1 -0
  5. megatron_core-0.12.0rc2/megatron/core/config.py +3 -0
  6. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_dataset.py +3 -3
  7. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_megatron_dataset_builder.py +99 -48
  8. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -8
  9. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/__init__.py +1 -1
  10. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/exchange_utils.py +110 -79
  11. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/mapping.py +7 -5
  12. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/serialization.py +5 -4
  13. megatron_core-0.12.0rc2/megatron/core/dist_checkpointing/state_dict_utils.py +112 -0
  14. megatron_core-0.12.0rc2/megatron/core/dist_checkpointing/strategies/async_utils.py +543 -0
  15. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/base.py +2 -1
  16. megatron_core-0.12.0rc2/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +38 -0
  17. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/common.py +3 -3
  18. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +113 -49
  19. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +172 -96
  20. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/resharding.py +15 -12
  21. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +90 -5
  22. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/torch.py +170 -53
  23. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/two_stage.py +20 -6
  24. megatron_core-0.12.0rc2/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +347 -0
  25. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/utils.py +101 -1
  26. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/__init__.py +3 -0
  27. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +694 -0
  28. megatron_core-0.12.0rc2/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +1966 -0
  29. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/data_parallel_base.py +2 -2
  30. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/distributed_data_parallel.py +64 -22
  31. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/distributed_data_parallel_config.py +30 -1
  32. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/finalize_model_grads.py +54 -8
  33. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/param_and_grad_buffer.py +85 -41
  34. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +37 -11
  35. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/enums.py +10 -0
  36. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +4 -0
  37. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_layers.py +4 -1
  38. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/extensions/transformer_engine.py +240 -93
  39. megatron_core-0.12.0rc2/megatron/core/fp8_utils.py +449 -0
  40. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_cross_entropy.py +28 -23
  41. megatron_core-0.12.0rc2/megatron/core/inference/async_stream.py +67 -0
  42. megatron_core-0.12.0rc2/megatron/core/inference/common_inference_params.py +4 -0
  43. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/communication_utils.py +4 -0
  44. megatron_core-0.12.0rc2/megatron/core/inference/contexts/__init__.py +11 -0
  45. megatron_core-0.12.0rc2/megatron/core/inference/contexts/base_context.py +20 -0
  46. megatron_core-0.12.0rc2/megatron/core/inference/contexts/dynamic_context.py +1004 -0
  47. megatron_core-0.12.0rc2/megatron/core/inference/contexts/static_context.py +133 -0
  48. megatron_core-0.12.0rc2/megatron/core/inference/engines/__init__.py +5 -0
  49. megatron_core-0.12.0rc2/megatron/core/inference/engines/dynamic_engine.py +176 -0
  50. megatron_core-0.12.0rc2/megatron/core/inference/engines/mcore_engine.py +5 -0
  51. megatron_core-0.12.0rc2/megatron/core/inference/engines/static_engine.py +245 -0
  52. megatron_core-0.12.0rc2/megatron/core/inference/inference_request.py +64 -0
  53. megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +364 -0
  54. megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +135 -0
  55. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
  56. megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +218 -0
  57. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +72 -58
  58. megatron_core-0.12.0rc2/megatron/core/inference/modelopt_support/__init__.py +10 -0
  59. megatron_core-0.12.0rc2/megatron/core/inference/modelopt_support/gpt/__init__.py +8 -0
  60. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/gpt/model_specs.py +9 -4
  61. megatron_core-0.12.0rc2/megatron/core/inference/modelopt_support/mamba/__init__.py +1 -0
  62. megatron_core-0.12.0rc2/megatron/core/inference/modelopt_support/mamba/model_specs.py +89 -0
  63. megatron_core-0.12.0rc2/megatron/core/inference/sampling_params.py +38 -0
  64. megatron_core-0.12.0rc2/megatron/core/inference/scheduler.py +193 -0
  65. megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +38 -0
  66. megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +5 -0
  67. megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers/text_generation_controller.py +816 -0
  68. megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +40 -0
  69. megatron_core-0.12.0rc2/megatron/core/inference_params.py +5 -0
  70. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/model_parallel_config.py +10 -0
  71. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/t5_model.py +84 -11
  72. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_model.py +29 -13
  73. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/__init__.py +1 -1
  74. megatron_core-0.12.0rc2/megatron/core/models/common/embeddings/relative_pos_embedding.py +179 -0
  75. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/rope_utils.py +12 -2
  76. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +107 -10
  77. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/language_module/language_module.py +81 -8
  78. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/gpt_layer_specs.py +151 -68
  79. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/gpt_model.py +191 -17
  80. megatron_core-0.12.0rc2/megatron/core/models/gpt/moe_module_specs.py +81 -0
  81. megatron_core-0.12.0rc2/megatron/core/models/huggingface/__init__.py +2 -0
  82. megatron_core-0.12.0rc2/megatron/core/models/huggingface/clip_model.py +26 -0
  83. megatron_core-0.12.0rc2/megatron/core/models/huggingface/module.py +63 -0
  84. megatron_core-0.12.0rc2/megatron/core/models/huggingface/qwen_model.py +42 -0
  85. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/mamba_layer_specs.py +2 -1
  86. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/mamba_model.py +32 -14
  87. megatron_core-0.12.0rc2/megatron/core/models/multimodal/context_parallel.py +99 -0
  88. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/multimodal/llava_model.py +297 -216
  89. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/multimodal/llava_spec.py +8 -6
  90. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/config.py +3 -0
  91. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/decoder_attention.py +18 -9
  92. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/encoder_attention.py +8 -3
  93. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/model.py +13 -5
  94. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/clip_vit_model.py +30 -7
  95. megatron_core-0.12.0rc2/megatron/core/models/vision/radio.py +325 -0
  96. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/vit_layer_specs.py +1 -1
  97. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/__init__.py +154 -32
  98. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/clip_grads.py +21 -9
  99. megatron_core-0.12.0rc2/megatron/core/optimizer/cpu_offloading/__init__.py +2 -0
  100. megatron_core-0.12.0rc2/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +465 -0
  101. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/distrib_optimizer.py +515 -224
  102. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/optimizer.py +157 -103
  103. megatron_core-0.12.0rc2/megatron/core/optimizer/optimizer_config.py +212 -0
  104. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/package_info.py +2 -2
  105. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/parallel_state.py +282 -58
  106. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/p2p_communication.py +20 -3
  107. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/schedules.py +186 -70
  108. megatron_core-0.12.0rc2/megatron/core/post_training/__init__.py +1 -0
  109. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/__init__.py +10 -0
  110. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt/model_specs.py +245 -0
  111. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +133 -0
  112. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/layers.py +246 -0
  113. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
  114. megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/mamba/model_specs.py +90 -0
  115. megatron_core-0.12.0rc2/megatron/core/process_groups_config.py +113 -0
  116. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/rerun_state_machine.py +241 -65
  117. megatron_core-0.12.0rc2/megatron/core/ssm/__init__.py +1 -0
  118. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_block.py +120 -50
  119. megatron_core-0.12.0rc2/megatron/core/ssm/mamba_config.py +22 -0
  120. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_layer.py +64 -11
  121. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_mixer.py +41 -14
  122. megatron_core-0.12.0rc2/megatron/core/ssm/mlp_layer.py +25 -0
  123. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/layers.py +16 -0
  124. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/random.py +163 -28
  125. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/timers.py +35 -7
  126. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/attention.py +290 -92
  127. megatron_core-0.12.0rc2/megatron/core/transformer/cuda_graphs.py +916 -0
  128. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/enums.py +20 -0
  129. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/mlp.py +13 -2
  130. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/experts.py +53 -12
  131. megatron_core-0.12.0rc2/megatron/core/transformer/moe/fused_a2a.py +202 -0
  132. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py +9 -6
  133. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/moe_layer.py +12 -21
  134. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/moe_utils.py +315 -33
  135. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/router.py +167 -33
  136. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/shared_experts.py +4 -5
  137. megatron_core-0.12.0rc2/megatron/core/transformer/moe/token_dispatcher.py +996 -0
  138. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/multi_latent_attention.py +113 -40
  139. megatron_core-0.12.0rc2/megatron/core/transformer/multi_token_prediction.py +737 -0
  140. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/transformer_block.py +128 -139
  141. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/transformer_config.py +476 -62
  142. megatron_core-0.12.0rc2/megatron/core/transformer/transformer_layer.py +740 -0
  143. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/utils.py +9 -2
  144. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/utils.py +324 -31
  145. {megatron_core-0.10.0 → megatron_core-0.12.0rc2/megatron_core.egg-info}/PKG-INFO +75 -13
  146. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/SOURCES.txt +48 -6
  147. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/requires.txt +5 -2
  148. {megatron_core-0.10.0/requirements/pytorch:24.01 → megatron_core-0.12.0rc2/requirements/pytorch_24.01}/requirements.txt +3 -2
  149. {megatron_core-0.10.0/requirements/pytorch:24.07 → megatron_core-0.12.0rc2/requirements/pytorch_24.07}/requirements.txt +3 -1
  150. megatron_core-0.12.0rc2/requirements/pytorch_24.10/requirements.txt +6 -0
  151. megatron_core-0.12.0rc2/requirements/pytorch_25.03/requirements.txt +15 -0
  152. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/setup.py +7 -2
  153. megatron_core-0.10.0/megatron/core/dist_checkpointing/state_dict_transformation.py +0 -270
  154. megatron_core-0.10.0/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -224
  155. megatron_core-0.10.0/megatron/core/inference/ammo_support/__init__.py +0 -8
  156. megatron_core-0.10.0/megatron/core/inference/ammo_support/gpt/model_specs.py +0 -2
  157. megatron_core-0.10.0/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py +0 -5
  158. megatron_core-0.10.0/megatron/core/inference/common_inference_params.py +0 -29
  159. megatron_core-0.10.0/megatron/core/inference/engines/mcore_engine.py +0 -113
  160. megatron_core-0.10.0/megatron/core/inference/inference_request.py +0 -39
  161. megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -238
  162. megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -90
  163. megatron_core-0.10.0/megatron/core/inference/modelopt_support/__init__.py +0 -8
  164. megatron_core-0.10.0/megatron/core/inference/scheduler.py +0 -127
  165. megatron_core-0.10.0/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -35
  166. megatron_core-0.10.0/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -402
  167. megatron_core-0.10.0/megatron/core/inference_params.py +0 -31
  168. megatron_core-0.10.0/megatron/core/models/multimodal/__init__.py +0 -1
  169. megatron_core-0.10.0/megatron/core/optimizer/optimizer_config.py +0 -116
  170. megatron_core-0.10.0/megatron/core/transformer/cuda_graphs.py +0 -313
  171. megatron_core-0.10.0/megatron/core/transformer/moe/__init__.py +0 -0
  172. megatron_core-0.10.0/megatron/core/transformer/moe/token_dispatcher.py +0 -594
  173. megatron_core-0.10.0/megatron/core/transformer/transformer_layer.py +0 -397
  174. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/MANIFEST.in +0 -0
  175. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/README.md +0 -0
  176. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/config_logger.py +0 -0
  177. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/__init__.py +0 -0
  178. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/bert_dataset.py +0 -0
  179. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/gpt_dataset.py +0 -0
  180. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/helpers.cpp +0 -0
  181. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/helpers.py +0 -0
  182. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/indexed_dataset.py +0 -0
  183. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/masked_dataset.py +0 -0
  184. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/megatron_dataset.py +0 -0
  185. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  186. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/multimodal_dataset.py +0 -0
  187. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/__init__.py +0 -0
  188. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/__init__.py +0 -0
  189. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  190. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/config.py +0 -0
  191. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  192. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  193. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/__init__.py +0 -0
  194. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/build.py +0 -0
  195. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/dataset.py +0 -0
  196. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/db/utils.py +0 -0
  197. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/external_libs.py +0 -0
  198. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/__init__.py +0 -0
  199. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/build.py +0 -0
  200. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/factory.py +0 -0
  201. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/index.py +0 -0
  202. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  203. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  204. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  205. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/utils.py +0 -0
  206. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/index/validate.py +0 -0
  207. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/__init__.py +0 -0
  208. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  209. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  210. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/query.py +0 -0
  211. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  212. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/query/utils.py +0 -0
  213. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/retro/utils.py +0 -0
  214. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/t5_dataset.py +0 -0
  215. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/utils.py +0 -0
  216. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/datasets/utils_s3.py +0 -0
  217. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/core.py +0 -0
  218. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  219. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  220. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  221. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  222. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  223. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/dist_checkpointing/validation.py +0 -0
  224. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/distributed/__init__.py +0 -0
  225. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/__init__.py +0 -0
  226. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/data_type.py +0 -0
  227. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/export_config.py +0 -0
  228. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/model_type.py +0 -0
  229. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/__init__.py +0 -0
  230. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  231. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  232. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  233. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  234. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  235. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  236. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  237. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  238. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  239. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/extensions/__init__.py +0 -0
  240. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/__init__.py +0 -0
  241. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  242. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  243. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  244. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  245. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_layer_norm.py +0 -0
  246. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/fusions/fused_softmax.py +0 -0
  247. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/__init__.py +0 -0
  248. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/engines/abstract_engine.py +0 -0
  249. {megatron_core-0.10.0/megatron/core/inference/engines → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers}/__init__.py +0 -0
  250. {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/gpt}/__init__.py +0 -0
  251. {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/gpt → megatron_core-0.12.0rc2/megatron/core/inference/model_inference_wrappers/t5}/__init__.py +0 -0
  252. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py +0 -0
  253. {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/t5 → megatron_core-0.12.0rc2/megatron/core/inference/text_generation_controllers}/__init__.py +0 -0
  254. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/inference/utils.py +0 -0
  255. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/jit.py +0 -0
  256. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/__init__.py +0 -0
  257. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/T5/t5_spec.py +0 -0
  258. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/__init__.py +0 -0
  259. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/__init__.py +0 -0
  260. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  261. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/bert_lm_head.py +0 -0
  262. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/bert/pooler.py +0 -0
  263. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/__init__.py +0 -0
  264. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  265. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  266. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/language_module/__init__.py +0 -0
  267. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/vision_module/__init__.py +0 -0
  268. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  269. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/gpt/__init__.py +0 -0
  270. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/mamba/__init__.py +0 -0
  271. {megatron_core-0.10.0/megatron/core/inference/modelopt_support/gpt → megatron_core-0.12.0rc2/megatron/core/models/multimodal}/__init__.py +0 -0
  272. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/__init__.py +0 -0
  273. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/base_attention.py +0 -0
  274. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/decoder_spec.py +0 -0
  275. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/encoder_spec.py +0 -0
  276. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/retro/utils.py +0 -0
  277. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/__init__.py +0 -0
  278. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/models/vision/multimodal_projector.py +0 -0
  279. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/num_microbatches_calculator.py +0 -0
  280. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer/grad_scaler.py +0 -0
  281. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/optimizer_param_scheduler.py +0 -0
  282. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/packed_seq_params.py +0 -0
  283. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/pipeline_parallel/__init__.py +0 -0
  284. {megatron_core-0.10.0/megatron/core/inference/text_generation_controllers → megatron_core-0.12.0rc2/megatron/core/post_training/modelopt/gpt}/__init__.py +0 -0
  285. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/requirements.txt +0 -0
  286. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  287. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/ssm/triton_cache_manager.py +0 -0
  288. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/__init__.py +0 -0
  289. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  290. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/data.py +0 -0
  291. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/mappings.py +0 -0
  292. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/tensor_parallel/utils.py +0 -0
  293. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/__init__.py +0 -0
  294. {megatron_core-0.10.0/megatron/core/ssm → megatron_core-0.12.0rc2/megatron/core/transformer/custom_layers}/__init__.py +0 -0
  295. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  296. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/dot_product_attention.py +0 -0
  297. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/identity_op.py +0 -0
  298. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/module.py +0 -0
  299. {megatron_core-0.10.0/megatron/core/transformer/custom_layers → megatron_core-0.12.0rc2/megatron/core/transformer/moe}/__init__.py +0 -0
  300. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  301. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  302. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/spec_utils.py +0 -0
  303. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/torch_layer_norm.py +0 -0
  304. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron/core/transformer/torch_norm.py +0 -0
  305. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/dependency_links.txt +0 -0
  306. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/megatron_core.egg-info/top_level.txt +0 -0
  307. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/pyproject.toml +0 -0
  308. {megatron_core-0.10.0 → megatron_core-0.12.0rc2}/setup.cfg +0 -0
@@ -36,8 +36,8 @@ OpenAI). Files from these organizations have notices at the top of each file.
36
36
  Below are licenses used in those files, as indicated.
37
37
 
38
38
 
39
- --------------------------------------------------------------------------------
40
- -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code --
39
+ --------------------------------------------------------------------------------------
40
+ -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
41
41
 
42
42
 
43
43
  Apache License
@@ -247,8 +247,9 @@ LICENSE FOR
247
247
  Facebook, Inc. and its affiliates,
248
248
  Meta Platforms, Inc. and its affiliates,
249
249
  Microsoft Corporation,
250
- OpenGVLab/InternVL, and
251
- Triton language and compiler.
250
+ OpenGVLab/InternVL,
251
+ Triton language and compiler,
252
+ and DeepSeek.
252
253
 
253
254
  MIT License
254
255
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.10.0
3
+ Version: 0.12.0rc2
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Home-page: https://github.com/NVIDIA/Megatron-LM/megatron/core
6
6
  Download-URL: https://github.com/NVIDIA/Megatron-LM/releases
@@ -46,8 +46,8 @@ License: The following applies to all files unless otherwise noted:
46
46
  Below are licenses used in those files, as indicated.
47
47
 
48
48
 
49
- --------------------------------------------------------------------------------
50
- -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and Mamba code --
49
+ --------------------------------------------------------------------------------------
50
+ -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
51
51
 
52
52
 
53
53
  Apache License
@@ -257,8 +257,9 @@ License: The following applies to all files unless otherwise noted:
257
257
  Facebook, Inc. and its affiliates,
258
258
  Meta Platforms, Inc. and its affiliates,
259
259
  Microsoft Corporation,
260
- OpenGVLab/InternVL, and
261
- Triton language and compiler.
260
+ OpenGVLab/InternVL,
261
+ Triton language and compiler,
262
+ and DeepSeek.
262
263
 
263
264
  MIT License
264
265
 
@@ -316,11 +317,15 @@ Requires-Dist: tiktoken
316
317
  Requires-Dist: wrapt
317
318
  Requires-Dist: zarr
318
319
  Requires-Dist: wandb
319
- Requires-Dist: tensorstore==0.1.45
320
- Requires-Dist: nvidia-modelopt[torch]>=0.19.0; sys_platform != "darwin"
320
+ Requires-Dist: tensorstore!=0.1.46,!=0.1.72
321
+ Requires-Dist: torch
322
+ Requires-Dist: nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
323
+ Requires-Dist: torch
324
+ Requires-Dist: packaging
321
325
  Dynamic: author
322
326
  Dynamic: download-url
323
327
  Dynamic: home-page
328
+ Dynamic: license-file
324
329
  Dynamic: maintainer
325
330
  Dynamic: requires-dist
326
331
 
@@ -342,9 +347,8 @@ Megatron-LM & Megatron-Core
342
347
  - **[2024/6]** Megatron-Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
343
348
  - **[2024/1 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron-Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron-Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron-Core intro](#megatron-core) for more details.
344
349
 
345
-
346
-
347
350
  # Table of Contents
351
+
348
352
  - [Megatron-LM \& Megatron-Core](#megatron-lm--megatron-core)
349
353
  - [Latest News](#latest-news)
350
354
  - [Table of Contents](#table-of-contents)
@@ -368,7 +372,6 @@ Megatron-LM & Megatron-Core
368
372
  - [Retro and InstructRetro](#retro-and-instructretro)
369
373
  - [Mamba-based Language Models](#mamba-based-language-models)
370
374
  - [Mixture of Experts](#mixture-of-experts)
371
- - [Key Features of MoE](#key-features-of-moe)
372
375
  - [Evaluation and Tasks](#evaluation-and-tasks)
373
376
  - [GPT Text Generation](#gpt-text-generation)
374
377
  - [Detoxify GPT via Self-generation](#detoxify-gpt-via-self-generation)
@@ -385,7 +388,10 @@ Megatron-LM & Megatron-Core
385
388
  - [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
386
389
  - [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
387
390
  - [Reproducibility](#reproducibility)
388
- - [Projects Using Megatron](#projects-using-megatron)
391
+ - [Checkpoint conversion](#checkpoint-conversion)
392
+ - [Model class conversion](#model-class-conversion)
393
+ - [Checkpoint format conversion](#checkpoint-format-conversion)
394
+ - [Projects Using Megatron](#projects-using-megatron)
389
395
 
390
396
  # Megatron Overview
391
397
  This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a research-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
@@ -915,7 +921,63 @@ There are currently three known Megatron optimizations that break reproducibilit
915
921
 
916
922
  In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
917
923
 
918
- ## Projects Using Megatron
924
+ # Checkpoint conversion
925
+
926
+ We support two forms of model conversion:
927
+
928
+ 1. Model class conversion (i.e., the `GPTModel` in `model.legacy` vs. `model.core`)
929
+ 2. Checkpoint format conversion (i.e., distributed vs. non-distributed checkpoint)
930
+
931
+ ## Model class conversion
932
+
933
+ Megatron supports converting between different model classes, including internal model classes (we currently have the older `legacy` models, and the newer `core` models) and external model classes (such as Meta, Huggingface, Mistral, and Mixtral models). Additionally, during this conversion, one can update the parallel state of the model (i.e., changing tensor and pipeline model parallelism).
934
+
935
+ We provide the tool `tools/checkpoint/convert.py` to convert between model classes. Some important arguments include:
936
+
937
+ - `--model-type`: `GPT` or `BERT`
938
+ - `--loader`: format of the existing checkpoint. Supported formats include:
939
+ - `legacy`: our older model classes (under `megatron.legacy.model`)
940
+ - `core`: our newer model classes (under `megatron.core.models`)
941
+ - `llama_mistral`: for loading Llama and Mistral models (supports Meta and Huggingface formats)
942
+ - `mixtral_hf`: for loading Mixtral models (Huggingface only)
943
+ - `--load-dir`: directory for loading the existing checkpoint
944
+ - `--saver`: `legacy` or `core` (see descriptions under `--loader`)
945
+ - `--save-dir`: directory for saving the new checkpoint
946
+ - `--target-tensor-parallel-size`: new tensor model parallel size
947
+ - `--target-pipeline-parallel-size`: new pipeline model parallel size
948
+
949
+ For more argument details, please see the main script (`convert.py`), loader scripts (`loader_core.py`, `loader_legacy.py`, `loader_llama_mistral.py`, `loader_mixtral_hf.py`), or saver scripts (`saver_core.py`, `saver_legacy.py`).
950
+
951
+ An example command for converting a GPT model from the old format (`legacy`) to the new format (`core`) would look as follows:
952
+
953
+ ```
954
+ python tools/checkpoint/convert.py \
955
+ > --model-type GPT \
956
+ > --loader legacy \
957
+ > --load-dir ${LEGACY_FORMAT_DIR} \
958
+ > --saver core \
959
+ > --save-dir ${CORE_FORMAT_DIR} \
960
+ > --target-tensor-parallel-size ${TP} \
961
+ > --target-pipeline-parallel-size ${PP} \
962
+ ```
963
+
964
+ For examples of converting Llama/Mistral models into Megatron, please see [here](docs/llama_mistral.md).
965
+
966
+ ## Checkpoint format conversion
967
+
968
+ Megatron offers multiple checkpoint formats, including:
969
+
970
+ - `torch`: Basic checkpoint format with sequential read & writes, and is tied to a specific tensor/pipeline model parallel state (TP/PP states, respectively). (While a specific checkpoint is tied to a specific TP/PP state, a checkpoint can still be manually converted via the model class converter described above).
971
+ - `torch_dist`: Distributed checkpoint format, for fast parallel reads & writes, and also is parallel state agnostic (i.e., one can load the same checkpoint to different TP/PP setups).
972
+
973
+ Generally speaking, `torch_dist` is the more modern and recommended checkpoint format due to its speed. However, depending on the use case, it may be desirable to convert between these two formats. To do so, launch your *training* script (e.g., via `pretrain_gpt.py`) as you normally would, but with two additional arguments:
974
+
975
+ - `--ckpt-convert-format ${FORMAT}`: `${FORMAT}` can be one of `torch` or `torch_dist`, as described above.
976
+ - `--ckpt-convert-save ${PATH_TO_SAVE_NEW_FORMAT}`: this path should be different than your existing `--load`/`--save` paths, to avoid overwriting the existing checkpoint. After converting, use this new path for your `--load`/`--save` paths.
977
+
978
+ The general idea of this checkpoint format converter is that it launches the model just as one normally would for training, but before running any training iterations, it saves to the new checkpoint format, and then exits. It is important to note that all other launch args should remain the same, in order for the system to understand the previous checkpoint format.
979
+
980
+ # Projects Using Megatron
919
981
  Below are some of the projects where we have directly used Megatron:
920
982
  * [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
921
983
  * [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
@@ -16,9 +16,8 @@ Megatron-LM & Megatron-Core
16
16
  - **[2024/6]** Megatron-Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
17
17
  - **[2024/1 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron-Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron-Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron-Core intro](#megatron-core) for more details.
18
18
 
19
-
20
-
21
19
  # Table of Contents
20
+
22
21
  - [Megatron-LM \& Megatron-Core](#megatron-lm--megatron-core)
23
22
  - [Latest News](#latest-news)
24
23
  - [Table of Contents](#table-of-contents)
@@ -42,7 +41,6 @@ Megatron-LM & Megatron-Core
42
41
  - [Retro and InstructRetro](#retro-and-instructretro)
43
42
  - [Mamba-based Language Models](#mamba-based-language-models)
44
43
  - [Mixture of Experts](#mixture-of-experts)
45
- - [Key Features of MoE](#key-features-of-moe)
46
44
  - [Evaluation and Tasks](#evaluation-and-tasks)
47
45
  - [GPT Text Generation](#gpt-text-generation)
48
46
  - [Detoxify GPT via Self-generation](#detoxify-gpt-via-self-generation)
@@ -59,7 +57,10 @@ Megatron-LM & Megatron-Core
59
57
  - [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
60
58
  - [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
61
59
  - [Reproducibility](#reproducibility)
62
- - [Projects Using Megatron](#projects-using-megatron)
60
+ - [Checkpoint conversion](#checkpoint-conversion)
61
+ - [Model class conversion](#model-class-conversion)
62
+ - [Checkpoint format conversion](#checkpoint-format-conversion)
63
+ - [Projects Using Megatron](#projects-using-megatron)
63
64
 
64
65
  # Megatron Overview
65
66
  This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a research-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
@@ -589,7 +590,63 @@ There are currently three known Megatron optimizations that break reproducibilit
589
590
 
590
591
  In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
591
592
 
592
- ## Projects Using Megatron
593
+ # Checkpoint conversion
594
+
595
+ We support two forms of model conversion:
596
+
597
+ 1. Model class conversion (i.e., the `GPTModel` in `model.legacy` vs. `model.core`)
598
+ 2. Checkpoint format conversion (i.e., distributed vs. non-distributed checkpoint)
599
+
600
+ ## Model class conversion
601
+
602
+ Megatron supports converting between different model classes, including internal model classes (we currently have the older `legacy` models, and the newer `core` models) and external model classes (such as Meta, Huggingface, Mistral, and Mixtral models). Additionally, during this conversion, one can update the parallel state of the model (i.e., changing tensor and pipeline model parallelism).
603
+
604
+ We provide the tool `tools/checkpoint/convert.py` to convert between model classes. Some important arguments include:
605
+
606
+ - `--model-type`: `GPT` or `BERT`
607
+ - `--loader`: format of the existing checkpoint. Supported formats include:
608
+ - `legacy`: our older model classes (under `megatron.legacy.model`)
609
+ - `core`: our newer model classes (under `megatron.core.models`)
610
+ - `llama_mistral`: for loading Llama and Mistral models (supports Meta and Huggingface formats)
611
+ - `mixtral_hf`: for loading Mixtral models (Huggingface only)
612
+ - `--load-dir`: directory for loading the existing checkpoint
613
+ - `--saver`: `legacy` or `core` (see descriptions under `--loader`)
614
+ - `--save-dir`: directory for saving the new checkpoint
615
+ - `--target-tensor-parallel-size`: new tensor model parallel size
616
+ - `--target-pipeline-parallel-size`: new pipeline model parallel size
617
+
618
+ For more argument details, please see the main script (`convert.py`), loader scripts (`loader_core.py`, `loader_legacy.py`, `loader_llama_mistral.py`, `loader_mixtral_hf.py`), or saver scripts (`saver_core.py`, `saver_legacy.py`).
619
+
620
+ An example command for converting a GPT model from the old format (`legacy`) to the new format (`core`) would look as follows:
621
+
622
+ ```
623
+ python tools/checkpoint/convert.py \
624
+ > --model-type GPT \
625
+ > --loader legacy \
626
+ > --load-dir ${LEGACY_FORMAT_DIR} \
627
+ > --saver core \
628
+ > --save-dir ${CORE_FORMAT_DIR} \
629
+ > --target-tensor-parallel-size ${TP} \
630
+ > --target-pipeline-parallel-size ${PP} \
631
+ ```
632
+
633
+ For examples of converting Llama/Mistral models into Megatron, please see [here](docs/llama_mistral.md).
634
+
635
+ ## Checkpoint format conversion
636
+
637
+ Megatron offers multiple checkpoint formats, including:
638
+
639
+ - `torch`: Basic checkpoint format with sequential read & writes, and is tied to a specific tensor/pipeline model parallel state (TP/PP states, respectively). (While a specific checkpoint is tied to a specific TP/PP state, a checkpoint can still be manually converted via the model class converter described above).
640
+ - `torch_dist`: Distributed checkpoint format, for fast parallel reads & writes, and also is parallel state agnostic (i.e., one can load the same checkpoint to different TP/PP setups).
641
+
642
+ Generally speaking, `torch_dist` is the more modern and recommended checkpoint format due to its speed. However, depending on the use case, it may be desirable to convert between these two formats. To do so, launch your *training* script (e.g., via `pretrain_gpt.py`) as you normally would, but with two additional arguments:
643
+
644
+ - `--ckpt-convert-format ${FORMAT}`: `${FORMAT}` can be one of `torch` or `torch_dist`, as described above.
645
+ - `--ckpt-convert-save ${PATH_TO_SAVE_NEW_FORMAT}`: this path should be different than your existing `--load`/`--save` paths, to avoid overwriting the existing checkpoint. After converting, use this new path for your `--load`/`--save` paths.
646
+
647
+ The general idea of this checkpoint format converter is that it launches the model just as one normally would for training, but before running any training iterations, it saves to the new checkpoint format, and then exits. It is important to note that all other launch args should remain the same, in order for the system to understand the previous checkpoint format.
648
+
649
+ # Projects Using Megatron
593
650
  Below are some of the projects where we have directly used Megatron:
594
651
  * [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
595
652
  * [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+
2
3
  import megatron.core.tensor_parallel
3
4
  import megatron.core.utils
4
5
  from megatron.core import parallel_state
@@ -0,0 +1,3 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ ENABLE_EXPERIMENTAL = False
@@ -29,7 +29,8 @@ class BlendedDataset(torch.utils.data.Dataset):
29
29
 
30
30
  weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
31
31
 
32
- size (Optional[int]): The number of samples to draw from the blend. If None, for each dataset index idx draw exactly weights[idx] samples from datasets[idx].
32
+ size (Optional[int]): The number of samples to draw from the blend. If None, for each
33
+ dataset index idx draw exactly weights[idx] samples from datasets[idx].
33
34
 
34
35
  config (BlendedMegatronDatasetConfig): The config
35
36
 
@@ -74,7 +75,6 @@ class BlendedDataset(torch.utils.data.Dataset):
74
75
  unique_identifiers["split"] = self.split.name
75
76
  unique_identifiers["weights"] = self.weights
76
77
  unique_identifiers["size"] = self.size
77
- unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights
78
78
 
79
79
  self.unique_description = json.dumps(
80
80
  unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
@@ -168,7 +168,7 @@ class BlendedDataset(torch.utils.data.Dataset):
168
168
  log_single_rank(
169
169
  logger,
170
170
  logging.WARNING,
171
- f"Unable to save the {type(self).__name__} indexes because path_to_cache is None",
171
+ f"Cannot save the {type(self).__name__} indexes because path_to_cache is None",
172
172
  )
173
173
 
174
174
  t_end = time.time()
@@ -34,7 +34,9 @@ class BlendedMegatronDatasetBuilder(object):
34
34
 
35
35
  sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split
36
36
 
37
- is_built_on_rank (Callable): A callable which returns True if the dataset should be built on the current rank and False otherwise. It should be Megatron Core parallelism aware i.e. global rank, local group rank, and virtual rank may inform its return value.
37
+ is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
38
+ the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
39
+ global rank, local group rank, and virtual rank may inform its return value.
38
40
 
39
41
  config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
40
42
  """
@@ -54,7 +56,7 @@ class BlendedMegatronDatasetBuilder(object):
54
56
  log_single_rank(
55
57
  logger,
56
58
  logging.INFO,
57
- f"Building dataset splits with cls={cls.__name__}, sizes={self.sizes}, and config={self.config}",
59
+ f"Building {cls.__name__} splits with sizes={self.sizes} and config={self.config}",
58
60
  )
59
61
 
60
62
  if not self.config.mock:
@@ -96,7 +98,8 @@ class BlendedMegatronDatasetBuilder(object):
96
98
  (2) The split has one contributing dataset, and...
97
99
 
98
100
  (a) 'size' is not None
99
- - Build a mid-level dataset with low-level dataset sampling in proportion to the size
101
+ - Build a mid-level dataset with low-level dataset sampling in proportion to the
102
+ size
100
103
 
101
104
  (b) 'size' is None
102
105
  - Build mid-level datasets with no excess low-level dataset sampling
@@ -104,24 +107,27 @@ class BlendedMegatronDatasetBuilder(object):
104
107
  (3) The split has multiple contributing datasets, and...
105
108
 
106
109
  (a) 'weights' is not None and 'size' is not None
107
- - Build mid-level datasets with low-level dataset sampling in proportion to their weights and the size
108
- - Build a top-level dataset of length marginally greater than 'size' with mid-level dataset sampling in proportion to their weights and the size
110
+ - Build mid-level datasets with low-level dataset sampling in proportion to their
111
+ weights and the size
112
+ - Build a top-level dataset of length marginally greater than 'size' with mid-level
113
+ dataset sampling in proportion to their weights and the size
109
114
 
110
115
  (b) 'weights' is not None and 'size' is None
111
116
  - Error
112
117
 
113
118
  (c) 'weights' is None and 'size' is not None
114
119
  - Build mid-level datasets with no excess low-level dataset sampling
115
- - Build a top-level dataset of length 'size' with mid-level dataset sampling in proportion to their lengths and the size
116
-
117
- - The 'size' of the top-level dataset is capped at the sum of the mid-level dataset lengths
120
+ - Build a top-level dataset of length 'size' (capped at the sum of the mid-level
121
+ dataset lengths) with mid-level dataset sampling in proportion to their lengths
122
+ and the size
118
123
 
119
124
  (d) 'weights' is None and 'size' is None
120
125
  - Build mid-level datasets with no excess low-level dataset sampling
121
126
  - Build a top-level dataset with no excess mid-level dataset sampling
122
127
 
123
128
  Returns:
124
- List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
129
+ List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
130
+ split
125
131
  """
126
132
  datasets = self._build_blended_dataset_splits()
127
133
 
@@ -134,24 +140,35 @@ class BlendedMegatronDatasetBuilder(object):
134
140
  log_single_rank(
135
141
  logger,
136
142
  logging.INFO,
137
- f"Verifying NumPy indices for {type(dataset).__name__} {dataset.split.name} split",
143
+ (
144
+ f"Verifying NumPy indices for {type(dataset).__name__} "
145
+ f"{dataset.split.name} split"
146
+ ),
138
147
  )
139
148
  else:
140
149
  log_single_rank(
141
150
  logger,
142
151
  logging.INFO,
143
- f"NumPy indices for {type(dataset).__name__} {dataset.split.name} split are fully cached, skipping verification",
152
+ (
153
+ f"NumPy indices for {type(dataset).__name__} {dataset.split.name} "
154
+ f"split are fully cached, skipping verification"
155
+ ),
144
156
  )
145
157
  continue
146
158
  # Check blend size
147
159
  assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
148
160
  # Check blend access of mid-level datasets
149
- _, sizes = numpy.unique(dataset.dataset_index, return_counts=True)
150
- for i, dataset_and_size in enumerate(zip(dataset.datasets, sizes)):
151
- if len(dataset_and_size[0]) < dataset_and_size[1]:
161
+ dataset_indices, dataset_sizes = numpy.unique(
162
+ dataset.dataset_index, return_counts=True
163
+ )
164
+ for i, (index, size) in enumerate(zip(dataset_indices, dataset_sizes)):
165
+ if len(dataset.datasets[index]) < size:
152
166
  raise IndexError(
153
- f"The {dataset.split.name} blend oversamples (N = {dataset_and_size[1]}) {type(dataset_and_size[0]).__name__} {i} (len = {len(dataset_and_size[0])}). "
154
- f"Set renormalize_blend_weights to True and re-run. File an issue if the problem is not resolved."
167
+ f"The {dataset.split.name} blend oversamples the contributing "
168
+ f"datasets and, e.g., requests {size} samples from "
169
+ f"{type(dataset.datasets[index]).__name__} {i} with size "
170
+ f"{len(dataset.datasets[index])}. This is unexpected. "
171
+ f"Please file an issue."
155
172
  )
156
173
 
157
174
  return datasets
@@ -162,7 +179,8 @@ class BlendedMegatronDatasetBuilder(object):
162
179
  See the BlendedMegatronDatasetBuilder.build alias for more information.
163
180
 
164
181
  Returns:
165
- List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per split
182
+ List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
183
+ split
166
184
  """
167
185
  ##
168
186
  # Return fake "mock" datasets
@@ -192,13 +210,19 @@ class BlendedMegatronDatasetBuilder(object):
192
210
 
193
211
  # Build the mid-level datasets
194
212
  if weights is None:
195
- sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
213
+ # Build only one "epoch"
214
+ sizes_per_dataset_buffer = [[None for split in Split] for prefix in prefixes]
196
215
  else:
197
- sizes_per_dataset = _get_size_per_split_per_dataset(weights, self.sizes)
216
+ # The number of samples we plan to use per dataset
217
+ sizes_per_dataset_target = _get_size_per_split_per_dataset(weights, self.sizes)
218
+ # The number of samples we plan to build per dataset
219
+ sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
220
+ weights, self.sizes, margin=0.5
221
+ )
198
222
 
199
- # build each dataset in parallel
223
+ # Build each dataset in parallel
200
224
  megatron_datasets = self._build_megatron_datasets_parallel(
201
- prefixes, split, sizes_per_dataset
225
+ prefixes, split, sizes_per_dataset_buffer
202
226
  )
203
227
 
204
228
  # Build the top-level datasets
@@ -207,11 +231,11 @@ class BlendedMegatronDatasetBuilder(object):
207
231
  if split[i] is not None:
208
232
  weights_i = weights
209
233
  if weights_i is not None and self.sizes[i] is not None:
210
- size_per_dataset = list(zip(*sizes_per_dataset))[i]
234
+ # Blend according to client-specified weights and client-specified size
235
+ size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
211
236
  size_i = sum(size_per_dataset)
212
- if self.config.renormalize_blend_weights:
213
- weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
214
237
  elif weights_i is None:
238
+ # Blend according to dataset sizes as-is and (maybe) client-specified size
215
239
  try:
216
240
  weights_i = [
217
241
  len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
@@ -221,9 +245,12 @@ class BlendedMegatronDatasetBuilder(object):
221
245
  if self.sizes[i] is not None:
222
246
  size_i = min(self.sizes[i], sum(weights_i))
223
247
  else:
224
- size_i = None # => the size will be sum(weights_i)
248
+ # Build exhaustive indices
249
+ size_i = None
225
250
  else:
226
- raise RuntimeError
251
+ raise ValueError(
252
+ "Using client-specified weights requires client-specified size"
253
+ )
227
254
  blended_datasets[i] = self.build_generic_dataset(
228
255
  BlendedDataset,
229
256
  self.is_built_on_rank,
@@ -263,22 +290,31 @@ class BlendedMegatronDatasetBuilder(object):
263
290
 
264
291
  # Build mid-level datasets
265
292
  if weights is None:
266
- sizes_per_dataset = [[None for split in Split] for prefix in prefixes]
293
+ sizes_per_dataset_buffer = [
294
+ [None for split in Split] for prefix in prefixes
295
+ ]
267
296
  else:
268
- sizes_per_dataset = _get_size_per_split_per_dataset(weights, sizes_spoof)
297
+ # The number of samples we plan to use per dataset
298
+ sizes_per_dataset_target = _get_size_per_split_per_dataset(
299
+ weights, sizes_spoof
300
+ )
301
+ # The number of samples we plan to build per dataset
302
+ sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
303
+ weights, sizes_spoof, margin=0.5
304
+ )
269
305
 
270
- # build each dataset in parallel
306
+ # Build each dataset in parallel
271
307
  megatron_datasets = self._build_megatron_datasets_parallel(
272
- prefixes, split_spoof, sizes_per_dataset
308
+ prefixes, split_spoof, sizes_per_dataset_buffer
273
309
  )[i]
274
310
 
275
311
  # Build top-level dataset
276
312
  if weights is not None and self.sizes[i] is not None:
277
- size_per_dataset = list(zip(*sizes_per_dataset))[i]
313
+ # Blend according to client-specified weights and client-specified size
314
+ size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
278
315
  size = sum(size_per_dataset)
279
- if self.config.renormalize_blend_weights:
280
- weights = list(map(lambda _size: _size / size, size_per_dataset))
281
316
  elif weights is None:
317
+ # Blend according to dataset sizes as-is and (maybe) client-specified size
282
318
  try:
283
319
  weights = [
284
320
  len(megatron_dataset) for megatron_dataset in megatron_datasets
@@ -288,7 +324,8 @@ class BlendedMegatronDatasetBuilder(object):
288
324
  if self.sizes[i] is not None:
289
325
  size = min(self.sizes[i], sum(weights))
290
326
  else:
291
- size = None # => the size will be sum(weights)
327
+ # Build exhaustive indices
328
+ size = None
292
329
  else:
293
330
  raise RuntimeError
294
331
  blended_datasets[i] = self.build_generic_dataset(
@@ -395,13 +432,15 @@ class BlendedMegatronDatasetBuilder(object):
395
432
  """Build each MidLevelDataset split from a single LowLevelDataset
396
433
 
397
434
  Args:
398
- dataset_path (Optional[str]): The path on disk which defines the underlying LowLevelDataset, or None for mock dataset classes
435
+ dataset_path (Optional[str]): The path on disk which defines the underlying
436
+ LowLevelDataset, or None for mock dataset classes
399
437
 
400
438
  split (List[Tuple[float, float]]): The dataset split matrix
401
439
 
402
440
  sizes (List[int]): The number of total samples to draw from each split
403
441
 
404
- synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
442
+ synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
443
+ behavior. Set to False when we enforce this behavior at higher level.
405
444
 
406
445
  Returns:
407
446
  List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
@@ -462,17 +501,22 @@ class BlendedMegatronDatasetBuilder(object):
462
501
  and torch.distributed is initialized.
463
502
 
464
503
  Args:
465
- cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be built. In special cases, e.g. when we are building the low level dataset for a RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
504
+ cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
505
+ built. In special cases, e.g. when we are building the low level dataset for a
506
+ RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
466
507
 
467
- synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks behavior. Set to False when we enforce this behavior at higher level.
508
+ synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
509
+ behavior. Set to False when we enforce this behavior at higher level.
468
510
 
469
- args (Tuple[Any]): The positional arguments used to build the provided DistributedDataset class
511
+ args (Tuple[Any]): The positional arguments used to build the provided
512
+ DistributedDataset class
470
513
 
471
514
  Raises:
472
515
  Exception: When the dataset constructor raises an OSError
473
516
 
474
517
  Returns:
475
- Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the Iterable instantiation, or None
518
+ Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
519
+ Iterable instantiation, or None
476
520
  """
477
521
  if torch.distributed.is_initialized():
478
522
  rank = torch.distributed.get_rank()
@@ -485,10 +529,10 @@ class BlendedMegatronDatasetBuilder(object):
485
529
  dataset = cls(*args)
486
530
  except OSError as err:
487
531
  log = (
488
- f"Failed to write dataset materials to the data cache directory. "
489
- + f"Please supply a directory to which you have write access via "
490
- + f"the path_to_cache attribute in BlendedMegatronDatasetConfig and "
491
- + f"retry. Refer to the preserved traceback above for more information."
532
+ f"Failed to write dataset materials to the data cache directory. Please "
533
+ f"supply a directory to which you have write access via the path_to_cache "
534
+ f"attribute in BlendedMegatronDatasetConfig and retry. Refer to the "
535
+ f"preserved traceback above for more information."
492
536
  )
493
537
  raise Exception(log) from err
494
538
 
@@ -505,23 +549,30 @@ class BlendedMegatronDatasetBuilder(object):
505
549
 
506
550
 
507
551
  def _get_size_per_split_per_dataset(
508
- normalized_weights: List[float], target_size_per_split: List[int]
552
+ normalized_weights: List[float], target_size_per_split: List[int], margin: float = 0.0
509
553
  ) -> List[List[int]]:
510
554
  """Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
511
555
 
512
556
  Args:
513
557
  normalized_weights (List[float]): e.g. [0.3, 0.7]
514
558
 
515
- target_size_per_split (List[int]): The number of samples to target for each BlendedDataset split
559
+ target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
560
+ split
561
+
562
+ margin (float): The relative quantity of extra samples to build per per split per dataset,
563
+ as a percentage
516
564
 
517
565
  Returns:
518
566
  List[List[int]]: The number of samples to request per MegatronDataset per split
519
567
  """
520
568
  assert numpy.isclose(sum(normalized_weights), 1.0)
521
569
 
522
- # Use 0.5% target margin to ensure we satiate the request
570
+ # Use margin as buffer to ensure we satiate the request
523
571
  sizes_per_dataset = [
524
- [int(math.ceil(target_size * weight * 1.005)) for target_size in target_size_per_split]
572
+ [
573
+ int(math.ceil(math.ceil(target_size * weight) * (1 + margin / 100)))
574
+ for target_size in target_size_per_split
575
+ ]
525
576
  for weight in normalized_weights
526
577
  ]
527
578
 
@@ -34,12 +34,6 @@ class BlendedMegatronDatasetConfig:
34
34
  'blend'. Defauls to None.
35
35
  """
36
36
 
37
- renormalize_blend_weights: bool = False
38
- """Renormalize the blend weights to account for mid-level dataset oversampling done to ensure
39
- fulfillmenet of the of the requested number of samples. Defaults to False for backward
40
- comparability in the data sample order.
41
- """
42
-
43
37
  split: Optional[str] = None
44
38
  """The split string, a comma separated weighting for the dataset splits when drawing samples
45
39
  from a single distribution. Not to be used with 'blend_per_split'. Defaults to None.
@@ -67,7 +61,7 @@ class BlendedMegatronDatasetConfig:
67
61
  """
68
62
 
69
63
  tokenizer: Optional[MegatronTokenizer] = None
70
- """The MegatronTokenizer instance or None. Required for datasets which do online tokenization."""
64
+ """The MegatronTokenizer instance. Required for datasets that do online tokenization."""
71
65
 
72
66
  def __post_init__(self) -> None:
73
67
  """Do asserts and set fields post init"""
@@ -149,7 +143,8 @@ def convert_split_vector_to_split_matrix(
149
143
  Args:
150
144
  vector_a (List[float]): The primary split vector
151
145
 
152
- vector_b (Optional[List[float]]): An optional secondary split vector which constrains the primary split vector. Defaults to None.
146
+ vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
147
+ primary split vector. Defaults to None.
153
148
 
154
149
  Returns:
155
150
  List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order