megatron-core 0.12.1__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (332) hide show
  1. {megatron_core-0.12.1 → megatron_core-0.13.0}/MANIFEST.in +1 -0
  2. {megatron_core-0.12.1/megatron_core.egg-info → megatron_core-0.13.0}/PKG-INFO +166 -337
  3. {megatron_core-0.12.1 → megatron_core-0.13.0}/README.md +139 -37
  4. megatron_core-0.13.0/megatron/core/config.py +14 -0
  5. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/config_logger.py +15 -5
  6. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/bert_dataset.py +5 -7
  7. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/blended_dataset.py +17 -6
  8. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/blended_megatron_dataset_builder.py +11 -46
  9. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/blended_megatron_dataset_config.py +8 -0
  10. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/gpt_dataset.py +26 -15
  11. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/helpers.py +3 -1
  12. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/indexed_dataset.py +173 -85
  13. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/masked_dataset.py +1 -3
  14. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/megatron_dataset.py +4 -5
  15. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/megatron_tokenizer.py +0 -1
  16. megatron_core-0.13.0/megatron/core/datasets/object_storage_utils.py +281 -0
  17. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/config/bert_embedders.py +3 -2
  18. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/db/build.py +40 -24
  19. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/db/dataset.py +12 -3
  20. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/db/utils.py +42 -11
  21. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/external_libs.py +1 -3
  22. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/build.py +31 -5
  23. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/index.py +26 -9
  24. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/indexes/faiss_base.py +34 -5
  25. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +54 -9
  26. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/validate.py +15 -12
  27. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +14 -6
  28. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/query.py +71 -15
  29. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/retro_dataset.py +21 -8
  30. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/utils.py +56 -19
  31. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/t5_dataset.py +2 -2
  32. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/utils.py +8 -3
  33. megatron_core-0.13.0/megatron/core/datasets/utils_object_storage.py +277 -0
  34. megatron_core-0.13.0/megatron/core/datasets/utils_s3.py +5 -0
  35. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/__init__.py +1 -0
  36. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/core.py +26 -10
  37. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/dict_utils.py +15 -15
  38. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/exchange_utils.py +75 -54
  39. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/mapping.py +54 -52
  40. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/serialization.py +51 -33
  41. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/base.py +13 -13
  42. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/common.py +58 -22
  43. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +183 -49
  44. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +8 -3
  45. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/resharding.py +9 -7
  46. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/tensorstore.py +35 -14
  47. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/torch.py +49 -20
  48. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/two_stage.py +3 -3
  49. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/zarr.py +75 -51
  50. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +76 -29
  51. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/validation.py +51 -42
  52. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/__init__.py +5 -1
  53. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +98 -23
  54. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +603 -107
  55. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/distributed_data_parallel.py +137 -33
  56. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/distributed_data_parallel_config.py +36 -0
  57. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/finalize_model_grads.py +76 -46
  58. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/param_and_grad_buffer.py +107 -64
  59. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +8 -3
  60. megatron_core-0.13.0/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +19 -0
  61. megatron_core-0.13.0/megatron/core/energy_monitor.py +91 -0
  62. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/enums.py +2 -1
  63. megatron_core-0.13.0/megatron/core/export/export_config.py +32 -0
  64. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +37 -25
  65. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trt_model_config.py +10 -1
  66. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trtllm_helper.py +62 -54
  67. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +18 -11
  68. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +30 -24
  69. megatron_core-0.13.0/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +8 -0
  70. megatron_core-0.13.0/megatron/core/extensions/kitchen.py +1088 -0
  71. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/extensions/transformer_engine.py +499 -189
  72. megatron_core-0.13.0/megatron/core/extensions/transformer_engine_spec_provider.py +85 -0
  73. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fp8_utils.py +77 -21
  74. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_bias_dropout.py +24 -5
  75. megatron_core-0.13.0/megatron/core/fusions/fused_bias_swiglu.py +255 -0
  76. megatron_core-0.13.0/megatron/core/fusions/fused_indices_converter.py +288 -0
  77. megatron_core-0.13.0/megatron/core/fusions/fused_mla_yarn_rope_apply.py +719 -0
  78. megatron_core-0.13.0/megatron/core/fusions/fused_pad_routing_map.py +98 -0
  79. megatron_core-0.13.0/megatron/core/inference/communication_utils.py +137 -0
  80. megatron_core-0.13.0/megatron/core/inference/contexts/__init__.py +22 -0
  81. megatron_core-0.13.0/megatron/core/inference/contexts/base_context.py +43 -0
  82. megatron_core-0.13.0/megatron/core/inference/contexts/dynamic_chunk_allocator.py +92 -0
  83. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/contexts/dynamic_context.py +363 -364
  84. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/contexts/static_context.py +1 -6
  85. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/engines/dynamic_engine.py +72 -12
  86. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/engines/static_engine.py +17 -1
  87. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/inference_request.py +16 -1
  88. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +41 -22
  89. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +1 -1
  90. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +4 -0
  91. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +7 -2
  92. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +2 -1
  93. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/sampling_params.py +1 -0
  94. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +294 -117
  95. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/utils.py +1 -0
  96. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/jit.py +10 -2
  97. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/model_parallel_config.py +3 -0
  98. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/T5/t5_model.py +21 -3
  99. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/T5/t5_spec.py +8 -5
  100. megatron_core-0.13.0/megatron/core/models/backends.py +112 -0
  101. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/bert/bert_layer_specs.py +7 -5
  102. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/bert/bert_model.py +4 -0
  103. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/language_model_embedding.py +9 -2
  104. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/relative_pos_embedding.py +2 -1
  105. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/rope_utils.py +26 -21
  106. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +12 -3
  107. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +37 -6
  108. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/language_module/language_module.py +12 -3
  109. megatron_core-0.13.0/megatron/core/models/gpt/fine_grained_callables.py +195 -0
  110. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/gpt/gpt_layer_specs.py +205 -107
  111. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/gpt/gpt_model.py +113 -29
  112. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +8 -5
  113. megatron_core-0.13.0/megatron/core/models/gpt/moe_module_specs.py +68 -0
  114. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/huggingface/clip_model.py +19 -3
  115. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/huggingface/module.py +35 -1
  116. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/huggingface/qwen_model.py +20 -3
  117. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/mamba/mamba_model.py +18 -0
  118. megatron_core-0.13.0/megatron/core/models/mimo/__init__.py +16 -0
  119. megatron_core-0.13.0/megatron/core/models/mimo/config/__init__.py +5 -0
  120. megatron_core-0.13.0/megatron/core/models/mimo/config/base_configs.py +34 -0
  121. megatron_core-0.13.0/megatron/core/models/mimo/model/__init__.py +4 -0
  122. megatron_core-0.13.0/megatron/core/models/mimo/model/base.py +290 -0
  123. megatron_core-0.13.0/megatron/core/models/mimo/submodules/audio.py +154 -0
  124. megatron_core-0.13.0/megatron/core/models/mimo/submodules/base.py +193 -0
  125. megatron_core-0.13.0/megatron/core/models/mimo/submodules/vision.py +184 -0
  126. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/multimodal/llava_model.py +10 -6
  127. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/multimodal/llava_spec.py +2 -1
  128. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/base_attention.py +4 -0
  129. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/decoder_attention.py +13 -8
  130. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/decoder_spec.py +9 -5
  131. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/encoder_spec.py +3 -0
  132. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/vision/clip_vit_model.py +12 -2
  133. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/vision/radio.py +30 -8
  134. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/vision/vit_layer_specs.py +6 -5
  135. megatron_core-0.13.0/megatron/core/msc_utils.py +69 -0
  136. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/__init__.py +51 -19
  137. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +7 -0
  138. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/distrib_optimizer.py +115 -56
  139. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/optimizer.py +98 -12
  140. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/optimizer_config.py +33 -1
  141. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer_param_scheduler.py +5 -0
  142. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/package_info.py +2 -2
  143. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/parallel_state.py +558 -282
  144. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/pipeline_parallel/p2p_communication.py +36 -14
  145. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/pipeline_parallel/schedules.py +127 -62
  146. megatron_core-0.13.0/megatron/core/pipeline_parallel/utils.py +166 -0
  147. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/post_training/modelopt/gpt/model_specs.py +20 -67
  148. megatron_core-0.13.0/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +63 -0
  149. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/post_training/modelopt/layers.py +32 -29
  150. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/post_training/modelopt/mamba/model_specs.py +2 -1
  151. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/process_groups_config.py +71 -6
  152. megatron_core-0.13.0/megatron/core/quantization/__init__.py +1 -0
  153. megatron_core-0.13.0/megatron/core/quantization/quant_config.py +219 -0
  154. megatron_core-0.13.0/megatron/core/quantization/utils.py +37 -0
  155. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/rerun_state_machine.py +82 -83
  156. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/mamba_block.py +28 -17
  157. megatron_core-0.13.0/megatron/core/ssm/mamba_context_parallel.py +389 -0
  158. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +5 -5
  159. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/mamba_layer.py +4 -0
  160. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/mamba_mixer.py +280 -143
  161. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/mlp_layer.py +5 -0
  162. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/data.py +13 -17
  163. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/layers.py +166 -103
  164. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/mappings.py +123 -103
  165. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/random.py +42 -8
  166. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/utils.py +20 -12
  167. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/timers.py +6 -3
  168. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/attention.py +230 -98
  169. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/cuda_graphs.py +121 -43
  170. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/dot_product_attention.py +12 -1
  171. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/enums.py +14 -3
  172. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/heterogeneous/linear_replacements.py +4 -0
  173. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/mlp.py +44 -12
  174. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/module.py +71 -8
  175. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/experts.py +71 -54
  176. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/fused_a2a.py +91 -29
  177. megatron_core-0.13.0/megatron/core/transformer/moe/moe_layer.py +276 -0
  178. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/moe_utils.py +197 -13
  179. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/router.py +57 -35
  180. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/shared_experts.py +9 -1
  181. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/token_dispatcher.py +532 -205
  182. megatron_core-0.13.0/megatron/core/transformer/moe/upcycling_utils.py +359 -0
  183. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/multi_latent_attention.py +139 -63
  184. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/multi_token_prediction.py +48 -63
  185. megatron_core-0.13.0/megatron/core/transformer/pipeline_parallel_layer_layout.py +266 -0
  186. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/transformer_block.py +99 -39
  187. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/transformer_config.py +291 -103
  188. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/transformer_layer.py +135 -46
  189. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/utils.py +70 -0
  190. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/utils.py +300 -41
  191. {megatron_core-0.12.1 → megatron_core-0.13.0/megatron_core.egg-info}/PKG-INFO +166 -337
  192. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron_core.egg-info/SOURCES.txt +29 -13
  193. megatron_core-0.13.0/megatron_core.egg-info/requires.txt +30 -0
  194. megatron_core-0.13.0/pyproject.toml +192 -0
  195. megatron_core-0.13.0/setup.py +22 -0
  196. megatron_core-0.12.1/megatron/core/config.py +0 -3
  197. megatron_core-0.12.1/megatron/core/datasets/utils_s3.py +0 -164
  198. megatron_core-0.12.1/megatron/core/export/export_config.py +0 -19
  199. megatron_core-0.12.1/megatron/core/fusions/fused_bias_swiglu.py +0 -146
  200. megatron_core-0.12.1/megatron/core/inference/communication_utils.py +0 -54
  201. megatron_core-0.12.1/megatron/core/inference/contexts/__init__.py +0 -11
  202. megatron_core-0.12.1/megatron/core/inference/contexts/base_context.py +0 -20
  203. megatron_core-0.12.1/megatron/core/inference/modelopt_support/gpt/__init__.py +0 -8
  204. megatron_core-0.12.1/megatron/core/inference/modelopt_support/gpt/model_specs.py +0 -68
  205. megatron_core-0.12.1/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py +0 -133
  206. megatron_core-0.12.1/megatron/core/inference/modelopt_support/mamba/model_specs.py +0 -89
  207. megatron_core-0.12.1/megatron/core/models/gpt/moe_module_specs.py +0 -81
  208. megatron_core-0.12.1/megatron/core/post_training/modelopt/__init__.py +0 -10
  209. megatron_core-0.12.1/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -133
  210. megatron_core-0.12.1/megatron/core/post_training/modelopt/mamba/__init__.py +0 -1
  211. megatron_core-0.12.1/megatron/core/ssm/mamba_config.py +0 -22
  212. megatron_core-0.12.1/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py +0 -324
  213. megatron_core-0.12.1/megatron/core/transformer/moe/moe_layer.py +0 -161
  214. megatron_core-0.12.1/megatron/core/transformer/moe/upcycling_utils.py +0 -196
  215. megatron_core-0.12.1/megatron_core.egg-info/requires.txt +0 -19
  216. megatron_core-0.12.1/pyproject.toml +0 -72
  217. megatron_core-0.12.1/requirements/pytorch_24.01/requirements.txt +0 -16
  218. megatron_core-0.12.1/requirements/pytorch_24.07/requirements.txt +0 -16
  219. megatron_core-0.12.1/requirements/pytorch_24.10/requirements.txt +0 -6
  220. megatron_core-0.12.1/requirements/pytorch_25.03/requirements.txt +0 -15
  221. megatron_core-0.12.1/setup.py +0 -128
  222. {megatron_core-0.12.1 → megatron_core-0.13.0}/LICENSE +0 -0
  223. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/README.md +0 -0
  224. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/__init__.py +0 -0
  225. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/__init__.py +0 -0
  226. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/helpers.cpp +0 -0
  227. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/multimodal_dataset.py +0 -0
  228. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/__init__.py +0 -0
  229. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/config/__init__.py +0 -0
  230. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/config/config.py +0 -0
  231. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  232. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  233. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/db/__init__.py +0 -0
  234. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/__init__.py +0 -0
  235. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/factory.py +0 -0
  236. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  237. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/index/utils.py +0 -0
  238. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/__init__.py +0 -0
  239. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  240. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/datasets/retro/query/utils.py +0 -0
  241. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  242. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  243. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  244. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  245. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  246. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  247. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/dist_checkpointing/utils.py +0 -0
  248. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/custom_fsdp/__init__.py +0 -0
  249. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/distributed/data_parallel_base.py +0 -0
  250. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/__init__.py +0 -0
  251. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/data_type.py +0 -0
  252. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/model_type.py +0 -0
  253. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/__init__.py +0 -0
  254. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  255. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  256. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  257. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  258. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  259. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  260. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/extensions/__init__.py +0 -0
  261. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/__init__.py +0 -0
  262. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  263. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  264. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  265. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_layer_norm.py +0 -0
  266. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/fusions/fused_softmax.py +0 -0
  267. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/__init__.py +0 -0
  268. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/async_stream.py +0 -0
  269. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/common_inference_params.py +0 -0
  270. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/engines/__init__.py +0 -0
  271. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/engines/abstract_engine.py +0 -0
  272. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/engines/mcore_engine.py +0 -0
  273. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  274. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  275. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  276. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/scheduler.py +0 -0
  277. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  278. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  279. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  280. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  281. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/inference_params.py +0 -0
  282. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/T5/__init__.py +0 -0
  283. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/__init__.py +0 -0
  284. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/bert/__init__.py +0 -0
  285. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/bert/bert_lm_head.py +0 -0
  286. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/bert/pooler.py +0 -0
  287. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/__init__.py +0 -0
  288. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/embeddings/__init__.py +0 -0
  289. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/language_module/__init__.py +0 -0
  290. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/vision_module/__init__.py +0 -0
  291. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  292. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/gpt/__init__.py +0 -0
  293. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/huggingface/__init__.py +0 -0
  294. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/mamba/__init__.py +0 -0
  295. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  296. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/multimodal/__init__.py +0 -0
  297. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/multimodal/context_parallel.py +0 -0
  298. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/__init__.py +0 -0
  299. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/config.py +0 -0
  300. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/encoder_attention.py +0 -0
  301. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/model.py +0 -0
  302. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/retro/utils.py +0 -0
  303. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/vision/__init__.py +0 -0
  304. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/models/vision/multimodal_projector.py +0 -0
  305. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/num_microbatches_calculator.py +0 -0
  306. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/clip_grads.py +0 -0
  307. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  308. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/optimizer/grad_scaler.py +0 -0
  309. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/packed_seq_params.py +0 -0
  310. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/pipeline_parallel/__init__.py +0 -0
  311. {megatron_core-0.12.1/megatron/core/inference/modelopt_support/mamba → megatron_core-0.13.0/megatron/core/post_training}/__init__.py +0 -0
  312. {megatron_core-0.12.1/megatron/core/inference/modelopt_support → megatron_core-0.13.0/megatron/core/post_training/modelopt}/__init__.py +0 -0
  313. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  314. {megatron_core-0.12.1/megatron/core/post_training → megatron_core-0.13.0/megatron/core/post_training/modelopt/mamba}/__init__.py +0 -0
  315. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/requirements.txt +0 -0
  316. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/__init__.py +0 -0
  317. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/ssm/triton_cache_manager.py +0 -0
  318. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/__init__.py +0 -0
  319. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  320. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/__init__.py +0 -0
  321. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  322. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  323. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  324. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/identity_op.py +0 -0
  325. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/__init__.py +0 -0
  326. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  327. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/spec_utils.py +0 -0
  328. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/torch_layer_norm.py +0 -0
  329. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron/core/transformer/torch_norm.py +0 -0
  330. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron_core.egg-info/dependency_links.txt +0 -0
  331. {megatron_core-0.12.1 → megatron_core-0.13.0}/megatron_core.egg-info/top_level.txt +0 -0
  332. {megatron_core-0.12.1 → megatron_core-0.13.0}/setup.cfg +0 -0
@@ -1,3 +1,4 @@
1
1
  include megatron/core/requirements.txt
2
2
  include megatron/core/README.md
3
+ include megatron/core/package_info.py
3
4
  recursive-include requirements *