megatron-core 0.15.0rc0__tar.gz → 0.15.0rc5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (355) hide show
  1. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/LICENSE +1 -1
  2. {megatron_core-0.15.0rc0/megatron_core.egg-info → megatron_core-0.15.0rc5}/PKG-INFO +23 -6
  3. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/README.md +17 -2
  4. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/__init__.py +17 -0
  5. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_builder.py +2 -8
  6. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -3
  7. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/gpt_dataset.py +4 -4
  8. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.cpp +3 -1
  9. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/indexed_dataset.py +10 -7
  10. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_tokenizer.py +1 -1
  11. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/tokenizers.py +3 -3
  12. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/mapping.py +20 -0
  13. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/common.py +6 -6
  14. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/torch.py +10 -5
  15. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel.py +49 -90
  16. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel_config.py +9 -0
  17. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/finalize_model_grads.py +36 -20
  18. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +30 -35
  19. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +33 -4
  20. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +8 -3
  21. megatron_core-0.15.0rc5/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +521 -0
  22. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +123 -27
  23. megatron_core-0.15.0rc5/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +27 -0
  24. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +188 -107
  25. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +45 -27
  26. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/param_and_grad_buffer.py +27 -6
  27. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/enums.py +6 -0
  28. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +47 -24
  29. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine.py +214 -209
  30. megatron_core-0.15.0rc5/megatron/core/fp4_utils.py +136 -0
  31. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/full_cuda_graph.py +6 -3
  32. megatron_core-0.15.0rc5/megatron/core/fusions/fused_bias_geglu.py +442 -0
  33. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_softmax.py +149 -10
  34. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_context.py +194 -87
  35. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/dynamic_engine.py +213 -81
  36. megatron_core-0.15.0rc5/megatron/core/inference/inference_request.py +193 -0
  37. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +11 -10
  38. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +30 -12
  39. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/model_parallel_config.py +4 -1
  40. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_model.py +8 -8
  41. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/language_module/language_module.py +13 -12
  42. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/model_chunk_schedule_plan.py +115 -109
  43. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/fine_grained_callables.py +117 -7
  44. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_layer_specs.py +11 -9
  45. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_model.py +55 -17
  46. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +11 -3
  47. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_model.py +8 -8
  48. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_model.py +12 -12
  49. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/base_attention.py +4 -4
  50. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_attention.py +5 -5
  51. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_spec.py +8 -2
  52. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/clip_vit_model.py +5 -5
  53. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/radio.py +4 -4
  54. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/nccl_allocator.py +39 -8
  55. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/__init__.py +16 -122
  56. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/clip_grads.py +4 -4
  57. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/distrib_optimizer.py +31 -11
  58. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer.py +62 -12
  59. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer_config.py +0 -6
  60. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/package_info.py +3 -5
  61. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/parallel_state.py +15 -10
  62. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/combined_1f1b.py +179 -66
  63. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/schedules.py +334 -232
  64. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/utils.py +0 -16
  65. megatron_core-0.15.0rc5/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
  66. megatron_core-0.15.0rc5/megatron/core/process_groups_config.py +489 -0
  67. megatron_core-0.15.0rc5/megatron/core/safe_globals.py +35 -0
  68. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_block.py +8 -8
  69. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_layer.py +4 -4
  70. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_mixer.py +9 -9
  71. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mlp_layer.py +3 -3
  72. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/layers.py +7 -3
  73. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/timers.py +14 -1
  74. megatron_core-0.15.0rc5/megatron/core/tokenizers/__init__.py +4 -0
  75. megatron_core-0.15.0rc5/megatron/core/tokenizers/base_tokenizer.py +48 -0
  76. megatron_core-0.15.0rc5/megatron/core/tokenizers/megatron_tokenizer.py +171 -0
  77. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/__init__.py +3 -0
  78. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/__init__.py +8 -0
  79. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +147 -0
  80. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +164 -0
  81. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/chat_template.py +71 -0
  82. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +335 -0
  83. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +179 -0
  84. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/null_tokenizer.py +79 -0
  85. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +411 -0
  86. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +303 -0
  87. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/__init__.py +8 -0
  88. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/bert_tokenizer.py +12 -0
  89. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/default_tokenizer.py +12 -0
  90. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/gpt_tokenizer.py +12 -0
  91. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/mamba_tokenizer.py +12 -0
  92. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/retro_tokenizer.py +12 -0
  93. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/t5_tokenizer.py +12 -0
  94. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/text_tokenizer.py +254 -0
  95. megatron_core-0.15.0rc5/megatron/core/tokenizers/text/utils/build_tokenizer.py +58 -0
  96. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/attention.py +21 -23
  97. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/cuda_graphs.py +485 -53
  98. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/dot_product_attention.py +56 -17
  99. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/enums.py +1 -0
  100. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/mlp.py +24 -4
  101. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/module.py +32 -3
  102. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/experts.py +36 -21
  103. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_layer.py +19 -19
  104. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_utils.py +20 -16
  105. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/router.py +89 -12
  106. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/shared_experts.py +3 -3
  107. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/token_dispatcher.py +20 -19
  108. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_latent_attention.py +14 -14
  109. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_token_prediction.py +241 -211
  110. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/pipeline_parallel_layer_layout.py +56 -17
  111. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_block.py +126 -63
  112. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_config.py +84 -15
  113. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_layer.py +66 -45
  114. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/utils.py +151 -1
  115. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/utils.py +31 -5
  116. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5/megatron_core.egg-info}/PKG-INFO +23 -6
  117. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/SOURCES.txt +25 -0
  118. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/requires.txt +5 -3
  119. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/pyproject.toml +9 -5
  120. megatron_core-0.15.0rc0/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -387
  121. megatron_core-0.15.0rc0/megatron/core/fusions/fused_bias_geglu.py +0 -85
  122. megatron_core-0.15.0rc0/megatron/core/inference/inference_request.py +0 -91
  123. megatron_core-0.15.0rc0/megatron/core/process_groups_config.py +0 -233
  124. megatron_core-0.15.0rc0/megatron/core/transformer/moe/__init__.py +0 -0
  125. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/MANIFEST.in +0 -0
  126. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/README.md +0 -0
  127. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/activations.py +0 -0
  128. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/config.py +0 -0
  129. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/config_logger.py +0 -0
  130. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/__init__.py +0 -0
  131. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/bert_dataset.py +0 -0
  132. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_dataset.py +0 -0
  133. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.py +0 -0
  134. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/masked_dataset.py +0 -0
  135. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_dataset.py +0 -0
  136. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/multimodal_dataset.py +0 -0
  137. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/object_storage_utils.py +0 -0
  138. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/__init__.py +0 -0
  139. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/__init__.py +0 -0
  140. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  141. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/config.py +0 -0
  142. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  143. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/__init__.py +0 -0
  144. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/build.py +0 -0
  145. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/dataset.py +0 -0
  146. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/utils.py +0 -0
  147. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/external_libs.py +0 -0
  148. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/__init__.py +0 -0
  149. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/build.py +0 -0
  150. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/factory.py +0 -0
  151. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/index.py +0 -0
  152. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  153. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  154. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  155. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/utils.py +0 -0
  156. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/validate.py +0 -0
  157. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/__init__.py +0 -0
  158. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  159. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  160. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/query.py +0 -0
  161. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  162. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/utils.py +0 -0
  163. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/utils.py +0 -0
  164. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/t5_dataset.py +0 -0
  165. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils.py +0 -0
  166. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_object_storage.py +0 -0
  167. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_s3.py +0 -0
  168. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/__init__.py +0 -0
  169. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/core.py +0 -0
  170. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  171. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  172. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  173. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/serialization.py +0 -0
  174. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  175. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  176. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  177. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  178. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  179. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  180. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  181. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  182. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  183. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  184. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  185. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  186. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  187. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  188. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/utils.py +0 -0
  189. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/validation.py +0 -0
  190. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/__init__.py +0 -0
  191. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/data_parallel_base.py +0 -0
  192. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/__init__.py +0 -0
  193. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  194. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  195. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  196. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  197. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/energy_monitor.py +0 -0
  198. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/__init__.py +0 -0
  199. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/data_type.py +0 -0
  200. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/export_config.py +0 -0
  201. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/model_type.py +0 -0
  202. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/__init__.py +0 -0
  203. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  204. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  205. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  206. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  207. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  208. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  209. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  210. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  211. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  212. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  213. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  214. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/__init__.py +0 -0
  215. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/kitchen.py +0 -0
  216. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  217. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fp8_utils.py +0 -0
  218. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/__init__.py +0 -0
  219. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  220. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  221. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  222. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  223. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_indices_converter.py +0 -0
  224. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_layer_norm.py +0 -0
  225. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  226. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  227. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  228. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/hyper_comm_grid.py +0 -0
  229. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/__init__.py +0 -0
  230. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/async_stream.py +0 -0
  231. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/common_inference_params.py +0 -0
  232. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/communication_utils.py +0 -0
  233. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/__init__.py +0 -0
  234. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/base_context.py +0 -0
  235. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  236. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/static_context.py +0 -0
  237. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  238. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/__init__.py +0 -0
  239. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/abstract_engine.py +0 -0
  240. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/mcore_engine.py +0 -0
  241. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/static_engine.py +0 -0
  242. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/headers.py +0 -0
  243. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/inference_client.py +0 -0
  244. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  245. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  246. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  247. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  248. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  249. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  250. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  251. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/sampling_params.py +0 -0
  252. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/scheduler.py +0 -0
  253. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  254. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  255. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  256. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  257. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/utils.py +0 -0
  258. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference_params.py +0 -0
  259. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/jit.py +0 -0
  260. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/__init__.py +0 -0
  261. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_spec.py +0 -0
  262. {megatron_core-0.15.0rc0/megatron/core/post_training → megatron_core-0.15.0rc5/megatron/core/models}/__init__.py +0 -0
  263. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/backends.py +0 -0
  264. {megatron_core-0.15.0rc0/megatron/core/models → megatron_core-0.15.0rc5/megatron/core/models/bert}/__init__.py +0 -0
  265. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  266. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_lm_head.py +0 -0
  267. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_model.py +0 -0
  268. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/pooler.py +0 -0
  269. {megatron_core-0.15.0rc0/megatron/core/models/bert → megatron_core-0.15.0rc5/megatron/core/models/common}/__init__.py +0 -0
  270. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/__init__.py +0 -0
  271. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  272. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  273. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  274. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  275. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  276. {megatron_core-0.15.0rc0/megatron/core/models/common → megatron_core-0.15.0rc5/megatron/core/models/common/language_module}/__init__.py +0 -0
  277. {megatron_core-0.15.0rc0/megatron/core/models/common/language_module → megatron_core-0.15.0rc5/megatron/core/models/common/vision_module}/__init__.py +0 -0
  278. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  279. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/__init__.py +0 -0
  280. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  281. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/__init__.py +0 -0
  282. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/clip_model.py +0 -0
  283. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/module.py +0 -0
  284. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/qwen_model.py +0 -0
  285. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/__init__.py +0 -0
  286. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  287. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/__init__.py +0 -0
  288. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/__init__.py +0 -0
  289. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/base_configs.py +0 -0
  290. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/__init__.py +0 -0
  291. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/base.py +0 -0
  292. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/audio.py +0 -0
  293. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/base.py +0 -0
  294. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/vision.py +0 -0
  295. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/__init__.py +0 -0
  296. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/context_parallel.py +0 -0
  297. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_spec.py +0 -0
  298. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/__init__.py +0 -0
  299. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/config.py +0 -0
  300. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_attention.py +0 -0
  301. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_spec.py +0 -0
  302. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/model.py +0 -0
  303. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/utils.py +0 -0
  304. {megatron_core-0.15.0rc0/megatron/core/models/common/vision_module → megatron_core-0.15.0rc5/megatron/core/models/vision}/__init__.py +0 -0
  305. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/multimodal_projector.py +0 -0
  306. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  307. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/msc_utils.py +0 -0
  308. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/num_microbatches_calculator.py +0 -0
  309. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  310. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  311. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/grad_scaler.py +0 -0
  312. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer_param_scheduler.py +0 -0
  313. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/packed_seq_params.py +0 -0
  314. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/__init__.py +0 -0
  315. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  316. {megatron_core-0.15.0rc0/megatron/core/post_training/modelopt/mamba → megatron_core-0.15.0rc5/megatron/core/post_training}/__init__.py +0 -0
  317. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/__init__.py +0 -0
  318. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  319. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  320. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  321. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/layers.py +0 -0
  322. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  323. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/__init__.py +0 -0
  324. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/quant_config.py +0 -0
  325. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/utils.py +0 -0
  326. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/requirements.txt +0 -0
  327. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/rerun_state_machine.py +0 -0
  328. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/__init__.py +0 -0
  329. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  330. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  331. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/triton_cache_manager.py +0 -0
  332. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/__init__.py +0 -0
  333. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  334. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/data.py +0 -0
  335. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/mappings.py +0 -0
  336. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/random.py +0 -0
  337. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/utils.py +0 -0
  338. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/__init__.py +0 -0
  339. {megatron_core-0.15.0rc0/megatron/core/models/vision → megatron_core-0.15.0rc5/megatron/core/transformer/custom_layers}/__init__.py +0 -0
  340. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  341. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  342. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  343. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  344. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/identity_op.py +0 -0
  345. {megatron_core-0.15.0rc0/megatron/core/transformer/custom_layers → megatron_core-0.15.0rc5/megatron/core/transformer/moe}/__init__.py +0 -0
  346. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  347. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  348. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  349. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/spec_utils.py +0 -0
  350. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_layer_norm.py +0 -0
  351. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_norm.py +0 -0
  352. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/dependency_links.txt +0 -0
  353. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/top_level.txt +0 -0
  354. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/setup.cfg +0 -0
  355. {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/setup.py +0 -0
@@ -37,7 +37,7 @@ Below are licenses used in those files, as indicated.
37
37
 
38
38
 
39
39
  --------------------------------------------------------------------------------------
40
- -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
40
+ -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, TinyZero and vLLM code --
41
41
 
42
42
 
43
43
  Apache License
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.15.0rc0
3
+ Version: 0.15.0rc5
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -37,23 +37,24 @@ Requires-Dist: flask-restful; extra == "mlm"
37
37
  Requires-Dist: sentencepiece; extra == "mlm"
38
38
  Requires-Dist: tiktoken; extra == "mlm"
39
39
  Requires-Dist: wandb; extra == "mlm"
40
+ Requires-Dist: transformers; extra == "mlm"
40
41
  Provides-Extra: dev
41
42
  Requires-Dist: tqdm; extra == "dev"
42
43
  Requires-Dist: einops~=0.8; extra == "dev"
43
44
  Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
44
45
  Requires-Dist: nvtx~=0.2; extra == "dev"
45
- Requires-Dist: transformers~=4.53; extra == "dev"
46
- Requires-Dist: multi-storage-client<0.26,~=0.25; extra == "dev"
46
+ Requires-Dist: multi-storage-client~=0.27; extra == "dev"
47
47
  Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
48
48
  Requires-Dist: setuptools<80.0.0; extra == "dev"
49
49
  Requires-Dist: mamba-ssm~=2.2; extra == "dev"
50
50
  Requires-Dist: causal-conv1d~=1.5; extra == "dev"
51
51
  Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
52
- Requires-Dist: transformer-engine[pytorch]<2.7.0,>=2.6.0a0; extra == "dev"
52
+ Requires-Dist: transformer-engine[pytorch]<2.8.0,>=2.6.0a0; extra == "dev"
53
53
  Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
54
54
  Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
55
55
  Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
56
56
  Requires-Dist: flashinfer-python; extra == "dev"
57
+ Requires-Dist: wget; extra == "dev"
57
58
  Requires-Dist: onnxscript; extra == "dev"
58
59
  Provides-Extra: lts
59
60
  Requires-Dist: tqdm; extra == "lts"
@@ -63,6 +64,7 @@ Requires-Dist: nvtx; extra == "lts"
63
64
  Requires-Dist: transformers; extra == "lts"
64
65
  Requires-Dist: zarr; extra == "lts"
65
66
  Requires-Dist: setuptools<80.0.0; extra == "lts"
67
+ Requires-Dist: wget; extra == "lts"
66
68
  Dynamic: license-file
67
69
 
68
70
  <div align="center">
@@ -93,7 +95,10 @@ cd Megatron-LM
93
95
 
94
96
  # Latest News
95
97
 
96
- - 📣 NEW! **[DeepSeek & MoE Training with FP8](https://github.com/yanring/Megatron-MoE-ModelZoo)** examples are now available, including optimized configurations for `DeepSeek-V3`, `Qwen2` and `Mixtral` models with FP8 precision support.
98
+ - 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
99
+ - 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
100
+ - 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
101
+ - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
97
102
  - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
98
103
 
99
104
  <details>
@@ -143,6 +148,7 @@ cd Megatron-LM
143
148
  **Resources**
144
149
  - [Examples](./examples/) - Training scripts and tutorials
145
150
  - [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
151
+ - [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
146
152
  - [Community & Support](#-community--support) - Get help and contribute
147
153
  - [Getting Help](#getting-help)
148
154
  - [Contributing](#contributing)
@@ -217,10 +223,12 @@ Megatron-LM/
217
223
 
218
224
  **Libraries using Megatron Core:**
219
225
 
226
+ - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
227
+ - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
220
228
  - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
221
229
  - **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
222
230
 
223
- **Compatible with:** [HuggingFace Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
231
+ **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
224
232
 
225
233
  # Installation
226
234
 
@@ -510,6 +518,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
510
518
  --use-distributed-optimizer
511
519
  ```
512
520
 
521
+ # Roadmaps
522
+
523
+ Stay up-to-date with our development roadmaps and planned features:
524
+
525
+ - **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
526
+ - **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
527
+
528
+ *More roadmap trackers will be added soon.*
529
+
513
530
  # Community & Support
514
531
 
515
532
  ## Getting Help
@@ -26,7 +26,10 @@ cd Megatron-LM
26
26
 
27
27
  # Latest News
28
28
 
29
- - 📣 NEW! **[DeepSeek & MoE Training with FP8](https://github.com/yanring/Megatron-MoE-ModelZoo)** examples are now available, including optimized configurations for `DeepSeek-V3`, `Qwen2` and `Mixtral` models with FP8 precision support.
29
+ - 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
30
+ - 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
31
+ - 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
32
+ - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
30
33
  - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
31
34
 
32
35
  <details>
@@ -76,6 +79,7 @@ cd Megatron-LM
76
79
  **Resources**
77
80
  - [Examples](./examples/) - Training scripts and tutorials
78
81
  - [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
82
+ - [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
79
83
  - [Community & Support](#-community--support) - Get help and contribute
80
84
  - [Getting Help](#getting-help)
81
85
  - [Contributing](#contributing)
@@ -150,10 +154,12 @@ Megatron-LM/
150
154
 
151
155
  **Libraries using Megatron Core:**
152
156
 
157
+ - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
158
+ - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
153
159
  - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
154
160
  - **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
155
161
 
156
- **Compatible with:** [HuggingFace Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
162
+ **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
157
163
 
158
164
  # Installation
159
165
 
@@ -443,6 +449,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
443
449
  --use-distributed-optimizer
444
450
  ```
445
451
 
452
+ # Roadmaps
453
+
454
+ Stay up-to-date with our development roadmaps and planned features:
455
+
456
+ - **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
457
+ - **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
458
+
459
+ *More roadmap trackers will be added soon.*
460
+
446
461
  # Community & Support
447
462
 
448
463
  ## Getting Help
@@ -20,6 +20,7 @@ from megatron.core.package_info import (
20
20
  __version__,
21
21
  )
22
22
  from megatron.core.timers import Timers
23
+ from megatron.core.utils import is_torch_min_version
23
24
 
24
25
  # Alias parallel_state as mpu, its legacy name
25
26
  mpu = parallel_state
@@ -32,4 +33,20 @@ __all__ = [
32
33
  "InferenceParams",
33
34
  "ModelParallelConfig",
34
35
  "Timers",
36
+ "__contact_emails__",
37
+ "__contact_names__",
38
+ "__description__",
39
+ "__download_url__",
40
+ "__homepage__",
41
+ "__keywords__",
42
+ "__license__",
43
+ "__package_name__",
44
+ "__repository_url__",
45
+ "__shortversion__",
46
+ "__version__",
35
47
  ]
48
+
49
+ from .safe_globals import register_safe_globals
50
+
51
+ if is_torch_min_version("2.6a0"):
52
+ register_safe_globals()
@@ -35,7 +35,8 @@ class BlendedMegatronDatasetBuilder(object):
35
35
 
36
36
  is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
37
37
  the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
38
- global rank, local group rank, and virtual rank may inform its return value.
38
+ global rank, local group rank, and virtual rank may inform its return value. Should
39
+ return true for exactly one process on global rank 0.
39
40
 
40
41
  config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
41
42
  """
@@ -72,13 +73,6 @@ class BlendedMegatronDatasetBuilder(object):
72
73
  for {split.name} split
73
74
  This can occur with multiple validation sets if datasets have weights"""
74
75
 
75
- if torch.distributed.is_initialized():
76
- gb_rank = torch.distributed.get_rank()
77
- if gb_rank == 0:
78
- assert (
79
- self.is_built_on_rank()
80
- ), "is_built_on_rank must return True when global rank = 0"
81
-
82
76
  def build(self) -> List[Optional[TopLevelDataset]]:
83
77
  """Build all dataset splits according to the provided blend(s)
84
78
 
@@ -6,8 +6,8 @@ import re
6
6
  from dataclasses import dataclass, field
7
7
  from typing import List, Optional, Tuple
8
8
 
9
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
10
9
  from megatron.core.datasets.utils import Split, log_single_rank, normalize
10
+ from megatron.core.tokenizers import MegatronTokenizerBase
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -66,8 +66,8 @@ class BlendedMegatronDatasetConfig:
66
66
  constructor.
67
67
  """
68
68
 
69
- tokenizer: Optional[MegatronTokenizer] = None
70
- """The MegatronTokenizer instance. Required for datasets that do online tokenization."""
69
+ tokenizer: Optional[MegatronTokenizerBase] = None
70
+ """The MegatronTokenizerBase instance. Required for datasets that do online tokenization."""
71
71
 
72
72
  mid_level_dataset_surplus: float = 0.005
73
73
  """The sample surplus to build for the mid-level datasets(s). Defaults arbitrarily to 0.005.
@@ -12,9 +12,9 @@ import torch
12
12
  from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
13
13
  from megatron.core.datasets.indexed_dataset import IndexedDataset
14
14
  from megatron.core.datasets.megatron_dataset import MegatronDataset
15
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
16
15
  from megatron.core.datasets.object_storage_utils import ObjectStorageConfig, is_object_storage_path
17
16
  from megatron.core.datasets.utils import Split
17
+ from megatron.core.tokenizers import MegatronTokenizerBase
18
18
  from megatron.core.utils import log_single_rank
19
19
 
20
20
  logger = logging.getLogger(__name__)
@@ -701,8 +701,8 @@ class MockGPTLowLevelDataset:
701
701
  we add the end of document token to each element indexed in __getitem__
702
702
 
703
703
  Args:
704
- tokenizer (MegatronTokenizer): The tokenizer the special token information of which we use
705
- to augment the mock data.
704
+ tokenizer (MegatronTokenizerBase): The tokenizer the special token information of which
705
+ we use to augment the mock data.
706
706
  """
707
707
 
708
708
  seed: int = 0
@@ -714,7 +714,7 @@ class MockGPTLowLevelDataset:
714
714
  max_sequence_length: int = 4096
715
715
  """The hard-coded max sequence length to generate"""
716
716
 
717
- def __init__(self, tokenizer: MegatronTokenizer) -> None:
717
+ def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
718
718
  self.tokenizer = tokenizer
719
719
  rng = numpy.random.default_rng(seed=self.seed)
720
720
  self.sequence_lengths = rng.integers(
@@ -3,6 +3,7 @@
3
3
  /* Helper methods for fast index mapping builds */
4
4
 
5
5
  #include <algorithm>
6
+ #include <cassert>
6
7
  #include <iostream>
7
8
  #include <limits>
8
9
  #include <math.h>
@@ -46,7 +47,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
46
47
  while (dataset_unspent_indices.size() > 0) {
47
48
  double index_sample_double = std::max(static_cast<double>(index_sample), 1.0);
48
49
 
49
- int64_t error_argmax;
50
+ int64_t error_argmax = -1;
50
51
  double error_max = std::numeric_limits<double>::lowest();
51
52
 
52
53
  for (int32_t index_dataset : dataset_unspent_indices) {
@@ -56,6 +57,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
56
57
  error_max = error;
57
58
  }
58
59
  }
60
+ assert(error_argmax >= 0);
59
61
 
60
62
  // Populate the indices.
61
63
  dataset_index_ptr[index_sample] = static_cast<int16_t>(error_argmax);
@@ -12,6 +12,7 @@ import shutil
12
12
  import struct
13
13
  import time
14
14
  from abc import ABC, abstractmethod
15
+ from collections.abc import Iterable
15
16
  from enum import Enum
16
17
  from functools import lru_cache
17
18
  from itertools import accumulate
@@ -172,9 +173,9 @@ class _IndexWriter(object):
172
173
 
173
174
  def write(
174
175
  self,
175
- sequence_lengths: List[int],
176
- sequence_modes: Optional[List[int]],
177
- document_indices: List[int],
176
+ sequence_lengths: Iterable[Union[int, numpy.integer]],
177
+ sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
178
+ document_indices: Iterable[Union[int, numpy.integer]],
178
179
  ) -> None:
179
180
  """Write the index (.idx) file
180
181
 
@@ -208,7 +209,9 @@ class _IndexWriter(object):
208
209
  if sequence_modes is not None:
209
210
  self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
210
211
 
211
- def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
212
+ def _sequence_pointers(
213
+ self, sequence_lengths: Iterable[Union[int, numpy.integer]]
214
+ ) -> List[int]:
212
215
  """Build the sequence pointers per the sequence lengths and dtype size
213
216
 
214
217
  Args:
@@ -217,11 +220,11 @@ class _IndexWriter(object):
217
220
  Returns:
218
221
  List[int]: The pointer to the beginning of each sequence
219
222
  """
220
- itemsize = DType.size(self.dtype)
221
- curr_ptr = 0
223
+ itemsize = numpy.int64(DType.size(self.dtype))
224
+ curr_ptr = numpy.int64(0)
222
225
  list_ptr = []
223
226
  for length in sequence_lengths:
224
- list_ptr.append(curr_ptr)
227
+ list_ptr.append(curr_ptr.item())
225
228
  curr_ptr += length * itemsize
226
229
  return list_ptr
227
230
 
@@ -7,7 +7,7 @@ from typing import Any
7
7
  import numpy
8
8
 
9
9
 
10
- class MegatronTokenizer(ABC):
10
+ class MegatronLegacyTokenizer(ABC):
11
11
  """Abstract class for tokenizer
12
12
 
13
13
  Absent a config or class-specific tracking of which objects are uniquely identifying, we must
@@ -4,12 +4,12 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
7
+ from megatron.core.tokenizers import MegatronTokenizerBase
8
8
 
9
9
 
10
10
  @dataclass
11
11
  class RetroTokenizers:
12
12
  """Container class for GPT and Bert tokenizers."""
13
13
 
14
- gpt: MegatronTokenizer = None
15
- bert: MegatronTokenizer = None
14
+ gpt: MegatronTokenizerBase = None
15
+ bert: MegatronTokenizerBase = None
@@ -29,6 +29,9 @@ ShardedStateDict = Dict[str, Any]
29
29
  ReplicaId = Union[int, Tuple[int, ...]]
30
30
 
31
31
 
32
+ _logged_deprecations = {}
33
+
34
+
32
35
  class ShardedBase(ABC):
33
36
  """Base class for ShardedTensor and ShardedStateDict."""
34
37
 
@@ -147,6 +150,23 @@ class ShardedTensor(ShardedBase):
147
150
  f"`step` argument in the flattened range of a ShardedTensor is not supported."
148
151
  )
149
152
 
153
+ if self.prepend_axis_num:
154
+ if not _logged_deprecations.get("prepend_axis_num", False):
155
+ logger.warning(
156
+ "ShardedTensor.prepend_axis_num greater than 0 is deprecated."
157
+ " In Megatron-Core this can be prevented by setting sharded_state_dict"
158
+ " metadata['singleton_local_shards'] to True."
159
+ )
160
+ _logged_deprecations["prepend_axis_num"] = True
161
+
162
+ if self.flattened_range is not None:
163
+ if not _logged_deprecations.get("flattened_range", False):
164
+ logger.warning(
165
+ "ShardedTensor.flattened_range is deprecated."
166
+ " Use latest DistributedOptimizer formats."
167
+ )
168
+ _logged_deprecations["flattened_range"] = True
169
+
150
170
  @property
151
171
  def has_regular_grid(self):
152
172
  """Alias for having a regular sharding grid."""
@@ -84,9 +84,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
84
84
  try:
85
85
  if MultiStorageClientFeature.is_enabled():
86
86
  msc = MultiStorageClientFeature.import_package()
87
- return msc.torch.load(load_path, map_location='cpu', weights_only=False)
87
+ return msc.torch.load(load_path, map_location='cpu')
88
88
  else:
89
- return torch.load(load_path, map_location='cpu', weights_only=False)
89
+ return torch.load(load_path, map_location='cpu')
90
90
  except FileNotFoundError as e:
91
91
  err_msg = f'Common file {load_path} does not exist'
92
92
  if MultiStorageClientFeature.is_enabled():
@@ -118,9 +118,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
118
118
  try:
119
119
  if MultiStorageClientFeature.is_enabled():
120
120
  msc = MultiStorageClientFeature.import_package()
121
- loaded_obj = msc.torch.load(load_path, weights_only=False)
121
+ loaded_obj = msc.torch.load(load_path)
122
122
  else:
123
- loaded_obj = torch.load(load_path, weights_only=False)
123
+ loaded_obj = torch.load(load_path)
124
124
  except FileNotFoundError as e:
125
125
  # Backward compatible logic: previously the save format was incorrect
126
126
  base, _ = os.path.splitext(sh_obj.unique_key)
@@ -128,9 +128,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
128
128
  try:
129
129
  if MultiStorageClientFeature.is_enabled():
130
130
  msc = MultiStorageClientFeature.import_package()
131
- loaded_obj = msc.torch.load(old_load_path, weights_only=False)
131
+ loaded_obj = msc.torch.load(old_load_path)
132
132
  else:
133
- loaded_obj = torch.load(old_load_path, weights_only=False)
133
+ loaded_obj = torch.load(old_load_path)
134
134
  except FileNotFoundError:
135
135
  err_msg = f'Object shard {load_path} not found'
136
136
  obj_subdir = os.path.join(checkpoint_dir, sh_obj.key)
@@ -340,11 +340,12 @@ def mcore_to_pyt_state_dict(
340
340
  if sh_ten.allow_shape_mismatch and is_loading:
341
341
  sh_ten.data.zero_()
342
342
 
343
- if not sh_tens[0].has_regular_grid:
344
- if not is_torch_min_version("2.6a0"):
345
- raise CheckpointingException(
346
- f"Uneven sharding not supported for PyTorch version {get_torch_version()}"
347
- )
343
+ is_pre_mcore_014_sh_ten = (
344
+ sh_tens[0].prepend_axis_num or sh_tens[0].flattened_range is not None
345
+ )
346
+ if (
347
+ not is_pre_mcore_014_sh_ten or not sh_tens[0].has_regular_grid
348
+ ) and is_torch_min_version("2.6a0"):
348
349
  assert sh_tens[0].flattened_range is None
349
350
  if len(sh_tens) > 1:
350
351
  return LocalShardsContainer(
@@ -353,6 +354,10 @@ def mcore_to_pyt_state_dict(
353
354
  else:
354
355
  return CheckpointableShardedTensor.from_sh_ten(sh_tens[0])
355
356
  else:
357
+ if not sh_tens[0].has_regular_grid and not is_torch_min_version("2.6a0"):
358
+ raise CheckpointingException(
359
+ f"Uneven sharding not supported for PyTorch version {get_torch_version()}"
360
+ )
356
361
  torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(
357
362
  sh_tens, rank, load_legacy_1d_flatten_tensors
358
363
  )