megatron-core 0.14.0rc7__tar.gz → 0.15.0rc4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (354) hide show
  1. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/LICENSE +1 -1
  2. {megatron_core-0.14.0rc7/megatron_core.egg-info → megatron_core-0.15.0rc4}/PKG-INFO +24 -7
  3. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/README.md +17 -2
  4. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/__init__.py +11 -0
  5. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_megatron_dataset_builder.py +2 -8
  6. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -3
  7. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/gpt_dataset.py +4 -4
  8. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/helpers.cpp +3 -1
  9. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/megatron_tokenizer.py +1 -1
  10. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/tokenizers.py +3 -3
  11. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/dict_utils.py +13 -5
  12. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/mapping.py +31 -5
  13. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/optimizer.py +6 -0
  14. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/async_utils.py +52 -14
  15. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/base.py +1 -5
  16. megatron_core-0.15.0rc4/megatron/core/dist_checkpointing/strategies/checkpointable.py +196 -0
  17. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/torch.py +42 -14
  18. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/zarr.py +6 -1
  19. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/validation.py +13 -3
  20. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/distributed_data_parallel.py +49 -90
  21. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/distributed_data_parallel_config.py +9 -0
  22. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/finalize_model_grads.py +36 -20
  23. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +12 -16
  24. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +31 -0
  25. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +6 -0
  26. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +6 -1
  27. megatron_core-0.15.0rc4/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +27 -0
  28. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +97 -61
  29. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +6 -0
  30. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/param_and_grad_buffer.py +26 -6
  31. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/enums.py +6 -0
  32. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +47 -24
  33. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/kitchen.py +4 -0
  34. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/transformer_engine.py +259 -207
  35. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/transformer_engine_spec_provider.py +5 -0
  36. megatron_core-0.15.0rc4/megatron/core/fp4_utils.py +136 -0
  37. megatron_core-0.15.0rc4/megatron/core/fusions/fused_bias_geglu.py +442 -0
  38. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_softmax.py +51 -7
  39. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/dynamic_context.py +167 -80
  40. megatron_core-0.15.0rc4/megatron/core/inference/data_parallel_inference_coordinator.py +322 -0
  41. megatron_core-0.15.0rc4/megatron/core/inference/engines/dynamic_engine.py +828 -0
  42. megatron_core-0.15.0rc4/megatron/core/inference/headers.py +17 -0
  43. megatron_core-0.15.0rc4/megatron/core/inference/inference_client.py +190 -0
  44. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/inference_request.py +11 -1
  45. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +11 -10
  46. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/sampling_params.py +11 -0
  47. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +43 -9
  48. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/model_parallel_config.py +6 -3
  49. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/t5_model.py +8 -8
  50. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/backends.py +9 -0
  51. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +23 -21
  52. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/language_module/language_module.py +13 -12
  53. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/model_chunk_schedule_plan.py +115 -109
  54. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/fine_grained_callables.py +117 -7
  55. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/gpt_layer_specs.py +23 -9
  56. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/gpt_model.py +55 -17
  57. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +11 -3
  58. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/moe_module_specs.py +7 -1
  59. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/mamba_model.py +8 -8
  60. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/context_parallel.py +25 -13
  61. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/llava_model.py +17 -12
  62. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/base_attention.py +4 -4
  63. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/decoder_attention.py +5 -5
  64. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/decoder_spec.py +8 -2
  65. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/clip_vit_model.py +5 -5
  66. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/multimodal_projector.py +35 -30
  67. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/radio.py +30 -4
  68. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/nccl_allocator.py +39 -8
  69. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/__init__.py +16 -122
  70. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/distrib_optimizer.py +432 -130
  71. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/optimizer.py +61 -9
  72. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/optimizer_config.py +0 -6
  73. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/package_info.py +4 -6
  74. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/parallel_state.py +9 -7
  75. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/combined_1f1b.py +179 -66
  76. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/schedules.py +334 -232
  77. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/utils.py +0 -16
  78. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +1 -0
  79. megatron_core-0.15.0rc4/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
  80. megatron_core-0.15.0rc4/megatron/core/process_groups_config.py +489 -0
  81. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_block.py +8 -8
  82. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_layer.py +4 -4
  83. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_mixer.py +9 -9
  84. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mlp_layer.py +3 -3
  85. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/layers.py +3 -3
  86. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/random.py +5 -2
  87. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/timers.py +14 -1
  88. megatron_core-0.15.0rc4/megatron/core/tokenizers/__init__.py +4 -0
  89. megatron_core-0.15.0rc4/megatron/core/tokenizers/base_tokenizer.py +48 -0
  90. megatron_core-0.15.0rc4/megatron/core/tokenizers/megatron_tokenizer.py +171 -0
  91. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/__init__.py +3 -0
  92. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/__init__.py +8 -0
  93. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +147 -0
  94. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +164 -0
  95. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/chat_template.py +71 -0
  96. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +335 -0
  97. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +179 -0
  98. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/null_tokenizer.py +79 -0
  99. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +411 -0
  100. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +303 -0
  101. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/__init__.py +8 -0
  102. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/bert_tokenizer.py +12 -0
  103. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/default_tokenizer.py +12 -0
  104. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/gpt_tokenizer.py +12 -0
  105. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/mamba_tokenizer.py +12 -0
  106. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/retro_tokenizer.py +12 -0
  107. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/t5_tokenizer.py +12 -0
  108. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/text_tokenizer.py +254 -0
  109. megatron_core-0.15.0rc4/megatron/core/tokenizers/text/utils/build_tokenizer.py +58 -0
  110. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/attention.py +51 -25
  111. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/cuda_graphs.py +183 -61
  112. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/dot_product_attention.py +44 -13
  113. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/mlp.py +44 -6
  114. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/module.py +32 -3
  115. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/experts.py +60 -27
  116. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/moe_layer.py +47 -20
  117. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/moe_utils.py +20 -16
  118. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/router.py +89 -12
  119. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/shared_experts.py +36 -5
  120. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/token_dispatcher.py +20 -19
  121. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/multi_latent_attention.py +42 -17
  122. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/multi_token_prediction.py +241 -211
  123. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/pipeline_parallel_layer_layout.py +46 -11
  124. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_block.py +126 -63
  125. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_config.py +129 -19
  126. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_layer.py +77 -46
  127. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/utils.py +117 -0
  128. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/utils.py +28 -5
  129. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4/megatron_core.egg-info}/PKG-INFO +24 -7
  130. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/SOURCES.txt +28 -0
  131. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/requires.txt +6 -4
  132. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/pyproject.toml +11 -7
  133. megatron_core-0.14.0rc7/megatron/core/fusions/fused_bias_geglu.py +0 -85
  134. megatron_core-0.14.0rc7/megatron/core/inference/engines/dynamic_engine.py +0 -423
  135. megatron_core-0.14.0rc7/megatron/core/process_groups_config.py +0 -233
  136. megatron_core-0.14.0rc7/megatron/core/transformer/moe/__init__.py +0 -0
  137. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/MANIFEST.in +0 -0
  138. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/README.md +0 -0
  139. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/activations.py +0 -0
  140. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/config.py +0 -0
  141. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/config_logger.py +0 -0
  142. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/__init__.py +0 -0
  143. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/bert_dataset.py +0 -0
  144. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_dataset.py +0 -0
  145. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/helpers.py +0 -0
  146. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/indexed_dataset.py +0 -0
  147. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/masked_dataset.py +0 -0
  148. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/megatron_dataset.py +0 -0
  149. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/multimodal_dataset.py +0 -0
  150. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/object_storage_utils.py +0 -0
  151. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/__init__.py +0 -0
  152. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/__init__.py +0 -0
  153. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  154. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/config.py +0 -0
  155. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  156. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/__init__.py +0 -0
  157. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/build.py +0 -0
  158. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/dataset.py +0 -0
  159. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/utils.py +0 -0
  160. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/external_libs.py +0 -0
  161. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/__init__.py +0 -0
  162. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/build.py +0 -0
  163. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/factory.py +0 -0
  164. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/index.py +0 -0
  165. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  166. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  167. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  168. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/utils.py +0 -0
  169. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/validate.py +0 -0
  170. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/__init__.py +0 -0
  171. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  172. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  173. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/query.py +0 -0
  174. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  175. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/utils.py +0 -0
  176. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/utils.py +0 -0
  177. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/t5_dataset.py +0 -0
  178. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils.py +0 -0
  179. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils_object_storage.py +0 -0
  180. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils_s3.py +0 -0
  181. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/__init__.py +0 -0
  182. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/core.py +0 -0
  183. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  184. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/serialization.py +0 -0
  185. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  186. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  187. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  188. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  189. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  190. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  191. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  192. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  193. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  194. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  195. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  196. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/utils.py +0 -0
  197. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/__init__.py +0 -0
  198. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/data_parallel_base.py +0 -0
  199. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/__init__.py +0 -0
  200. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  201. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  202. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  203. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  204. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  205. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/energy_monitor.py +0 -0
  206. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/__init__.py +0 -0
  207. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/data_type.py +0 -0
  208. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/export_config.py +0 -0
  209. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/model_type.py +0 -0
  210. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/__init__.py +0 -0
  211. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  212. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  213. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  214. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  215. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  216. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  217. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  218. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  219. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  220. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  221. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  222. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/__init__.py +0 -0
  223. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fp8_utils.py +0 -0
  224. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/full_cuda_graph.py +0 -0
  225. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/__init__.py +0 -0
  226. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  227. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  228. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  229. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  230. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_indices_converter.py +0 -0
  231. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_layer_norm.py +0 -0
  232. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  233. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  234. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  235. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/hyper_comm_grid.py +0 -0
  236. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/__init__.py +0 -0
  237. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/async_stream.py +0 -0
  238. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/common_inference_params.py +0 -0
  239. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/communication_utils.py +0 -0
  240. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/__init__.py +0 -0
  241. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/base_context.py +0 -0
  242. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  243. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/static_context.py +0 -0
  244. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/__init__.py +0 -0
  245. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/abstract_engine.py +0 -0
  246. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/mcore_engine.py +0 -0
  247. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/static_engine.py +0 -0
  248. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  249. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  250. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  251. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  252. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  253. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  254. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  255. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/scheduler.py +0 -0
  256. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  257. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  258. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  259. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  260. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/utils.py +0 -0
  261. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference_params.py +0 -0
  262. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/jit.py +0 -0
  263. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/__init__.py +0 -0
  264. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/t5_spec.py +0 -0
  265. {megatron_core-0.14.0rc7/megatron/core/post_training → megatron_core-0.15.0rc4/megatron/core/models}/__init__.py +0 -0
  266. {megatron_core-0.14.0rc7/megatron/core/models → megatron_core-0.15.0rc4/megatron/core/models/bert}/__init__.py +0 -0
  267. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  268. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_lm_head.py +0 -0
  269. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_model.py +0 -0
  270. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/pooler.py +0 -0
  271. {megatron_core-0.14.0rc7/megatron/core/models/bert → megatron_core-0.15.0rc4/megatron/core/models/common}/__init__.py +0 -0
  272. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/__init__.py +0 -0
  273. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  274. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  275. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  276. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  277. {megatron_core-0.14.0rc7/megatron/core/models/common → megatron_core-0.15.0rc4/megatron/core/models/common/language_module}/__init__.py +0 -0
  278. {megatron_core-0.14.0rc7/megatron/core/models/common/language_module → megatron_core-0.15.0rc4/megatron/core/models/common/vision_module}/__init__.py +0 -0
  279. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  280. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/__init__.py +0 -0
  281. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/__init__.py +0 -0
  282. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/clip_model.py +0 -0
  283. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/module.py +0 -0
  284. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/qwen_model.py +0 -0
  285. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/__init__.py +0 -0
  286. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  287. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/__init__.py +0 -0
  288. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/config/__init__.py +0 -0
  289. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/config/base_configs.py +0 -0
  290. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/model/__init__.py +0 -0
  291. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/model/base.py +0 -0
  292. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/audio.py +0 -0
  293. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/base.py +0 -0
  294. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/vision.py +0 -0
  295. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/__init__.py +0 -0
  296. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/llava_spec.py +0 -0
  297. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/__init__.py +0 -0
  298. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/config.py +0 -0
  299. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/encoder_attention.py +0 -0
  300. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/encoder_spec.py +0 -0
  301. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/model.py +0 -0
  302. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/utils.py +0 -0
  303. {megatron_core-0.14.0rc7/megatron/core/models/common/vision_module → megatron_core-0.15.0rc4/megatron/core/models/vision}/__init__.py +0 -0
  304. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  305. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/msc_utils.py +0 -0
  306. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/num_microbatches_calculator.py +0 -0
  307. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/clip_grads.py +0 -0
  308. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  309. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  310. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/grad_scaler.py +0 -0
  311. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer_param_scheduler.py +0 -0
  312. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/packed_seq_params.py +0 -0
  313. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/__init__.py +0 -0
  314. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  315. {megatron_core-0.14.0rc7/megatron/core/post_training/modelopt/mamba → megatron_core-0.15.0rc4/megatron/core/post_training}/__init__.py +0 -0
  316. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/__init__.py +0 -0
  317. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  318. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  319. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/layers.py +0 -0
  320. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  321. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/__init__.py +0 -0
  322. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/quant_config.py +0 -0
  323. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/utils.py +0 -0
  324. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/requirements.txt +0 -0
  325. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/rerun_state_machine.py +0 -0
  326. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/safe_globals.py +0 -0
  327. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/__init__.py +0 -0
  328. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  329. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  330. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/triton_cache_manager.py +0 -0
  331. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/__init__.py +0 -0
  332. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  333. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/data.py +0 -0
  334. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/mappings.py +0 -0
  335. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/utils.py +0 -0
  336. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/__init__.py +0 -0
  337. {megatron_core-0.14.0rc7/megatron/core/models/vision → megatron_core-0.15.0rc4/megatron/core/transformer/custom_layers}/__init__.py +0 -0
  338. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  339. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/enums.py +0 -0
  340. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  341. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  342. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  343. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/identity_op.py +0 -0
  344. {megatron_core-0.14.0rc7/megatron/core/transformer/custom_layers → megatron_core-0.15.0rc4/megatron/core/transformer/moe}/__init__.py +0 -0
  345. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  346. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  347. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  348. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/spec_utils.py +0 -0
  349. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/torch_layer_norm.py +0 -0
  350. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/torch_norm.py +0 -0
  351. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/dependency_links.txt +0 -0
  352. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/top_level.txt +0 -0
  353. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/setup.cfg +0 -0
  354. {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/setup.py +0 -0
@@ -37,7 +37,7 @@ Below are licenses used in those files, as indicated.
37
37
 
38
38
 
39
39
  --------------------------------------------------------------------------------------
40
- -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
40
+ -- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, TinyZero and vLLM code --
41
41
 
42
42
 
43
43
  Apache License
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.14.0rc7
3
+ Version: 0.15.0rc4
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -31,29 +31,30 @@ Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
32
  Requires-Dist: torch
33
33
  Requires-Dist: numpy<2.0.0
34
- Requires-Dist: packaging
34
+ Requires-Dist: packaging>=24.2
35
35
  Provides-Extra: mlm
36
36
  Requires-Dist: flask-restful; extra == "mlm"
37
37
  Requires-Dist: sentencepiece; extra == "mlm"
38
38
  Requires-Dist: tiktoken; extra == "mlm"
39
39
  Requires-Dist: wandb; extra == "mlm"
40
+ Requires-Dist: transformers; extra == "mlm"
40
41
  Provides-Extra: dev
41
42
  Requires-Dist: tqdm; extra == "dev"
42
43
  Requires-Dist: einops~=0.8; extra == "dev"
43
44
  Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
44
45
  Requires-Dist: nvtx~=0.2; extra == "dev"
45
- Requires-Dist: transformers~=4.53; extra == "dev"
46
- Requires-Dist: multi-storage-client~=0.20; extra == "dev"
46
+ Requires-Dist: multi-storage-client~=0.27; extra == "dev"
47
47
  Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
48
48
  Requires-Dist: setuptools<80.0.0; extra == "dev"
49
49
  Requires-Dist: mamba-ssm~=2.2; extra == "dev"
50
50
  Requires-Dist: causal-conv1d~=1.5; extra == "dev"
51
51
  Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
52
- Requires-Dist: transformer-engine[pytorch]<2.7.0,>=2.6.0a0; extra == "dev"
52
+ Requires-Dist: transformer-engine[pytorch]<2.8.0,>=2.6.0a0; extra == "dev"
53
53
  Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
54
54
  Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
55
55
  Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
56
56
  Requires-Dist: flashinfer-python; extra == "dev"
57
+ Requires-Dist: wget; extra == "dev"
57
58
  Requires-Dist: onnxscript; extra == "dev"
58
59
  Provides-Extra: lts
59
60
  Requires-Dist: tqdm; extra == "lts"
@@ -63,6 +64,7 @@ Requires-Dist: nvtx; extra == "lts"
63
64
  Requires-Dist: transformers; extra == "lts"
64
65
  Requires-Dist: zarr; extra == "lts"
65
66
  Requires-Dist: setuptools<80.0.0; extra == "lts"
67
+ Requires-Dist: wget; extra == "lts"
66
68
  Dynamic: license-file
67
69
 
68
70
  <div align="center">
@@ -93,7 +95,10 @@ cd Megatron-LM
93
95
 
94
96
  # Latest News
95
97
 
96
- - 📣 NEW! **[DeepSeek & MoE Training with FP8](https://github.com/yanring/Megatron-MoE-ModelZoo)** examples are now available, including optimized configurations for `DeepSeek-V3`, `Qwen2` and `Mixtral` models with FP8 precision support.
98
+ - 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
99
+ - 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
100
+ - 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
101
+ - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
97
102
  - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
98
103
 
99
104
  <details>
@@ -143,6 +148,7 @@ cd Megatron-LM
143
148
  **Resources**
144
149
  - [Examples](./examples/) - Training scripts and tutorials
145
150
  - [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
151
+ - [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
146
152
  - [Community & Support](#-community--support) - Get help and contribute
147
153
  - [Getting Help](#getting-help)
148
154
  - [Contributing](#contributing)
@@ -217,10 +223,12 @@ Megatron-LM/
217
223
 
218
224
  **Libraries using Megatron Core:**
219
225
 
226
+ - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
227
+ - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
220
228
  - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
221
229
  - **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
222
230
 
223
- **Compatible with:** [HuggingFace Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
231
+ **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
224
232
 
225
233
  # Installation
226
234
 
@@ -510,6 +518,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
510
518
  --use-distributed-optimizer
511
519
  ```
512
520
 
521
+ # Roadmaps
522
+
523
+ Stay up-to-date with our development roadmaps and planned features:
524
+
525
+ - **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
526
+ - **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
527
+
528
+ *More roadmap trackers will be added soon.*
529
+
513
530
  # Community & Support
514
531
 
515
532
  ## Getting Help
@@ -26,7 +26,10 @@ cd Megatron-LM
26
26
 
27
27
  # Latest News
28
28
 
29
- - 📣 NEW! **[DeepSeek & MoE Training with FP8](https://github.com/yanring/Megatron-MoE-ModelZoo)** examples are now available, including optimized configurations for `DeepSeek-V3`, `Qwen2` and `Mixtral` models with FP8 precision support.
29
+ - 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
30
+ - 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
31
+ - 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
32
+ - **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
30
33
  - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
31
34
 
32
35
  <details>
@@ -76,6 +79,7 @@ cd Megatron-LM
76
79
  **Resources**
77
80
  - [Examples](./examples/) - Training scripts and tutorials
78
81
  - [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
82
+ - [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
79
83
  - [Community & Support](#-community--support) - Get help and contribute
80
84
  - [Getting Help](#getting-help)
81
85
  - [Contributing](#contributing)
@@ -150,10 +154,12 @@ Megatron-LM/
150
154
 
151
155
  **Libraries using Megatron Core:**
152
156
 
157
+ - **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
158
+ - **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
153
159
  - **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
154
160
  - **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
155
161
 
156
- **Compatible with:** [HuggingFace Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
162
+ **Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
157
163
 
158
164
  # Installation
159
165
 
@@ -443,6 +449,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
443
449
  --use-distributed-optimizer
444
450
  ```
445
451
 
452
+ # Roadmaps
453
+
454
+ Stay up-to-date with our development roadmaps and planned features:
455
+
456
+ - **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
457
+ - **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
458
+
459
+ *More roadmap trackers will be added soon.*
460
+
446
461
  # Community & Support
447
462
 
448
463
  ## Getting Help
@@ -33,6 +33,17 @@ __all__ = [
33
33
  "InferenceParams",
34
34
  "ModelParallelConfig",
35
35
  "Timers",
36
+ "__contact_emails__",
37
+ "__contact_names__",
38
+ "__description__",
39
+ "__download_url__",
40
+ "__homepage__",
41
+ "__keywords__",
42
+ "__license__",
43
+ "__package_name__",
44
+ "__repository_url__",
45
+ "__shortversion__",
46
+ "__version__",
36
47
  ]
37
48
 
38
49
  from .safe_globals import register_safe_globals
@@ -35,7 +35,8 @@ class BlendedMegatronDatasetBuilder(object):
35
35
 
36
36
  is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
37
37
  the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
38
- global rank, local group rank, and virtual rank may inform its return value.
38
+ global rank, local group rank, and virtual rank may inform its return value. Should
39
+ return true for exactly one process on global rank 0.
39
40
 
40
41
  config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
41
42
  """
@@ -72,13 +73,6 @@ class BlendedMegatronDatasetBuilder(object):
72
73
  for {split.name} split
73
74
  This can occur with multiple validation sets if datasets have weights"""
74
75
 
75
- if torch.distributed.is_initialized():
76
- gb_rank = torch.distributed.get_rank()
77
- if gb_rank == 0:
78
- assert (
79
- self.is_built_on_rank()
80
- ), "is_built_on_rank must return True when global rank = 0"
81
-
82
76
  def build(self) -> List[Optional[TopLevelDataset]]:
83
77
  """Build all dataset splits according to the provided blend(s)
84
78
 
@@ -6,8 +6,8 @@ import re
6
6
  from dataclasses import dataclass, field
7
7
  from typing import List, Optional, Tuple
8
8
 
9
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
10
9
  from megatron.core.datasets.utils import Split, log_single_rank, normalize
10
+ from megatron.core.tokenizers import MegatronTokenizerBase
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -66,8 +66,8 @@ class BlendedMegatronDatasetConfig:
66
66
  constructor.
67
67
  """
68
68
 
69
- tokenizer: Optional[MegatronTokenizer] = None
70
- """The MegatronTokenizer instance. Required for datasets that do online tokenization."""
69
+ tokenizer: Optional[MegatronTokenizerBase] = None
70
+ """The MegatronTokenizerBase instance. Required for datasets that do online tokenization."""
71
71
 
72
72
  mid_level_dataset_surplus: float = 0.005
73
73
  """The sample surplus to build for the mid-level datasets(s). Defaults arbitrarily to 0.005.
@@ -12,9 +12,9 @@ import torch
12
12
  from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
13
13
  from megatron.core.datasets.indexed_dataset import IndexedDataset
14
14
  from megatron.core.datasets.megatron_dataset import MegatronDataset
15
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
16
15
  from megatron.core.datasets.object_storage_utils import ObjectStorageConfig, is_object_storage_path
17
16
  from megatron.core.datasets.utils import Split
17
+ from megatron.core.tokenizers import MegatronTokenizerBase
18
18
  from megatron.core.utils import log_single_rank
19
19
 
20
20
  logger = logging.getLogger(__name__)
@@ -701,8 +701,8 @@ class MockGPTLowLevelDataset:
701
701
  we add the end of document token to each element indexed in __getitem__
702
702
 
703
703
  Args:
704
- tokenizer (MegatronTokenizer): The tokenizer the special token information of which we use
705
- to augment the mock data.
704
+ tokenizer (MegatronTokenizerBase): The tokenizer the special token information of which
705
+ we use to augment the mock data.
706
706
  """
707
707
 
708
708
  seed: int = 0
@@ -714,7 +714,7 @@ class MockGPTLowLevelDataset:
714
714
  max_sequence_length: int = 4096
715
715
  """The hard-coded max sequence length to generate"""
716
716
 
717
- def __init__(self, tokenizer: MegatronTokenizer) -> None:
717
+ def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
718
718
  self.tokenizer = tokenizer
719
719
  rng = numpy.random.default_rng(seed=self.seed)
720
720
  self.sequence_lengths = rng.integers(
@@ -3,6 +3,7 @@
3
3
  /* Helper methods for fast index mapping builds */
4
4
 
5
5
  #include <algorithm>
6
+ #include <cassert>
6
7
  #include <iostream>
7
8
  #include <limits>
8
9
  #include <math.h>
@@ -46,7 +47,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
46
47
  while (dataset_unspent_indices.size() > 0) {
47
48
  double index_sample_double = std::max(static_cast<double>(index_sample), 1.0);
48
49
 
49
- int64_t error_argmax;
50
+ int64_t error_argmax = -1;
50
51
  double error_max = std::numeric_limits<double>::lowest();
51
52
 
52
53
  for (int32_t index_dataset : dataset_unspent_indices) {
@@ -56,6 +57,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
56
57
  error_max = error;
57
58
  }
58
59
  }
60
+ assert(error_argmax >= 0);
59
61
 
60
62
  // Populate the indices.
61
63
  dataset_index_ptr[index_sample] = static_cast<int16_t>(error_argmax);
@@ -7,7 +7,7 @@ from typing import Any
7
7
  import numpy
8
8
 
9
9
 
10
- class MegatronTokenizer(ABC):
10
+ class MegatronLegacyTokenizer(ABC):
11
11
  """Abstract class for tokenizer
12
12
 
13
13
  Absent a config or class-specific tracking of which objects are uniquely identifying, we must
@@ -4,12 +4,12 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
 
7
- from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
7
+ from megatron.core.tokenizers import MegatronTokenizerBase
8
8
 
9
9
 
10
10
  @dataclass
11
11
  class RetroTokenizers:
12
12
  """Container class for GPT and Bert tokenizers."""
13
13
 
14
- gpt: MegatronTokenizer = None
15
- bert: MegatronTokenizer = None
14
+ gpt: MegatronTokenizerBase = None
15
+ bert: MegatronTokenizerBase = None
@@ -103,11 +103,19 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
103
103
  else:
104
104
  only_left = []
105
105
  only_right = []
106
+ mismatch_debug_data = [prefix, type(x1), type(x2)]
106
107
  if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
107
- if x1.device != x2.device:
108
- _is_mismatch = not torch.all(x1.cpu() == x2.cpu())
109
- else:
110
- _is_mismatch = not torch.all(x1 == x2)
108
+ try:
109
+ if x1.device != x2.device:
110
+ _is_mismatch = not torch.all(x1.cpu() == x2.cpu())
111
+ else:
112
+ _is_mismatch = not torch.all(x1 == x2)
113
+ mismatch_debug_data.extend(
114
+ [(x1 != x2).sum(), (x1 != x2).shape, (x1 != x2).nonzero().tolist()]
115
+ )
116
+ except (RuntimeError, TypeError, ValueError):
117
+ _is_mismatch = True
118
+ mismatch_debug_data.extend([x1.shape, x2.shape])
111
119
  # TODO: change with concrete type that has both replica_id and data attrs
112
120
  elif hasattr(x1, "replica_id") and hasattr(x2, "replica_id"):
113
121
  assert type(x1) == type(x2)
@@ -122,7 +130,7 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
122
130
  _is_mismatch = True
123
131
 
124
132
  if _is_mismatch:
125
- mismatch.append((prefix, type(x1), type(x2)))
133
+ mismatch.append(tuple(mismatch_debug_data))
126
134
 
127
135
  return only_left, only_right, mismatch
128
136
 
@@ -29,6 +29,9 @@ ShardedStateDict = Dict[str, Any]
29
29
  ReplicaId = Union[int, Tuple[int, ...]]
30
30
 
31
31
 
32
+ _logged_deprecations = {}
33
+
34
+
32
35
  class ShardedBase(ABC):
33
36
  """Base class for ShardedTensor and ShardedStateDict."""
34
37
 
@@ -135,17 +138,40 @@ class ShardedTensor(ShardedBase):
135
138
  f"equal to global shape dimensions for {self}"
136
139
  )
137
140
 
138
- for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
139
- if sh != 0 and off % sh != 0:
140
- raise CheckpointingException(
141
- f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
142
- )
141
+ if self.axis_fragmentations is not None:
142
+ for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
143
+ if sh != 0 and off % sh != 0:
144
+ raise CheckpointingException(
145
+ f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
146
+ )
143
147
 
144
148
  if has_flattened_range and self.flattened_range.step is not None:
145
149
  raise CheckpointingException(
146
150
  f"`step` argument in the flattened range of a ShardedTensor is not supported."
147
151
  )
148
152
 
153
+ if self.prepend_axis_num:
154
+ if not _logged_deprecations.get("prepend_axis_num", False):
155
+ logger.warning(
156
+ "ShardedTensor.prepend_axis_num greater than 0 is deprecated."
157
+ " In Megatron-Core this can be prevented by setting sharded_state_dict"
158
+ " metadata['singleton_local_shards'] to True."
159
+ )
160
+ _logged_deprecations["prepend_axis_num"] = True
161
+
162
+ if self.flattened_range is not None:
163
+ if not _logged_deprecations.get("flattened_range", False):
164
+ logger.warning(
165
+ "ShardedTensor.flattened_range is deprecated."
166
+ " Use latest DistributedOptimizer formats."
167
+ )
168
+ _logged_deprecations["flattened_range"] = True
169
+
170
+ @property
171
+ def has_regular_grid(self):
172
+ """Alias for having a regular sharding grid."""
173
+ return self.axis_fragmentations is not None
174
+
149
175
  def global_slice(self) -> Tuple[Union[int, slice], ...]:
150
176
  """
151
177
  Returns a tuple of int and slice objects representing a slice of the
@@ -25,6 +25,12 @@ from .mapping import (
25
25
  )
26
26
  from .utils import extract_sharded_tensors_and_factories
27
27
 
28
+ KEEP_VARS_HINT = (
29
+ " Make sure state dict contains original torch.nn.Parameters (not pure torch.Tensors)"
30
+ " by passing `keep_vars=True` to `.state_dict()`. If any transformation of the original"
31
+ " parameter is needed, use a ShardedTensorFactory."
32
+ )
33
+
28
34
 
29
35
  def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
30
36
  """Generate mapping from optimizer param to optimizer state id."""
@@ -79,9 +79,24 @@ class AsyncRequest(NamedTuple):
79
79
 
80
80
  This logic is equivalent to what should happen in case of the async call.
81
81
  """
82
+ # preload tensors.
83
+ async_fn_args = list(self.async_fn_args)
84
+ if self.preload_fn:
85
+ assert len(async_fn_args) == 3, "Expected 3 args to be passed to async function"
86
+ # The async_fn is passed as a partial functool with pre-determined args
87
+ # In the async_fn_args we pass the remaining positional args required by the async_fn
88
+ # async_fn_args[1] refers to the write_buckets
89
+ # To ensure we stage the write_buckets to CPU memory for sync CP,
90
+ # we replace it with preload_fn callable that returns the CPU staged tensors
91
+ async_fn_args[1] = self.preload_fn()
92
+ # persist the state
82
93
  if self.async_fn is not None:
83
- self.async_fn(*self.async_fn_args)
94
+ self.async_fn(*async_fn_args, **self.async_fn_kwargs)
95
+
96
+ # This utility implements a sync cp save. Hence the barrier.
84
97
  torch.distributed.barrier()
98
+
99
+ # Finalize the CP state
85
100
  for finalize_fn in self.finalize_fns:
86
101
  finalize_fn()
87
102
 
@@ -150,7 +165,7 @@ class AsyncCaller(ABC):
150
165
  return ten[0] == 0
151
166
 
152
167
  @abstractmethod
153
- def close(self):
168
+ def close(self, abort=False):
154
169
  """Terminate the async caller at exit of an application or some termination conditions"""
155
170
  logger.info(f"AsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller")
156
171
 
@@ -237,15 +252,23 @@ class TemporalAsyncCaller(AsyncCaller):
237
252
  is_done = True
238
253
  return is_done
239
254
 
240
- def close(self):
255
+ def close(self, abort=False):
241
256
  """For TemporalAsyncCaller, this method is called explictly in `is_current_async_calls_done`
242
257
 
243
258
  This method make sure the TemporalAsyncCaller terminated
244
259
  with all its assigned async request completed
260
+
261
+ Args:
262
+ abort (bool, optional): Default to False. Needs to be manually set to true when
263
+ the checkpoint async process needs to be aborted.
245
264
  """
246
265
  if self.process:
247
266
  logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
248
- self.process.join()
267
+ if abort:
268
+ logger.warning(f"Temporal worker aborted in rank {torch.distributed.get_rank()}")
269
+ self.process.kill()
270
+ else:
271
+ self.process.join()
249
272
  self.process = None
250
273
  logger.debug(
251
274
  "TemporalAsyncCaller: Async process join finished "
@@ -388,18 +411,25 @@ class PersistentAsyncCaller(AsyncCaller):
388
411
 
389
412
  return is_done
390
413
 
391
- def close(self):
414
+ def close(self, abort=False):
392
415
  """Wait on the left async requests and terminate the PersistentAsyncCaller
393
416
 
394
417
  Signals the PersistentAsyncCaller by sending a 'DONE' message to make it terminated
418
+ Args:
419
+ abort (bool, optional): Default to False. Needs to be manually set to true when
420
+ the checkpoint async process needs to be aborted.
395
421
  """
396
422
  logger.info(
397
423
  f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller"
398
424
  )
399
425
  if self.process:
400
- self.queue.put('DONE')
401
- self.queue.join()
402
- self.process.join()
426
+ if abort:
427
+ logger.warning(f"Persistent worker aborted in rank {torch.distributed.get_rank()}")
428
+ self.process.kill()
429
+ else:
430
+ self.queue.put('DONE')
431
+ self.queue.join()
432
+ self.process.join()
403
433
  self.process = None
404
434
 
405
435
  def __del__(self):
@@ -528,6 +558,9 @@ class AsyncCallsQueue:
528
558
  blocking (bool, optional): if True, will wait until all active requests
529
559
  are done. Otherwise, finalizes only the async request that already
530
560
  finished. Defaults to False.
561
+
562
+ no_dist (bool, Optional): if True, training ranks simply check its
563
+ asynchronous checkpoint writer without synchronization.
531
564
  Returns:
532
565
  List[int]: list of indices (as returned by `schedule_async_request`)
533
566
  of async calls that have been successfully finalized.
@@ -545,8 +578,8 @@ class AsyncCallsQueue:
545
578
  finalize_fn()
546
579
  ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
547
580
  torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
548
- assert ten.item() == call_idx, 'Unmatched async calls. '
549
- 'That probably means not all ranks are participating in async finalization'
581
+ assert ten.item() == call_idx, "Unmatched async calls. "
582
+ "That probably means not all ranks are participating in async finalization"
550
583
  call_idx_finalized.append(call_idx)
551
584
  return call_idx_finalized
552
585
 
@@ -554,8 +587,13 @@ class AsyncCallsQueue:
554
587
  """Get the number of active async calls."""
555
588
  return len(self.async_calls)
556
589
 
557
- def close(self):
558
- """Finalize all calls upon closing."""
559
- self.maybe_finalize_async_calls(blocking=True)
590
+ def close(self, abort=False):
591
+ """Finalize all calls upon closing.
592
+ Args:
593
+ abort (bool, optional): Default to False. Needs to be manually set to true when
594
+ the checkpoint async process needs to be aborted.
595
+ """
596
+ if not abort:
597
+ self.maybe_finalize_async_calls(blocking=True)
560
598
  if self.persistent and self.persistent_caller:
561
- self.persistent_caller.close()
599
+ self.persistent_caller.close(abort=abort)
@@ -221,8 +221,4 @@ class AsyncSaveShardedStrategy(SaveShardedStrategy):
221
221
  def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
222
222
  """Each async strategy can be trivially used as a sync strategy."""
223
223
  async_request = self.async_save(sharded_state_dict, checkpoint_dir)
224
- # multiprocessing routines may cause issue when called on parent process
225
- # We keep this verbose call for now
226
- global async_calls
227
- async_calls.schedule_async_request(async_request)
228
- async_calls.maybe_finalize_async_calls(blocking=True)
224
+ async_request.execute_sync()