megatron-core 0.15.0rc4__tar.gz → 0.15.0rc6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (353) hide show
  1. {megatron_core-0.15.0rc4/megatron_core.egg-info → megatron_core-0.15.0rc6}/PKG-INFO +1 -1
  2. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/indexed_dataset.py +10 -7
  3. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel.py +7 -12
  4. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/finalize_model_grads.py +10 -12
  5. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +19 -20
  6. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +3 -5
  7. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +2 -3
  8. megatron_core-0.15.0rc6/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +521 -0
  9. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +130 -28
  10. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +1 -1
  11. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +92 -56
  12. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +42 -30
  13. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/param_and_grad_buffer.py +5 -2
  14. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/transformer_engine.py +109 -8
  15. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fp8_utils.py +22 -17
  16. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/full_cuda_graph.py +6 -3
  17. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_softmax.py +109 -14
  18. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/dynamic_context.py +33 -13
  19. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/dynamic_engine.py +60 -16
  20. megatron_core-0.15.0rc6/megatron/core/inference/inference_request.py +193 -0
  21. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +5 -3
  22. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/__init__.py +20 -2
  23. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/clip_grads.py +4 -4
  24. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/distrib_optimizer.py +6 -3
  25. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/optimizer.py +2 -1
  26. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/optimizer_config.py +5 -0
  27. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/package_info.py +1 -1
  28. megatron_core-0.15.0rc6/megatron/core/pipeline_parallel/bridge_communicator.py +399 -0
  29. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/safe_globals.py +3 -1
  30. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_layer.py +32 -21
  31. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/layers.py +16 -9
  32. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/cuda_graphs.py +102 -49
  33. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/dot_product_attention.py +13 -5
  34. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/enums.py +1 -0
  35. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/mlp.py +5 -2
  36. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/module.py +172 -0
  37. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/experts.py +32 -27
  38. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/moe_utils.py +17 -8
  39. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/router.py +13 -1
  40. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/pipeline_parallel_layer_layout.py +10 -6
  41. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_config.py +9 -2
  42. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_layer.py +114 -172
  43. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/utils.py +34 -1
  44. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/utils.py +3 -0
  45. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6/megatron_core.egg-info}/PKG-INFO +1 -1
  46. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/SOURCES.txt +1 -0
  47. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/pyproject.toml +21 -7
  48. megatron_core-0.15.0rc4/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -387
  49. megatron_core-0.15.0rc4/megatron/core/inference/inference_request.py +0 -91
  50. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/LICENSE +0 -0
  51. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/MANIFEST.in +0 -0
  52. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/README.md +0 -0
  53. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/README.md +0 -0
  54. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/__init__.py +0 -0
  55. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/activations.py +0 -0
  56. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/config.py +0 -0
  57. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/config_logger.py +0 -0
  58. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/__init__.py +0 -0
  59. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/bert_dataset.py +0 -0
  60. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_dataset.py +0 -0
  61. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  62. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  63. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/gpt_dataset.py +0 -0
  64. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/helpers.cpp +0 -0
  65. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/helpers.py +0 -0
  66. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/masked_dataset.py +0 -0
  67. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/megatron_dataset.py +0 -0
  68. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  69. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/multimodal_dataset.py +0 -0
  70. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/object_storage_utils.py +0 -0
  71. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/__init__.py +0 -0
  72. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/__init__.py +0 -0
  73. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  74. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/config.py +0 -0
  75. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  76. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  77. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/__init__.py +0 -0
  78. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/build.py +0 -0
  79. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/dataset.py +0 -0
  80. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/utils.py +0 -0
  81. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/external_libs.py +0 -0
  82. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/__init__.py +0 -0
  83. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/build.py +0 -0
  84. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/factory.py +0 -0
  85. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/index.py +0 -0
  86. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  87. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  88. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  89. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/utils.py +0 -0
  90. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/validate.py +0 -0
  91. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/__init__.py +0 -0
  92. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  93. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  94. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/query.py +0 -0
  95. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  96. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/utils.py +0 -0
  97. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/utils.py +0 -0
  98. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/t5_dataset.py +0 -0
  99. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils.py +0 -0
  100. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils_object_storage.py +0 -0
  101. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils_s3.py +0 -0
  102. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/__init__.py +0 -0
  103. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/core.py +0 -0
  104. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  105. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  106. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/mapping.py +0 -0
  107. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  108. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/serialization.py +0 -0
  109. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  110. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  111. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  112. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  113. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  114. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  115. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  116. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  117. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  118. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  119. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  120. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  121. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  122. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  123. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  124. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  125. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/utils.py +0 -0
  126. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/validation.py +0 -0
  127. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/__init__.py +0 -0
  128. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/data_parallel_base.py +0 -0
  129. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  130. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/__init__.py +0 -0
  131. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  132. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  133. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  134. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  135. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/energy_monitor.py +0 -0
  136. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/enums.py +0 -0
  137. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/__init__.py +0 -0
  138. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/data_type.py +0 -0
  139. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/export_config.py +0 -0
  140. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/model_type.py +0 -0
  141. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/__init__.py +0 -0
  142. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  143. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  144. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  145. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  146. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  147. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  148. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  149. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  150. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  151. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  152. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  153. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  154. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/__init__.py +0 -0
  155. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/kitchen.py +0 -0
  156. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  157. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fp4_utils.py +0 -0
  158. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/__init__.py +0 -0
  159. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  160. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  161. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  162. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  163. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  164. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_indices_converter.py +0 -0
  165. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_layer_norm.py +0 -0
  166. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  167. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  168. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  169. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/hyper_comm_grid.py +0 -0
  170. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/__init__.py +0 -0
  171. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/async_stream.py +0 -0
  172. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/common_inference_params.py +0 -0
  173. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/communication_utils.py +0 -0
  174. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/__init__.py +0 -0
  175. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/base_context.py +0 -0
  176. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  177. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/static_context.py +0 -0
  178. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  179. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/__init__.py +0 -0
  180. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/abstract_engine.py +0 -0
  181. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/mcore_engine.py +0 -0
  182. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/static_engine.py +0 -0
  183. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/headers.py +0 -0
  184. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/inference_client.py +0 -0
  185. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  186. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  187. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  188. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  189. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  190. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  191. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  192. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  193. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/sampling_params.py +0 -0
  194. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/scheduler.py +0 -0
  195. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  196. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  197. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  198. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  199. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/utils.py +0 -0
  200. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference_params.py +0 -0
  201. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/jit.py +0 -0
  202. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/model_parallel_config.py +0 -0
  203. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/__init__.py +0 -0
  204. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/t5_model.py +0 -0
  205. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/t5_spec.py +0 -0
  206. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/__init__.py +0 -0
  207. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/backends.py +0 -0
  208. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/__init__.py +0 -0
  209. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  210. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_lm_head.py +0 -0
  211. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_model.py +0 -0
  212. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/pooler.py +0 -0
  213. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/__init__.py +0 -0
  214. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/__init__.py +0 -0
  215. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  216. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  217. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  218. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  219. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  220. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/language_module/__init__.py +0 -0
  221. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/language_module/language_module.py +0 -0
  222. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  223. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/vision_module/__init__.py +0 -0
  224. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  225. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/__init__.py +0 -0
  226. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  227. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  228. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/gpt_model.py +0 -0
  229. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  230. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  231. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/__init__.py +0 -0
  232. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/clip_model.py +0 -0
  233. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/module.py +0 -0
  234. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/qwen_model.py +0 -0
  235. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/__init__.py +0 -0
  236. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  237. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/mamba_model.py +0 -0
  238. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/__init__.py +0 -0
  239. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/config/__init__.py +0 -0
  240. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/config/base_configs.py +0 -0
  241. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/model/__init__.py +0 -0
  242. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/model/base.py +0 -0
  243. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/audio.py +0 -0
  244. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/base.py +0 -0
  245. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/vision.py +0 -0
  246. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/__init__.py +0 -0
  247. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/context_parallel.py +0 -0
  248. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/llava_model.py +0 -0
  249. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/llava_spec.py +0 -0
  250. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/__init__.py +0 -0
  251. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/base_attention.py +0 -0
  252. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/config.py +0 -0
  253. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/decoder_attention.py +0 -0
  254. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/decoder_spec.py +0 -0
  255. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/encoder_attention.py +0 -0
  256. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/encoder_spec.py +0 -0
  257. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/model.py +0 -0
  258. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/utils.py +0 -0
  259. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/__init__.py +0 -0
  260. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/clip_vit_model.py +0 -0
  261. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/multimodal_projector.py +0 -0
  262. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/radio.py +0 -0
  263. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  264. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/msc_utils.py +0 -0
  265. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/nccl_allocator.py +0 -0
  266. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/num_microbatches_calculator.py +0 -0
  267. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  268. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  269. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/grad_scaler.py +0 -0
  270. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer_param_scheduler.py +0 -0
  271. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/packed_seq_params.py +0 -0
  272. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/parallel_state.py +0 -0
  273. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/__init__.py +0 -0
  274. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  275. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  276. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/schedules.py +0 -0
  277. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/utils.py +0 -0
  278. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/__init__.py +0 -0
  279. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/__init__.py +0 -0
  280. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  281. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  282. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  283. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/layers.py +0 -0
  284. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  285. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  286. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/process_groups_config.py +0 -0
  287. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/__init__.py +0 -0
  288. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/quant_config.py +0 -0
  289. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/utils.py +0 -0
  290. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/requirements.txt +0 -0
  291. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/rerun_state_machine.py +0 -0
  292. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/__init__.py +0 -0
  293. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_block.py +0 -0
  294. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  295. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  296. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_mixer.py +0 -0
  297. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mlp_layer.py +0 -0
  298. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/triton_cache_manager.py +0 -0
  299. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/__init__.py +0 -0
  300. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  301. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/data.py +0 -0
  302. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/mappings.py +0 -0
  303. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/random.py +0 -0
  304. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/utils.py +0 -0
  305. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/timers.py +0 -0
  306. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/__init__.py +0 -0
  307. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  308. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  309. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/__init__.py +0 -0
  310. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  311. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  312. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  313. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  314. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  315. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  316. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  317. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  318. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  319. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  320. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  321. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  322. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  323. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  324. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  325. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  326. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  327. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  328. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/__init__.py +0 -0
  329. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/attention.py +0 -0
  330. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  331. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  332. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  333. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  334. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  335. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/identity_op.py +0 -0
  336. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/__init__.py +0 -0
  337. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  338. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  339. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/moe_layer.py +0 -0
  340. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/shared_experts.py +0 -0
  341. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  342. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  343. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/multi_latent_attention.py +0 -0
  344. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/multi_token_prediction.py +0 -0
  345. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/spec_utils.py +0 -0
  346. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/torch_layer_norm.py +0 -0
  347. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/torch_norm.py +0 -0
  348. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_block.py +0 -0
  349. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/dependency_links.txt +0 -0
  350. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/requires.txt +0 -0
  351. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/top_level.txt +0 -0
  352. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/setup.cfg +0 -0
  353. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.15.0rc4
3
+ Version: 0.15.0rc6
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -12,6 +12,7 @@ import shutil
12
12
  import struct
13
13
  import time
14
14
  from abc import ABC, abstractmethod
15
+ from collections.abc import Iterable
15
16
  from enum import Enum
16
17
  from functools import lru_cache
17
18
  from itertools import accumulate
@@ -172,9 +173,9 @@ class _IndexWriter(object):
172
173
 
173
174
  def write(
174
175
  self,
175
- sequence_lengths: List[int],
176
- sequence_modes: Optional[List[int]],
177
- document_indices: List[int],
176
+ sequence_lengths: Iterable[Union[int, numpy.integer]],
177
+ sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
178
+ document_indices: Iterable[Union[int, numpy.integer]],
178
179
  ) -> None:
179
180
  """Write the index (.idx) file
180
181
 
@@ -208,7 +209,9 @@ class _IndexWriter(object):
208
209
  if sequence_modes is not None:
209
210
  self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
210
211
 
211
- def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
212
+ def _sequence_pointers(
213
+ self, sequence_lengths: Iterable[Union[int, numpy.integer]]
214
+ ) -> List[int]:
212
215
  """Build the sequence pointers per the sequence lengths and dtype size
213
216
 
214
217
  Args:
@@ -217,11 +220,11 @@ class _IndexWriter(object):
217
220
  Returns:
218
221
  List[int]: The pointer to the beginning of each sequence
219
222
  """
220
- itemsize = DType.size(self.dtype)
221
- curr_ptr = 0
223
+ itemsize = numpy.int64(DType.size(self.dtype))
224
+ curr_ptr = numpy.int64(0)
222
225
  list_ptr = []
223
226
  for length in sequence_lengths:
224
- list_ptr.append(curr_ptr)
227
+ list_ptr.append(curr_ptr.item())
225
228
  curr_ptr += length * itemsize
226
229
  return list_ptr
227
230
 
@@ -519,8 +519,11 @@ class DistributedDataParallel(_BaseDataParallel):
519
519
  param_slice = bucket.param_data.view(-1)[param_start:param_end]
520
520
  param.data.copy_(param_slice.view(param.data.shape))
521
521
  # All-gathered params are not needed after being copied to param.data.
522
- # Zero out the grad buffer (shared with param buffer) for gradient accumulation.
523
- bucket.grad_data.zero_()
522
+ # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
523
+ # We cannot zero out the entire grad buffer because one grad buffer may
524
+ # correspond to multiple param buffers. If we zero out the entire grad buffer,
525
+ # it would clear the data of those param buffers that have not yet completed AG.
526
+ bucket.param_data.zero_()
524
527
 
525
528
  def start_grad_sync(self, *unused):
526
529
  """
@@ -562,16 +565,8 @@ class DistributedDataParallel(_BaseDataParallel):
562
565
  # to True, and there will be a double-GA.
563
566
  for param in self.params_with_grad:
564
567
  param.grad_added_to_main_grad = False
565
- # In the case of "reuse_grad_buf_for_mxfp8_param_ag=True & overlap_param_gather=True",
566
- # the grad buffer is not reset here because the grad buffer is shared with the param buffer.
567
- # The grad buffer is zeroed by "bucket.grad_data.zero_()" in the "finish_param_sync" stage
568
- # after the param all-gather.
569
- if not (
570
- self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
571
- and self.ddp_config.overlap_param_gather
572
- ):
573
- for buffer in self.buffers + self.expert_parallel_buffers:
574
- buffer.reset()
568
+ for buffer in self.buffers + self.expert_parallel_buffers:
569
+ buffer.reset()
575
570
  for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
576
571
  bucket_group.reset()
577
572
 
@@ -267,13 +267,18 @@ def _allreduce_position_embedding_grads(
267
267
  )
268
268
 
269
269
 
270
- def _reset_global_aux_loss_tracker(model: List[torch.nn.Module]):
270
+ def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.nn.Module]):
271
271
  """
272
- Reset the global aux loss tracker.
272
+ Reset the temporary tensors of the model.
273
273
  """
274
274
  for model_chunk in model:
275
275
  for module in get_attr_wrapped_model(model_chunk, 'modules')():
276
- if hasattr(module, 'reset_global_aux_loss_tracker'):
276
+ if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
277
+ module.local_tokens_per_expert.zero_()
278
+ if (
279
+ config.moe_router_load_balancing_type == "global_aux_loss"
280
+ or "global_aux_loss" in config.moe_router_load_balancing_type
281
+ ) and hasattr(module, 'reset_global_aux_loss_tracker'):
277
282
  module.reset_global_aux_loss_tracker()
278
283
 
279
284
 
@@ -298,10 +303,7 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
298
303
  stacked_tokens_per_expert, stacked_expert_bias, config.moe_router_bias_update_rate
299
304
  )
300
305
 
301
- for tokens_per_expert, expert_bias, updated_expert_bias in zip(
302
- tokens_per_expert_list, expert_bias_list, stacked_updated_expert_bias
303
- ):
304
- tokens_per_expert.zero_()
306
+ for expert_bias, updated_expert_bias in zip(expert_bias_list, stacked_updated_expert_bias):
305
307
  expert_bias.copy_(updated_expert_bias)
306
308
 
307
309
 
@@ -465,11 +467,7 @@ def finalize_model_grads(
465
467
  if config.moe_router_enable_expert_bias:
466
468
  _update_router_expert_bias(model, config)
467
469
 
468
- if (
469
- config.moe_router_load_balancing_type == "global_aux_loss"
470
- or "global_aux_loss" in config.moe_router_load_balancing_type
471
- ):
472
- _reset_global_aux_loss_tracker(model)
470
+ reset_model_temporary_tensors(config, model)
473
471
 
474
472
  # normalize gradients for per-token loss normalization.
475
473
  # if we are using by the number of tokens, then we use that as a divisor. this number
@@ -158,7 +158,7 @@ class FullyShardedDataParallel(_BaseDataParallel):
158
158
  dp_cp_group = parallel_state.get_data_parallel_group(
159
159
  with_context_parallel=True, partial_data_parallel=True
160
160
  )
161
- inter_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
161
+ outer_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
162
162
  hybrid_fsdp_group = parallel_state.get_data_parallel_group(
163
163
  with_context_parallel=True, partial_data_parallel=False
164
164
  )
@@ -166,17 +166,17 @@ class FullyShardedDataParallel(_BaseDataParallel):
166
166
  dp_cp_group = parallel_state.get_data_parallel_group(
167
167
  with_context_parallel=True, partial_data_parallel=False
168
168
  )
169
- inter_fsdp_group = None
169
+ outer_fsdp_group = None
170
170
  hybrid_fsdp_group = None
171
171
  else:
172
172
  tp_group = getattr(pg_collection, 'tp', None)
173
173
  if enable_hsdp:
174
174
  dp_cp_group = pg_collection.intra_dp_cp
175
- inter_fsdp_group = pg_collection.inter_dist_opt
175
+ outer_fsdp_group = pg_collection.inter_dist_opt
176
176
  hybrid_fsdp_group = pg_collection.dp_cp
177
177
  else:
178
178
  dp_cp_group = pg_collection.dp_cp
179
- inter_fsdp_group = None
179
+ outer_fsdp_group = None
180
180
  hybrid_fsdp_group = None
181
181
 
182
182
  if tp_group is None:
@@ -184,17 +184,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
184
184
  tp_group = single_rank_group
185
185
 
186
186
  if enable_hsdp:
187
- mesh = _get_hsdp_tp_mesh(inter_fsdp_group, dp_cp_group, tp_group)
187
+ mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
188
188
  dist_index = FSDPDistributedIndex(
189
- use_hybrid_fsdp=True,
190
189
  hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
191
190
  device_mesh=DeviceMesh.from_group(
192
- [inter_fsdp_group, dp_cp_group, tp_group],
191
+ [outer_fsdp_group, dp_cp_group, tp_group],
193
192
  device_type="cuda",
194
193
  mesh=mesh.tolist(),
195
- mesh_dim_names=["inter_fsdp_dp", "dp_cp", "tp"],
194
+ mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"],
196
195
  ),
197
- dp_inter_dim="inter_fsdp_dp",
196
+ dp_outer_dim="outer_fsdp_dp", # Use Hybrid FSDP!
198
197
  dp_shard_dim="dp_cp",
199
198
  tp_dim="tp",
200
199
  hybrid_fsdp_group=hybrid_fsdp_group,
@@ -222,20 +221,20 @@ class FullyShardedDataParallel(_BaseDataParallel):
222
221
  self.module.synchronize_param_gather()
223
222
 
224
223
 
225
- def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
224
+ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
226
225
  assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
227
226
  world_size = dist.get_world_size()
228
227
 
229
228
  mesh = einops.rearrange(
230
229
  torch.arange(world_size),
231
- "(inter_fsdp_dp fsdp tp) -> inter_fsdp_dp fsdp tp",
232
- inter_fsdp_dp=inter_fsdp_dp_group.size(),
230
+ "(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp",
231
+ outer_fsdp_dp=outer_fsdp_dp_group.size(),
233
232
  tp=tp_group.size(),
234
233
  )
235
234
 
236
235
  mesh_fsdp_ranks = einops.rearrange(
237
236
  mesh,
238
- 'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp tp) fsdp',
237
+ 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp',
239
238
  tp=tp_group.size(),
240
239
  fsdp=dp_cp_group.size(),
241
240
  )
@@ -247,7 +246,7 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
247
246
 
248
247
  mesh_tp_ranks = einops.rearrange(
249
248
  mesh,
250
- 'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp fsdp) tp',
249
+ 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp',
251
250
  tp=tp_group.size(),
252
251
  fsdp=dp_cp_group.size(),
253
252
  )
@@ -257,18 +256,18 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
257
256
  f"do not match the ranks in the TP group {tp_group_ranks}."
258
257
  )
259
258
 
260
- mesh_inter_fsdp_dp_ranks = einops.rearrange(
259
+ mesh_outer_fsdp_dp_ranks = einops.rearrange(
261
260
  mesh,
262
- 'inter_fsdp_dp fsdp tp -> (fsdp tp) inter_fsdp_dp',
261
+ 'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp',
263
262
  tp=tp_group.size(),
264
263
  fsdp=dp_cp_group.size(),
265
264
  )
266
- inter_fsdp_dp_group_ranks = dist.get_process_group_ranks(inter_fsdp_dp_group)
265
+ outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group)
267
266
  assert _check_mesh_ranks_and_group_ranks_are_consistent(
268
- mesh_inter_fsdp_dp_ranks, inter_fsdp_dp_group_ranks
267
+ mesh_outer_fsdp_dp_ranks, outer_fsdp_dp_group_ranks
269
268
  ), (
270
- f"[Megatron-FSDP] Inter FSDP Data Parallel ranks in the mesh {mesh_inter_fsdp_dp_ranks} "
271
- f"do not match the ranks in the Inter FSDP DP group {inter_fsdp_dp_group_ranks}."
269
+ f"[Megatron-FSDP] Outer FSDP Data Parallel ranks in the mesh {mesh_outer_fsdp_dp_ranks} "
270
+ f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}."
272
271
  )
273
272
 
274
273
  return mesh
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .distributed_data_parallel_config import DistributedDataParallelConfig
16
+ from .fully_shard import fully_shard, fully_shard_model, fully_shard_optimizer
16
17
  from .megatron_fsdp import MegatronFSDP
17
18
  from .package_info import (
18
19
  __contact_emails__,
@@ -29,16 +30,13 @@ from .package_info import (
29
30
  )
30
31
  from .utils import FSDPDistributedIndex
31
32
 
32
- try:
33
- from .fully_shard import fully_shard
34
- except ImportError as e:
35
- print(f"Failed to import fully_shard: {e}")
36
-
37
33
  __all__ = [
38
34
  "DistributedDataParallelConfig",
39
35
  "MegatronFSDP",
40
36
  "FSDPDistributedIndex",
41
37
  "fully_shard",
38
+ "fully_shard_model",
39
+ "fully_shard_optimizer",
42
40
  "__contact_emails__",
43
41
  "__contact_names__",
44
42
  "__description__",
@@ -117,13 +117,12 @@ class DistributedDataParallelConfig:
117
117
  This option will cause additional memory overhead, however, it is necessary for
118
118
  to register user buffer (nccl_ub=True) for the Megatron FSDP.
119
119
  This option will be automatically set to True when nccl_ub=True.
120
- """
120
+ """
121
121
 
122
122
  outer_dp_sharding_strategy: str = 'no_shard'
123
123
  """
124
124
  Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
125
- Valid values are 'no_shard', 'optim', 'optim_grads', 'optim_grads_params'.
126
- This option is only effective when Hybrid FSDP is enabled.
125
+ Valid values are 'no_shard', 'optim'. This option is only effective when Hybrid FSDP is enabled.
127
126
  """
128
127
 
129
128
  disable_symmetric_registration: bool = False