megatron-core 0.15.0rc4__tar.gz → 0.15.0rc5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (352) hide show
  1. {megatron_core-0.15.0rc4/megatron_core.egg-info → megatron_core-0.15.0rc5}/PKG-INFO +1 -1
  2. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/indexed_dataset.py +10 -7
  3. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +19 -20
  4. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +3 -5
  5. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +2 -3
  6. megatron_core-0.15.0rc5/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +521 -0
  7. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +117 -26
  8. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +1 -1
  9. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +92 -56
  10. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +42 -30
  11. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine.py +13 -2
  12. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/full_cuda_graph.py +6 -3
  13. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_softmax.py +109 -14
  14. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_context.py +33 -13
  15. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/dynamic_engine.py +60 -16
  16. megatron_core-0.15.0rc5/megatron/core/inference/inference_request.py +193 -0
  17. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +5 -3
  18. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/clip_grads.py +4 -4
  19. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer.py +2 -1
  20. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/package_info.py +1 -1
  21. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/safe_globals.py +3 -1
  22. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/layers.py +4 -0
  23. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/cuda_graphs.py +10 -0
  24. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/dot_product_attention.py +13 -5
  25. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/enums.py +1 -0
  26. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/pipeline_parallel_layer_layout.py +10 -6
  27. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_config.py +5 -0
  28. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/utils.py +34 -1
  29. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/utils.py +3 -0
  30. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5/megatron_core.egg-info}/PKG-INFO +1 -1
  31. megatron_core-0.15.0rc4/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -387
  32. megatron_core-0.15.0rc4/megatron/core/inference/inference_request.py +0 -91
  33. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/LICENSE +0 -0
  34. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/MANIFEST.in +0 -0
  35. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/README.md +0 -0
  36. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/README.md +0 -0
  37. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/__init__.py +0 -0
  38. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/activations.py +0 -0
  39. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/config.py +0 -0
  40. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/config_logger.py +0 -0
  41. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/__init__.py +0 -0
  42. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/bert_dataset.py +0 -0
  43. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_dataset.py +0 -0
  44. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  45. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  46. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/gpt_dataset.py +0 -0
  47. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.cpp +0 -0
  48. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.py +0 -0
  49. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/masked_dataset.py +0 -0
  50. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_dataset.py +0 -0
  51. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  52. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/multimodal_dataset.py +0 -0
  53. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/object_storage_utils.py +0 -0
  54. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/__init__.py +0 -0
  55. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/__init__.py +0 -0
  56. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  57. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/config.py +0 -0
  58. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  59. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  60. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/__init__.py +0 -0
  61. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/build.py +0 -0
  62. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/dataset.py +0 -0
  63. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/utils.py +0 -0
  64. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/external_libs.py +0 -0
  65. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/__init__.py +0 -0
  66. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/build.py +0 -0
  67. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/factory.py +0 -0
  68. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/index.py +0 -0
  69. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  70. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  71. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  72. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/utils.py +0 -0
  73. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/validate.py +0 -0
  74. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/__init__.py +0 -0
  75. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  76. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  77. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/query.py +0 -0
  78. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  79. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/utils.py +0 -0
  80. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/utils.py +0 -0
  81. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/t5_dataset.py +0 -0
  82. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils.py +0 -0
  83. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_object_storage.py +0 -0
  84. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_s3.py +0 -0
  85. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/__init__.py +0 -0
  86. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/core.py +0 -0
  87. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  88. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  89. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/mapping.py +0 -0
  90. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  91. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/serialization.py +0 -0
  92. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  93. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  94. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  95. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  96. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  97. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  98. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  99. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  100. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  101. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  102. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  103. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  104. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  105. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  106. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  107. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  108. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/utils.py +0 -0
  109. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/validation.py +0 -0
  110. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/__init__.py +0 -0
  111. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/data_parallel_base.py +0 -0
  112. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  113. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  114. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/finalize_model_grads.py +0 -0
  115. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/__init__.py +0 -0
  116. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  117. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  118. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  119. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  120. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  121. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/energy_monitor.py +0 -0
  122. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/enums.py +0 -0
  123. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/__init__.py +0 -0
  124. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/data_type.py +0 -0
  125. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/export_config.py +0 -0
  126. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/model_type.py +0 -0
  127. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/__init__.py +0 -0
  128. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  129. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  130. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  131. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  132. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  133. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  134. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  135. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  136. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  137. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  138. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  139. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  140. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/extensions/__init__.py +0 -0
  141. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/extensions/kitchen.py +0 -0
  142. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  143. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fp4_utils.py +0 -0
  144. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fp8_utils.py +0 -0
  145. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/__init__.py +0 -0
  146. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  147. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  148. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  149. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  150. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  151. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_indices_converter.py +0 -0
  152. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_layer_norm.py +0 -0
  153. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  154. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  155. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  156. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/hyper_comm_grid.py +0 -0
  157. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/__init__.py +0 -0
  158. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/async_stream.py +0 -0
  159. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/common_inference_params.py +0 -0
  160. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/communication_utils.py +0 -0
  161. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/__init__.py +0 -0
  162. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/base_context.py +0 -0
  163. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  164. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/static_context.py +0 -0
  165. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  166. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/__init__.py +0 -0
  167. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/abstract_engine.py +0 -0
  168. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/mcore_engine.py +0 -0
  169. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/static_engine.py +0 -0
  170. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/headers.py +0 -0
  171. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/inference_client.py +0 -0
  172. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  173. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  174. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  175. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  176. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  177. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  178. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  179. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  180. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/sampling_params.py +0 -0
  181. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/scheduler.py +0 -0
  182. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  183. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  184. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  185. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  186. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference/utils.py +0 -0
  187. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/inference_params.py +0 -0
  188. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/jit.py +0 -0
  189. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/model_parallel_config.py +0 -0
  190. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/T5/__init__.py +0 -0
  191. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_model.py +0 -0
  192. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_spec.py +0 -0
  193. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/__init__.py +0 -0
  194. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/backends.py +0 -0
  195. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/bert/__init__.py +0 -0
  196. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  197. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_lm_head.py +0 -0
  198. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_model.py +0 -0
  199. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/bert/pooler.py +0 -0
  200. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/__init__.py +0 -0
  201. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/__init__.py +0 -0
  202. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  203. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  204. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  205. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  206. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  207. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/language_module/__init__.py +0 -0
  208. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/language_module/language_module.py +0 -0
  209. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  210. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/vision_module/__init__.py +0 -0
  211. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  212. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/__init__.py +0 -0
  213. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  214. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  215. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_model.py +0 -0
  216. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  217. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  218. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/__init__.py +0 -0
  219. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/clip_model.py +0 -0
  220. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/module.py +0 -0
  221. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/qwen_model.py +0 -0
  222. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/__init__.py +0 -0
  223. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  224. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_model.py +0 -0
  225. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/__init__.py +0 -0
  226. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/__init__.py +0 -0
  227. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/base_configs.py +0 -0
  228. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/__init__.py +0 -0
  229. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/base.py +0 -0
  230. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/audio.py +0 -0
  231. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/base.py +0 -0
  232. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/vision.py +0 -0
  233. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/__init__.py +0 -0
  234. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/context_parallel.py +0 -0
  235. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_model.py +0 -0
  236. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_spec.py +0 -0
  237. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/__init__.py +0 -0
  238. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/base_attention.py +0 -0
  239. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/config.py +0 -0
  240. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_attention.py +0 -0
  241. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_spec.py +0 -0
  242. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_attention.py +0 -0
  243. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_spec.py +0 -0
  244. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/model.py +0 -0
  245. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/retro/utils.py +0 -0
  246. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/vision/__init__.py +0 -0
  247. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/vision/clip_vit_model.py +0 -0
  248. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/vision/multimodal_projector.py +0 -0
  249. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/vision/radio.py +0 -0
  250. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  251. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/msc_utils.py +0 -0
  252. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/nccl_allocator.py +0 -0
  253. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/num_microbatches_calculator.py +0 -0
  254. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/__init__.py +0 -0
  255. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  256. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  257. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  258. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/grad_scaler.py +0 -0
  259. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer_config.py +0 -0
  260. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/optimizer_param_scheduler.py +0 -0
  261. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/packed_seq_params.py +0 -0
  262. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/parallel_state.py +0 -0
  263. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/__init__.py +0 -0
  264. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  265. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  266. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/schedules.py +0 -0
  267. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/utils.py +0 -0
  268. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/__init__.py +0 -0
  269. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/__init__.py +0 -0
  270. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  271. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  272. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  273. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/layers.py +0 -0
  274. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  275. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  276. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/process_groups_config.py +0 -0
  277. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/quantization/__init__.py +0 -0
  278. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/quantization/quant_config.py +0 -0
  279. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/quantization/utils.py +0 -0
  280. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/requirements.txt +0 -0
  281. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/rerun_state_machine.py +0 -0
  282. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/__init__.py +0 -0
  283. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_block.py +0 -0
  284. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  285. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  286. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_layer.py +0 -0
  287. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_mixer.py +0 -0
  288. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/mlp_layer.py +0 -0
  289. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/ssm/triton_cache_manager.py +0 -0
  290. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/__init__.py +0 -0
  291. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  292. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/data.py +0 -0
  293. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/mappings.py +0 -0
  294. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/random.py +0 -0
  295. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/utils.py +0 -0
  296. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/timers.py +0 -0
  297. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/__init__.py +0 -0
  298. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  299. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  300. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/__init__.py +0 -0
  301. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  302. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  303. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  304. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  305. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  306. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  307. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  308. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  309. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  310. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  311. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  312. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  313. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  314. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  315. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  316. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  317. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  318. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  319. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/__init__.py +0 -0
  320. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/attention.py +0 -0
  321. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  322. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  323. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  324. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  325. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  326. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/identity_op.py +0 -0
  327. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/mlp.py +0 -0
  328. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/module.py +0 -0
  329. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/__init__.py +0 -0
  330. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/experts.py +0 -0
  331. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  332. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  333. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_layer.py +0 -0
  334. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_utils.py +0 -0
  335. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/router.py +0 -0
  336. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/shared_experts.py +0 -0
  337. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  338. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  339. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_latent_attention.py +0 -0
  340. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_token_prediction.py +0 -0
  341. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/spec_utils.py +0 -0
  342. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_layer_norm.py +0 -0
  343. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_norm.py +0 -0
  344. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_block.py +0 -0
  345. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_layer.py +0 -0
  346. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron_core.egg-info/SOURCES.txt +0 -0
  347. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron_core.egg-info/dependency_links.txt +0 -0
  348. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron_core.egg-info/requires.txt +0 -0
  349. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/megatron_core.egg-info/top_level.txt +0 -0
  350. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/pyproject.toml +0 -0
  351. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/setup.cfg +0 -0
  352. {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc5}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.15.0rc4
3
+ Version: 0.15.0rc5
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -12,6 +12,7 @@ import shutil
12
12
  import struct
13
13
  import time
14
14
  from abc import ABC, abstractmethod
15
+ from collections.abc import Iterable
15
16
  from enum import Enum
16
17
  from functools import lru_cache
17
18
  from itertools import accumulate
@@ -172,9 +173,9 @@ class _IndexWriter(object):
172
173
 
173
174
  def write(
174
175
  self,
175
- sequence_lengths: List[int],
176
- sequence_modes: Optional[List[int]],
177
- document_indices: List[int],
176
+ sequence_lengths: Iterable[Union[int, numpy.integer]],
177
+ sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
178
+ document_indices: Iterable[Union[int, numpy.integer]],
178
179
  ) -> None:
179
180
  """Write the index (.idx) file
180
181
 
@@ -208,7 +209,9 @@ class _IndexWriter(object):
208
209
  if sequence_modes is not None:
209
210
  self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
210
211
 
211
- def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
212
+ def _sequence_pointers(
213
+ self, sequence_lengths: Iterable[Union[int, numpy.integer]]
214
+ ) -> List[int]:
212
215
  """Build the sequence pointers per the sequence lengths and dtype size
213
216
 
214
217
  Args:
@@ -217,11 +220,11 @@ class _IndexWriter(object):
217
220
  Returns:
218
221
  List[int]: The pointer to the beginning of each sequence
219
222
  """
220
- itemsize = DType.size(self.dtype)
221
- curr_ptr = 0
223
+ itemsize = numpy.int64(DType.size(self.dtype))
224
+ curr_ptr = numpy.int64(0)
222
225
  list_ptr = []
223
226
  for length in sequence_lengths:
224
- list_ptr.append(curr_ptr)
227
+ list_ptr.append(curr_ptr.item())
225
228
  curr_ptr += length * itemsize
226
229
  return list_ptr
227
230
 
@@ -158,7 +158,7 @@ class FullyShardedDataParallel(_BaseDataParallel):
158
158
  dp_cp_group = parallel_state.get_data_parallel_group(
159
159
  with_context_parallel=True, partial_data_parallel=True
160
160
  )
161
- inter_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
161
+ outer_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
162
162
  hybrid_fsdp_group = parallel_state.get_data_parallel_group(
163
163
  with_context_parallel=True, partial_data_parallel=False
164
164
  )
@@ -166,17 +166,17 @@ class FullyShardedDataParallel(_BaseDataParallel):
166
166
  dp_cp_group = parallel_state.get_data_parallel_group(
167
167
  with_context_parallel=True, partial_data_parallel=False
168
168
  )
169
- inter_fsdp_group = None
169
+ outer_fsdp_group = None
170
170
  hybrid_fsdp_group = None
171
171
  else:
172
172
  tp_group = getattr(pg_collection, 'tp', None)
173
173
  if enable_hsdp:
174
174
  dp_cp_group = pg_collection.intra_dp_cp
175
- inter_fsdp_group = pg_collection.inter_dist_opt
175
+ outer_fsdp_group = pg_collection.inter_dist_opt
176
176
  hybrid_fsdp_group = pg_collection.dp_cp
177
177
  else:
178
178
  dp_cp_group = pg_collection.dp_cp
179
- inter_fsdp_group = None
179
+ outer_fsdp_group = None
180
180
  hybrid_fsdp_group = None
181
181
 
182
182
  if tp_group is None:
@@ -184,17 +184,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
184
184
  tp_group = single_rank_group
185
185
 
186
186
  if enable_hsdp:
187
- mesh = _get_hsdp_tp_mesh(inter_fsdp_group, dp_cp_group, tp_group)
187
+ mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
188
188
  dist_index = FSDPDistributedIndex(
189
- use_hybrid_fsdp=True,
190
189
  hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
191
190
  device_mesh=DeviceMesh.from_group(
192
- [inter_fsdp_group, dp_cp_group, tp_group],
191
+ [outer_fsdp_group, dp_cp_group, tp_group],
193
192
  device_type="cuda",
194
193
  mesh=mesh.tolist(),
195
- mesh_dim_names=["inter_fsdp_dp", "dp_cp", "tp"],
194
+ mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"],
196
195
  ),
197
- dp_inter_dim="inter_fsdp_dp",
196
+ dp_outer_dim="outer_fsdp_dp", # Use Hybrid FSDP!
198
197
  dp_shard_dim="dp_cp",
199
198
  tp_dim="tp",
200
199
  hybrid_fsdp_group=hybrid_fsdp_group,
@@ -222,20 +221,20 @@ class FullyShardedDataParallel(_BaseDataParallel):
222
221
  self.module.synchronize_param_gather()
223
222
 
224
223
 
225
- def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
224
+ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
226
225
  assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
227
226
  world_size = dist.get_world_size()
228
227
 
229
228
  mesh = einops.rearrange(
230
229
  torch.arange(world_size),
231
- "(inter_fsdp_dp fsdp tp) -> inter_fsdp_dp fsdp tp",
232
- inter_fsdp_dp=inter_fsdp_dp_group.size(),
230
+ "(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp",
231
+ outer_fsdp_dp=outer_fsdp_dp_group.size(),
233
232
  tp=tp_group.size(),
234
233
  )
235
234
 
236
235
  mesh_fsdp_ranks = einops.rearrange(
237
236
  mesh,
238
- 'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp tp) fsdp',
237
+ 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp',
239
238
  tp=tp_group.size(),
240
239
  fsdp=dp_cp_group.size(),
241
240
  )
@@ -247,7 +246,7 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
247
246
 
248
247
  mesh_tp_ranks = einops.rearrange(
249
248
  mesh,
250
- 'inter_fsdp_dp fsdp tp -> (inter_fsdp_dp fsdp) tp',
249
+ 'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp',
251
250
  tp=tp_group.size(),
252
251
  fsdp=dp_cp_group.size(),
253
252
  )
@@ -257,18 +256,18 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
257
256
  f"do not match the ranks in the TP group {tp_group_ranks}."
258
257
  )
259
258
 
260
- mesh_inter_fsdp_dp_ranks = einops.rearrange(
259
+ mesh_outer_fsdp_dp_ranks = einops.rearrange(
261
260
  mesh,
262
- 'inter_fsdp_dp fsdp tp -> (fsdp tp) inter_fsdp_dp',
261
+ 'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp',
263
262
  tp=tp_group.size(),
264
263
  fsdp=dp_cp_group.size(),
265
264
  )
266
- inter_fsdp_dp_group_ranks = dist.get_process_group_ranks(inter_fsdp_dp_group)
265
+ outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group)
267
266
  assert _check_mesh_ranks_and_group_ranks_are_consistent(
268
- mesh_inter_fsdp_dp_ranks, inter_fsdp_dp_group_ranks
267
+ mesh_outer_fsdp_dp_ranks, outer_fsdp_dp_group_ranks
269
268
  ), (
270
- f"[Megatron-FSDP] Inter FSDP Data Parallel ranks in the mesh {mesh_inter_fsdp_dp_ranks} "
271
- f"do not match the ranks in the Inter FSDP DP group {inter_fsdp_dp_group_ranks}."
269
+ f"[Megatron-FSDP] Outer FSDP Data Parallel ranks in the mesh {mesh_outer_fsdp_dp_ranks} "
270
+ f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}."
272
271
  )
273
272
 
274
273
  return mesh
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .distributed_data_parallel_config import DistributedDataParallelConfig
16
+ from .fully_shard import fully_shard, fully_shard_model, fully_shard_optimizer
16
17
  from .megatron_fsdp import MegatronFSDP
17
18
  from .package_info import (
18
19
  __contact_emails__,
@@ -29,16 +30,13 @@ from .package_info import (
29
30
  )
30
31
  from .utils import FSDPDistributedIndex
31
32
 
32
- try:
33
- from .fully_shard import fully_shard
34
- except ImportError as e:
35
- print(f"Failed to import fully_shard: {e}")
36
-
37
33
  __all__ = [
38
34
  "DistributedDataParallelConfig",
39
35
  "MegatronFSDP",
40
36
  "FSDPDistributedIndex",
41
37
  "fully_shard",
38
+ "fully_shard_model",
39
+ "fully_shard_optimizer",
42
40
  "__contact_emails__",
43
41
  "__contact_names__",
44
42
  "__description__",
@@ -117,13 +117,12 @@ class DistributedDataParallelConfig:
117
117
  This option will cause additional memory overhead, however, it is necessary for
118
118
  to register user buffer (nccl_ub=True) for the Megatron FSDP.
119
119
  This option will be automatically set to True when nccl_ub=True.
120
- """
120
+ """
121
121
 
122
122
  outer_dp_sharding_strategy: str = 'no_shard'
123
123
  """
124
124
  Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
125
- Valid values are 'no_shard', 'optim', 'optim_grads', 'optim_grads_params'.
126
- This option is only effective when Hybrid FSDP is enabled.
125
+ Valid values are 'no_shard', 'optim'. This option is only effective when Hybrid FSDP is enabled.
127
126
  """
128
127
 
129
128
  disable_symmetric_registration: bool = False