megatron-core 0.14.0rc0__tar.gz → 0.14.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show
  1. {megatron_core-0.14.0rc0/megatron_core.egg-info → megatron_core-0.14.0rc1}/PKG-INFO +1 -1
  2. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/indexed_dataset.py +5 -0
  3. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/distributed_data_parallel_config.py +9 -0
  4. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/extensions/transformer_engine.py +10 -9
  5. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/dynamic_engine.py +61 -32
  6. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/inference_request.py +1 -0
  7. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +55 -2
  8. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/package_info.py +1 -1
  9. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/layers.py +9 -7
  10. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/attention.py +2 -1
  11. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/cuda_graphs.py +5 -1
  12. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/utils.py +3 -0
  13. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1/megatron_core.egg-info}/PKG-INFO +1 -1
  14. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/LICENSE +0 -0
  15. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/MANIFEST.in +0 -0
  16. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/README.md +0 -0
  17. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/README.md +0 -0
  18. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/__init__.py +0 -0
  19. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/config.py +0 -0
  20. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/config_logger.py +0 -0
  21. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/__init__.py +0 -0
  22. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/bert_dataset.py +0 -0
  23. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/blended_dataset.py +0 -0
  24. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  25. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  26. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/gpt_dataset.py +0 -0
  27. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/helpers.cpp +0 -0
  28. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/helpers.py +0 -0
  29. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/masked_dataset.py +0 -0
  30. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/megatron_dataset.py +0 -0
  31. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  32. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/multimodal_dataset.py +0 -0
  33. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/object_storage_utils.py +0 -0
  34. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/__init__.py +0 -0
  35. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/config/__init__.py +0 -0
  36. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  37. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/config/config.py +0 -0
  38. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  39. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  40. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/db/__init__.py +0 -0
  41. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/db/build.py +0 -0
  42. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/db/dataset.py +0 -0
  43. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/db/utils.py +0 -0
  44. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/external_libs.py +0 -0
  45. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/__init__.py +0 -0
  46. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/build.py +0 -0
  47. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/factory.py +0 -0
  48. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/index.py +0 -0
  49. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  50. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  51. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  52. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/utils.py +0 -0
  53. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/index/validate.py +0 -0
  54. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/__init__.py +0 -0
  55. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  56. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  57. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/query.py +0 -0
  58. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  59. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/query/utils.py +0 -0
  60. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/retro/utils.py +0 -0
  61. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/t5_dataset.py +0 -0
  62. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/utils.py +0 -0
  63. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/utils_object_storage.py +0 -0
  64. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/datasets/utils_s3.py +0 -0
  65. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/__init__.py +0 -0
  66. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/core.py +0 -0
  67. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  68. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  69. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/mapping.py +0 -0
  70. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  71. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/serialization.py +0 -0
  72. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  73. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  74. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  75. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  76. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  77. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  78. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  79. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  80. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  81. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  82. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  83. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  84. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  85. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  86. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  87. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/utils.py +0 -0
  88. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/dist_checkpointing/validation.py +0 -0
  89. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/__init__.py +0 -0
  90. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/custom_fsdp/__init__.py +0 -0
  91. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +0 -0
  92. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +0 -0
  93. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/data_parallel_base.py +0 -0
  94. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  95. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/finalize_model_grads.py +0 -0
  96. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  97. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  98. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  99. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/energy_monitor.py +0 -0
  100. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/enums.py +0 -0
  101. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/__init__.py +0 -0
  102. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/data_type.py +0 -0
  103. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/export_config.py +0 -0
  104. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/model_type.py +0 -0
  105. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/__init__.py +0 -0
  106. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  107. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  108. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  109. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  110. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  111. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  112. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  113. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  114. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  115. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  116. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  117. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  118. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/extensions/__init__.py +0 -0
  119. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/extensions/kitchen.py +0 -0
  120. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  121. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fp8_utils.py +0 -0
  122. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/__init__.py +0 -0
  123. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  124. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  125. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  126. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  127. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  128. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_indices_converter.py +0 -0
  129. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_layer_norm.py +0 -0
  130. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  131. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  132. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/fusions/fused_softmax.py +0 -0
  133. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/hyper_comm_grid.py +0 -0
  134. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/__init__.py +0 -0
  135. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/async_stream.py +0 -0
  136. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/common_inference_params.py +0 -0
  137. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/communication_utils.py +0 -0
  138. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/contexts/__init__.py +0 -0
  139. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/contexts/base_context.py +0 -0
  140. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  141. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/contexts/dynamic_context.py +0 -0
  142. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/contexts/static_context.py +0 -0
  143. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/__init__.py +0 -0
  144. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/abstract_engine.py +0 -0
  145. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/mcore_engine.py +0 -0
  146. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/engines/static_engine.py +0 -0
  147. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  148. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  149. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  150. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  151. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  152. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  153. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  154. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  155. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/sampling_params.py +0 -0
  156. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/scheduler.py +0 -0
  157. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  158. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  159. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  160. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  161. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference/utils.py +0 -0
  162. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/inference_params.py +0 -0
  163. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/jit.py +0 -0
  164. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/model_parallel_config.py +0 -0
  165. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/T5/__init__.py +0 -0
  166. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/T5/t5_model.py +0 -0
  167. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/T5/t5_spec.py +0 -0
  168. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/__init__.py +0 -0
  169. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/backends.py +0 -0
  170. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/bert/__init__.py +0 -0
  171. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  172. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/bert/bert_lm_head.py +0 -0
  173. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/bert/bert_model.py +0 -0
  174. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/bert/pooler.py +0 -0
  175. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/__init__.py +0 -0
  176. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/__init__.py +0 -0
  177. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  178. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  179. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  180. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  181. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  182. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/language_module/__init__.py +0 -0
  183. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/language_module/language_module.py +0 -0
  184. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/vision_module/__init__.py +0 -0
  185. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  186. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/__init__.py +0 -0
  187. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  188. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  189. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/gpt_model.py +0 -0
  190. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  191. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  192. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/huggingface/__init__.py +0 -0
  193. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/huggingface/clip_model.py +0 -0
  194. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/huggingface/module.py +0 -0
  195. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/huggingface/qwen_model.py +0 -0
  196. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mamba/__init__.py +0 -0
  197. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  198. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mamba/mamba_model.py +0 -0
  199. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/__init__.py +0 -0
  200. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/config/__init__.py +0 -0
  201. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/config/base_configs.py +0 -0
  202. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/model/__init__.py +0 -0
  203. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/model/base.py +0 -0
  204. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/submodules/audio.py +0 -0
  205. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/submodules/base.py +0 -0
  206. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/mimo/submodules/vision.py +0 -0
  207. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/multimodal/__init__.py +0 -0
  208. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/multimodal/context_parallel.py +0 -0
  209. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/multimodal/llava_model.py +0 -0
  210. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/multimodal/llava_spec.py +0 -0
  211. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/__init__.py +0 -0
  212. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/base_attention.py +0 -0
  213. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/config.py +0 -0
  214. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/decoder_attention.py +0 -0
  215. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/decoder_spec.py +0 -0
  216. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/encoder_attention.py +0 -0
  217. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/encoder_spec.py +0 -0
  218. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/model.py +0 -0
  219. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/retro/utils.py +0 -0
  220. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/vision/__init__.py +0 -0
  221. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/vision/clip_vit_model.py +0 -0
  222. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/vision/multimodal_projector.py +0 -0
  223. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/vision/radio.py +0 -0
  224. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  225. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/msc_utils.py +0 -0
  226. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/num_microbatches_calculator.py +0 -0
  227. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/__init__.py +0 -0
  228. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/clip_grads.py +0 -0
  229. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  230. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  231. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  232. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/grad_scaler.py +0 -0
  233. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/optimizer.py +0 -0
  234. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer/optimizer_config.py +0 -0
  235. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/optimizer_param_scheduler.py +0 -0
  236. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/packed_seq_params.py +0 -0
  237. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/parallel_state.py +0 -0
  238. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/pipeline_parallel/__init__.py +0 -0
  239. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  240. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/pipeline_parallel/schedules.py +0 -0
  241. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/pipeline_parallel/utils.py +0 -0
  242. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/__init__.py +0 -0
  243. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/__init__.py +0 -0
  244. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  245. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  246. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  247. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/layers.py +0 -0
  248. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  249. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  250. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/process_groups_config.py +0 -0
  251. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/quantization/__init__.py +0 -0
  252. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/quantization/quant_config.py +0 -0
  253. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/quantization/utils.py +0 -0
  254. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/requirements.txt +0 -0
  255. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/rerun_state_machine.py +0 -0
  256. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/__init__.py +0 -0
  257. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mamba_block.py +0 -0
  258. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  259. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  260. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mamba_layer.py +0 -0
  261. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mamba_mixer.py +0 -0
  262. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/mlp_layer.py +0 -0
  263. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/ssm/triton_cache_manager.py +0 -0
  264. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/__init__.py +0 -0
  265. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  266. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/data.py +0 -0
  267. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/mappings.py +0 -0
  268. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/random.py +0 -0
  269. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/tensor_parallel/utils.py +0 -0
  270. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/timers.py +0 -0
  271. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/__init__.py +0 -0
  272. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  273. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  274. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/dot_product_attention.py +0 -0
  275. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/enums.py +0 -0
  276. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  277. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  278. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/identity_op.py +0 -0
  279. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/mlp.py +0 -0
  280. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/module.py +0 -0
  281. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/__init__.py +0 -0
  282. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/experts.py +0 -0
  283. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  284. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  285. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/moe_layer.py +0 -0
  286. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/moe_utils.py +0 -0
  287. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/router.py +0 -0
  288. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/shared_experts.py +0 -0
  289. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  290. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  291. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/multi_latent_attention.py +0 -0
  292. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/multi_token_prediction.py +0 -0
  293. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  294. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/spec_utils.py +0 -0
  295. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/torch_layer_norm.py +0 -0
  296. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/torch_norm.py +0 -0
  297. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/transformer_block.py +0 -0
  298. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/transformer_config.py +0 -0
  299. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/transformer_layer.py +0 -0
  300. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron/core/transformer/utils.py +0 -0
  301. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron_core.egg-info/SOURCES.txt +0 -0
  302. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron_core.egg-info/dependency_links.txt +0 -0
  303. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron_core.egg-info/requires.txt +0 -0
  304. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/megatron_core.egg-info/top_level.txt +0 -0
  305. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/pyproject.toml +0 -0
  306. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/setup.cfg +0 -0
  307. {megatron_core-0.14.0rc0 → megatron_core-0.14.0rc1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.14.0rc0
3
+ Version: 0.14.0rc1
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -5,6 +5,7 @@
5
5
 
6
6
  # Essentially re-written in entirety
7
7
 
8
+ import gc
8
9
  import logging
9
10
  import os
10
11
  import shutil
@@ -906,6 +907,10 @@ class IndexedDatasetBuilder(object):
906
907
  assert index.sequence_modes is not None, "sequence_modes cannot not be None"
907
908
  self.sequence_modes.extend(index.sequence_modes)
908
909
 
910
+ # Free up memory to make space for new indices
911
+ del index
912
+ gc.collect()
913
+
909
914
  # Concatenate data
910
915
  with self._open(get_bin_path(path_prefix), "rb") as f:
911
916
  shutil.copyfileobj(f, self.data_file)
@@ -113,6 +113,15 @@ class DistributedDataParallelConfig:
113
113
  """
114
114
 
115
115
  def __post_init__(self):
116
+ import os
117
+
116
118
  """Check the validity of the config."""
117
119
  if self.reuse_grad_buf_for_mxfp8_param_ag:
118
120
  assert self.fp8_param_gather, "Reuse grad buffer only when keeping params in MXFP8."
121
+
122
+ if self.nccl_ub:
123
+ if 'expandable_segments:True' in os.getenv('PYTORCH_CUDA_ALLOC_CONF', '').split(','):
124
+ raise ValueError(
125
+ "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True is currently not supported "
126
+ "with nccl_ub due to compatibility issue with torch.cuda.MemPool API."
127
+ )
@@ -39,6 +39,8 @@ from megatron.core.transformer.enums import AttnMaskType
39
39
  from megatron.core.transformer.transformer_config import TransformerConfig
40
40
  from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
41
41
  from megatron.core.utils import (
42
+ get_pg_rank,
43
+ get_pg_size,
42
44
  get_te_version,
43
45
  get_tensor_model_parallel_group_if_none,
44
46
  is_te_min_version,
@@ -228,8 +230,7 @@ class TELinear(te.pytorch.Linear):
228
230
  assert tp_group is None, "duplicated linear should not have tp_group set"
229
231
  tp_size = 1
230
232
  else:
231
- assert tp_group is not None, "Parallel linear should always have tp_group set"
232
- tp_size = tp_group.size()
233
+ tp_size = get_pg_size(tp_group)
233
234
 
234
235
  self.expert_parallel = self.config.expert_model_parallel_size > 1
235
236
  if is_expert:
@@ -374,8 +375,8 @@ class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
374
375
  self.is_first_microbatch = True
375
376
  self.disable_parameter_transpose_cache = self.config.disable_parameter_transpose_cache
376
377
  extra_kwargs = _get_extra_te_kwargs(config)
377
- self.tp_size = tp_group.size()
378
- self.tp_rank = tp_group.rank()
378
+ self.tp_size = get_pg_size(tp_group)
379
+ self.tp_rank = get_pg_rank(tp_group)
379
380
 
380
381
  if self.config.delay_wgrad_compute:
381
382
  if is_te_min_version("2.3.0"):
@@ -542,8 +543,8 @@ class TEColumnParallelLinear(TELinear):
542
543
  if gather_output:
543
544
  raise ValueError("Transformer Engine linear layers do not support gather_output = True")
544
545
  tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
545
- world_size = tp_group.size()
546
- rank = tp_group.rank()
546
+ world_size = get_pg_size(tp_group)
547
+ rank = get_pg_rank(tp_group)
547
548
 
548
549
  super().__init__(
549
550
  input_size=input_size,
@@ -657,8 +658,8 @@ class TERowParallelLinear(TELinear):
657
658
  tp_group=tp_group,
658
659
  )
659
660
  if config.use_cpu_initialization:
660
- world_size = tp_group.size()
661
- rank = tp_group.rank()
661
+ world_size = get_pg_size(tp_group)
662
+ rank = get_pg_rank(tp_group)
662
663
  input_size_per_partition = divide(input_size, world_size)
663
664
  self.master_weight = _initialize_affine_weight_cpu(
664
665
  self.weight,
@@ -1003,7 +1004,7 @@ if HAVE_TE and is_te_min_version("1.9.0.dev0"):
1003
1004
  # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
1004
1005
  # So we disable comms by making TE agnostic of model parallel.
1005
1006
  tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
1006
- tp_size = tp_group.size()
1007
+ tp_size = get_pg_size(tp_group)
1007
1008
 
1008
1009
  self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)
1009
1010
 
@@ -1,7 +1,6 @@
1
1
  # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
2
 
3
3
  import asyncio
4
- import time
5
4
  from collections import deque
6
5
  from typing import Dict, List, Optional, Tuple, Union
7
6
 
@@ -70,6 +69,8 @@ class DynamicInferenceEngine(AbstractEngine):
70
69
  self.request_counter = Counter()
71
70
  self.requests: Dict[int, DynamicInferenceRequest] = {}
72
71
  self.request_completion_futures: Dict[int, asyncio.Future] = {}
72
+ self.step_start_event = torch.cuda.Event(enable_timing=True)
73
+ self.step_end_event = torch.cuda.Event(enable_timing=True)
73
74
 
74
75
  # Initialize the asyncio loop if it has not already been initialized.
75
76
  # TODO: Start the engine loop here.
@@ -176,19 +177,25 @@ class DynamicInferenceEngine(AbstractEngine):
176
177
  return self.request_completion_futures[request_id]
177
178
 
178
179
  def post_process_requests(
179
- self, request_ids: torch.Tensor, finished_request_ids: torch.Tensor, sample: torch.Tensor
180
- ) -> List[DynamicInferenceRequest]:
180
+ self,
181
+ request_ids: torch.Tensor,
182
+ finished_request_ids: torch.Tensor,
183
+ step_time: float,
184
+ sample: torch.Tensor,
185
+ ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest]]:
181
186
  """
182
187
  Handles post-processing for requests after a step.
183
188
 
184
189
  Args:
185
190
  request_ids (torch.Tensor): A list of request_ids
186
191
  finished_request_ids (torch.Tensor): A list of finished request ids
192
+ step_time (float): The latency of the last step
187
193
  sample: (torch.Tensor): The newly generated tokens for each request
188
194
 
189
195
  Returns:
190
- A list of completed requests as `DynamicInferenceRequest` objects
196
+ A list of active requests and completed requests as `DynamicInferenceRequest` objects
191
197
  """
198
+ active_requests: List[DynamicInferenceRequest] = []
192
199
  finished_requests: List[DynamicInferenceRequest] = []
193
200
  finished_request_ids = set(finished_request_ids.tolist())
194
201
  self.finished_request_count += len(finished_request_ids)
@@ -196,6 +203,9 @@ class DynamicInferenceEngine(AbstractEngine):
196
203
  for request_id, token in zip(request_ids.tolist(), sample.tolist()):
197
204
  request: DynamicInferenceRequest = self.requests[request_id]
198
205
  request.generated_tokens.append(token)
206
+ if request.tpot is None:
207
+ request.tpot = []
208
+ request.tpot.append(step_time)
199
209
 
200
210
  if request_id in finished_request_ids:
201
211
  request.generated_length = len(request.generated_tokens)
@@ -207,50 +217,67 @@ class DynamicInferenceEngine(AbstractEngine):
207
217
  finished_request.generated_tokens
208
218
  )
209
219
  self.request_completion_futures[request_id].set_result(finished_request)
210
-
211
- return finished_requests
220
+ else:
221
+ active_requests.append(request)
222
+
223
+ return active_requests, finished_requests
224
+
225
+ def schedule_waiting_requests(self):
226
+ """Tries to schedule any requests in the waiting pool."""
227
+ for waiting_request_id in self.waiting_request_ids.copy():
228
+ waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
229
+ try:
230
+ self.context.add_request(
231
+ waiting_request_id,
232
+ waiting_request.prompt_tokens,
233
+ waiting_request.sampling_params.num_tokens_to_generate,
234
+ )
235
+ self.waiting_request_ids.popleft()
236
+ except Exception as e:
237
+ break
212
238
 
213
239
  async def async_step(
214
240
  self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
215
- ) -> Tuple[List[DynamicInferenceRequest], float]:
216
- """Wrapper for controller.generate_output_tokens_dynamic_batch(), to
217
- match vLLM API.
218
-
219
- Uses `asyncio` for continuous generation which allows this
241
+ ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
242
+ """
243
+ Wrapper for controller.generate_output_tokens_dynamic_batch(), to
244
+ match vLLM API. Uses `asyncio` for continuous generation which allows this
220
245
  method to sleep and wake up when new requests are available.
246
+
247
+ Args:
248
+ sampling_params (SamplingParams): The sampling parameters.
249
+ verbose (bool): Whether to run in verbose mode.
250
+
251
+ Returns:
252
+ A tuple comprised of:
253
+ 1. Requests that ran in the last step and are still active.
254
+ 2. Requests that ran in the last step and have now finished.
255
+ 3. The step time in seconds.
221
256
  """
222
257
 
223
258
  # Generate tokens.
224
- t = time.time()
225
259
  is_decode_only = self.context.is_decode_only()
260
+ self.step_start_event.record()
226
261
  result = self.controller.generate_output_tokens_dynamic_batch(
227
262
  sampling_params, self.termination_id
228
263
  )
229
- step_time = time.time() - t
230
-
231
- finished_requests: List[DynamicInferenceRequest] = []
264
+ self.step_end_event.record()
265
+ self.step_end_event.synchronize()
266
+ step_time = self.step_start_event.elapsed_time(self.step_end_event) / 1e3
232
267
 
233
268
  if result is not None:
234
269
  request_ids, finished_request_ids, sample = result
235
270
 
236
271
  # TODO: Move this to a background thread?
237
- finished_requests.extend(
238
- self.post_process_requests(request_ids, finished_request_ids, sample)
272
+ (active_requests, finished_requests) = self.post_process_requests(
273
+ request_ids, finished_request_ids, step_time, sample
239
274
  )
240
275
 
241
- # Schedule waiting requests
242
276
  # TODO: Move this to a background thread?
243
- for waiting_request_id in self.waiting_request_ids.copy():
244
- waiting_request: DynamicInferenceRequest = self.requests[waiting_request_id]
245
- try:
246
- self.context.add_request(
247
- waiting_request_id,
248
- waiting_request.prompt_tokens,
249
- waiting_request.sampling_params.num_tokens_to_generate,
250
- )
251
- self.waiting_request_ids.popleft()
252
- except Exception as e:
253
- break
277
+ self.schedule_waiting_requests()
278
+ else:
279
+ active_requests: List[DynamicInferenceRequest] = []
280
+ finished_requests: List[DynamicInferenceRequest] = []
254
281
 
255
282
  # Print context state.
256
283
  if verbose:
@@ -278,9 +305,11 @@ class DynamicInferenceEngine(AbstractEngine):
278
305
  )
279
306
  )
280
307
 
281
- return finished_requests, step_time
308
+ return active_requests, finished_requests, step_time
282
309
 
283
- def step(self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False):
310
+ def step(
311
+ self, sampling_params: SamplingParams, *, verbose: Optional[bool] = False
312
+ ) -> Tuple[List[DynamicInferenceRequest], List[DynamicInferenceRequest], float]:
284
313
  """Synchronous wrapper for `self.async_step`."""
285
314
  return self._loop.run_until_complete(
286
315
  self.async_step(sampling_params=sampling_params, verbose=verbose)
@@ -297,7 +326,7 @@ class DynamicInferenceEngine(AbstractEngine):
297
326
 
298
327
  finished_requests_list = []
299
328
  while self.has_unfinished_requests():
300
- finished_requests, step_time = self.step(sampling_params)
329
+ active_requests, finished_requests, step_time = self.step(sampling_params)
301
330
  finished_requests_list.extend(finished_requests)
302
331
 
303
332
  return finished_requests_list
@@ -46,6 +46,7 @@ class InferenceRequest:
46
46
  prompt_top_n_logprobs: Optional[List[Dict[str, float]]] = None
47
47
  generated_top_n_logprobs: Optional[List[Dict[str, float]]] = None
48
48
  generated_length: Optional[int] = None
49
+ tpot: Optional[List[int]] = None
49
50
 
50
51
  def __post_init__(self):
51
52
  if self.sampling_params is None and self.inference_parameters is not None:
@@ -34,6 +34,8 @@ try:
34
34
 
35
35
  except ImportError:
36
36
  HAVE_TE = False
37
+ Fp8Padding = None
38
+ Fp8Unpadding = None
37
39
 
38
40
 
39
41
  class TextGenerationController:
@@ -312,6 +314,7 @@ class TextGenerationController:
312
314
  current_context_end_position: int,
313
315
  is_generation_done_tensor: torch.Tensor,
314
316
  generated_sequence_lengths: torch.Tensor,
317
+ termination_id: Optional[int] = None,
315
318
  ) -> Tuple[torch.Tensor, torch.Tensor]:
316
319
  """Checks which prompts have reached an end condition
317
320
 
@@ -337,10 +340,12 @@ class TextGenerationController:
337
340
  Tuple[torch.Tensor, torch.Tensor]: Returns the boolean
338
341
  is_generation_done_tensor and the generated_sequence_lengths after updating it
339
342
  """
343
+ if termination_id is None:
344
+ termination_id = self.tokenizer.eod
340
345
  latest_samples = updated_prompts_tokens[:, current_context_end_position]
341
346
  # Make sure we are checking eod criterion only for prompts that have started generating
342
347
  # (i.e) We only look at the generated tokenns and not the input tokens.
343
- reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
348
+ reached_eod = (latest_samples == termination_id) & generation_started
344
349
  is_generation_done_tensor = is_generation_done_tensor | reached_eod
345
350
  # We increment generated sequence lengths when that prompt has not hit the
346
351
  # EOD and generation has started
@@ -543,7 +548,7 @@ class TextGenerationController:
543
548
  active_requests: OrderedDict[str, InferenceRequest],
544
549
  active_streams: Optional[OrderedDict[str, AsyncStream]] = None,
545
550
  ) -> OrderedDict[str, InferenceRequest]:
546
- """Utility to generate the all the output tokens and probabilities for the prompts .
551
+ """Utility to generate all the output tokens and probabilities for the prompts.
547
552
 
548
553
  This utility generates the output tokens for a static batch. It runs the forward steps till
549
554
  all prompts complete generation, updates the status of these requests to completed, adds
@@ -654,6 +659,10 @@ class TextGenerationController:
654
659
  # to nearest power of 2
655
660
  vocab_size = self.inference_wrapped_model.inference_wrapper_config.padded_vocab_size
656
661
 
662
+ # Check whether early termination is enabled
663
+ no_early_termination = getattr(sampling_params, "no_early_termination", False)
664
+ termination_id = -1 if no_early_termination else self.tokenizer.eod
665
+
657
666
  streaming_enabled = active_streams is not None and len(active_streams) > 0
658
667
  if streaming_enabled:
659
668
  # Start a separate thread for streaming tokens to avoid blocking the
@@ -671,6 +680,11 @@ class TextGenerationController:
671
680
  streaming_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
672
681
  stream_tokens = functools.partial(self.stream_tokens, sampling_params)
673
682
 
683
+ for request in active_requests.values():
684
+ # Initialize to a list to store a latency measurement for each generated token.
685
+ request.tpot = []
686
+ timing_events = []
687
+
674
688
  with torch.inference_mode():
675
689
  self.inference_wrapped_model.prep_model_for_inference()
676
690
 
@@ -694,7 +708,18 @@ class TextGenerationController:
694
708
  context_start_position = 0
695
709
  context_end_position = min_prompt_length_in_batch
696
710
 
711
+ # The initial iteration of this loop runs the prefill phase up to the shortest
712
+ # prompt length in the batch. Then every subsequent iterations runs a decode step.
713
+ # At least one new token will be generated in each iteration. The generated token
714
+ # will be ignored for requests which have prompt length > the current generated
715
+ # sequence length. Similarly, the generated token is ignored for requests which
716
+ # have maximum total sequence length < the current generated sequence length.
697
717
  while True:
718
+ # Add a timing event at the start of each iteration. The token generation
719
+ # time will be the elapsed time between consective timing events.
720
+ timing_events.append(torch.cuda.Event(enable_timing=True))
721
+ timing_events[-1].record()
722
+
698
723
  # Pick the context window that we need to pass through the network.
699
724
  inference_input_for_context_window: Dict[str, Any] = (
700
725
  self.inference_wrapped_model.get_batch_for_context_window(
@@ -817,6 +842,7 @@ class TextGenerationController:
817
842
  current_context_end_position=context_end_position,
818
843
  is_generation_done_tensor=is_generation_done_tensor,
819
844
  generated_sequence_lengths=generated_sequence_lengths,
845
+ termination_id=termination_id,
820
846
  )
821
847
  )
822
848
 
@@ -852,6 +878,10 @@ class TextGenerationController:
852
878
  if context_end_position >= max_sequence_length:
853
879
  break
854
880
 
881
+ # Add a final timing event to compute the latency of every loop iteration
882
+ timing_events.append(torch.cuda.Event(enable_timing=True))
883
+ timing_events[-1].record()
884
+
855
885
  # Close all streams
856
886
  if streaming_enabled:
857
887
  streaming_executor.shutdown()
@@ -870,6 +900,15 @@ class TextGenerationController:
870
900
  generated_sequence_lengths > sampling_params.num_tokens_to_generate
871
901
  ] = sampling_params.num_tokens_to_generate
872
902
 
903
+ timing_events[-1].synchronize()
904
+ tpot = torch.tensor(
905
+ [
906
+ timing_events[i].elapsed_time(timing_events[i + 1]) / 1e3
907
+ for i in range(len(timing_events) - 1)
908
+ ],
909
+ dtype=torch.float32,
910
+ )
911
+
873
912
  for idx, request in enumerate(active_requests.values()):
874
913
  input_prompt_length = int(prompt_lengths_in_batch[idx])
875
914
  # Shorter prompts might have generated more than required tokens. So we trim them down
@@ -885,6 +924,20 @@ class TextGenerationController:
885
924
  request.generated_length = required_sequence_length
886
925
  request.generated_tokens = required_result_tokens
887
926
 
927
+ # Record the decode latencies for only the generated tokens
928
+ request_tpot = tpot.clone()
929
+ # Sum up the latencies of the first prompt tokens if the
930
+ # request prompt length > minimum prompt length
931
+ spill_length = input_prompt_length - min_prompt_length_in_batch
932
+ if spill_length > 0:
933
+ spill_latency = request_tpot[:spill_length].sum()
934
+ request_tpot = torch.cat((spill_latency.unsqueeze(0), request_tpot[spill_length:]))
935
+
936
+ # Remove the extraneous latencies if the
937
+ # request sequence length < maximum sequence length
938
+ request_tpot = request_tpot[:required_sequence_length]
939
+ request.tpot = request_tpot.tolist()
940
+
888
941
  if output_log_probs is not None:
889
942
  request.prompt_log_probs = output_log_probs[idx, : input_prompt_length - 1].tolist()
890
943
  request.generated_log_probs = output_log_probs[
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 14
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0'
7
+ PRE_RELEASE = 'rc1'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -20,6 +20,8 @@ from megatron.core.parallel_state import (
20
20
  )
21
21
  from megatron.core.utils import (
22
22
  divide,
23
+ get_pg_rank,
24
+ get_pg_size,
23
25
  get_tensor_model_parallel_group_if_none,
24
26
  is_torch_min_version,
25
27
  make_tp_sharded_tensor_for_checkpoint,
@@ -219,7 +221,7 @@ class VocabParallelEmbedding(torch.nn.Module):
219
221
 
220
222
  (self.vocab_start_index, self.vocab_end_index) = (
221
223
  VocabUtility.vocab_range_from_global_vocab_size(
222
- self.num_embeddings, self.tp_group.rank(), self.tp_group.size()
224
+ self.num_embeddings, get_pg_rank(self.tp_group), get_pg_size(self.tp_group)
223
225
  )
224
226
  )
225
227
  self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index
@@ -241,8 +243,8 @@ class VocabParallelEmbedding(torch.nn.Module):
241
243
  0,
242
244
  init_method,
243
245
  params_dtype=config.params_dtype,
244
- rank=self.tp_group.rank(),
245
- world_size=self.tp_group.size(),
246
+ rank=get_pg_rank(self.tp_group),
247
+ world_size=get_pg_size(self.tp_group),
246
248
  )
247
249
  else:
248
250
  self.weight = Parameter(
@@ -808,8 +810,8 @@ class ColumnParallelLinear(torch.nn.Module):
808
810
  self.tp_group = get_tensor_model_parallel_group_if_none(
809
811
  self.tp_group, is_expert=self.is_expert
810
812
  )
811
- world_size = self.tp_group.size()
812
- rank = self.tp_group.rank()
813
+ world_size = get_pg_size(self.tp_group)
814
+ rank = get_pg_rank(self.tp_group)
813
815
  self.explicit_expert_comm = self.is_expert and (world_size > 1 or self.expert_parallel)
814
816
  self.output_size_per_partition = divide(output_size, world_size)
815
817
 
@@ -1120,8 +1122,8 @@ class RowParallelLinear(torch.nn.Module):
1120
1122
  self.tp_group, is_expert=self.is_expert
1121
1123
  )
1122
1124
 
1123
- world_size = self.tp_group.size()
1124
- rank = self.tp_group.rank()
1125
+ world_size = get_pg_size(self.tp_group)
1126
+ rank = get_pg_rank(self.tp_group)
1125
1127
  self.explicit_expert_comm = self.is_expert and (world_size > 1 or self.expert_parallel)
1126
1128
 
1127
1129
  self.input_size_per_partition = divide(input_size, world_size)
@@ -28,6 +28,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module
28
28
  from megatron.core.utils import (
29
29
  deprecate_inference_params,
30
30
  divide,
31
+ get_pg_size,
31
32
  is_fa_min_version,
32
33
  nvtx_range_pop,
33
34
  nvtx_range_push,
@@ -135,7 +136,7 @@ class Attention(MegatronModule, ABC):
135
136
  self.model_comm_pgs = model_comm_pgs
136
137
 
137
138
  # Per attention head and per partition values
138
- world_size = self.model_comm_pgs.tp.size()
139
+ world_size = get_pg_size(self.model_comm_pgs.tp)
139
140
  self.hidden_size_per_attention_head = divide(
140
141
  self.query_projection_size, self.config.num_attention_heads
141
142
  )
@@ -977,9 +977,13 @@ class CudaGraphManager(torch.nn.Module):
977
977
  runner = self.get_cudagraph_runner(megatron_module)
978
978
  runner.eval()
979
979
  out = runner.record_graph_capture(args, kwargs)
980
- elif self.training and torch.is_grad_enabled():
980
+ elif self.training:
981
981
  # Training mode
982
982
  runner = self.get_cudagraph_runner(megatron_module)
983
+ # check if a layer is frozen during training.
984
+ if not torch.is_grad_enabled():
985
+ # If the layer is frozen, we need to set the runner to eval mode.
986
+ runner.eval()
983
987
  out = runner.record_graph_capture(args, kwargs)
984
988
  else:
985
989
  # No cudagraphs were found in training mode with grad disabled, so fallback to
@@ -401,6 +401,9 @@ def deprecate_inference_params(inference_context, inference_params):
401
401
  def get_tensor_model_parallel_group_if_none(tp_group, is_expert=False, check_initialized=True):
402
402
  """Issue a deprecation warning if tp_group is None and return the default tp group."""
403
403
  # TODO(zijiey): remove this function later.
404
+ if not torch.distributed.is_initialized():
405
+ return None
406
+
404
407
  if tp_group is None:
405
408
  if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
406
409
  warnings.warn(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.14.0rc0
3
+ Version: 0.14.0rc1
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>