megatron-core 0.13.0rc3__tar.gz → 0.13.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (306) hide show
  1. {megatron_core-0.13.0rc3/megatron_core.egg-info → megatron_core-0.13.1}/PKG-INFO +1 -1
  2. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/T5/t5_spec.py +2 -0
  3. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/bert/bert_layer_specs.py +2 -0
  4. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +1 -1
  5. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/gpt_layer_specs.py +4 -0
  6. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +2 -0
  7. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/moe_module_specs.py +2 -0
  8. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/decoder_spec.py +2 -0
  9. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/encoder_spec.py +2 -0
  10. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/package_info.py +2 -2
  11. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/cuda_graphs.py +1 -0
  12. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/heterogeneous/linear_replacements.py +4 -0
  13. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/experts.py +1 -0
  14. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/moe_layer.py +2 -0
  15. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/moe_utils.py +2 -0
  16. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/multi_token_prediction.py +2 -0
  17. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/transformer_block.py +22 -11
  18. {megatron_core-0.13.0rc3 → megatron_core-0.13.1/megatron_core.egg-info}/PKG-INFO +1 -1
  19. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/pyproject.toml +2 -1
  20. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/LICENSE +0 -0
  21. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/MANIFEST.in +0 -0
  22. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/README.md +0 -0
  23. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/README.md +0 -0
  24. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/__init__.py +0 -0
  25. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/config.py +0 -0
  26. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/config_logger.py +0 -0
  27. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/__init__.py +0 -0
  28. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/bert_dataset.py +0 -0
  29. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/blended_dataset.py +0 -0
  30. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  31. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  32. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/gpt_dataset.py +0 -0
  33. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/helpers.cpp +0 -0
  34. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/helpers.py +0 -0
  35. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/indexed_dataset.py +0 -0
  36. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/masked_dataset.py +0 -0
  37. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/megatron_dataset.py +0 -0
  38. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  39. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/multimodal_dataset.py +0 -0
  40. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/object_storage_utils.py +0 -0
  41. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/__init__.py +0 -0
  42. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/config/__init__.py +0 -0
  43. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  44. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/config/config.py +0 -0
  45. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  46. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  47. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/db/__init__.py +0 -0
  48. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/db/build.py +0 -0
  49. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/db/dataset.py +0 -0
  50. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/db/utils.py +0 -0
  51. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/external_libs.py +0 -0
  52. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/__init__.py +0 -0
  53. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/build.py +0 -0
  54. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/factory.py +0 -0
  55. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/index.py +0 -0
  56. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  57. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  58. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  59. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/utils.py +0 -0
  60. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/index/validate.py +0 -0
  61. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/__init__.py +0 -0
  62. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  63. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  64. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/query.py +0 -0
  65. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  66. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/query/utils.py +0 -0
  67. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/retro/utils.py +0 -0
  68. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/t5_dataset.py +0 -0
  69. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/utils.py +0 -0
  70. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/utils_object_storage.py +0 -0
  71. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/datasets/utils_s3.py +0 -0
  72. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/__init__.py +0 -0
  73. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/core.py +0 -0
  74. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  75. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  76. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/mapping.py +0 -0
  77. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  78. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/serialization.py +0 -0
  79. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  80. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  81. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  82. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  83. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  84. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  85. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  86. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  87. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  88. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  89. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  90. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  91. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  92. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  93. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  94. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/utils.py +0 -0
  95. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/dist_checkpointing/validation.py +0 -0
  96. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/__init__.py +0 -0
  97. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/custom_fsdp/__init__.py +0 -0
  98. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +0 -0
  99. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +0 -0
  100. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/data_parallel_base.py +0 -0
  101. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  102. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  103. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/finalize_model_grads.py +0 -0
  104. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  105. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  106. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  107. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/energy_monitor.py +0 -0
  108. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/enums.py +0 -0
  109. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/__init__.py +0 -0
  110. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/data_type.py +0 -0
  111. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/export_config.py +0 -0
  112. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/model_type.py +0 -0
  113. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/__init__.py +0 -0
  114. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  115. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  116. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  117. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  118. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  119. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  120. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  121. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  122. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  123. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  124. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  125. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  126. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/extensions/__init__.py +0 -0
  127. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/extensions/kitchen.py +0 -0
  128. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/extensions/transformer_engine.py +0 -0
  129. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  130. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fp8_utils.py +0 -0
  131. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/__init__.py +0 -0
  132. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  133. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  134. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  135. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  136. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  137. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_indices_converter.py +0 -0
  138. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_layer_norm.py +0 -0
  139. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  140. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  141. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/fusions/fused_softmax.py +0 -0
  142. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/__init__.py +0 -0
  143. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/async_stream.py +0 -0
  144. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/common_inference_params.py +0 -0
  145. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/communication_utils.py +0 -0
  146. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/contexts/__init__.py +0 -0
  147. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/contexts/base_context.py +0 -0
  148. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  149. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/contexts/dynamic_context.py +0 -0
  150. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/contexts/static_context.py +0 -0
  151. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/engines/__init__.py +0 -0
  152. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/engines/abstract_engine.py +0 -0
  153. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/engines/dynamic_engine.py +0 -0
  154. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/engines/mcore_engine.py +0 -0
  155. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/engines/static_engine.py +0 -0
  156. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/inference_request.py +0 -0
  157. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  158. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  159. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  160. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  161. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  162. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  163. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  164. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  165. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/sampling_params.py +0 -0
  166. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/scheduler.py +0 -0
  167. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  168. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  169. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  170. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
  171. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  172. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference/utils.py +0 -0
  173. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/inference_params.py +0 -0
  174. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/jit.py +0 -0
  175. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/model_parallel_config.py +0 -0
  176. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/T5/__init__.py +0 -0
  177. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/T5/t5_model.py +0 -0
  178. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/__init__.py +0 -0
  179. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/backends.py +0 -0
  180. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/bert/__init__.py +0 -0
  181. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/bert/bert_lm_head.py +0 -0
  182. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/bert/bert_model.py +0 -0
  183. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/bert/pooler.py +0 -0
  184. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/__init__.py +0 -0
  185. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/__init__.py +0 -0
  186. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  187. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  188. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  189. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  190. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/language_module/__init__.py +0 -0
  191. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/language_module/language_module.py +0 -0
  192. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/vision_module/__init__.py +0 -0
  193. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  194. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/__init__.py +0 -0
  195. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  196. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/gpt/gpt_model.py +0 -0
  197. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/huggingface/__init__.py +0 -0
  198. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/huggingface/clip_model.py +0 -0
  199. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/huggingface/module.py +0 -0
  200. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/huggingface/qwen_model.py +0 -0
  201. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mamba/__init__.py +0 -0
  202. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  203. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mamba/mamba_model.py +0 -0
  204. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/__init__.py +0 -0
  205. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/config/__init__.py +0 -0
  206. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/config/base_configs.py +0 -0
  207. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/model/__init__.py +0 -0
  208. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/model/base.py +0 -0
  209. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/submodules/audio.py +0 -0
  210. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/submodules/base.py +0 -0
  211. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/mimo/submodules/vision.py +0 -0
  212. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/multimodal/__init__.py +0 -0
  213. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/multimodal/context_parallel.py +0 -0
  214. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/multimodal/llava_model.py +0 -0
  215. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/multimodal/llava_spec.py +0 -0
  216. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/__init__.py +0 -0
  217. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/base_attention.py +0 -0
  218. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/config.py +0 -0
  219. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/decoder_attention.py +0 -0
  220. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/encoder_attention.py +0 -0
  221. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/model.py +0 -0
  222. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/retro/utils.py +0 -0
  223. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/vision/__init__.py +0 -0
  224. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/vision/clip_vit_model.py +0 -0
  225. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/vision/multimodal_projector.py +0 -0
  226. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/vision/radio.py +0 -0
  227. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  228. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/msc_utils.py +0 -0
  229. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/num_microbatches_calculator.py +0 -0
  230. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/__init__.py +0 -0
  231. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/clip_grads.py +0 -0
  232. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  233. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  234. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  235. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/grad_scaler.py +0 -0
  236. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/optimizer.py +0 -0
  237. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer/optimizer_config.py +0 -0
  238. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/optimizer_param_scheduler.py +0 -0
  239. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/packed_seq_params.py +0 -0
  240. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/parallel_state.py +0 -0
  241. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/pipeline_parallel/__init__.py +0 -0
  242. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  243. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/pipeline_parallel/schedules.py +0 -0
  244. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/pipeline_parallel/utils.py +0 -0
  245. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/__init__.py +0 -0
  246. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/__init__.py +0 -0
  247. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  248. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  249. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  250. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/layers.py +0 -0
  251. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  252. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  253. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/process_groups_config.py +0 -0
  254. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/quantization/__init__.py +0 -0
  255. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/quantization/quant_config.py +0 -0
  256. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/quantization/utils.py +0 -0
  257. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/requirements.txt +0 -0
  258. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/rerun_state_machine.py +0 -0
  259. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/__init__.py +0 -0
  260. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mamba_block.py +0 -0
  261. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  262. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  263. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mamba_layer.py +0 -0
  264. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mamba_mixer.py +0 -0
  265. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/mlp_layer.py +0 -0
  266. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/ssm/triton_cache_manager.py +0 -0
  267. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/__init__.py +0 -0
  268. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  269. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/data.py +0 -0
  270. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/layers.py +0 -0
  271. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/mappings.py +0 -0
  272. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/random.py +0 -0
  273. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/tensor_parallel/utils.py +0 -0
  274. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/timers.py +0 -0
  275. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/__init__.py +0 -0
  276. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/attention.py +0 -0
  277. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  278. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  279. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/dot_product_attention.py +0 -0
  280. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/enums.py +0 -0
  281. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  282. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/identity_op.py +0 -0
  283. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/mlp.py +0 -0
  284. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/module.py +0 -0
  285. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/__init__.py +0 -0
  286. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  287. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  288. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/router.py +0 -0
  289. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/shared_experts.py +0 -0
  290. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  291. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  292. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/multi_latent_attention.py +0 -0
  293. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  294. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/spec_utils.py +0 -0
  295. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/torch_layer_norm.py +0 -0
  296. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/torch_norm.py +0 -0
  297. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/transformer_config.py +0 -0
  298. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/transformer_layer.py +0 -0
  299. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/transformer/utils.py +0 -0
  300. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron/core/utils.py +0 -0
  301. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron_core.egg-info/SOURCES.txt +0 -0
  302. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron_core.egg-info/dependency_links.txt +0 -0
  303. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron_core.egg-info/requires.txt +0 -0
  304. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/megatron_core.egg-info/top_level.txt +0 -0
  305. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/setup.cfg +0 -0
  306. {megatron_core-0.13.0rc3 → megatron_core-0.13.1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.13.0rc3
3
+ Version: 0.13.1
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -16,6 +16,8 @@ from megatron.core.transformer.transformer_block import TransformerBlockSubmodul
16
16
  from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
17
17
 
18
18
  try:
19
+ import transformer_engine as te # pylint: disable=unused-import
20
+
19
21
  from megatron.core.extensions.transformer_engine import (
20
22
  TEColumnParallelLinear,
21
23
  TEDotProductAttention,
@@ -12,6 +12,8 @@ from megatron.core.transformer.spec_utils import ModuleSpec
12
12
  from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
13
13
 
14
14
  try:
15
+ import transformer_engine as te # pylint: disable=unused-import
16
+
15
17
  from megatron.core.extensions.transformer_engine import (
16
18
  TEDotProductAttention,
17
19
  TELayerNormColumnParallelLinear,
@@ -315,5 +315,5 @@ class MultimodalRotaryEmbedding(nn.Module):
315
315
  if parallel_state.get_context_parallel_world_size() > 1:
316
316
  # slice rotary_pos_emb along sequence dimension and select the parition of the current
317
317
  # CP rank
318
- emb = get_pos_emb_on_this_cp_rank(emb, 1)
318
+ emb = get_pos_emb_on_this_cp_rank(emb, 0, parallel_state.get_context_parallel_group())
319
319
  return emb
@@ -34,6 +34,8 @@ from megatron.core.transformer.transformer_layer import (
34
34
  )
35
35
 
36
36
  try:
37
+ import transformer_engine as te # pylint: disable=unused-import
38
+
37
39
  from megatron.core.extensions.transformer_engine import TEFusedMLP, TENorm
38
40
  from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
39
41
 
@@ -42,6 +44,8 @@ except ImportError:
42
44
  HAVE_TE = False
43
45
 
44
46
  try:
47
+ import nvidia_kitchen # pylint: disable=unused-import
48
+
45
49
  from megatron.core.extensions.kitchen import KitchenSpecProvider
46
50
 
47
51
  HAVE_KITCHEN = True
@@ -29,6 +29,8 @@ from megatron.core.transformer.transformer_layer import (
29
29
  from megatron.core.utils import is_te_min_version
30
30
 
31
31
  try:
32
+ import transformer_engine as te # pylint: disable=unused-import
33
+
32
34
  from megatron.core.extensions.transformer_engine import (
33
35
  TEDotProductAttention,
34
36
  TELayerNormColumnParallelLinear,
@@ -9,6 +9,8 @@ from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
9
9
  from megatron.core.transformer.spec_utils import ModuleSpec
10
10
 
11
11
  try:
12
+ import transformer_engine as te # pylint: disable=unused-import
13
+
12
14
  from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
13
15
 
14
16
  HAVE_TE = True
@@ -40,6 +40,8 @@ except ImportError:
40
40
  HAVE_APEX = False
41
41
 
42
42
  try:
43
+ import transformer_engine as te # pylint: disable=unused-import
44
+
43
45
  from megatron.core.extensions.transformer_engine import (
44
46
  TEColumnParallelLinear,
45
47
  TEDotProductAttention,
@@ -21,6 +21,8 @@ from megatron.core.transformer.mlp import MLP, MLPSubmodules
21
21
  from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
22
22
 
23
23
  try:
24
+ import transformer_engine as te # pylint: disable=unused-import
25
+
24
26
  from megatron.core.extensions.transformer_engine import (
25
27
  TEColumnParallelLinear,
26
28
  TEDotProductAttention,
@@ -3,8 +3,8 @@
3
3
 
4
4
  MAJOR = 0
5
5
  MINOR = 13
6
- PATCH = 0
7
- PRE_RELEASE = 'rc3'
6
+ PATCH = 1
7
+ PRE_RELEASE = ''
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -23,6 +23,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig
23
23
  from megatron.core.utils import is_te_min_version
24
24
 
25
25
  try:
26
+ import transformer_engine as te # pylint: disable=unused-import
26
27
  from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
27
28
  from transformer_engine.pytorch.graph import restore_fp8_tensors, save_fp8_tensors
28
29
  from transformer_engine.pytorch.graph import set_capture_end as te_set_capture_end
@@ -16,6 +16,8 @@ from megatron.core.transformer.transformer_config import TransformerConfig
16
16
  from megatron.core.utils import divide
17
17
 
18
18
  try:
19
+ import transformer_engine as te # pylint: disable=unused-import
20
+
19
21
  from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear
20
22
 
21
23
  HAVE_TE = True
@@ -67,6 +69,7 @@ if HAVE_TE:
67
69
  )
68
70
 
69
71
  def forward(self, x, **kwargs):
72
+ """Forward of TELayerNormColumnParallelLinearGathered"""
70
73
  out, bias = super().forward(x)
71
74
  assert bias is None, "bias should be None since we set skip_bias_add=False"
72
75
 
@@ -100,6 +103,7 @@ class ColumnParallelLinearGathered(ColumnParallelLinear):
100
103
  runtime_gather_output: bool | None = None,
101
104
  **kwargs,
102
105
  ):
106
+ """Forward of ColumnParallelLinearGathered"""
103
107
  out, bias = super().forward(input_, weight, runtime_gather_output)
104
108
  assert bias is None, "bias should be None since we set skip_bias_add=False"
105
109
 
@@ -40,6 +40,7 @@ from megatron.core.transformer.utils import (
40
40
  )
41
41
 
42
42
  try:
43
+ import transformer_engine as te # pylint: disable=unused-import
43
44
 
44
45
  from megatron.core.extensions.transformer_engine import Fp8Padding, Fp8Unpadding
45
46
 
@@ -21,6 +21,8 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module
21
21
  from megatron.core.transformer.transformer_config import TransformerConfig
22
22
 
23
23
  try:
24
+ import transformer_engine as te # pylint: disable=unused-import
25
+
24
26
  from megatron.core.extensions.transformer_engine import te_checkpoint
25
27
 
26
28
  HAVE_TE = True
@@ -10,6 +10,8 @@ from megatron.core.process_groups_config import ModelCommProcessGroups
10
10
  from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
11
11
 
12
12
  try:
13
+ import transformer_engine as te # pylint: disable=unused-import
14
+
13
15
  from megatron.core.extensions.transformer_engine import (
14
16
  fused_permute,
15
17
  fused_permute_with_probs,
@@ -32,6 +32,8 @@ SUPPORTED_ATTN_MASK = [
32
32
  ]
33
33
 
34
34
  try:
35
+ import transformer_engine as te # pylint: disable=unused-import
36
+
35
37
  from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
36
38
 
37
39
  HAVE_TE = True
@@ -28,27 +28,38 @@ from megatron.core.transformer.utils import sharded_state_dict_default
28
28
  from megatron.core.utils import WrappedTensor, deprecate_inference_params, make_viewless_tensor
29
29
 
30
30
  try:
31
+ import transformer_engine.pytorch as te # pylint: disable=unused-import
32
+
33
+ HAVE_TE = True
34
+ except ImportError:
35
+ HAVE_TE = False
36
+
37
+ try:
38
+ import apex # pylint: disable=unused-import
39
+
40
+ HAVE_APEX = True
41
+ except ImportError:
42
+ HAVE_APEX = False
43
+
44
+ get_cpu_offload_context = None
45
+ te_checkpoint = None
46
+
47
+ if HAVE_TE:
31
48
  from megatron.core.extensions.transformer_engine import (
32
49
  TENorm,
33
50
  get_cpu_offload_context,
34
51
  te_checkpoint,
35
52
  )
36
53
 
37
- HAVE_TE = True
38
54
  LayerNormImpl = TENorm
39
- except ImportError:
40
- HAVE_TE = False
41
- get_cpu_offload_context = None
42
-
43
- try:
44
- import apex # pylint: disable=unused-import
45
55
 
46
- LayerNormImpl = FusedLayerNorm
56
+ elif HAVE_APEX:
57
+ LayerNormImpl = FusedLayerNorm
47
58
 
48
- except ImportError:
49
- from megatron.core.transformer.torch_norm import WrappedTorchNorm
59
+ else:
60
+ from megatron.core.transformer.torch_norm import WrappedTorchNorm
50
61
 
51
- LayerNormImpl = WrappedTorchNorm
62
+ LayerNormImpl = WrappedTorchNorm
52
63
 
53
64
 
54
65
  def get_num_layers_to_build(config: TransformerConfig, vp_stage: Optional[int] = None) -> int:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.13.0rc3
3
+ Version: 0.13.1
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -171,7 +171,8 @@ source = ["/opt/megatron-lm/"]
171
171
  [tool.ruff.lint]
172
172
  # Enable all `pydocstyle` rules, limiting to those that adhere to the
173
173
  # Google convention via `convention = "google"`, below.
174
- select = ["D", "F"]
174
+ # select = ["D", "F"]
175
+ select = ["S506"]
175
176
 
176
177
  # - On top of the Google convention, disable `D417`, which requires
177
178
  # documentation for every function parameter.