megatron-core 0.15.0rc4__tar.gz → 0.15.0rc6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.15.0rc4/megatron_core.egg-info → megatron_core-0.15.0rc6}/PKG-INFO +1 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/indexed_dataset.py +10 -7
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel.py +7 -12
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/finalize_model_grads.py +10 -12
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +19 -20
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +3 -5
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +2 -3
- megatron_core-0.15.0rc6/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +521 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +130 -28
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +1 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +92 -56
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +42 -30
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/param_and_grad_buffer.py +5 -2
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/transformer_engine.py +109 -8
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fp8_utils.py +22 -17
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/full_cuda_graph.py +6 -3
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_softmax.py +109 -14
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/dynamic_context.py +33 -13
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/dynamic_engine.py +60 -16
- megatron_core-0.15.0rc6/megatron/core/inference/inference_request.py +193 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +5 -3
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/__init__.py +20 -2
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/clip_grads.py +4 -4
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/distrib_optimizer.py +6 -3
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/optimizer.py +2 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/optimizer_config.py +5 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/package_info.py +1 -1
- megatron_core-0.15.0rc6/megatron/core/pipeline_parallel/bridge_communicator.py +399 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/safe_globals.py +3 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_layer.py +32 -21
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/layers.py +16 -9
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/cuda_graphs.py +102 -49
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/dot_product_attention.py +13 -5
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/enums.py +1 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/mlp.py +5 -2
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/module.py +172 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/experts.py +32 -27
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/moe_utils.py +17 -8
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/router.py +13 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/pipeline_parallel_layer_layout.py +10 -6
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_config.py +9 -2
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_layer.py +114 -172
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/utils.py +34 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/utils.py +3 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6/megatron_core.egg-info}/PKG-INFO +1 -1
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/SOURCES.txt +1 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/pyproject.toml +21 -7
- megatron_core-0.15.0rc4/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -387
- megatron_core-0.15.0rc4/megatron/core/inference/inference_request.py +0 -91
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/LICENSE +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/MANIFEST.in +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/README.md +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/README.md +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/activations.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils_object_storage.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/mapping.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/enums.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fp4_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/inference_client.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/jit.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/gpt_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/decoder_attention.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/nccl_allocator.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/parallel_state.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_block.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mamba_mixer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/timers.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/base_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/attention.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/multi_latent_attention.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/requires.txt +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/setup.cfg +0 -0
- {megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.15.
|
|
3
|
+
Version: 0.15.0rc6
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
{megatron_core-0.15.0rc4 → megatron_core-0.15.0rc6}/megatron/core/datasets/indexed_dataset.py
RENAMED
|
@@ -12,6 +12,7 @@ import shutil
|
|
|
12
12
|
import struct
|
|
13
13
|
import time
|
|
14
14
|
from abc import ABC, abstractmethod
|
|
15
|
+
from collections.abc import Iterable
|
|
15
16
|
from enum import Enum
|
|
16
17
|
from functools import lru_cache
|
|
17
18
|
from itertools import accumulate
|
|
@@ -172,9 +173,9 @@ class _IndexWriter(object):
|
|
|
172
173
|
|
|
173
174
|
def write(
|
|
174
175
|
self,
|
|
175
|
-
sequence_lengths:
|
|
176
|
-
sequence_modes: Optional[
|
|
177
|
-
document_indices:
|
|
176
|
+
sequence_lengths: Iterable[Union[int, numpy.integer]],
|
|
177
|
+
sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
|
|
178
|
+
document_indices: Iterable[Union[int, numpy.integer]],
|
|
178
179
|
) -> None:
|
|
179
180
|
"""Write the index (.idx) file
|
|
180
181
|
|
|
@@ -208,7 +209,9 @@ class _IndexWriter(object):
|
|
|
208
209
|
if sequence_modes is not None:
|
|
209
210
|
self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
|
|
210
211
|
|
|
211
|
-
def _sequence_pointers(
|
|
212
|
+
def _sequence_pointers(
|
|
213
|
+
self, sequence_lengths: Iterable[Union[int, numpy.integer]]
|
|
214
|
+
) -> List[int]:
|
|
212
215
|
"""Build the sequence pointers per the sequence lengths and dtype size
|
|
213
216
|
|
|
214
217
|
Args:
|
|
@@ -217,11 +220,11 @@ class _IndexWriter(object):
|
|
|
217
220
|
Returns:
|
|
218
221
|
List[int]: The pointer to the beginning of each sequence
|
|
219
222
|
"""
|
|
220
|
-
itemsize = DType.size(self.dtype)
|
|
221
|
-
curr_ptr = 0
|
|
223
|
+
itemsize = numpy.int64(DType.size(self.dtype))
|
|
224
|
+
curr_ptr = numpy.int64(0)
|
|
222
225
|
list_ptr = []
|
|
223
226
|
for length in sequence_lengths:
|
|
224
|
-
list_ptr.append(curr_ptr)
|
|
227
|
+
list_ptr.append(curr_ptr.item())
|
|
225
228
|
curr_ptr += length * itemsize
|
|
226
229
|
return list_ptr
|
|
227
230
|
|
|
@@ -519,8 +519,11 @@ class DistributedDataParallel(_BaseDataParallel):
|
|
|
519
519
|
param_slice = bucket.param_data.view(-1)[param_start:param_end]
|
|
520
520
|
param.data.copy_(param_slice.view(param.data.shape))
|
|
521
521
|
# All-gathered params are not needed after being copied to param.data.
|
|
522
|
-
# Zero out the
|
|
523
|
-
|
|
522
|
+
# Zero out the param buffer (shared with grad buffer) for gradient accumulation.
|
|
523
|
+
# We cannot zero out the entire grad buffer because one grad buffer may
|
|
524
|
+
# correspond to multiple param buffers. If we zero out the entire grad buffer,
|
|
525
|
+
# it would clear the data of those param buffers that have not yet completed AG.
|
|
526
|
+
bucket.param_data.zero_()
|
|
524
527
|
|
|
525
528
|
def start_grad_sync(self, *unused):
|
|
526
529
|
"""
|
|
@@ -562,16 +565,8 @@ class DistributedDataParallel(_BaseDataParallel):
|
|
|
562
565
|
# to True, and there will be a double-GA.
|
|
563
566
|
for param in self.params_with_grad:
|
|
564
567
|
param.grad_added_to_main_grad = False
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
# The grad buffer is zeroed by "bucket.grad_data.zero_()" in the "finish_param_sync" stage
|
|
568
|
-
# after the param all-gather.
|
|
569
|
-
if not (
|
|
570
|
-
self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
|
|
571
|
-
and self.ddp_config.overlap_param_gather
|
|
572
|
-
):
|
|
573
|
-
for buffer in self.buffers + self.expert_parallel_buffers:
|
|
574
|
-
buffer.reset()
|
|
568
|
+
for buffer in self.buffers + self.expert_parallel_buffers:
|
|
569
|
+
buffer.reset()
|
|
575
570
|
for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
|
|
576
571
|
bucket_group.reset()
|
|
577
572
|
|
|
@@ -267,13 +267,18 @@ def _allreduce_position_embedding_grads(
|
|
|
267
267
|
)
|
|
268
268
|
|
|
269
269
|
|
|
270
|
-
def
|
|
270
|
+
def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.nn.Module]):
|
|
271
271
|
"""
|
|
272
|
-
Reset the
|
|
272
|
+
Reset the temporary tensors of the model.
|
|
273
273
|
"""
|
|
274
274
|
for model_chunk in model:
|
|
275
275
|
for module in get_attr_wrapped_model(model_chunk, 'modules')():
|
|
276
|
-
if hasattr(module, '
|
|
276
|
+
if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
|
|
277
|
+
module.local_tokens_per_expert.zero_()
|
|
278
|
+
if (
|
|
279
|
+
config.moe_router_load_balancing_type == "global_aux_loss"
|
|
280
|
+
or "global_aux_loss" in config.moe_router_load_balancing_type
|
|
281
|
+
) and hasattr(module, 'reset_global_aux_loss_tracker'):
|
|
277
282
|
module.reset_global_aux_loss_tracker()
|
|
278
283
|
|
|
279
284
|
|
|
@@ -298,10 +303,7 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
|
|
|
298
303
|
stacked_tokens_per_expert, stacked_expert_bias, config.moe_router_bias_update_rate
|
|
299
304
|
)
|
|
300
305
|
|
|
301
|
-
for
|
|
302
|
-
tokens_per_expert_list, expert_bias_list, stacked_updated_expert_bias
|
|
303
|
-
):
|
|
304
|
-
tokens_per_expert.zero_()
|
|
306
|
+
for expert_bias, updated_expert_bias in zip(expert_bias_list, stacked_updated_expert_bias):
|
|
305
307
|
expert_bias.copy_(updated_expert_bias)
|
|
306
308
|
|
|
307
309
|
|
|
@@ -465,11 +467,7 @@ def finalize_model_grads(
|
|
|
465
467
|
if config.moe_router_enable_expert_bias:
|
|
466
468
|
_update_router_expert_bias(model, config)
|
|
467
469
|
|
|
468
|
-
|
|
469
|
-
config.moe_router_load_balancing_type == "global_aux_loss"
|
|
470
|
-
or "global_aux_loss" in config.moe_router_load_balancing_type
|
|
471
|
-
):
|
|
472
|
-
_reset_global_aux_loss_tracker(model)
|
|
470
|
+
reset_model_temporary_tensors(config, model)
|
|
473
471
|
|
|
474
472
|
# normalize gradients for per-token loss normalization.
|
|
475
473
|
# if we are using by the number of tokens, then we use that as a divisor. this number
|
|
@@ -158,7 +158,7 @@ class FullyShardedDataParallel(_BaseDataParallel):
|
|
|
158
158
|
dp_cp_group = parallel_state.get_data_parallel_group(
|
|
159
159
|
with_context_parallel=True, partial_data_parallel=True
|
|
160
160
|
)
|
|
161
|
-
|
|
161
|
+
outer_fsdp_group = parallel_state.get_inter_distributed_optimizer_instance_group()
|
|
162
162
|
hybrid_fsdp_group = parallel_state.get_data_parallel_group(
|
|
163
163
|
with_context_parallel=True, partial_data_parallel=False
|
|
164
164
|
)
|
|
@@ -166,17 +166,17 @@ class FullyShardedDataParallel(_BaseDataParallel):
|
|
|
166
166
|
dp_cp_group = parallel_state.get_data_parallel_group(
|
|
167
167
|
with_context_parallel=True, partial_data_parallel=False
|
|
168
168
|
)
|
|
169
|
-
|
|
169
|
+
outer_fsdp_group = None
|
|
170
170
|
hybrid_fsdp_group = None
|
|
171
171
|
else:
|
|
172
172
|
tp_group = getattr(pg_collection, 'tp', None)
|
|
173
173
|
if enable_hsdp:
|
|
174
174
|
dp_cp_group = pg_collection.intra_dp_cp
|
|
175
|
-
|
|
175
|
+
outer_fsdp_group = pg_collection.inter_dist_opt
|
|
176
176
|
hybrid_fsdp_group = pg_collection.dp_cp
|
|
177
177
|
else:
|
|
178
178
|
dp_cp_group = pg_collection.dp_cp
|
|
179
|
-
|
|
179
|
+
outer_fsdp_group = None
|
|
180
180
|
hybrid_fsdp_group = None
|
|
181
181
|
|
|
182
182
|
if tp_group is None:
|
|
@@ -184,17 +184,16 @@ class FullyShardedDataParallel(_BaseDataParallel):
|
|
|
184
184
|
tp_group = single_rank_group
|
|
185
185
|
|
|
186
186
|
if enable_hsdp:
|
|
187
|
-
mesh = _get_hsdp_tp_mesh(
|
|
187
|
+
mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
|
|
188
188
|
dist_index = FSDPDistributedIndex(
|
|
189
|
-
use_hybrid_fsdp=True,
|
|
190
189
|
hsdp_outer_dp_shard=self.ddp_config.outer_dp_sharding_strategy != "no_shard",
|
|
191
190
|
device_mesh=DeviceMesh.from_group(
|
|
192
|
-
[
|
|
191
|
+
[outer_fsdp_group, dp_cp_group, tp_group],
|
|
193
192
|
device_type="cuda",
|
|
194
193
|
mesh=mesh.tolist(),
|
|
195
|
-
mesh_dim_names=["
|
|
194
|
+
mesh_dim_names=["outer_fsdp_dp", "dp_cp", "tp"],
|
|
196
195
|
),
|
|
197
|
-
|
|
196
|
+
dp_outer_dim="outer_fsdp_dp", # Use Hybrid FSDP!
|
|
198
197
|
dp_shard_dim="dp_cp",
|
|
199
198
|
tp_dim="tp",
|
|
200
199
|
hybrid_fsdp_group=hybrid_fsdp_group,
|
|
@@ -222,20 +221,20 @@ class FullyShardedDataParallel(_BaseDataParallel):
|
|
|
222
221
|
self.module.synchronize_param_gather()
|
|
223
222
|
|
|
224
223
|
|
|
225
|
-
def _get_hsdp_tp_mesh(
|
|
224
|
+
def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
|
|
226
225
|
assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
|
|
227
226
|
world_size = dist.get_world_size()
|
|
228
227
|
|
|
229
228
|
mesh = einops.rearrange(
|
|
230
229
|
torch.arange(world_size),
|
|
231
|
-
"(
|
|
232
|
-
|
|
230
|
+
"(outer_fsdp_dp fsdp tp) -> outer_fsdp_dp fsdp tp",
|
|
231
|
+
outer_fsdp_dp=outer_fsdp_dp_group.size(),
|
|
233
232
|
tp=tp_group.size(),
|
|
234
233
|
)
|
|
235
234
|
|
|
236
235
|
mesh_fsdp_ranks = einops.rearrange(
|
|
237
236
|
mesh,
|
|
238
|
-
'
|
|
237
|
+
'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp tp) fsdp',
|
|
239
238
|
tp=tp_group.size(),
|
|
240
239
|
fsdp=dp_cp_group.size(),
|
|
241
240
|
)
|
|
@@ -247,7 +246,7 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
|
|
|
247
246
|
|
|
248
247
|
mesh_tp_ranks = einops.rearrange(
|
|
249
248
|
mesh,
|
|
250
|
-
'
|
|
249
|
+
'outer_fsdp_dp fsdp tp -> (outer_fsdp_dp fsdp) tp',
|
|
251
250
|
tp=tp_group.size(),
|
|
252
251
|
fsdp=dp_cp_group.size(),
|
|
253
252
|
)
|
|
@@ -257,18 +256,18 @@ def _get_hsdp_tp_mesh(inter_fsdp_dp_group, dp_cp_group, tp_group):
|
|
|
257
256
|
f"do not match the ranks in the TP group {tp_group_ranks}."
|
|
258
257
|
)
|
|
259
258
|
|
|
260
|
-
|
|
259
|
+
mesh_outer_fsdp_dp_ranks = einops.rearrange(
|
|
261
260
|
mesh,
|
|
262
|
-
'
|
|
261
|
+
'outer_fsdp_dp fsdp tp -> (fsdp tp) outer_fsdp_dp',
|
|
263
262
|
tp=tp_group.size(),
|
|
264
263
|
fsdp=dp_cp_group.size(),
|
|
265
264
|
)
|
|
266
|
-
|
|
265
|
+
outer_fsdp_dp_group_ranks = dist.get_process_group_ranks(outer_fsdp_dp_group)
|
|
267
266
|
assert _check_mesh_ranks_and_group_ranks_are_consistent(
|
|
268
|
-
|
|
267
|
+
mesh_outer_fsdp_dp_ranks, outer_fsdp_dp_group_ranks
|
|
269
268
|
), (
|
|
270
|
-
f"[Megatron-FSDP]
|
|
271
|
-
f"do not match the ranks in the
|
|
269
|
+
f"[Megatron-FSDP] Outer FSDP Data Parallel ranks in the mesh {mesh_outer_fsdp_dp_ranks} "
|
|
270
|
+
f"do not match the ranks in the Outer FSDP DP group {outer_fsdp_dp_group_ranks}."
|
|
272
271
|
)
|
|
273
272
|
|
|
274
273
|
return mesh
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from .distributed_data_parallel_config import DistributedDataParallelConfig
|
|
16
|
+
from .fully_shard import fully_shard, fully_shard_model, fully_shard_optimizer
|
|
16
17
|
from .megatron_fsdp import MegatronFSDP
|
|
17
18
|
from .package_info import (
|
|
18
19
|
__contact_emails__,
|
|
@@ -29,16 +30,13 @@ from .package_info import (
|
|
|
29
30
|
)
|
|
30
31
|
from .utils import FSDPDistributedIndex
|
|
31
32
|
|
|
32
|
-
try:
|
|
33
|
-
from .fully_shard import fully_shard
|
|
34
|
-
except ImportError as e:
|
|
35
|
-
print(f"Failed to import fully_shard: {e}")
|
|
36
|
-
|
|
37
33
|
__all__ = [
|
|
38
34
|
"DistributedDataParallelConfig",
|
|
39
35
|
"MegatronFSDP",
|
|
40
36
|
"FSDPDistributedIndex",
|
|
41
37
|
"fully_shard",
|
|
38
|
+
"fully_shard_model",
|
|
39
|
+
"fully_shard_optimizer",
|
|
42
40
|
"__contact_emails__",
|
|
43
41
|
"__contact_names__",
|
|
44
42
|
"__description__",
|
|
@@ -117,13 +117,12 @@ class DistributedDataParallelConfig:
|
|
|
117
117
|
This option will cause additional memory overhead, however, it is necessary for
|
|
118
118
|
to register user buffer (nccl_ub=True) for the Megatron FSDP.
|
|
119
119
|
This option will be automatically set to True when nccl_ub=True.
|
|
120
|
-
|
|
120
|
+
"""
|
|
121
121
|
|
|
122
122
|
outer_dp_sharding_strategy: str = 'no_shard'
|
|
123
123
|
"""
|
|
124
124
|
Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode.
|
|
125
|
-
Valid values are 'no_shard', 'optim'
|
|
126
|
-
This option is only effective when Hybrid FSDP is enabled.
|
|
125
|
+
Valid values are 'no_shard', 'optim'. This option is only effective when Hybrid FSDP is enabled.
|
|
127
126
|
"""
|
|
128
127
|
|
|
129
128
|
disable_symmetric_registration: bool = False
|