megatron-core 0.14.0rc7__tar.gz → 0.15.0rc4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/LICENSE +1 -1
- {megatron_core-0.14.0rc7/megatron_core.egg-info → megatron_core-0.15.0rc4}/PKG-INFO +24 -7
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/README.md +17 -2
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/__init__.py +11 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_megatron_dataset_builder.py +2 -8
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/gpt_dataset.py +4 -4
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/helpers.cpp +3 -1
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/megatron_tokenizer.py +1 -1
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/tokenizers.py +3 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/dict_utils.py +13 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/mapping.py +31 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/optimizer.py +6 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/async_utils.py +52 -14
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/base.py +1 -5
- megatron_core-0.15.0rc4/megatron/core/dist_checkpointing/strategies/checkpointable.py +196 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/torch.py +42 -14
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/zarr.py +6 -1
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/validation.py +13 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/distributed_data_parallel.py +49 -90
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/distributed_data_parallel_config.py +9 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/finalize_model_grads.py +36 -20
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +12 -16
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +31 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +6 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +6 -1
- megatron_core-0.15.0rc4/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +27 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +97 -61
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +6 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/param_and_grad_buffer.py +26 -6
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/enums.py +6 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +47 -24
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/kitchen.py +4 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/transformer_engine.py +259 -207
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/transformer_engine_spec_provider.py +5 -0
- megatron_core-0.15.0rc4/megatron/core/fp4_utils.py +136 -0
- megatron_core-0.15.0rc4/megatron/core/fusions/fused_bias_geglu.py +442 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_softmax.py +51 -7
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/dynamic_context.py +167 -80
- megatron_core-0.15.0rc4/megatron/core/inference/data_parallel_inference_coordinator.py +322 -0
- megatron_core-0.15.0rc4/megatron/core/inference/engines/dynamic_engine.py +828 -0
- megatron_core-0.15.0rc4/megatron/core/inference/headers.py +17 -0
- megatron_core-0.15.0rc4/megatron/core/inference/inference_client.py +190 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/inference_request.py +11 -1
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +11 -10
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/sampling_params.py +11 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +43 -9
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/model_parallel_config.py +6 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/t5_model.py +8 -8
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/backends.py +9 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +23 -21
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/language_module/language_module.py +13 -12
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/model_chunk_schedule_plan.py +115 -109
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/fine_grained_callables.py +117 -7
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/gpt_layer_specs.py +23 -9
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/gpt_model.py +55 -17
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +11 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/moe_module_specs.py +7 -1
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/mamba_model.py +8 -8
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/context_parallel.py +25 -13
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/llava_model.py +17 -12
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/base_attention.py +4 -4
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/decoder_attention.py +5 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/decoder_spec.py +8 -2
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/clip_vit_model.py +5 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/multimodal_projector.py +35 -30
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/radio.py +30 -4
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/nccl_allocator.py +39 -8
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/__init__.py +16 -122
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/distrib_optimizer.py +432 -130
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/optimizer.py +61 -9
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/optimizer_config.py +0 -6
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/package_info.py +4 -6
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/parallel_state.py +9 -7
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/combined_1f1b.py +179 -66
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/schedules.py +334 -232
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/utils.py +0 -16
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +1 -0
- megatron_core-0.15.0rc4/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
- megatron_core-0.15.0rc4/megatron/core/process_groups_config.py +489 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_block.py +8 -8
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_layer.py +4 -4
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_mixer.py +9 -9
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mlp_layer.py +3 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/layers.py +3 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/random.py +5 -2
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/timers.py +14 -1
- megatron_core-0.15.0rc4/megatron/core/tokenizers/__init__.py +4 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/base_tokenizer.py +48 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/megatron_tokenizer.py +171 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/__init__.py +3 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/__init__.py +8 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +147 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +164 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/chat_template.py +71 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +335 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +179 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/null_tokenizer.py +79 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +411 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +303 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/__init__.py +8 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/bert_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/default_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/gpt_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/mamba_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/retro_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/models/t5_tokenizer.py +12 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/text_tokenizer.py +254 -0
- megatron_core-0.15.0rc4/megatron/core/tokenizers/text/utils/build_tokenizer.py +58 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/attention.py +51 -25
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/cuda_graphs.py +183 -61
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/dot_product_attention.py +44 -13
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/mlp.py +44 -6
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/module.py +32 -3
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/experts.py +60 -27
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/moe_layer.py +47 -20
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/moe_utils.py +20 -16
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/router.py +89 -12
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/shared_experts.py +36 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/token_dispatcher.py +20 -19
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/multi_latent_attention.py +42 -17
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/multi_token_prediction.py +241 -211
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/pipeline_parallel_layer_layout.py +46 -11
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_block.py +126 -63
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_config.py +129 -19
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/transformer_layer.py +77 -46
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/utils.py +117 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/utils.py +28 -5
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4/megatron_core.egg-info}/PKG-INFO +24 -7
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/SOURCES.txt +28 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/requires.txt +6 -4
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/pyproject.toml +11 -7
- megatron_core-0.14.0rc7/megatron/core/fusions/fused_bias_geglu.py +0 -85
- megatron_core-0.14.0rc7/megatron/core/inference/engines/dynamic_engine.py +0 -423
- megatron_core-0.14.0rc7/megatron/core/process_groups_config.py +0 -233
- megatron_core-0.14.0rc7/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/MANIFEST.in +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/README.md +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/activations.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils_object_storage.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/full_cuda_graph.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/jit.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/post_training → megatron_core-0.15.0rc4/megatron/core/models}/__init__.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models → megatron_core-0.15.0rc4/megatron/core/models/bert}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models/bert → megatron_core-0.15.0rc4/megatron/core/models/common}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models/common → megatron_core-0.15.0rc4/megatron/core/models/common/language_module}/__init__.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models/common/language_module → megatron_core-0.15.0rc4/megatron/core/models/common/vision_module}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models/common/vision_module → megatron_core-0.15.0rc4/megatron/core/models/vision}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/post_training/modelopt/mamba → megatron_core-0.15.0rc4/megatron/core/post_training}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/safe_globals.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/models/vision → megatron_core-0.15.0rc4/megatron/core/transformer/custom_layers}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.14.0rc7/megatron/core/transformer/custom_layers → megatron_core-0.15.0rc4/megatron/core/transformer/moe}/__init__.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/setup.cfg +0 -0
- {megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/setup.py +0 -0
|
@@ -37,7 +37,7 @@ Below are licenses used in those files, as indicated.
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
--------------------------------------------------------------------------------------
|
|
40
|
-
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
|
|
40
|
+
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, TinyZero and vLLM code --
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
Apache License
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0rc4
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -31,29 +31,30 @@ Description-Content-Type: text/markdown
|
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
Requires-Dist: torch
|
|
33
33
|
Requires-Dist: numpy<2.0.0
|
|
34
|
-
Requires-Dist: packaging
|
|
34
|
+
Requires-Dist: packaging>=24.2
|
|
35
35
|
Provides-Extra: mlm
|
|
36
36
|
Requires-Dist: flask-restful; extra == "mlm"
|
|
37
37
|
Requires-Dist: sentencepiece; extra == "mlm"
|
|
38
38
|
Requires-Dist: tiktoken; extra == "mlm"
|
|
39
39
|
Requires-Dist: wandb; extra == "mlm"
|
|
40
|
+
Requires-Dist: transformers; extra == "mlm"
|
|
40
41
|
Provides-Extra: dev
|
|
41
42
|
Requires-Dist: tqdm; extra == "dev"
|
|
42
43
|
Requires-Dist: einops~=0.8; extra == "dev"
|
|
43
44
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
|
|
44
45
|
Requires-Dist: nvtx~=0.2; extra == "dev"
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist: multi-storage-client~=0.20; extra == "dev"
|
|
46
|
+
Requires-Dist: multi-storage-client~=0.27; extra == "dev"
|
|
47
47
|
Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
|
|
48
48
|
Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
49
49
|
Requires-Dist: mamba-ssm~=2.2; extra == "dev"
|
|
50
50
|
Requires-Dist: causal-conv1d~=1.5; extra == "dev"
|
|
51
51
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
|
|
52
|
-
Requires-Dist: transformer-engine[pytorch]<2.
|
|
52
|
+
Requires-Dist: transformer-engine[pytorch]<2.8.0,>=2.6.0a0; extra == "dev"
|
|
53
53
|
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
54
54
|
Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
|
|
55
55
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
|
|
56
56
|
Requires-Dist: flashinfer-python; extra == "dev"
|
|
57
|
+
Requires-Dist: wget; extra == "dev"
|
|
57
58
|
Requires-Dist: onnxscript; extra == "dev"
|
|
58
59
|
Provides-Extra: lts
|
|
59
60
|
Requires-Dist: tqdm; extra == "lts"
|
|
@@ -63,6 +64,7 @@ Requires-Dist: nvtx; extra == "lts"
|
|
|
63
64
|
Requires-Dist: transformers; extra == "lts"
|
|
64
65
|
Requires-Dist: zarr; extra == "lts"
|
|
65
66
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
67
|
+
Requires-Dist: wget; extra == "lts"
|
|
66
68
|
Dynamic: license-file
|
|
67
69
|
|
|
68
70
|
<div align="center">
|
|
@@ -93,7 +95,10 @@ cd Megatron-LM
|
|
|
93
95
|
|
|
94
96
|
# Latest News
|
|
95
97
|
|
|
96
|
-
-
|
|
98
|
+
- 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
99
|
+
- 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
100
|
+
- 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
101
|
+
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
97
102
|
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
98
103
|
|
|
99
104
|
<details>
|
|
@@ -143,6 +148,7 @@ cd Megatron-LM
|
|
|
143
148
|
**Resources**
|
|
144
149
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
145
150
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
151
|
+
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
146
152
|
- [Community & Support](#-community--support) - Get help and contribute
|
|
147
153
|
- [Getting Help](#getting-help)
|
|
148
154
|
- [Contributing](#contributing)
|
|
@@ -217,10 +223,12 @@ Megatron-LM/
|
|
|
217
223
|
|
|
218
224
|
**Libraries using Megatron Core:**
|
|
219
225
|
|
|
226
|
+
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
|
|
227
|
+
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
|
|
220
228
|
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
|
|
221
229
|
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
|
|
222
230
|
|
|
223
|
-
**Compatible with:** [
|
|
231
|
+
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
|
|
224
232
|
|
|
225
233
|
# Installation
|
|
226
234
|
|
|
@@ -510,6 +518,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
510
518
|
--use-distributed-optimizer
|
|
511
519
|
```
|
|
512
520
|
|
|
521
|
+
# Roadmaps
|
|
522
|
+
|
|
523
|
+
Stay up-to-date with our development roadmaps and planned features:
|
|
524
|
+
|
|
525
|
+
- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
|
|
526
|
+
- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
|
|
527
|
+
|
|
528
|
+
*More roadmap trackers will be added soon.*
|
|
529
|
+
|
|
513
530
|
# Community & Support
|
|
514
531
|
|
|
515
532
|
## Getting Help
|
|
@@ -26,7 +26,10 @@ cd Megatron-LM
|
|
|
26
26
|
|
|
27
27
|
# Latest News
|
|
28
28
|
|
|
29
|
-
-
|
|
29
|
+
- 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
30
|
+
- 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
31
|
+
- 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
32
|
+
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
30
33
|
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
31
34
|
|
|
32
35
|
<details>
|
|
@@ -76,6 +79,7 @@ cd Megatron-LM
|
|
|
76
79
|
**Resources**
|
|
77
80
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
78
81
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
82
|
+
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
79
83
|
- [Community & Support](#-community--support) - Get help and contribute
|
|
80
84
|
- [Getting Help](#getting-help)
|
|
81
85
|
- [Contributing](#contributing)
|
|
@@ -150,10 +154,12 @@ Megatron-LM/
|
|
|
150
154
|
|
|
151
155
|
**Libraries using Megatron Core:**
|
|
152
156
|
|
|
157
|
+
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
|
|
158
|
+
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
|
|
153
159
|
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
|
|
154
160
|
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
|
|
155
161
|
|
|
156
|
-
**Compatible with:** [
|
|
162
|
+
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
|
|
157
163
|
|
|
158
164
|
# Installation
|
|
159
165
|
|
|
@@ -443,6 +449,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
443
449
|
--use-distributed-optimizer
|
|
444
450
|
```
|
|
445
451
|
|
|
452
|
+
# Roadmaps
|
|
453
|
+
|
|
454
|
+
Stay up-to-date with our development roadmaps and planned features:
|
|
455
|
+
|
|
456
|
+
- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
|
|
457
|
+
- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
|
|
458
|
+
|
|
459
|
+
*More roadmap trackers will be added soon.*
|
|
460
|
+
|
|
446
461
|
# Community & Support
|
|
447
462
|
|
|
448
463
|
## Getting Help
|
|
@@ -33,6 +33,17 @@ __all__ = [
|
|
|
33
33
|
"InferenceParams",
|
|
34
34
|
"ModelParallelConfig",
|
|
35
35
|
"Timers",
|
|
36
|
+
"__contact_emails__",
|
|
37
|
+
"__contact_names__",
|
|
38
|
+
"__description__",
|
|
39
|
+
"__download_url__",
|
|
40
|
+
"__homepage__",
|
|
41
|
+
"__keywords__",
|
|
42
|
+
"__license__",
|
|
43
|
+
"__package_name__",
|
|
44
|
+
"__repository_url__",
|
|
45
|
+
"__shortversion__",
|
|
46
|
+
"__version__",
|
|
36
47
|
]
|
|
37
48
|
|
|
38
49
|
from .safe_globals import register_safe_globals
|
|
@@ -35,7 +35,8 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
35
35
|
|
|
36
36
|
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
|
|
37
37
|
the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
|
|
38
|
-
global rank, local group rank, and virtual rank may inform its return value.
|
|
38
|
+
global rank, local group rank, and virtual rank may inform its return value. Should
|
|
39
|
+
return true for exactly one process on global rank 0.
|
|
39
40
|
|
|
40
41
|
config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
|
|
41
42
|
"""
|
|
@@ -72,13 +73,6 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
72
73
|
for {split.name} split
|
|
73
74
|
This can occur with multiple validation sets if datasets have weights"""
|
|
74
75
|
|
|
75
|
-
if torch.distributed.is_initialized():
|
|
76
|
-
gb_rank = torch.distributed.get_rank()
|
|
77
|
-
if gb_rank == 0:
|
|
78
|
-
assert (
|
|
79
|
-
self.is_built_on_rank()
|
|
80
|
-
), "is_built_on_rank must return True when global rank = 0"
|
|
81
|
-
|
|
82
76
|
def build(self) -> List[Optional[TopLevelDataset]]:
|
|
83
77
|
"""Build all dataset splits according to the provided blend(s)
|
|
84
78
|
|
|
@@ -6,8 +6,8 @@ import re
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import List, Optional, Tuple
|
|
8
8
|
|
|
9
|
-
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
|
|
10
9
|
from megatron.core.datasets.utils import Split, log_single_rank, normalize
|
|
10
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -66,8 +66,8 @@ class BlendedMegatronDatasetConfig:
|
|
|
66
66
|
constructor.
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
|
-
tokenizer: Optional[
|
|
70
|
-
"""The
|
|
69
|
+
tokenizer: Optional[MegatronTokenizerBase] = None
|
|
70
|
+
"""The MegatronTokenizerBase instance. Required for datasets that do online tokenization."""
|
|
71
71
|
|
|
72
72
|
mid_level_dataset_surplus: float = 0.005
|
|
73
73
|
"""The sample surplus to build for the mid-level datasets(s). Defaults arbitrarily to 0.005.
|
|
@@ -12,9 +12,9 @@ import torch
|
|
|
12
12
|
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
|
|
13
13
|
from megatron.core.datasets.indexed_dataset import IndexedDataset
|
|
14
14
|
from megatron.core.datasets.megatron_dataset import MegatronDataset
|
|
15
|
-
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
|
|
16
15
|
from megatron.core.datasets.object_storage_utils import ObjectStorageConfig, is_object_storage_path
|
|
17
16
|
from megatron.core.datasets.utils import Split
|
|
17
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
18
18
|
from megatron.core.utils import log_single_rank
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
@@ -701,8 +701,8 @@ class MockGPTLowLevelDataset:
|
|
|
701
701
|
we add the end of document token to each element indexed in __getitem__
|
|
702
702
|
|
|
703
703
|
Args:
|
|
704
|
-
tokenizer (
|
|
705
|
-
|
|
704
|
+
tokenizer (MegatronTokenizerBase): The tokenizer the special token information of which
|
|
705
|
+
we use to augment the mock data.
|
|
706
706
|
"""
|
|
707
707
|
|
|
708
708
|
seed: int = 0
|
|
@@ -714,7 +714,7 @@ class MockGPTLowLevelDataset:
|
|
|
714
714
|
max_sequence_length: int = 4096
|
|
715
715
|
"""The hard-coded max sequence length to generate"""
|
|
716
716
|
|
|
717
|
-
def __init__(self, tokenizer:
|
|
717
|
+
def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
|
|
718
718
|
self.tokenizer = tokenizer
|
|
719
719
|
rng = numpy.random.default_rng(seed=self.seed)
|
|
720
720
|
self.sequence_lengths = rng.integers(
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
/* Helper methods for fast index mapping builds */
|
|
4
4
|
|
|
5
5
|
#include <algorithm>
|
|
6
|
+
#include <cassert>
|
|
6
7
|
#include <iostream>
|
|
7
8
|
#include <limits>
|
|
8
9
|
#include <math.h>
|
|
@@ -46,7 +47,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
|
|
|
46
47
|
while (dataset_unspent_indices.size() > 0) {
|
|
47
48
|
double index_sample_double = std::max(static_cast<double>(index_sample), 1.0);
|
|
48
49
|
|
|
49
|
-
int64_t error_argmax;
|
|
50
|
+
int64_t error_argmax = -1;
|
|
50
51
|
double error_max = std::numeric_limits<double>::lowest();
|
|
51
52
|
|
|
52
53
|
for (int32_t index_dataset : dataset_unspent_indices) {
|
|
@@ -56,6 +57,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
|
|
|
56
57
|
error_max = error;
|
|
57
58
|
}
|
|
58
59
|
}
|
|
60
|
+
assert(error_argmax >= 0);
|
|
59
61
|
|
|
60
62
|
// Populate the indices.
|
|
61
63
|
dataset_index_ptr[index_sample] = static_cast<int16_t>(error_argmax);
|
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
|
|
7
|
-
from megatron.core.
|
|
7
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
11
11
|
class RetroTokenizers:
|
|
12
12
|
"""Container class for GPT and Bert tokenizers."""
|
|
13
13
|
|
|
14
|
-
gpt:
|
|
15
|
-
bert:
|
|
14
|
+
gpt: MegatronTokenizerBase = None
|
|
15
|
+
bert: MegatronTokenizerBase = None
|
{megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/dict_utils.py
RENAMED
|
@@ -103,11 +103,19 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
|
|
|
103
103
|
else:
|
|
104
104
|
only_left = []
|
|
105
105
|
only_right = []
|
|
106
|
+
mismatch_debug_data = [prefix, type(x1), type(x2)]
|
|
106
107
|
if isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
try:
|
|
109
|
+
if x1.device != x2.device:
|
|
110
|
+
_is_mismatch = not torch.all(x1.cpu() == x2.cpu())
|
|
111
|
+
else:
|
|
112
|
+
_is_mismatch = not torch.all(x1 == x2)
|
|
113
|
+
mismatch_debug_data.extend(
|
|
114
|
+
[(x1 != x2).sum(), (x1 != x2).shape, (x1 != x2).nonzero().tolist()]
|
|
115
|
+
)
|
|
116
|
+
except (RuntimeError, TypeError, ValueError):
|
|
117
|
+
_is_mismatch = True
|
|
118
|
+
mismatch_debug_data.extend([x1.shape, x2.shape])
|
|
111
119
|
# TODO: change with concrete type that has both replica_id and data attrs
|
|
112
120
|
elif hasattr(x1, "replica_id") and hasattr(x2, "replica_id"):
|
|
113
121
|
assert type(x1) == type(x2)
|
|
@@ -122,7 +130,7 @@ def diff(x1: Any, x2: Any, prefix: Tuple = ()) -> Tuple[list, list, list]:
|
|
|
122
130
|
_is_mismatch = True
|
|
123
131
|
|
|
124
132
|
if _is_mismatch:
|
|
125
|
-
mismatch.append((
|
|
133
|
+
mismatch.append(tuple(mismatch_debug_data))
|
|
126
134
|
|
|
127
135
|
return only_left, only_right, mismatch
|
|
128
136
|
|
{megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/mapping.py
RENAMED
|
@@ -29,6 +29,9 @@ ShardedStateDict = Dict[str, Any]
|
|
|
29
29
|
ReplicaId = Union[int, Tuple[int, ...]]
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
_logged_deprecations = {}
|
|
33
|
+
|
|
34
|
+
|
|
32
35
|
class ShardedBase(ABC):
|
|
33
36
|
"""Base class for ShardedTensor and ShardedStateDict."""
|
|
34
37
|
|
|
@@ -135,17 +138,40 @@ class ShardedTensor(ShardedBase):
|
|
|
135
138
|
f"equal to global shape dimensions for {self}"
|
|
136
139
|
)
|
|
137
140
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
141
|
+
if self.axis_fragmentations is not None:
|
|
142
|
+
for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
|
|
143
|
+
if sh != 0 and off % sh != 0:
|
|
144
|
+
raise CheckpointingException(
|
|
145
|
+
f"Global offset ({off}) must be divisible by local shape ({sh}) for {self}."
|
|
146
|
+
)
|
|
143
147
|
|
|
144
148
|
if has_flattened_range and self.flattened_range.step is not None:
|
|
145
149
|
raise CheckpointingException(
|
|
146
150
|
f"`step` argument in the flattened range of a ShardedTensor is not supported."
|
|
147
151
|
)
|
|
148
152
|
|
|
153
|
+
if self.prepend_axis_num:
|
|
154
|
+
if not _logged_deprecations.get("prepend_axis_num", False):
|
|
155
|
+
logger.warning(
|
|
156
|
+
"ShardedTensor.prepend_axis_num greater than 0 is deprecated."
|
|
157
|
+
" In Megatron-Core this can be prevented by setting sharded_state_dict"
|
|
158
|
+
" metadata['singleton_local_shards'] to True."
|
|
159
|
+
)
|
|
160
|
+
_logged_deprecations["prepend_axis_num"] = True
|
|
161
|
+
|
|
162
|
+
if self.flattened_range is not None:
|
|
163
|
+
if not _logged_deprecations.get("flattened_range", False):
|
|
164
|
+
logger.warning(
|
|
165
|
+
"ShardedTensor.flattened_range is deprecated."
|
|
166
|
+
" Use latest DistributedOptimizer formats."
|
|
167
|
+
)
|
|
168
|
+
_logged_deprecations["flattened_range"] = True
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def has_regular_grid(self):
|
|
172
|
+
"""Alias for having a regular sharding grid."""
|
|
173
|
+
return self.axis_fragmentations is not None
|
|
174
|
+
|
|
149
175
|
def global_slice(self) -> Tuple[Union[int, slice], ...]:
|
|
150
176
|
"""
|
|
151
177
|
Returns a tuple of int and slice objects representing a slice of the
|
{megatron_core-0.14.0rc7 → megatron_core-0.15.0rc4}/megatron/core/dist_checkpointing/optimizer.py
RENAMED
|
@@ -25,6 +25,12 @@ from .mapping import (
|
|
|
25
25
|
)
|
|
26
26
|
from .utils import extract_sharded_tensors_and_factories
|
|
27
27
|
|
|
28
|
+
KEEP_VARS_HINT = (
|
|
29
|
+
" Make sure state dict contains original torch.nn.Parameters (not pure torch.Tensors)"
|
|
30
|
+
" by passing `keep_vars=True` to `.state_dict()`. If any transformation of the original"
|
|
31
|
+
" parameter is needed, use a ShardedTensorFactory."
|
|
32
|
+
)
|
|
33
|
+
|
|
28
34
|
|
|
29
35
|
def get_optim_param_to_id_map(optim_params_iter: Iterable[torch.nn.Parameter]) -> Dict[int, int]:
|
|
30
36
|
"""Generate mapping from optimizer param to optimizer state id."""
|
|
@@ -79,9 +79,24 @@ class AsyncRequest(NamedTuple):
|
|
|
79
79
|
|
|
80
80
|
This logic is equivalent to what should happen in case of the async call.
|
|
81
81
|
"""
|
|
82
|
+
# preload tensors.
|
|
83
|
+
async_fn_args = list(self.async_fn_args)
|
|
84
|
+
if self.preload_fn:
|
|
85
|
+
assert len(async_fn_args) == 3, "Expected 3 args to be passed to async function"
|
|
86
|
+
# The async_fn is passed as a partial functool with pre-determined args
|
|
87
|
+
# In the async_fn_args we pass the remaining positional args required by the async_fn
|
|
88
|
+
# async_fn_args[1] refers to the write_buckets
|
|
89
|
+
# To ensure we stage the write_buckets to CPU memory for sync CP,
|
|
90
|
+
# we replace it with preload_fn callable that returns the CPU staged tensors
|
|
91
|
+
async_fn_args[1] = self.preload_fn()
|
|
92
|
+
# persist the state
|
|
82
93
|
if self.async_fn is not None:
|
|
83
|
-
self.async_fn(*self.
|
|
94
|
+
self.async_fn(*async_fn_args, **self.async_fn_kwargs)
|
|
95
|
+
|
|
96
|
+
# This utility implements a sync cp save. Hence the barrier.
|
|
84
97
|
torch.distributed.barrier()
|
|
98
|
+
|
|
99
|
+
# Finalize the CP state
|
|
85
100
|
for finalize_fn in self.finalize_fns:
|
|
86
101
|
finalize_fn()
|
|
87
102
|
|
|
@@ -150,7 +165,7 @@ class AsyncCaller(ABC):
|
|
|
150
165
|
return ten[0] == 0
|
|
151
166
|
|
|
152
167
|
@abstractmethod
|
|
153
|
-
def close(self):
|
|
168
|
+
def close(self, abort=False):
|
|
154
169
|
"""Terminate the async caller at exit of an application or some termination conditions"""
|
|
155
170
|
logger.info(f"AsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller")
|
|
156
171
|
|
|
@@ -237,15 +252,23 @@ class TemporalAsyncCaller(AsyncCaller):
|
|
|
237
252
|
is_done = True
|
|
238
253
|
return is_done
|
|
239
254
|
|
|
240
|
-
def close(self):
|
|
255
|
+
def close(self, abort=False):
|
|
241
256
|
"""For TemporalAsyncCaller, this method is called explictly in `is_current_async_calls_done`
|
|
242
257
|
|
|
243
258
|
This method make sure the TemporalAsyncCaller terminated
|
|
244
259
|
with all its assigned async request completed
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
abort (bool, optional): Default to False. Needs to be manually set to true when
|
|
263
|
+
the checkpoint async process needs to be aborted.
|
|
245
264
|
"""
|
|
246
265
|
if self.process:
|
|
247
266
|
logger.debug(f"rank: {torch.distributed.get_rank()}, joining self.process")
|
|
248
|
-
|
|
267
|
+
if abort:
|
|
268
|
+
logger.warning(f"Temporal worker aborted in rank {torch.distributed.get_rank()}")
|
|
269
|
+
self.process.kill()
|
|
270
|
+
else:
|
|
271
|
+
self.process.join()
|
|
249
272
|
self.process = None
|
|
250
273
|
logger.debug(
|
|
251
274
|
"TemporalAsyncCaller: Async process join finished "
|
|
@@ -388,18 +411,25 @@ class PersistentAsyncCaller(AsyncCaller):
|
|
|
388
411
|
|
|
389
412
|
return is_done
|
|
390
413
|
|
|
391
|
-
def close(self):
|
|
414
|
+
def close(self, abort=False):
|
|
392
415
|
"""Wait on the left async requests and terminate the PersistentAsyncCaller
|
|
393
416
|
|
|
394
417
|
Signals the PersistentAsyncCaller by sending a 'DONE' message to make it terminated
|
|
418
|
+
Args:
|
|
419
|
+
abort (bool, optional): Default to False. Needs to be manually set to true when
|
|
420
|
+
the checkpoint async process needs to be aborted.
|
|
395
421
|
"""
|
|
396
422
|
logger.info(
|
|
397
423
|
f"PersistentAsyncCaller: {torch.distributed.get_rank()}, Destroying Async Caller"
|
|
398
424
|
)
|
|
399
425
|
if self.process:
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
426
|
+
if abort:
|
|
427
|
+
logger.warning(f"Persistent worker aborted in rank {torch.distributed.get_rank()}")
|
|
428
|
+
self.process.kill()
|
|
429
|
+
else:
|
|
430
|
+
self.queue.put('DONE')
|
|
431
|
+
self.queue.join()
|
|
432
|
+
self.process.join()
|
|
403
433
|
self.process = None
|
|
404
434
|
|
|
405
435
|
def __del__(self):
|
|
@@ -528,6 +558,9 @@ class AsyncCallsQueue:
|
|
|
528
558
|
blocking (bool, optional): if True, will wait until all active requests
|
|
529
559
|
are done. Otherwise, finalizes only the async request that already
|
|
530
560
|
finished. Defaults to False.
|
|
561
|
+
|
|
562
|
+
no_dist (bool, Optional): if True, training ranks simply check its
|
|
563
|
+
asynchronous checkpoint writer without synchronization.
|
|
531
564
|
Returns:
|
|
532
565
|
List[int]: list of indices (as returned by `schedule_async_request`)
|
|
533
566
|
of async calls that have been successfully finalized.
|
|
@@ -545,8 +578,8 @@ class AsyncCallsQueue:
|
|
|
545
578
|
finalize_fn()
|
|
546
579
|
ten = torch.tensor([call_idx], dtype=torch.int, device=torch.cuda.current_device())
|
|
547
580
|
torch.distributed.all_reduce(ten, op=torch.distributed.ReduceOp.MAX)
|
|
548
|
-
assert ten.item() == call_idx,
|
|
549
|
-
|
|
581
|
+
assert ten.item() == call_idx, "Unmatched async calls. "
|
|
582
|
+
"That probably means not all ranks are participating in async finalization"
|
|
550
583
|
call_idx_finalized.append(call_idx)
|
|
551
584
|
return call_idx_finalized
|
|
552
585
|
|
|
@@ -554,8 +587,13 @@ class AsyncCallsQueue:
|
|
|
554
587
|
"""Get the number of active async calls."""
|
|
555
588
|
return len(self.async_calls)
|
|
556
589
|
|
|
557
|
-
def close(self):
|
|
558
|
-
"""Finalize all calls upon closing.
|
|
559
|
-
|
|
590
|
+
def close(self, abort=False):
|
|
591
|
+
"""Finalize all calls upon closing.
|
|
592
|
+
Args:
|
|
593
|
+
abort (bool, optional): Default to False. Needs to be manually set to true when
|
|
594
|
+
the checkpoint async process needs to be aborted.
|
|
595
|
+
"""
|
|
596
|
+
if not abort:
|
|
597
|
+
self.maybe_finalize_async_calls(blocking=True)
|
|
560
598
|
if self.persistent and self.persistent_caller:
|
|
561
|
-
self.persistent_caller.close()
|
|
599
|
+
self.persistent_caller.close(abort=abort)
|
|
@@ -221,8 +221,4 @@ class AsyncSaveShardedStrategy(SaveShardedStrategy):
|
|
|
221
221
|
def save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union[str, Path]):
|
|
222
222
|
"""Each async strategy can be trivially used as a sync strategy."""
|
|
223
223
|
async_request = self.async_save(sharded_state_dict, checkpoint_dir)
|
|
224
|
-
|
|
225
|
-
# We keep this verbose call for now
|
|
226
|
-
global async_calls
|
|
227
|
-
async_calls.schedule_async_request(async_request)
|
|
228
|
-
async_calls.maybe_finalize_async_calls(blocking=True)
|
|
224
|
+
async_request.execute_sync()
|