megatron-core 0.15.0rc0__tar.gz → 0.15.0rc5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/LICENSE +1 -1
- {megatron_core-0.15.0rc0/megatron_core.egg-info → megatron_core-0.15.0rc5}/PKG-INFO +23 -6
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/README.md +17 -2
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/__init__.py +17 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_builder.py +2 -8
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/gpt_dataset.py +4 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.cpp +3 -1
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/indexed_dataset.py +10 -7
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_tokenizer.py +1 -1
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/tokenizers.py +3 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/mapping.py +20 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/common.py +6 -6
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/torch.py +10 -5
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel.py +49 -90
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/distributed_data_parallel_config.py +9 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/finalize_model_grads.py +36 -20
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +30 -35
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +33 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +8 -3
- megatron_core-0.15.0rc5/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +521 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +123 -27
- megatron_core-0.15.0rc5/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +27 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +188 -107
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +45 -27
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/param_and_grad_buffer.py +27 -6
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/enums.py +6 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +47 -24
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine.py +214 -209
- megatron_core-0.15.0rc5/megatron/core/fp4_utils.py +136 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/full_cuda_graph.py +6 -3
- megatron_core-0.15.0rc5/megatron/core/fusions/fused_bias_geglu.py +442 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_softmax.py +149 -10
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_context.py +194 -87
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/dynamic_engine.py +213 -81
- megatron_core-0.15.0rc5/megatron/core/inference/inference_request.py +193 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +11 -10
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +30 -12
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/model_parallel_config.py +4 -1
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_model.py +8 -8
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/language_module/language_module.py +13 -12
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/model_chunk_schedule_plan.py +115 -109
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/fine_grained_callables.py +117 -7
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_layer_specs.py +11 -9
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/gpt_model.py +55 -17
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +11 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_model.py +8 -8
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_model.py +12 -12
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/base_attention.py +4 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_attention.py +5 -5
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/decoder_spec.py +8 -2
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/clip_vit_model.py +5 -5
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/radio.py +4 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/nccl_allocator.py +39 -8
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/__init__.py +16 -122
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/clip_grads.py +4 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/distrib_optimizer.py +31 -11
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer.py +62 -12
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/optimizer_config.py +0 -6
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/package_info.py +3 -5
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/parallel_state.py +15 -10
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/combined_1f1b.py +179 -66
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/schedules.py +334 -232
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/utils.py +0 -16
- megatron_core-0.15.0rc5/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
- megatron_core-0.15.0rc5/megatron/core/process_groups_config.py +489 -0
- megatron_core-0.15.0rc5/megatron/core/safe_globals.py +35 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_block.py +8 -8
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_layer.py +4 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_mixer.py +9 -9
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mlp_layer.py +3 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/layers.py +7 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/timers.py +14 -1
- megatron_core-0.15.0rc5/megatron/core/tokenizers/__init__.py +4 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/base_tokenizer.py +48 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/megatron_tokenizer.py +171 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/__init__.py +3 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/__init__.py +8 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +147 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +164 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/chat_template.py +71 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +335 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +179 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/null_tokenizer.py +79 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +411 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +303 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/__init__.py +8 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/bert_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/default_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/gpt_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/mamba_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/retro_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/models/t5_tokenizer.py +12 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/text_tokenizer.py +254 -0
- megatron_core-0.15.0rc5/megatron/core/tokenizers/text/utils/build_tokenizer.py +58 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/attention.py +21 -23
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/cuda_graphs.py +485 -53
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/dot_product_attention.py +56 -17
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/enums.py +1 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/mlp.py +24 -4
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/module.py +32 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/experts.py +36 -21
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_layer.py +19 -19
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/moe_utils.py +20 -16
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/router.py +89 -12
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/shared_experts.py +3 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/token_dispatcher.py +20 -19
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_latent_attention.py +14 -14
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/multi_token_prediction.py +241 -211
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/pipeline_parallel_layer_layout.py +56 -17
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_block.py +126 -63
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_config.py +84 -15
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/transformer_layer.py +66 -45
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/utils.py +151 -1
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/utils.py +31 -5
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5/megatron_core.egg-info}/PKG-INFO +23 -6
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/SOURCES.txt +25 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/requires.txt +5 -3
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/pyproject.toml +9 -5
- megatron_core-0.15.0rc0/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -387
- megatron_core-0.15.0rc0/megatron/core/fusions/fused_bias_geglu.py +0 -85
- megatron_core-0.15.0rc0/megatron/core/inference/inference_request.py +0 -91
- megatron_core-0.15.0rc0/megatron/core/process_groups_config.py +0 -233
- megatron_core-0.15.0rc0/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/MANIFEST.in +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/README.md +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/activations.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_object_storage.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/serialization.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/kitchen.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fp8_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_indices_converter.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/hyper_comm_grid.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/engines/static_engine.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/headers.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/inference_client.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/jit.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/post_training → megatron_core-0.15.0rc5/megatron/core/models}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/backends.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models → megatron_core-0.15.0rc5/megatron/core/models/bert}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models/bert → megatron_core-0.15.0rc5/megatron/core/models/common}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models/common → megatron_core-0.15.0rc5/megatron/core/models/common/language_module}/__init__.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models/common/language_module → megatron_core-0.15.0rc5/megatron/core/models/common/vision_module}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/clip_model.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/module.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/huggingface/qwen_model.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models/common/vision_module → megatron_core-0.15.0rc5/megatron/core/models/vision}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/post_training/modelopt/mamba → megatron_core-0.15.0rc5/megatron/core/post_training}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/layers.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/quant_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/rerun_state_machine.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_context_parallel.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/models/vision → megatron_core-0.15.0rc5/megatron/core/transformer/custom_layers}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.15.0rc0/megatron/core/transformer/custom_layers → megatron_core-0.15.0rc5/megatron/core/transformer/moe}/__init__.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/setup.cfg +0 -0
- {megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/setup.py +0 -0
|
@@ -37,7 +37,7 @@ Below are licenses used in those files, as indicated.
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
--------------------------------------------------------------------------------------
|
|
40
|
-
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
|
|
40
|
+
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, TinyZero and vLLM code --
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
Apache License
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.15.
|
|
3
|
+
Version: 0.15.0rc5
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -37,23 +37,24 @@ Requires-Dist: flask-restful; extra == "mlm"
|
|
|
37
37
|
Requires-Dist: sentencepiece; extra == "mlm"
|
|
38
38
|
Requires-Dist: tiktoken; extra == "mlm"
|
|
39
39
|
Requires-Dist: wandb; extra == "mlm"
|
|
40
|
+
Requires-Dist: transformers; extra == "mlm"
|
|
40
41
|
Provides-Extra: dev
|
|
41
42
|
Requires-Dist: tqdm; extra == "dev"
|
|
42
43
|
Requires-Dist: einops~=0.8; extra == "dev"
|
|
43
44
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72,~=0.1; extra == "dev"
|
|
44
45
|
Requires-Dist: nvtx~=0.2; extra == "dev"
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist: multi-storage-client<0.26,~=0.25; extra == "dev"
|
|
46
|
+
Requires-Dist: multi-storage-client~=0.27; extra == "dev"
|
|
47
47
|
Requires-Dist: opentelemetry-api~=1.33.1; extra == "dev"
|
|
48
48
|
Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
49
49
|
Requires-Dist: mamba-ssm~=2.2; extra == "dev"
|
|
50
50
|
Requires-Dist: causal-conv1d~=1.5; extra == "dev"
|
|
51
51
|
Requires-Dist: nv-grouped-gemm~=1.1; extra == "dev"
|
|
52
|
-
Requires-Dist: transformer-engine[pytorch]<2.
|
|
52
|
+
Requires-Dist: transformer-engine[pytorch]<2.8.0,>=2.6.0a0; extra == "dev"
|
|
53
53
|
Requires-Dist: nvidia-resiliency-ext<0.5.0,>=0.4.0a0; extra == "dev"
|
|
54
54
|
Requires-Dist: nvidia-modelopt[torch]<0.34.0,>=0.33.0a0; sys_platform != "darwin" and extra == "dev"
|
|
55
55
|
Requires-Dist: megatron-energon[av_decode]~=6.0; extra == "dev"
|
|
56
56
|
Requires-Dist: flashinfer-python; extra == "dev"
|
|
57
|
+
Requires-Dist: wget; extra == "dev"
|
|
57
58
|
Requires-Dist: onnxscript; extra == "dev"
|
|
58
59
|
Provides-Extra: lts
|
|
59
60
|
Requires-Dist: tqdm; extra == "lts"
|
|
@@ -63,6 +64,7 @@ Requires-Dist: nvtx; extra == "lts"
|
|
|
63
64
|
Requires-Dist: transformers; extra == "lts"
|
|
64
65
|
Requires-Dist: zarr; extra == "lts"
|
|
65
66
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
67
|
+
Requires-Dist: wget; extra == "lts"
|
|
66
68
|
Dynamic: license-file
|
|
67
69
|
|
|
68
70
|
<div align="center">
|
|
@@ -93,7 +95,10 @@ cd Megatron-LM
|
|
|
93
95
|
|
|
94
96
|
# Latest News
|
|
95
97
|
|
|
96
|
-
-
|
|
98
|
+
- 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
99
|
+
- 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
100
|
+
- 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
101
|
+
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
97
102
|
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
98
103
|
|
|
99
104
|
<details>
|
|
@@ -143,6 +148,7 @@ cd Megatron-LM
|
|
|
143
148
|
**Resources**
|
|
144
149
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
145
150
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
151
|
+
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
146
152
|
- [Community & Support](#-community--support) - Get help and contribute
|
|
147
153
|
- [Getting Help](#getting-help)
|
|
148
154
|
- [Contributing](#contributing)
|
|
@@ -217,10 +223,12 @@ Megatron-LM/
|
|
|
217
223
|
|
|
218
224
|
**Libraries using Megatron Core:**
|
|
219
225
|
|
|
226
|
+
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
|
|
227
|
+
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
|
|
220
228
|
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
|
|
221
229
|
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
|
|
222
230
|
|
|
223
|
-
**Compatible with:** [
|
|
231
|
+
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
|
|
224
232
|
|
|
225
233
|
# Installation
|
|
226
234
|
|
|
@@ -510,6 +518,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
510
518
|
--use-distributed-optimizer
|
|
511
519
|
```
|
|
512
520
|
|
|
521
|
+
# Roadmaps
|
|
522
|
+
|
|
523
|
+
Stay up-to-date with our development roadmaps and planned features:
|
|
524
|
+
|
|
525
|
+
- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
|
|
526
|
+
- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
|
|
527
|
+
|
|
528
|
+
*More roadmap trackers will be added soon.*
|
|
529
|
+
|
|
513
530
|
# Community & Support
|
|
514
531
|
|
|
515
532
|
## Getting Help
|
|
@@ -26,7 +26,10 @@ cd Megatron-LM
|
|
|
26
26
|
|
|
27
27
|
# Latest News
|
|
28
28
|
|
|
29
|
-
-
|
|
29
|
+
- 🔄 NEW! **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Bidirectional converter for interoperability between Hugging Face and Megatron checkpoints, featuring production-ready recipes for popular models.
|
|
30
|
+
- 🗺️ **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive roadmap for MoE features including DeepSeek-V3, Qwen3, advanced parallelism strategies, FP8 optimizations, and Blackwell performance enhancements.
|
|
31
|
+
- 🚀 **[GPT-OSS Implementation](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions are being integrated into Megatron Core.
|
|
32
|
+
- **[2025/06]** **[Megatron MoE Model Zoo](https://github.com/yanring/Megatron-MoE-ModelZoo)** - Best practices and optimized configurations for training DeepSeek-V3, Mixtral, and Qwen3 MoE models with performance benchmarking and checkpoint conversion tools.
|
|
30
33
|
- **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
|
|
31
34
|
|
|
32
35
|
<details>
|
|
@@ -76,6 +79,7 @@ cd Megatron-LM
|
|
|
76
79
|
**Resources**
|
|
77
80
|
- [Examples](./examples/) - Training scripts and tutorials
|
|
78
81
|
- [Documentation](https://docs.nvidia.com/Megatron-Core/) - Official docs
|
|
82
|
+
- [Roadmaps](#roadmaps) - Development roadmaps and feature tracking
|
|
79
83
|
- [Community & Support](#-community--support) - Get help and contribute
|
|
80
84
|
- [Getting Help](#getting-help)
|
|
81
85
|
- [Contributing](#contributing)
|
|
@@ -150,10 +154,12 @@ Megatron-LM/
|
|
|
150
154
|
|
|
151
155
|
**Libraries using Megatron Core:**
|
|
152
156
|
|
|
157
|
+
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
|
|
158
|
+
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
|
|
153
159
|
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
|
|
154
160
|
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
|
|
155
161
|
|
|
156
|
-
**Compatible with:** [
|
|
162
|
+
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
|
|
157
163
|
|
|
158
164
|
# Installation
|
|
159
165
|
|
|
@@ -443,6 +449,15 @@ Based on [NVIDIA NeMo production configurations](https://github.com/NVIDIA/NeMo/
|
|
|
443
449
|
--use-distributed-optimizer
|
|
444
450
|
```
|
|
445
451
|
|
|
452
|
+
# Roadmaps
|
|
453
|
+
|
|
454
|
+
Stay up-to-date with our development roadmaps and planned features:
|
|
455
|
+
|
|
456
|
+
- **[MoE Q3-Q4 2025 Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - Comprehensive MoE feature development including DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
|
|
457
|
+
- **[GPT-OSS Implementation Tracker](https://github.com/NVIDIA/Megatron-LM/issues/1739)** - Advanced features including YaRN RoPE scaling, attention sinks, and custom activation functions
|
|
458
|
+
|
|
459
|
+
*More roadmap trackers will be added soon.*
|
|
460
|
+
|
|
446
461
|
# Community & Support
|
|
447
462
|
|
|
448
463
|
## Getting Help
|
|
@@ -20,6 +20,7 @@ from megatron.core.package_info import (
|
|
|
20
20
|
__version__,
|
|
21
21
|
)
|
|
22
22
|
from megatron.core.timers import Timers
|
|
23
|
+
from megatron.core.utils import is_torch_min_version
|
|
23
24
|
|
|
24
25
|
# Alias parallel_state as mpu, its legacy name
|
|
25
26
|
mpu = parallel_state
|
|
@@ -32,4 +33,20 @@ __all__ = [
|
|
|
32
33
|
"InferenceParams",
|
|
33
34
|
"ModelParallelConfig",
|
|
34
35
|
"Timers",
|
|
36
|
+
"__contact_emails__",
|
|
37
|
+
"__contact_names__",
|
|
38
|
+
"__description__",
|
|
39
|
+
"__download_url__",
|
|
40
|
+
"__homepage__",
|
|
41
|
+
"__keywords__",
|
|
42
|
+
"__license__",
|
|
43
|
+
"__package_name__",
|
|
44
|
+
"__repository_url__",
|
|
45
|
+
"__shortversion__",
|
|
46
|
+
"__version__",
|
|
35
47
|
]
|
|
48
|
+
|
|
49
|
+
from .safe_globals import register_safe_globals
|
|
50
|
+
|
|
51
|
+
if is_torch_min_version("2.6a0"):
|
|
52
|
+
register_safe_globals()
|
|
@@ -35,7 +35,8 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
35
35
|
|
|
36
36
|
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
|
|
37
37
|
the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
|
|
38
|
-
global rank, local group rank, and virtual rank may inform its return value.
|
|
38
|
+
global rank, local group rank, and virtual rank may inform its return value. Should
|
|
39
|
+
return true for exactly one process on global rank 0.
|
|
39
40
|
|
|
40
41
|
config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
|
|
41
42
|
"""
|
|
@@ -72,13 +73,6 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
72
73
|
for {split.name} split
|
|
73
74
|
This can occur with multiple validation sets if datasets have weights"""
|
|
74
75
|
|
|
75
|
-
if torch.distributed.is_initialized():
|
|
76
|
-
gb_rank = torch.distributed.get_rank()
|
|
77
|
-
if gb_rank == 0:
|
|
78
|
-
assert (
|
|
79
|
-
self.is_built_on_rank()
|
|
80
|
-
), "is_built_on_rank must return True when global rank = 0"
|
|
81
|
-
|
|
82
76
|
def build(self) -> List[Optional[TopLevelDataset]]:
|
|
83
77
|
"""Build all dataset splits according to the provided blend(s)
|
|
84
78
|
|
|
@@ -6,8 +6,8 @@ import re
|
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import List, Optional, Tuple
|
|
8
8
|
|
|
9
|
-
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
|
|
10
9
|
from megatron.core.datasets.utils import Split, log_single_rank, normalize
|
|
10
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -66,8 +66,8 @@ class BlendedMegatronDatasetConfig:
|
|
|
66
66
|
constructor.
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
|
-
tokenizer: Optional[
|
|
70
|
-
"""The
|
|
69
|
+
tokenizer: Optional[MegatronTokenizerBase] = None
|
|
70
|
+
"""The MegatronTokenizerBase instance. Required for datasets that do online tokenization."""
|
|
71
71
|
|
|
72
72
|
mid_level_dataset_surplus: float = 0.005
|
|
73
73
|
"""The sample surplus to build for the mid-level datasets(s). Defaults arbitrarily to 0.005.
|
|
@@ -12,9 +12,9 @@ import torch
|
|
|
12
12
|
from megatron.core.datasets.blended_megatron_dataset_config import BlendedMegatronDatasetConfig
|
|
13
13
|
from megatron.core.datasets.indexed_dataset import IndexedDataset
|
|
14
14
|
from megatron.core.datasets.megatron_dataset import MegatronDataset
|
|
15
|
-
from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
|
|
16
15
|
from megatron.core.datasets.object_storage_utils import ObjectStorageConfig, is_object_storage_path
|
|
17
16
|
from megatron.core.datasets.utils import Split
|
|
17
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
18
18
|
from megatron.core.utils import log_single_rank
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
@@ -701,8 +701,8 @@ class MockGPTLowLevelDataset:
|
|
|
701
701
|
we add the end of document token to each element indexed in __getitem__
|
|
702
702
|
|
|
703
703
|
Args:
|
|
704
|
-
tokenizer (
|
|
705
|
-
|
|
704
|
+
tokenizer (MegatronTokenizerBase): The tokenizer the special token information of which
|
|
705
|
+
we use to augment the mock data.
|
|
706
706
|
"""
|
|
707
707
|
|
|
708
708
|
seed: int = 0
|
|
@@ -714,7 +714,7 @@ class MockGPTLowLevelDataset:
|
|
|
714
714
|
max_sequence_length: int = 4096
|
|
715
715
|
"""The hard-coded max sequence length to generate"""
|
|
716
716
|
|
|
717
|
-
def __init__(self, tokenizer:
|
|
717
|
+
def __init__(self, tokenizer: MegatronTokenizerBase) -> None:
|
|
718
718
|
self.tokenizer = tokenizer
|
|
719
719
|
rng = numpy.random.default_rng(seed=self.seed)
|
|
720
720
|
self.sequence_lengths = rng.integers(
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
/* Helper methods for fast index mapping builds */
|
|
4
4
|
|
|
5
5
|
#include <algorithm>
|
|
6
|
+
#include <cassert>
|
|
6
7
|
#include <iostream>
|
|
7
8
|
#include <limits>
|
|
8
9
|
#include <math.h>
|
|
@@ -46,7 +47,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
|
|
|
46
47
|
while (dataset_unspent_indices.size() > 0) {
|
|
47
48
|
double index_sample_double = std::max(static_cast<double>(index_sample), 1.0);
|
|
48
49
|
|
|
49
|
-
int64_t error_argmax;
|
|
50
|
+
int64_t error_argmax = -1;
|
|
50
51
|
double error_max = std::numeric_limits<double>::lowest();
|
|
51
52
|
|
|
52
53
|
for (int32_t index_dataset : dataset_unspent_indices) {
|
|
@@ -56,6 +57,7 @@ void build_exhaustive_blending_indices(py::array_t<int16_t> &dataset_index, py::
|
|
|
56
57
|
error_max = error;
|
|
57
58
|
}
|
|
58
59
|
}
|
|
60
|
+
assert(error_argmax >= 0);
|
|
59
61
|
|
|
60
62
|
// Populate the indices.
|
|
61
63
|
dataset_index_ptr[index_sample] = static_cast<int16_t>(error_argmax);
|
{megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/datasets/indexed_dataset.py
RENAMED
|
@@ -12,6 +12,7 @@ import shutil
|
|
|
12
12
|
import struct
|
|
13
13
|
import time
|
|
14
14
|
from abc import ABC, abstractmethod
|
|
15
|
+
from collections.abc import Iterable
|
|
15
16
|
from enum import Enum
|
|
16
17
|
from functools import lru_cache
|
|
17
18
|
from itertools import accumulate
|
|
@@ -172,9 +173,9 @@ class _IndexWriter(object):
|
|
|
172
173
|
|
|
173
174
|
def write(
|
|
174
175
|
self,
|
|
175
|
-
sequence_lengths:
|
|
176
|
-
sequence_modes: Optional[
|
|
177
|
-
document_indices:
|
|
176
|
+
sequence_lengths: Iterable[Union[int, numpy.integer]],
|
|
177
|
+
sequence_modes: Optional[Iterable[Union[int, numpy.integer]]],
|
|
178
|
+
document_indices: Iterable[Union[int, numpy.integer]],
|
|
178
179
|
) -> None:
|
|
179
180
|
"""Write the index (.idx) file
|
|
180
181
|
|
|
@@ -208,7 +209,9 @@ class _IndexWriter(object):
|
|
|
208
209
|
if sequence_modes is not None:
|
|
209
210
|
self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
|
|
210
211
|
|
|
211
|
-
def _sequence_pointers(
|
|
212
|
+
def _sequence_pointers(
|
|
213
|
+
self, sequence_lengths: Iterable[Union[int, numpy.integer]]
|
|
214
|
+
) -> List[int]:
|
|
212
215
|
"""Build the sequence pointers per the sequence lengths and dtype size
|
|
213
216
|
|
|
214
217
|
Args:
|
|
@@ -217,11 +220,11 @@ class _IndexWriter(object):
|
|
|
217
220
|
Returns:
|
|
218
221
|
List[int]: The pointer to the beginning of each sequence
|
|
219
222
|
"""
|
|
220
|
-
itemsize = DType.size(self.dtype)
|
|
221
|
-
curr_ptr = 0
|
|
223
|
+
itemsize = numpy.int64(DType.size(self.dtype))
|
|
224
|
+
curr_ptr = numpy.int64(0)
|
|
222
225
|
list_ptr = []
|
|
223
226
|
for length in sequence_lengths:
|
|
224
|
-
list_ptr.append(curr_ptr)
|
|
227
|
+
list_ptr.append(curr_ptr.item())
|
|
225
228
|
curr_ptr += length * itemsize
|
|
226
229
|
return list_ptr
|
|
227
230
|
|
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
|
|
7
|
-
from megatron.core.
|
|
7
|
+
from megatron.core.tokenizers import MegatronTokenizerBase
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
11
11
|
class RetroTokenizers:
|
|
12
12
|
"""Container class for GPT and Bert tokenizers."""
|
|
13
13
|
|
|
14
|
-
gpt:
|
|
15
|
-
bert:
|
|
14
|
+
gpt: MegatronTokenizerBase = None
|
|
15
|
+
bert: MegatronTokenizerBase = None
|
{megatron_core-0.15.0rc0 → megatron_core-0.15.0rc5}/megatron/core/dist_checkpointing/mapping.py
RENAMED
|
@@ -29,6 +29,9 @@ ShardedStateDict = Dict[str, Any]
|
|
|
29
29
|
ReplicaId = Union[int, Tuple[int, ...]]
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
_logged_deprecations = {}
|
|
33
|
+
|
|
34
|
+
|
|
32
35
|
class ShardedBase(ABC):
|
|
33
36
|
"""Base class for ShardedTensor and ShardedStateDict."""
|
|
34
37
|
|
|
@@ -147,6 +150,23 @@ class ShardedTensor(ShardedBase):
|
|
|
147
150
|
f"`step` argument in the flattened range of a ShardedTensor is not supported."
|
|
148
151
|
)
|
|
149
152
|
|
|
153
|
+
if self.prepend_axis_num:
|
|
154
|
+
if not _logged_deprecations.get("prepend_axis_num", False):
|
|
155
|
+
logger.warning(
|
|
156
|
+
"ShardedTensor.prepend_axis_num greater than 0 is deprecated."
|
|
157
|
+
" In Megatron-Core this can be prevented by setting sharded_state_dict"
|
|
158
|
+
" metadata['singleton_local_shards'] to True."
|
|
159
|
+
)
|
|
160
|
+
_logged_deprecations["prepend_axis_num"] = True
|
|
161
|
+
|
|
162
|
+
if self.flattened_range is not None:
|
|
163
|
+
if not _logged_deprecations.get("flattened_range", False):
|
|
164
|
+
logger.warning(
|
|
165
|
+
"ShardedTensor.flattened_range is deprecated."
|
|
166
|
+
" Use latest DistributedOptimizer formats."
|
|
167
|
+
)
|
|
168
|
+
_logged_deprecations["flattened_range"] = True
|
|
169
|
+
|
|
150
170
|
@property
|
|
151
171
|
def has_regular_grid(self):
|
|
152
172
|
"""Alias for having a regular sharding grid."""
|
|
@@ -84,9 +84,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
|
|
|
84
84
|
try:
|
|
85
85
|
if MultiStorageClientFeature.is_enabled():
|
|
86
86
|
msc = MultiStorageClientFeature.import_package()
|
|
87
|
-
return msc.torch.load(load_path, map_location='cpu'
|
|
87
|
+
return msc.torch.load(load_path, map_location='cpu')
|
|
88
88
|
else:
|
|
89
|
-
return torch.load(load_path, map_location='cpu'
|
|
89
|
+
return torch.load(load_path, map_location='cpu')
|
|
90
90
|
except FileNotFoundError as e:
|
|
91
91
|
err_msg = f'Common file {load_path} does not exist'
|
|
92
92
|
if MultiStorageClientFeature.is_enabled():
|
|
@@ -118,9 +118,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
|
|
|
118
118
|
try:
|
|
119
119
|
if MultiStorageClientFeature.is_enabled():
|
|
120
120
|
msc = MultiStorageClientFeature.import_package()
|
|
121
|
-
loaded_obj = msc.torch.load(load_path
|
|
121
|
+
loaded_obj = msc.torch.load(load_path)
|
|
122
122
|
else:
|
|
123
|
-
loaded_obj = torch.load(load_path
|
|
123
|
+
loaded_obj = torch.load(load_path)
|
|
124
124
|
except FileNotFoundError as e:
|
|
125
125
|
# Backward compatible logic: previously the save format was incorrect
|
|
126
126
|
base, _ = os.path.splitext(sh_obj.unique_key)
|
|
@@ -128,9 +128,9 @@ class TorchCommonLoadStrategy(LoadCommonStrategy):
|
|
|
128
128
|
try:
|
|
129
129
|
if MultiStorageClientFeature.is_enabled():
|
|
130
130
|
msc = MultiStorageClientFeature.import_package()
|
|
131
|
-
loaded_obj = msc.torch.load(old_load_path
|
|
131
|
+
loaded_obj = msc.torch.load(old_load_path)
|
|
132
132
|
else:
|
|
133
|
-
loaded_obj = torch.load(old_load_path
|
|
133
|
+
loaded_obj = torch.load(old_load_path)
|
|
134
134
|
except FileNotFoundError:
|
|
135
135
|
err_msg = f'Object shard {load_path} not found'
|
|
136
136
|
obj_subdir = os.path.join(checkpoint_dir, sh_obj.key)
|
|
@@ -340,11 +340,12 @@ def mcore_to_pyt_state_dict(
|
|
|
340
340
|
if sh_ten.allow_shape_mismatch and is_loading:
|
|
341
341
|
sh_ten.data.zero_()
|
|
342
342
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
343
|
+
is_pre_mcore_014_sh_ten = (
|
|
344
|
+
sh_tens[0].prepend_axis_num or sh_tens[0].flattened_range is not None
|
|
345
|
+
)
|
|
346
|
+
if (
|
|
347
|
+
not is_pre_mcore_014_sh_ten or not sh_tens[0].has_regular_grid
|
|
348
|
+
) and is_torch_min_version("2.6a0"):
|
|
348
349
|
assert sh_tens[0].flattened_range is None
|
|
349
350
|
if len(sh_tens) > 1:
|
|
350
351
|
return LocalShardsContainer(
|
|
@@ -353,6 +354,10 @@ def mcore_to_pyt_state_dict(
|
|
|
353
354
|
else:
|
|
354
355
|
return CheckpointableShardedTensor.from_sh_ten(sh_tens[0])
|
|
355
356
|
else:
|
|
357
|
+
if not sh_tens[0].has_regular_grid and not is_torch_min_version("2.6a0"):
|
|
358
|
+
raise CheckpointingException(
|
|
359
|
+
f"Uneven sharding not supported for PyTorch version {get_torch_version()}"
|
|
360
|
+
)
|
|
356
361
|
torch_sh_ten = sharded_tensor_to_torch_sharded_tensor(
|
|
357
362
|
sh_tens, rank, load_legacy_1d_flatten_tensors
|
|
358
363
|
)
|