megatron-core 0.10.0__tar.gz → 0.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.10.0 → megatron_core-0.12.0}/LICENSE +5 -4
- {megatron_core-0.10.0/megatron_core.egg-info → megatron_core-0.12.0}/PKG-INFO +75 -13
- {megatron_core-0.10.0 → megatron_core-0.12.0}/README.md +62 -5
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/__init__.py +1 -0
- megatron_core-0.12.0/megatron/core/config.py +3 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/blended_dataset.py +3 -3
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/blended_megatron_dataset_builder.py +99 -48
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/blended_megatron_dataset_config.py +3 -8
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/__init__.py +1 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/exchange_utils.py +110 -79
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/mapping.py +7 -5
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/serialization.py +5 -4
- megatron_core-0.12.0/megatron/core/dist_checkpointing/state_dict_utils.py +112 -0
- megatron_core-0.12.0/megatron/core/dist_checkpointing/strategies/async_utils.py +561 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/base.py +2 -1
- megatron_core-0.12.0/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +38 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/common.py +3 -3
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +113 -49
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +172 -96
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/resharding.py +15 -12
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +90 -5
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/torch.py +170 -53
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/two_stage.py +20 -6
- megatron_core-0.12.0/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +347 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/utils.py +101 -1
- megatron_core-0.12.0/megatron/core/distributed/custom_fsdp/__init__.py +3 -0
- megatron_core-0.12.0/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +749 -0
- megatron_core-0.12.0/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +2055 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/data_parallel_base.py +2 -2
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/distributed_data_parallel.py +64 -22
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/distributed_data_parallel_config.py +34 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/finalize_model_grads.py +55 -8
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/param_and_grad_buffer.py +85 -41
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +37 -11
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/enums.py +10 -0
- megatron_core-0.12.0/megatron/core/export/model_type.py +8 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +6 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +14 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trt_model_config.py +1 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trt_model_type.py +1 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trtllm_helper.py +20 -2
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trtllm_layers.py +13 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +10 -4
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +17 -5
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/extensions/transformer_engine.py +240 -93
- megatron_core-0.12.0/megatron/core/fp8_utils.py +456 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_cross_entropy.py +28 -23
- megatron_core-0.12.0/megatron/core/inference/async_stream.py +67 -0
- megatron_core-0.12.0/megatron/core/inference/common_inference_params.py +4 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/communication_utils.py +4 -0
- megatron_core-0.12.0/megatron/core/inference/contexts/__init__.py +11 -0
- megatron_core-0.12.0/megatron/core/inference/contexts/base_context.py +20 -0
- megatron_core-0.12.0/megatron/core/inference/contexts/dynamic_context.py +1022 -0
- megatron_core-0.12.0/megatron/core/inference/contexts/static_context.py +133 -0
- megatron_core-0.12.0/megatron/core/inference/engines/__init__.py +5 -0
- megatron_core-0.12.0/megatron/core/inference/engines/dynamic_engine.py +182 -0
- megatron_core-0.12.0/megatron/core/inference/engines/mcore_engine.py +5 -0
- megatron_core-0.12.0/megatron/core/inference/engines/static_engine.py +245 -0
- megatron_core-0.12.0/megatron/core/inference/inference_request.py +64 -0
- megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +364 -0
- megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +135 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
- megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +218 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +72 -58
- megatron_core-0.12.0/megatron/core/inference/modelopt_support/__init__.py +10 -0
- megatron_core-0.12.0/megatron/core/inference/modelopt_support/gpt/__init__.py +8 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/modelopt_support/gpt/model_specs.py +9 -4
- megatron_core-0.12.0/megatron/core/inference/modelopt_support/mamba/__init__.py +1 -0
- megatron_core-0.12.0/megatron/core/inference/modelopt_support/mamba/model_specs.py +89 -0
- megatron_core-0.12.0/megatron/core/inference/sampling_params.py +38 -0
- megatron_core-0.12.0/megatron/core/inference/scheduler.py +193 -0
- megatron_core-0.12.0/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +38 -0
- megatron_core-0.12.0/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +5 -0
- megatron_core-0.12.0/megatron/core/inference/text_generation_controllers/text_generation_controller.py +819 -0
- megatron_core-0.12.0/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +40 -0
- megatron_core-0.12.0/megatron/core/inference_params.py +5 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/model_parallel_config.py +10 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/T5/t5_model.py +84 -11
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/bert/bert_model.py +29 -13
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/embeddings/__init__.py +1 -1
- megatron_core-0.12.0/megatron/core/models/common/embeddings/relative_pos_embedding.py +179 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/embeddings/rope_utils.py +12 -2
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +107 -10
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/language_module/language_module.py +81 -8
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/gpt/gpt_layer_specs.py +177 -73
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/gpt/gpt_model.py +191 -17
- megatron_core-0.12.0/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +209 -0
- megatron_core-0.12.0/megatron/core/models/gpt/moe_module_specs.py +81 -0
- megatron_core-0.12.0/megatron/core/models/huggingface/__init__.py +2 -0
- megatron_core-0.12.0/megatron/core/models/huggingface/clip_model.py +26 -0
- megatron_core-0.12.0/megatron/core/models/huggingface/module.py +63 -0
- megatron_core-0.12.0/megatron/core/models/huggingface/qwen_model.py +42 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/mamba/mamba_layer_specs.py +2 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/mamba/mamba_model.py +32 -14
- megatron_core-0.12.0/megatron/core/models/multimodal/context_parallel.py +99 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/multimodal/llava_model.py +297 -216
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/multimodal/llava_spec.py +8 -6
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/config.py +3 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/decoder_attention.py +18 -9
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/encoder_attention.py +8 -3
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/model.py +13 -5
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/vision/clip_vit_model.py +30 -7
- megatron_core-0.12.0/megatron/core/models/vision/radio.py +325 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/vision/vit_layer_specs.py +1 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer/__init__.py +169 -36
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer/clip_grads.py +21 -9
- megatron_core-0.12.0/megatron/core/optimizer/cpu_offloading/__init__.py +2 -0
- megatron_core-0.12.0/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +465 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer/distrib_optimizer.py +515 -224
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer/optimizer.py +253 -121
- megatron_core-0.12.0/megatron/core/optimizer/optimizer_config.py +212 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/package_info.py +1 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/parallel_state.py +282 -58
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/pipeline_parallel/p2p_communication.py +20 -3
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/pipeline_parallel/schedules.py +186 -70
- megatron_core-0.12.0/megatron/core/post_training/__init__.py +1 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/__init__.py +10 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/gpt/model_specs.py +253 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +133 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/layers.py +246 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
- megatron_core-0.12.0/megatron/core/post_training/modelopt/mamba/model_specs.py +90 -0
- megatron_core-0.12.0/megatron/core/process_groups_config.py +113 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/rerun_state_machine.py +241 -65
- megatron_core-0.12.0/megatron/core/ssm/__init__.py +1 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/ssm/mamba_block.py +120 -50
- megatron_core-0.12.0/megatron/core/ssm/mamba_config.py +22 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/ssm/mamba_layer.py +64 -11
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/ssm/mamba_mixer.py +41 -14
- megatron_core-0.12.0/megatron/core/ssm/mlp_layer.py +25 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/__init__.py +2 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/layers.py +16 -0
- megatron_core-0.12.0/megatron/core/tensor_parallel/random.py +575 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/timers.py +35 -7
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/attention.py +294 -93
- megatron_core-0.12.0/megatron/core/transformer/cuda_graphs.py +916 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/enums.py +20 -0
- megatron_core-0.12.0/megatron/core/transformer/heterogeneous/heterogeneous_config.py +267 -0
- megatron_core-0.12.0/megatron/core/transformer/heterogeneous/linear_replacements.py +111 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/mlp.py +10 -2
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/experts.py +157 -55
- megatron_core-0.12.0/megatron/core/transformer/moe/fused_a2a.py +202 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py +9 -6
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/moe_layer.py +23 -26
- megatron_core-0.12.0/megatron/core/transformer/moe/moe_utils.py +722 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/router.py +167 -33
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/shared_experts.py +4 -5
- megatron_core-0.12.0/megatron/core/transformer/moe/token_dispatcher.py +997 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/multi_latent_attention.py +213 -88
- megatron_core-0.12.0/megatron/core/transformer/multi_token_prediction.py +737 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/torch_norm.py +49 -1
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/transformer_block.py +148 -142
- megatron_core-0.12.0/megatron/core/transformer/transformer_config.py +1114 -0
- megatron_core-0.12.0/megatron/core/transformer/transformer_layer.py +786 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/utils.py +9 -2
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/utils.py +349 -31
- {megatron_core-0.10.0 → megatron_core-0.12.0/megatron_core.egg-info}/PKG-INFO +75 -13
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron_core.egg-info/SOURCES.txt +51 -6
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron_core.egg-info/requires.txt +5 -2
- {megatron_core-0.10.0/requirements/pytorch:24.01 → megatron_core-0.12.0/requirements/pytorch_24.01}/requirements.txt +3 -2
- {megatron_core-0.10.0/requirements/pytorch:24.07 → megatron_core-0.12.0/requirements/pytorch_24.07}/requirements.txt +3 -1
- megatron_core-0.12.0/requirements/pytorch_24.10/requirements.txt +6 -0
- megatron_core-0.12.0/requirements/pytorch_25.03/requirements.txt +15 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/setup.py +7 -2
- megatron_core-0.10.0/megatron/core/dist_checkpointing/state_dict_transformation.py +0 -270
- megatron_core-0.10.0/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -224
- megatron_core-0.10.0/megatron/core/export/model_type.py +0 -7
- megatron_core-0.10.0/megatron/core/inference/ammo_support/__init__.py +0 -8
- megatron_core-0.10.0/megatron/core/inference/ammo_support/gpt/model_specs.py +0 -2
- megatron_core-0.10.0/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py +0 -5
- megatron_core-0.10.0/megatron/core/inference/common_inference_params.py +0 -29
- megatron_core-0.10.0/megatron/core/inference/engines/mcore_engine.py +0 -113
- megatron_core-0.10.0/megatron/core/inference/inference_request.py +0 -39
- megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -238
- megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -90
- megatron_core-0.10.0/megatron/core/inference/modelopt_support/__init__.py +0 -8
- megatron_core-0.10.0/megatron/core/inference/scheduler.py +0 -127
- megatron_core-0.10.0/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -35
- megatron_core-0.10.0/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -402
- megatron_core-0.10.0/megatron/core/inference_params.py +0 -31
- megatron_core-0.10.0/megatron/core/models/multimodal/__init__.py +0 -1
- megatron_core-0.10.0/megatron/core/optimizer/optimizer_config.py +0 -116
- megatron_core-0.10.0/megatron/core/tensor_parallel/random.py +0 -314
- megatron_core-0.10.0/megatron/core/transformer/cuda_graphs.py +0 -313
- megatron_core-0.10.0/megatron/core/transformer/moe/__init__.py +0 -0
- megatron_core-0.10.0/megatron/core/transformer/moe/moe_utils.py +0 -407
- megatron_core-0.10.0/megatron/core/transformer/moe/token_dispatcher.py +0 -594
- megatron_core-0.10.0/megatron/core/transformer/transformer_config.py +0 -637
- megatron_core-0.10.0/megatron/core/transformer/transformer_layer.py +0 -397
- {megatron_core-0.10.0 → megatron_core-0.12.0}/MANIFEST.in +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/README.md +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/engines → megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers}/__init__.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers → megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers/gpt}/__init__.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/gpt → megatron_core-0.12.0/megatron/core/inference/model_inference_wrappers/t5}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/model_inference_wrappers/t5 → megatron_core-0.12.0/megatron/core/inference/text_generation_controllers}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/jit.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/modelopt_support/gpt → megatron_core-0.12.0/megatron/core/models/multimodal}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.10.0/megatron/core/inference/text_generation_controllers → megatron_core-0.12.0/megatron/core/post_training/modelopt/gpt}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.10.0/megatron/core/ssm → megatron_core-0.12.0/megatron/core/transformer/custom_layers}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.10.0/megatron/core/transformer/custom_layers → megatron_core-0.12.0/megatron/core/transformer/moe}/__init__.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/pyproject.toml +0 -0
- {megatron_core-0.10.0 → megatron_core-0.12.0}/setup.cfg +0 -0
|
@@ -36,8 +36,8 @@ OpenAI). Files from these organizations have notices at the top of each file.
|
|
|
36
36
|
Below are licenses used in those files, as indicated.
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and
|
|
39
|
+
--------------------------------------------------------------------------------------
|
|
40
|
+
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
Apache License
|
|
@@ -247,8 +247,9 @@ LICENSE FOR
|
|
|
247
247
|
Facebook, Inc. and its affiliates,
|
|
248
248
|
Meta Platforms, Inc. and its affiliates,
|
|
249
249
|
Microsoft Corporation,
|
|
250
|
-
OpenGVLab/InternVL,
|
|
251
|
-
Triton language and compiler
|
|
250
|
+
OpenGVLab/InternVL,
|
|
251
|
+
Triton language and compiler,
|
|
252
|
+
and DeepSeek.
|
|
252
253
|
|
|
253
254
|
MIT License
|
|
254
255
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Home-page: https://github.com/NVIDIA/Megatron-LM/megatron/core
|
|
6
6
|
Download-URL: https://github.com/NVIDIA/Megatron-LM/releases
|
|
@@ -46,8 +46,8 @@ License: The following applies to all files unless otherwise noted:
|
|
|
46
46
|
Below are licenses used in those files, as indicated.
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
|
|
50
|
-
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, and
|
|
49
|
+
--------------------------------------------------------------------------------------
|
|
50
|
+
-- LICENSE FOR Facebook, huggingface, Google Research, LLaVA, Mamba, and vLLM code --
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
Apache License
|
|
@@ -257,8 +257,9 @@ License: The following applies to all files unless otherwise noted:
|
|
|
257
257
|
Facebook, Inc. and its affiliates,
|
|
258
258
|
Meta Platforms, Inc. and its affiliates,
|
|
259
259
|
Microsoft Corporation,
|
|
260
|
-
OpenGVLab/InternVL,
|
|
261
|
-
Triton language and compiler
|
|
260
|
+
OpenGVLab/InternVL,
|
|
261
|
+
Triton language and compiler,
|
|
262
|
+
and DeepSeek.
|
|
262
263
|
|
|
263
264
|
MIT License
|
|
264
265
|
|
|
@@ -316,11 +317,15 @@ Requires-Dist: tiktoken
|
|
|
316
317
|
Requires-Dist: wrapt
|
|
317
318
|
Requires-Dist: zarr
|
|
318
319
|
Requires-Dist: wandb
|
|
319
|
-
Requires-Dist: tensorstore
|
|
320
|
-
Requires-Dist:
|
|
320
|
+
Requires-Dist: tensorstore!=0.1.46,!=0.1.72
|
|
321
|
+
Requires-Dist: torch
|
|
322
|
+
Requires-Dist: nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
|
|
323
|
+
Requires-Dist: torch
|
|
324
|
+
Requires-Dist: packaging
|
|
321
325
|
Dynamic: author
|
|
322
326
|
Dynamic: download-url
|
|
323
327
|
Dynamic: home-page
|
|
328
|
+
Dynamic: license-file
|
|
324
329
|
Dynamic: maintainer
|
|
325
330
|
Dynamic: requires-dist
|
|
326
331
|
|
|
@@ -342,9 +347,8 @@ Megatron-LM & Megatron-Core
|
|
|
342
347
|
- **[2024/6]** Megatron-Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
|
|
343
348
|
- **[2024/1 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron-Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron-Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron-Core intro](#megatron-core) for more details.
|
|
344
349
|
|
|
345
|
-
|
|
346
|
-
|
|
347
350
|
# Table of Contents
|
|
351
|
+
|
|
348
352
|
- [Megatron-LM \& Megatron-Core](#megatron-lm--megatron-core)
|
|
349
353
|
- [Latest News](#latest-news)
|
|
350
354
|
- [Table of Contents](#table-of-contents)
|
|
@@ -368,7 +372,6 @@ Megatron-LM & Megatron-Core
|
|
|
368
372
|
- [Retro and InstructRetro](#retro-and-instructretro)
|
|
369
373
|
- [Mamba-based Language Models](#mamba-based-language-models)
|
|
370
374
|
- [Mixture of Experts](#mixture-of-experts)
|
|
371
|
-
- [Key Features of MoE](#key-features-of-moe)
|
|
372
375
|
- [Evaluation and Tasks](#evaluation-and-tasks)
|
|
373
376
|
- [GPT Text Generation](#gpt-text-generation)
|
|
374
377
|
- [Detoxify GPT via Self-generation](#detoxify-gpt-via-self-generation)
|
|
@@ -385,7 +388,10 @@ Megatron-LM & Megatron-Core
|
|
|
385
388
|
- [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
|
|
386
389
|
- [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
|
|
387
390
|
- [Reproducibility](#reproducibility)
|
|
388
|
-
|
|
391
|
+
- [Checkpoint conversion](#checkpoint-conversion)
|
|
392
|
+
- [Model class conversion](#model-class-conversion)
|
|
393
|
+
- [Checkpoint format conversion](#checkpoint-format-conversion)
|
|
394
|
+
- [Projects Using Megatron](#projects-using-megatron)
|
|
389
395
|
|
|
390
396
|
# Megatron Overview
|
|
391
397
|
This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a research-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
|
|
@@ -915,7 +921,63 @@ There are currently three known Megatron optimizations that break reproducibilit
|
|
|
915
921
|
|
|
916
922
|
In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
|
|
917
923
|
|
|
918
|
-
|
|
924
|
+
# Checkpoint conversion
|
|
925
|
+
|
|
926
|
+
We support two forms of model conversion:
|
|
927
|
+
|
|
928
|
+
1. Model class conversion (i.e., the `GPTModel` in `model.legacy` vs. `model.core`)
|
|
929
|
+
2. Checkpoint format conversion (i.e., distributed vs. non-distributed checkpoint)
|
|
930
|
+
|
|
931
|
+
## Model class conversion
|
|
932
|
+
|
|
933
|
+
Megatron supports converting between different model classes, including internal model classes (we currently have the older `legacy` models, and the newer `core` models) and external model classes (such as Meta, Huggingface, Mistral, and Mixtral models). Additionally, during this conversion, one can update the parallel state of the model (i.e., changing tensor and pipeline model parallelism).
|
|
934
|
+
|
|
935
|
+
We provide the tool `tools/checkpoint/convert.py` to convert between model classes. Some important arguments include:
|
|
936
|
+
|
|
937
|
+
- `--model-type`: `GPT` or `BERT`
|
|
938
|
+
- `--loader`: format of the existing checkpoint. Supported formats include:
|
|
939
|
+
- `legacy`: our older model classes (under `megatron.legacy.model`)
|
|
940
|
+
- `core`: our newer model classes (under `megatron.core.models`)
|
|
941
|
+
- `llama_mistral`: for loading Llama and Mistral models (supports Meta and Huggingface formats)
|
|
942
|
+
- `mixtral_hf`: for loading Mixtral models (Huggingface only)
|
|
943
|
+
- `--load-dir`: directory for loading the existing checkpoint
|
|
944
|
+
- `--saver`: `legacy` or `core` (see descriptions under `--loader`)
|
|
945
|
+
- `--save-dir`: directory for saving the new checkpoint
|
|
946
|
+
- `--target-tensor-parallel-size`: new tensor model parallel size
|
|
947
|
+
- `--target-pipeline-parallel-size`: new pipeline model parallel size
|
|
948
|
+
|
|
949
|
+
For more argument details, please see the main script (`convert.py`), loader scripts (`loader_core.py`, `loader_legacy.py`, `loader_llama_mistral.py`, `loader_mixtral_hf.py`), or saver scripts (`saver_core.py`, `saver_legacy.py`).
|
|
950
|
+
|
|
951
|
+
An example command for converting a GPT model from the old format (`legacy`) to the new format (`core`) would look as follows:
|
|
952
|
+
|
|
953
|
+
```
|
|
954
|
+
python tools/checkpoint/convert.py \
|
|
955
|
+
> --model-type GPT \
|
|
956
|
+
> --loader legacy \
|
|
957
|
+
> --load-dir ${LEGACY_FORMAT_DIR} \
|
|
958
|
+
> --saver core \
|
|
959
|
+
> --save-dir ${CORE_FORMAT_DIR} \
|
|
960
|
+
> --target-tensor-parallel-size ${TP} \
|
|
961
|
+
> --target-pipeline-parallel-size ${PP} \
|
|
962
|
+
```
|
|
963
|
+
|
|
964
|
+
For examples of converting Llama/Mistral models into Megatron, please see [here](docs/llama_mistral.md).
|
|
965
|
+
|
|
966
|
+
## Checkpoint format conversion
|
|
967
|
+
|
|
968
|
+
Megatron offers multiple checkpoint formats, including:
|
|
969
|
+
|
|
970
|
+
- `torch`: Basic checkpoint format with sequential read & writes, and is tied to a specific tensor/pipeline model parallel state (TP/PP states, respectively). (While a specific checkpoint is tied to a specific TP/PP state, a checkpoint can still be manually converted via the model class converter described above).
|
|
971
|
+
- `torch_dist`: Distributed checkpoint format, for fast parallel reads & writes, and also is parallel state agnostic (i.e., one can load the same checkpoint to different TP/PP setups).
|
|
972
|
+
|
|
973
|
+
Generally speaking, `torch_dist` is the more modern and recommended checkpoint format due to its speed. However, depending on the use case, it may be desirable to convert between these two formats. To do so, launch your *training* script (e.g., via `pretrain_gpt.py`) as you normally would, but with two additional arguments:
|
|
974
|
+
|
|
975
|
+
- `--ckpt-convert-format ${FORMAT}`: `${FORMAT}` can be one of `torch` or `torch_dist`, as described above.
|
|
976
|
+
- `--ckpt-convert-save ${PATH_TO_SAVE_NEW_FORMAT}`: this path should be different than your existing `--load`/`--save` paths, to avoid overwriting the existing checkpoint. After converting, use this new path for your `--load`/`--save` paths.
|
|
977
|
+
|
|
978
|
+
The general idea of this checkpoint format converter is that it launches the model just as one normally would for training, but before running any training iterations, it saves to the new checkpoint format, and then exits. It is important to note that all other launch args should remain the same, in order for the system to understand the previous checkpoint format.
|
|
979
|
+
|
|
980
|
+
# Projects Using Megatron
|
|
919
981
|
Below are some of the projects where we have directly used Megatron:
|
|
920
982
|
* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
|
|
921
983
|
* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
|
|
@@ -16,9 +16,8 @@ Megatron-LM & Megatron-Core
|
|
|
16
16
|
- **[2024/6]** Megatron-Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
|
|
17
17
|
- **[2024/1 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron-Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron-Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs. Explore the [Megatron-Core intro](#megatron-core) for more details.
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
19
|
# Table of Contents
|
|
20
|
+
|
|
22
21
|
- [Megatron-LM \& Megatron-Core](#megatron-lm--megatron-core)
|
|
23
22
|
- [Latest News](#latest-news)
|
|
24
23
|
- [Table of Contents](#table-of-contents)
|
|
@@ -42,7 +41,6 @@ Megatron-LM & Megatron-Core
|
|
|
42
41
|
- [Retro and InstructRetro](#retro-and-instructretro)
|
|
43
42
|
- [Mamba-based Language Models](#mamba-based-language-models)
|
|
44
43
|
- [Mixture of Experts](#mixture-of-experts)
|
|
45
|
-
- [Key Features of MoE](#key-features-of-moe)
|
|
46
44
|
- [Evaluation and Tasks](#evaluation-and-tasks)
|
|
47
45
|
- [GPT Text Generation](#gpt-text-generation)
|
|
48
46
|
- [Detoxify GPT via Self-generation](#detoxify-gpt-via-self-generation)
|
|
@@ -59,7 +57,10 @@ Megatron-LM & Megatron-Core
|
|
|
59
57
|
- [Collecting Wikipedia Training Data](#collecting-wikipedia-training-data)
|
|
60
58
|
- [Collecting GPT Webtext Data](#collecting-gpt-webtext-data)
|
|
61
59
|
- [Reproducibility](#reproducibility)
|
|
62
|
-
|
|
60
|
+
- [Checkpoint conversion](#checkpoint-conversion)
|
|
61
|
+
- [Model class conversion](#model-class-conversion)
|
|
62
|
+
- [Checkpoint format conversion](#checkpoint-format-conversion)
|
|
63
|
+
- [Projects Using Megatron](#projects-using-megatron)
|
|
63
64
|
|
|
64
65
|
# Megatron Overview
|
|
65
66
|
This repository comprises two essential components: **Megatron-LM** and **Megatron-Core**. Megatron-LM serves as a research-oriented framework leveraging Megatron-Core for large language model (LLM) training. Megatron-Core, on the other hand, is a library of GPU optimized training techniques that comes with formal product support including versioned APIs and regular releases. You can use Megatron-Core alongside Megatron-LM or [Nvidia NeMo Framework](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/nemo_megatron/mcore_customization.html) for an end-to-end and cloud-native solution. Alternatively, you can integrate Megatron-Core's building blocks into your preferred training framework.
|
|
@@ -589,7 +590,63 @@ There are currently three known Megatron optimizations that break reproducibilit
|
|
|
589
590
|
|
|
590
591
|
In addition, determinisim has only been verified in NGC PyTorch containers up to and newer than 23.12. If you observe nondeterminism in Megatron training under other circumstances please open an issue.
|
|
591
592
|
|
|
592
|
-
|
|
593
|
+
# Checkpoint conversion
|
|
594
|
+
|
|
595
|
+
We support two forms of model conversion:
|
|
596
|
+
|
|
597
|
+
1. Model class conversion (i.e., the `GPTModel` in `model.legacy` vs. `model.core`)
|
|
598
|
+
2. Checkpoint format conversion (i.e., distributed vs. non-distributed checkpoint)
|
|
599
|
+
|
|
600
|
+
## Model class conversion
|
|
601
|
+
|
|
602
|
+
Megatron supports converting between different model classes, including internal model classes (we currently have the older `legacy` models, and the newer `core` models) and external model classes (such as Meta, Huggingface, Mistral, and Mixtral models). Additionally, during this conversion, one can update the parallel state of the model (i.e., changing tensor and pipeline model parallelism).
|
|
603
|
+
|
|
604
|
+
We provide the tool `tools/checkpoint/convert.py` to convert between model classes. Some important arguments include:
|
|
605
|
+
|
|
606
|
+
- `--model-type`: `GPT` or `BERT`
|
|
607
|
+
- `--loader`: format of the existing checkpoint. Supported formats include:
|
|
608
|
+
- `legacy`: our older model classes (under `megatron.legacy.model`)
|
|
609
|
+
- `core`: our newer model classes (under `megatron.core.models`)
|
|
610
|
+
- `llama_mistral`: for loading Llama and Mistral models (supports Meta and Huggingface formats)
|
|
611
|
+
- `mixtral_hf`: for loading Mixtral models (Huggingface only)
|
|
612
|
+
- `--load-dir`: directory for loading the existing checkpoint
|
|
613
|
+
- `--saver`: `legacy` or `core` (see descriptions under `--loader`)
|
|
614
|
+
- `--save-dir`: directory for saving the new checkpoint
|
|
615
|
+
- `--target-tensor-parallel-size`: new tensor model parallel size
|
|
616
|
+
- `--target-pipeline-parallel-size`: new pipeline model parallel size
|
|
617
|
+
|
|
618
|
+
For more argument details, please see the main script (`convert.py`), loader scripts (`loader_core.py`, `loader_legacy.py`, `loader_llama_mistral.py`, `loader_mixtral_hf.py`), or saver scripts (`saver_core.py`, `saver_legacy.py`).
|
|
619
|
+
|
|
620
|
+
An example command for converting a GPT model from the old format (`legacy`) to the new format (`core`) would look as follows:
|
|
621
|
+
|
|
622
|
+
```
|
|
623
|
+
python tools/checkpoint/convert.py \
|
|
624
|
+
> --model-type GPT \
|
|
625
|
+
> --loader legacy \
|
|
626
|
+
> --load-dir ${LEGACY_FORMAT_DIR} \
|
|
627
|
+
> --saver core \
|
|
628
|
+
> --save-dir ${CORE_FORMAT_DIR} \
|
|
629
|
+
> --target-tensor-parallel-size ${TP} \
|
|
630
|
+
> --target-pipeline-parallel-size ${PP} \
|
|
631
|
+
```
|
|
632
|
+
|
|
633
|
+
For examples of converting Llama/Mistral models into Megatron, please see [here](docs/llama_mistral.md).
|
|
634
|
+
|
|
635
|
+
## Checkpoint format conversion
|
|
636
|
+
|
|
637
|
+
Megatron offers multiple checkpoint formats, including:
|
|
638
|
+
|
|
639
|
+
- `torch`: Basic checkpoint format with sequential read & writes, and is tied to a specific tensor/pipeline model parallel state (TP/PP states, respectively). (While a specific checkpoint is tied to a specific TP/PP state, a checkpoint can still be manually converted via the model class converter described above).
|
|
640
|
+
- `torch_dist`: Distributed checkpoint format, for fast parallel reads & writes, and also is parallel state agnostic (i.e., one can load the same checkpoint to different TP/PP setups).
|
|
641
|
+
|
|
642
|
+
Generally speaking, `torch_dist` is the more modern and recommended checkpoint format due to its speed. However, depending on the use case, it may be desirable to convert between these two formats. To do so, launch your *training* script (e.g., via `pretrain_gpt.py`) as you normally would, but with two additional arguments:
|
|
643
|
+
|
|
644
|
+
- `--ckpt-convert-format ${FORMAT}`: `${FORMAT}` can be one of `torch` or `torch_dist`, as described above.
|
|
645
|
+
- `--ckpt-convert-save ${PATH_TO_SAVE_NEW_FORMAT}`: this path should be different than your existing `--load`/`--save` paths, to avoid overwriting the existing checkpoint. After converting, use this new path for your `--load`/`--save` paths.
|
|
646
|
+
|
|
647
|
+
The general idea of this checkpoint format converter is that it launches the model just as one normally would for training, but before running any training iterations, it saves to the new checkpoint format, and then exits. It is important to note that all other launch args should remain the same, in order for the system to understand the previous checkpoint format.
|
|
648
|
+
|
|
649
|
+
# Projects Using Megatron
|
|
593
650
|
Below are some of the projects where we have directly used Megatron:
|
|
594
651
|
* [BERT and GPT Studies Using Megatron](https://arxiv.org/pdf/1909.08053.pdf)
|
|
595
652
|
* [BioMegatron: Larger Biomedical Domain Language Model](https://www.aclweb.org/anthology/2020.emnlp-main.379.pdf)
|
|
@@ -29,7 +29,8 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
29
29
|
|
|
30
30
|
weights (List[Union[int, float]]): The weights that determine the dataset blend ratios
|
|
31
31
|
|
|
32
|
-
size (Optional[int]): The number of samples to draw from the blend. If None, for each
|
|
32
|
+
size (Optional[int]): The number of samples to draw from the blend. If None, for each
|
|
33
|
+
dataset index idx draw exactly weights[idx] samples from datasets[idx].
|
|
33
34
|
|
|
34
35
|
config (BlendedMegatronDatasetConfig): The config
|
|
35
36
|
|
|
@@ -74,7 +75,6 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
74
75
|
unique_identifiers["split"] = self.split.name
|
|
75
76
|
unique_identifiers["weights"] = self.weights
|
|
76
77
|
unique_identifiers["size"] = self.size
|
|
77
|
-
unique_identifiers["renormalize_blend_weights"] = self.config.renormalize_blend_weights
|
|
78
78
|
|
|
79
79
|
self.unique_description = json.dumps(
|
|
80
80
|
unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
|
|
@@ -168,7 +168,7 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
168
168
|
log_single_rank(
|
|
169
169
|
logger,
|
|
170
170
|
logging.WARNING,
|
|
171
|
-
f"
|
|
171
|
+
f"Cannot save the {type(self).__name__} indexes because path_to_cache is None",
|
|
172
172
|
)
|
|
173
173
|
|
|
174
174
|
t_end = time.time()
|
|
@@ -34,7 +34,9 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
34
34
|
|
|
35
35
|
sizes (List[Optional[int]]): The minimum total number of samples to draw, or None, per split
|
|
36
36
|
|
|
37
|
-
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
|
|
37
|
+
is_built_on_rank (Callable): A callable which returns True if the dataset should be built on
|
|
38
|
+
the current rank and False otherwise. It should be Megatron Core parallelism aware i.e.
|
|
39
|
+
global rank, local group rank, and virtual rank may inform its return value.
|
|
38
40
|
|
|
39
41
|
config (BlendedMegatronDatasetConfig): The config object which informs dataset creation
|
|
40
42
|
"""
|
|
@@ -54,7 +56,7 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
54
56
|
log_single_rank(
|
|
55
57
|
logger,
|
|
56
58
|
logging.INFO,
|
|
57
|
-
f"Building
|
|
59
|
+
f"Building {cls.__name__} splits with sizes={self.sizes} and config={self.config}",
|
|
58
60
|
)
|
|
59
61
|
|
|
60
62
|
if not self.config.mock:
|
|
@@ -96,7 +98,8 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
96
98
|
(2) The split has one contributing dataset, and...
|
|
97
99
|
|
|
98
100
|
(a) 'size' is not None
|
|
99
|
-
- Build a mid-level dataset with low-level dataset sampling in proportion to the
|
|
101
|
+
- Build a mid-level dataset with low-level dataset sampling in proportion to the
|
|
102
|
+
size
|
|
100
103
|
|
|
101
104
|
(b) 'size' is None
|
|
102
105
|
- Build mid-level datasets with no excess low-level dataset sampling
|
|
@@ -104,24 +107,27 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
104
107
|
(3) The split has multiple contributing datasets, and...
|
|
105
108
|
|
|
106
109
|
(a) 'weights' is not None and 'size' is not None
|
|
107
|
-
- Build mid-level datasets with low-level dataset sampling in proportion to their
|
|
108
|
-
|
|
110
|
+
- Build mid-level datasets with low-level dataset sampling in proportion to their
|
|
111
|
+
weights and the size
|
|
112
|
+
- Build a top-level dataset of length marginally greater than 'size' with mid-level
|
|
113
|
+
dataset sampling in proportion to their weights and the size
|
|
109
114
|
|
|
110
115
|
(b) 'weights' is not None and 'size' is None
|
|
111
116
|
- Error
|
|
112
117
|
|
|
113
118
|
(c) 'weights' is None and 'size' is not None
|
|
114
119
|
- Build mid-level datasets with no excess low-level dataset sampling
|
|
115
|
-
- Build a top-level dataset of length 'size'
|
|
116
|
-
|
|
117
|
-
|
|
120
|
+
- Build a top-level dataset of length 'size' (capped at the sum of the mid-level
|
|
121
|
+
dataset lengths) with mid-level dataset sampling in proportion to their lengths
|
|
122
|
+
and the size
|
|
118
123
|
|
|
119
124
|
(d) 'weights' is None and 'size' is None
|
|
120
125
|
- Build mid-level datasets with no excess low-level dataset sampling
|
|
121
126
|
- Build a top-level dataset with no excess mid-level dataset sampling
|
|
122
127
|
|
|
123
128
|
Returns:
|
|
124
|
-
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
|
|
129
|
+
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
|
|
130
|
+
split
|
|
125
131
|
"""
|
|
126
132
|
datasets = self._build_blended_dataset_splits()
|
|
127
133
|
|
|
@@ -134,24 +140,35 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
134
140
|
log_single_rank(
|
|
135
141
|
logger,
|
|
136
142
|
logging.INFO,
|
|
137
|
-
|
|
143
|
+
(
|
|
144
|
+
f"Verifying NumPy indices for {type(dataset).__name__} "
|
|
145
|
+
f"{dataset.split.name} split"
|
|
146
|
+
),
|
|
138
147
|
)
|
|
139
148
|
else:
|
|
140
149
|
log_single_rank(
|
|
141
150
|
logger,
|
|
142
151
|
logging.INFO,
|
|
143
|
-
|
|
152
|
+
(
|
|
153
|
+
f"NumPy indices for {type(dataset).__name__} {dataset.split.name} "
|
|
154
|
+
f"split are fully cached, skipping verification"
|
|
155
|
+
),
|
|
144
156
|
)
|
|
145
157
|
continue
|
|
146
158
|
# Check blend size
|
|
147
159
|
assert dataset.size is None or dataset.size == dataset.dataset_index.shape[0]
|
|
148
160
|
# Check blend access of mid-level datasets
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
161
|
+
dataset_indices, dataset_sizes = numpy.unique(
|
|
162
|
+
dataset.dataset_index, return_counts=True
|
|
163
|
+
)
|
|
164
|
+
for i, (index, size) in enumerate(zip(dataset_indices, dataset_sizes)):
|
|
165
|
+
if len(dataset.datasets[index]) < size:
|
|
152
166
|
raise IndexError(
|
|
153
|
-
f"The {dataset.split.name} blend oversamples
|
|
154
|
-
f"
|
|
167
|
+
f"The {dataset.split.name} blend oversamples the contributing "
|
|
168
|
+
f"datasets and, e.g., requests {size} samples from "
|
|
169
|
+
f"{type(dataset.datasets[index]).__name__} {i} with size "
|
|
170
|
+
f"{len(dataset.datasets[index])}. This is unexpected. "
|
|
171
|
+
f"Please file an issue."
|
|
155
172
|
)
|
|
156
173
|
|
|
157
174
|
return datasets
|
|
@@ -162,7 +179,8 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
162
179
|
See the BlendedMegatronDatasetBuilder.build alias for more information.
|
|
163
180
|
|
|
164
181
|
Returns:
|
|
165
|
-
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
|
|
182
|
+
List[Optional[TopLevelDataset]]: A list containing a dataset instance (or None) per
|
|
183
|
+
split
|
|
166
184
|
"""
|
|
167
185
|
##
|
|
168
186
|
# Return fake "mock" datasets
|
|
@@ -192,13 +210,19 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
192
210
|
|
|
193
211
|
# Build the mid-level datasets
|
|
194
212
|
if weights is None:
|
|
195
|
-
|
|
213
|
+
# Build only one "epoch"
|
|
214
|
+
sizes_per_dataset_buffer = [[None for split in Split] for prefix in prefixes]
|
|
196
215
|
else:
|
|
197
|
-
|
|
216
|
+
# The number of samples we plan to use per dataset
|
|
217
|
+
sizes_per_dataset_target = _get_size_per_split_per_dataset(weights, self.sizes)
|
|
218
|
+
# The number of samples we plan to build per dataset
|
|
219
|
+
sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
|
|
220
|
+
weights, self.sizes, margin=0.5
|
|
221
|
+
)
|
|
198
222
|
|
|
199
|
-
#
|
|
223
|
+
# Build each dataset in parallel
|
|
200
224
|
megatron_datasets = self._build_megatron_datasets_parallel(
|
|
201
|
-
prefixes, split,
|
|
225
|
+
prefixes, split, sizes_per_dataset_buffer
|
|
202
226
|
)
|
|
203
227
|
|
|
204
228
|
# Build the top-level datasets
|
|
@@ -207,11 +231,11 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
207
231
|
if split[i] is not None:
|
|
208
232
|
weights_i = weights
|
|
209
233
|
if weights_i is not None and self.sizes[i] is not None:
|
|
210
|
-
|
|
234
|
+
# Blend according to client-specified weights and client-specified size
|
|
235
|
+
size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
|
|
211
236
|
size_i = sum(size_per_dataset)
|
|
212
|
-
if self.config.renormalize_blend_weights:
|
|
213
|
-
weights_i = list(map(lambda _size: _size / size_i, size_per_dataset))
|
|
214
237
|
elif weights_i is None:
|
|
238
|
+
# Blend according to dataset sizes as-is and (maybe) client-specified size
|
|
215
239
|
try:
|
|
216
240
|
weights_i = [
|
|
217
241
|
len(megatron_dataset) for megatron_dataset in megatron_datasets[i]
|
|
@@ -221,9 +245,12 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
221
245
|
if self.sizes[i] is not None:
|
|
222
246
|
size_i = min(self.sizes[i], sum(weights_i))
|
|
223
247
|
else:
|
|
224
|
-
|
|
248
|
+
# Build exhaustive indices
|
|
249
|
+
size_i = None
|
|
225
250
|
else:
|
|
226
|
-
raise
|
|
251
|
+
raise ValueError(
|
|
252
|
+
"Using client-specified weights requires client-specified size"
|
|
253
|
+
)
|
|
227
254
|
blended_datasets[i] = self.build_generic_dataset(
|
|
228
255
|
BlendedDataset,
|
|
229
256
|
self.is_built_on_rank,
|
|
@@ -263,22 +290,31 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
263
290
|
|
|
264
291
|
# Build mid-level datasets
|
|
265
292
|
if weights is None:
|
|
266
|
-
|
|
293
|
+
sizes_per_dataset_buffer = [
|
|
294
|
+
[None for split in Split] for prefix in prefixes
|
|
295
|
+
]
|
|
267
296
|
else:
|
|
268
|
-
|
|
297
|
+
# The number of samples we plan to use per dataset
|
|
298
|
+
sizes_per_dataset_target = _get_size_per_split_per_dataset(
|
|
299
|
+
weights, sizes_spoof
|
|
300
|
+
)
|
|
301
|
+
# The number of samples we plan to build per dataset
|
|
302
|
+
sizes_per_dataset_buffer = _get_size_per_split_per_dataset(
|
|
303
|
+
weights, sizes_spoof, margin=0.5
|
|
304
|
+
)
|
|
269
305
|
|
|
270
|
-
#
|
|
306
|
+
# Build each dataset in parallel
|
|
271
307
|
megatron_datasets = self._build_megatron_datasets_parallel(
|
|
272
|
-
prefixes, split_spoof,
|
|
308
|
+
prefixes, split_spoof, sizes_per_dataset_buffer
|
|
273
309
|
)[i]
|
|
274
310
|
|
|
275
311
|
# Build top-level dataset
|
|
276
312
|
if weights is not None and self.sizes[i] is not None:
|
|
277
|
-
|
|
313
|
+
# Blend according to client-specified weights and client-specified size
|
|
314
|
+
size_per_dataset = list(zip(*sizes_per_dataset_target))[i]
|
|
278
315
|
size = sum(size_per_dataset)
|
|
279
|
-
if self.config.renormalize_blend_weights:
|
|
280
|
-
weights = list(map(lambda _size: _size / size, size_per_dataset))
|
|
281
316
|
elif weights is None:
|
|
317
|
+
# Blend according to dataset sizes as-is and (maybe) client-specified size
|
|
282
318
|
try:
|
|
283
319
|
weights = [
|
|
284
320
|
len(megatron_dataset) for megatron_dataset in megatron_datasets
|
|
@@ -288,7 +324,8 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
288
324
|
if self.sizes[i] is not None:
|
|
289
325
|
size = min(self.sizes[i], sum(weights))
|
|
290
326
|
else:
|
|
291
|
-
|
|
327
|
+
# Build exhaustive indices
|
|
328
|
+
size = None
|
|
292
329
|
else:
|
|
293
330
|
raise RuntimeError
|
|
294
331
|
blended_datasets[i] = self.build_generic_dataset(
|
|
@@ -395,13 +432,15 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
395
432
|
"""Build each MidLevelDataset split from a single LowLevelDataset
|
|
396
433
|
|
|
397
434
|
Args:
|
|
398
|
-
dataset_path (Optional[str]): The path on disk which defines the underlying
|
|
435
|
+
dataset_path (Optional[str]): The path on disk which defines the underlying
|
|
436
|
+
LowLevelDataset, or None for mock dataset classes
|
|
399
437
|
|
|
400
438
|
split (List[Tuple[float, float]]): The dataset split matrix
|
|
401
439
|
|
|
402
440
|
sizes (List[int]): The number of total samples to draw from each split
|
|
403
441
|
|
|
404
|
-
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
|
|
442
|
+
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
|
|
443
|
+
behavior. Set to False when we enforce this behavior at higher level.
|
|
405
444
|
|
|
406
445
|
Returns:
|
|
407
446
|
List[Optional[MidLevelDataset]]: The MidLevelDataset (or None) per split
|
|
@@ -462,17 +501,22 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
462
501
|
and torch.distributed is initialized.
|
|
463
502
|
|
|
464
503
|
Args:
|
|
465
|
-
cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
|
|
504
|
+
cls (Union[Type[DistributedDataset], Callable]): The DistributedDataset class to be
|
|
505
|
+
built. In special cases, e.g. when we are building the low level dataset for a
|
|
506
|
+
RawMegatronDataset instance, we can accept a Callable which returns an Iterable.
|
|
466
507
|
|
|
467
|
-
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
|
|
508
|
+
synchronize_ranks (bool): Whether to call barrier for rank-0 / barrier / other-ranks
|
|
509
|
+
behavior. Set to False when we enforce this behavior at higher level.
|
|
468
510
|
|
|
469
|
-
args (Tuple[Any]): The positional arguments used to build the provided
|
|
511
|
+
args (Tuple[Any]): The positional arguments used to build the provided
|
|
512
|
+
DistributedDataset class
|
|
470
513
|
|
|
471
514
|
Raises:
|
|
472
515
|
Exception: When the dataset constructor raises an OSError
|
|
473
516
|
|
|
474
517
|
Returns:
|
|
475
|
-
Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
|
|
518
|
+
Optional[Union[DistributedDataset, Iterable]]: The DistributedDataset instantion, the
|
|
519
|
+
Iterable instantiation, or None
|
|
476
520
|
"""
|
|
477
521
|
if torch.distributed.is_initialized():
|
|
478
522
|
rank = torch.distributed.get_rank()
|
|
@@ -485,10 +529,10 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
485
529
|
dataset = cls(*args)
|
|
486
530
|
except OSError as err:
|
|
487
531
|
log = (
|
|
488
|
-
f"Failed to write dataset materials to the data cache directory. "
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
532
|
+
f"Failed to write dataset materials to the data cache directory. Please "
|
|
533
|
+
f"supply a directory to which you have write access via the path_to_cache "
|
|
534
|
+
f"attribute in BlendedMegatronDatasetConfig and retry. Refer to the "
|
|
535
|
+
f"preserved traceback above for more information."
|
|
492
536
|
)
|
|
493
537
|
raise Exception(log) from err
|
|
494
538
|
|
|
@@ -505,23 +549,30 @@ class BlendedMegatronDatasetBuilder(object):
|
|
|
505
549
|
|
|
506
550
|
|
|
507
551
|
def _get_size_per_split_per_dataset(
|
|
508
|
-
normalized_weights: List[float], target_size_per_split: List[int]
|
|
552
|
+
normalized_weights: List[float], target_size_per_split: List[int], margin: float = 0.0
|
|
509
553
|
) -> List[List[int]]:
|
|
510
554
|
"""Determine the contribution of the MegatronDataset splits to the BlendedDataset splits
|
|
511
555
|
|
|
512
556
|
Args:
|
|
513
557
|
normalized_weights (List[float]): e.g. [0.3, 0.7]
|
|
514
558
|
|
|
515
|
-
target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
|
|
559
|
+
target_size_per_split (List[int]): The number of samples to target for each BlendedDataset
|
|
560
|
+
split
|
|
561
|
+
|
|
562
|
+
margin (float): The relative quantity of extra samples to build per per split per dataset,
|
|
563
|
+
as a percentage
|
|
516
564
|
|
|
517
565
|
Returns:
|
|
518
566
|
List[List[int]]: The number of samples to request per MegatronDataset per split
|
|
519
567
|
"""
|
|
520
568
|
assert numpy.isclose(sum(normalized_weights), 1.0)
|
|
521
569
|
|
|
522
|
-
# Use
|
|
570
|
+
# Use margin as buffer to ensure we satiate the request
|
|
523
571
|
sizes_per_dataset = [
|
|
524
|
-
[
|
|
572
|
+
[
|
|
573
|
+
int(math.ceil(math.ceil(target_size * weight) * (1 + margin / 100)))
|
|
574
|
+
for target_size in target_size_per_split
|
|
575
|
+
]
|
|
525
576
|
for weight in normalized_weights
|
|
526
577
|
]
|
|
527
578
|
|
|
@@ -34,12 +34,6 @@ class BlendedMegatronDatasetConfig:
|
|
|
34
34
|
'blend'. Defauls to None.
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
renormalize_blend_weights: bool = False
|
|
38
|
-
"""Renormalize the blend weights to account for mid-level dataset oversampling done to ensure
|
|
39
|
-
fulfillmenet of the of the requested number of samples. Defaults to False for backward
|
|
40
|
-
comparability in the data sample order.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
37
|
split: Optional[str] = None
|
|
44
38
|
"""The split string, a comma separated weighting for the dataset splits when drawing samples
|
|
45
39
|
from a single distribution. Not to be used with 'blend_per_split'. Defaults to None.
|
|
@@ -67,7 +61,7 @@ class BlendedMegatronDatasetConfig:
|
|
|
67
61
|
"""
|
|
68
62
|
|
|
69
63
|
tokenizer: Optional[MegatronTokenizer] = None
|
|
70
|
-
"""The MegatronTokenizer instance
|
|
64
|
+
"""The MegatronTokenizer instance. Required for datasets that do online tokenization."""
|
|
71
65
|
|
|
72
66
|
def __post_init__(self) -> None:
|
|
73
67
|
"""Do asserts and set fields post init"""
|
|
@@ -149,7 +143,8 @@ def convert_split_vector_to_split_matrix(
|
|
|
149
143
|
Args:
|
|
150
144
|
vector_a (List[float]): The primary split vector
|
|
151
145
|
|
|
152
|
-
vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
|
|
146
|
+
vector_b (Optional[List[float]]): An optional secondary split vector which constrains the
|
|
147
|
+
primary split vector. Defaults to None.
|
|
153
148
|
|
|
154
149
|
Returns:
|
|
155
150
|
List[Tuple[float, float]]: The split matrix consisting of book-ends of each split in order
|