megatron-core 0.11.0__tar.gz → 0.12.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/LICENSE +3 -2
- {megatron_core-0.11.0/megatron_core.egg-info → megatron_core-0.12.0rc3}/PKG-INFO +7 -7
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/__init__.py +1 -0
- megatron_core-0.12.0rc3/megatron/core/config.py +3 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/exchange_utils.py +8 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/mapping.py +7 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/serialization.py +4 -3
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/state_dict_utils.py +28 -1
- megatron_core-0.12.0rc3/megatron/core/dist_checkpointing/strategies/async_utils.py +543 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/base.py +1 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +64 -38
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/torch.py +55 -9
- megatron_core-0.12.0rc3/megatron/core/distributed/custom_fsdp/__init__.py +3 -0
- megatron_core-0.12.0rc3/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +691 -0
- megatron_core-0.12.0rc3/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +1966 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/data_parallel_base.py +2 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/distributed_data_parallel.py +43 -6
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/distributed_data_parallel_config.py +30 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/finalize_model_grads.py +22 -8
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/param_and_grad_buffer.py +59 -20
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +30 -12
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/enums.py +10 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trtllm_layers.py +4 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/extensions/transformer_engine.py +104 -42
- megatron_core-0.12.0rc3/megatron/core/fp8_utils.py +449 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_cross_entropy.py +28 -23
- megatron_core-0.12.0rc3/megatron/core/inference/contexts/__init__.py +11 -0
- megatron_core-0.12.0rc3/megatron/core/inference/contexts/base_context.py +20 -0
- megatron_core-0.12.0rc3/megatron/core/inference/contexts/dynamic_context.py +1022 -0
- megatron_core-0.11.0/megatron/core/inference_params.py → megatron_core-0.12.0rc3/megatron/core/inference/contexts/static_context.py +52 -8
- megatron_core-0.12.0rc3/megatron/core/inference/engines/__init__.py +5 -0
- megatron_core-0.12.0rc3/megatron/core/inference/engines/dynamic_engine.py +182 -0
- megatron_core-0.12.0rc3/megatron/core/inference/engines/mcore_engine.py +5 -0
- megatron_core-0.11.0/megatron/core/inference/engines/mcore_engine.py → megatron_core-0.12.0rc3/megatron/core/inference/engines/static_engine.py +45 -10
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/inference_request.py +13 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +80 -27
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +51 -18
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +6 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +28 -22
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +7 -3
- megatron_core-0.12.0rc3/megatron/core/inference/modelopt_support/gpt/__init__.py +8 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/sampling_params.py +3 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/scheduler.py +25 -7
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +206 -52
- megatron_core-0.12.0rc3/megatron/core/inference_params.py +5 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/model_parallel_config.py +10 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/T5/t5_model.py +15 -9
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/bert/bert_model.py +12 -3
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/__init__.py +1 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/relative_pos_embedding.py +12 -6
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/rope_utils.py +12 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +102 -7
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/language_module/language_module.py +41 -8
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/gpt/gpt_layer_specs.py +59 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/gpt/gpt_model.py +171 -19
- megatron_core-0.12.0rc3/megatron/core/models/huggingface/__init__.py +2 -0
- megatron_core-0.12.0rc3/megatron/core/models/huggingface/clip_model.py +26 -0
- megatron_core-0.12.0rc3/megatron/core/models/huggingface/module.py +63 -0
- megatron_core-0.12.0rc3/megatron/core/models/huggingface/qwen_model.py +42 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/mamba/mamba_layer_specs.py +2 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/mamba/mamba_model.py +32 -14
- megatron_core-0.12.0rc3/megatron/core/models/multimodal/context_parallel.py +99 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/multimodal/llava_model.py +189 -100
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/decoder_attention.py +18 -9
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/encoder_attention.py +8 -3
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/model.py +13 -5
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/vision/clip_vit_model.py +28 -7
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/__init__.py +103 -9
- megatron_core-0.12.0rc3/megatron/core/optimizer/cpu_offloading/__init__.py +2 -0
- megatron_core-0.12.0rc3/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +465 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/distrib_optimizer.py +324 -141
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/optimizer.py +9 -4
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/optimizer_config.py +32 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/package_info.py +2 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/parallel_state.py +137 -25
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/pipeline_parallel/p2p_communication.py +20 -3
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/pipeline_parallel/schedules.py +176 -70
- megatron_core-0.12.0rc3/megatron/core/post_training/__init__.py +1 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/__init__.py +10 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/gpt/model_specs.py +245 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +133 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/layers.py +246 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/mamba/__init__.py +1 -0
- megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/mamba/model_specs.py +90 -0
- megatron_core-0.12.0rc3/megatron/core/process_groups_config.py +113 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/rerun_state_machine.py +237 -61
- megatron_core-0.12.0rc3/megatron/core/ssm/__init__.py +1 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/ssm/mamba_block.py +120 -50
- megatron_core-0.12.0rc3/megatron/core/ssm/mamba_config.py +22 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/ssm/mamba_layer.py +33 -9
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/ssm/mamba_mixer.py +41 -14
- megatron_core-0.12.0rc3/megatron/core/ssm/mlp_layer.py +25 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/random.py +27 -9
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/attention.py +277 -93
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/cuda_graphs.py +44 -21
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/mlp.py +13 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/experts.py +47 -7
- megatron_core-0.12.0rc3/megatron/core/transformer/moe/fused_a2a.py +202 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/moe_layer.py +7 -3
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/moe_utils.py +40 -6
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/router.py +87 -12
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/token_dispatcher.py +420 -67
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/multi_latent_attention.py +67 -21
- megatron_core-0.12.0rc3/megatron/core/transformer/multi_token_prediction.py +737 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/transformer_block.py +70 -127
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/transformer_config.py +167 -36
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/transformer_layer.py +279 -41
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/utils.py +9 -2
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/utils.py +314 -34
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3/megatron_core.egg-info}/PKG-INFO +7 -7
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron_core.egg-info/SOURCES.txt +33 -1
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron_core.egg-info/requires.txt +1 -5
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/requirements/pytorch_24.10/requirements.txt +0 -11
- megatron_core-0.12.0rc3/requirements/pytorch_25.03/requirements.txt +15 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/setup.py +1 -1
- megatron_core-0.11.0/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -228
- megatron_core-0.11.0/megatron/core/models/multimodal/__init__.py +0 -1
- megatron_core-0.11.0/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/MANIFEST.in +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/README.md +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/README.md +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/bert_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/blended_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/gpt_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/helpers.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/indexed_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/masked_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/megatron_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/megatron_tokenizer.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/db/build.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/db/dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/db/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/external_libs.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/build.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/index.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/index/validate.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/query.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/retro/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/t5_dataset.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/validation.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/distributed/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trt_model_config.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/engines → megatron_core-0.12.0rc3/megatron/core/inference/model_inference_wrappers}/__init__.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers → megatron_core-0.12.0rc3/megatron/core/inference/model_inference_wrappers/gpt}/__init__.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers/gpt → megatron_core-0.12.0rc3/megatron/core/inference/model_inference_wrappers/t5}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/modelopt_support/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/modelopt_support/gpt/model_specs.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/modelopt_support/mamba/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/modelopt_support/mamba/model_specs.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/model_inference_wrappers/t5 → megatron_core-0.12.0rc3/megatron/core/inference/text_generation_controllers}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/jit.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/T5/t5_spec.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/bert/bert_layer_specs.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/modelopt_support/gpt → megatron_core-0.12.0rc3/megatron/core/models/multimodal}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/multimodal/llava_spec.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/decoder_spec.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/encoder_spec.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/vision/radio.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/models/vision/vit_layer_specs.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.11.0/megatron/core/inference/text_generation_controllers → megatron_core-0.12.0rc3/megatron/core/post_training/modelopt/gpt}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/layers.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/mappings.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/tensor_parallel/utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/timers.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.11.0/megatron/core/ssm → megatron_core-0.12.0rc3/megatron/core/transformer/custom_layers}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.11.0/megatron/core/transformer/custom_layers → megatron_core-0.12.0rc3/megatron/core/transformer/moe}/__init__.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/pyproject.toml +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/requirements/pytorch_24.01/requirements.txt +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/requirements/pytorch_24.07/requirements.txt +0 -0
- {megatron_core-0.11.0 → megatron_core-0.12.0rc3}/setup.cfg +0 -0
|
@@ -247,8 +247,9 @@ LICENSE FOR
|
|
|
247
247
|
Facebook, Inc. and its affiliates,
|
|
248
248
|
Meta Platforms, Inc. and its affiliates,
|
|
249
249
|
Microsoft Corporation,
|
|
250
|
-
OpenGVLab/InternVL,
|
|
251
|
-
Triton language and compiler
|
|
250
|
+
OpenGVLab/InternVL,
|
|
251
|
+
Triton language and compiler,
|
|
252
|
+
and DeepSeek.
|
|
252
253
|
|
|
253
254
|
MIT License
|
|
254
255
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0rc3
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Home-page: https://github.com/NVIDIA/Megatron-LM/megatron/core
|
|
6
6
|
Download-URL: https://github.com/NVIDIA/Megatron-LM/releases
|
|
@@ -257,8 +257,9 @@ License: The following applies to all files unless otherwise noted:
|
|
|
257
257
|
Facebook, Inc. and its affiliates,
|
|
258
258
|
Meta Platforms, Inc. and its affiliates,
|
|
259
259
|
Microsoft Corporation,
|
|
260
|
-
OpenGVLab/InternVL,
|
|
261
|
-
Triton language and compiler
|
|
260
|
+
OpenGVLab/InternVL,
|
|
261
|
+
Triton language and compiler,
|
|
262
|
+
and DeepSeek.
|
|
262
263
|
|
|
263
264
|
MIT License
|
|
264
265
|
|
|
@@ -308,7 +309,6 @@ Requires-Dist: einops
|
|
|
308
309
|
Requires-Dist: flask-restful
|
|
309
310
|
Requires-Dist: nltk
|
|
310
311
|
Requires-Dist: pytest
|
|
311
|
-
Requires-Dist: pytest_asyncio
|
|
312
312
|
Requires-Dist: pytest-cov
|
|
313
313
|
Requires-Dist: pytest_mock
|
|
314
314
|
Requires-Dist: pytest-random-order
|
|
@@ -319,13 +319,13 @@ Requires-Dist: zarr
|
|
|
319
319
|
Requires-Dist: wandb
|
|
320
320
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72
|
|
321
321
|
Requires-Dist: torch
|
|
322
|
-
Requires-Dist: nvidia-modelopt[torch]>=0.
|
|
323
|
-
Requires-Dist: nvidia-resiliency-ext; platform_machine == "x86_64"
|
|
322
|
+
Requires-Dist: nvidia-modelopt[torch]>=0.23.2; sys_platform != "darwin"
|
|
324
323
|
Requires-Dist: torch
|
|
325
324
|
Requires-Dist: packaging
|
|
326
325
|
Dynamic: author
|
|
327
326
|
Dynamic: download-url
|
|
328
327
|
Dynamic: home-page
|
|
328
|
+
Dynamic: license-file
|
|
329
329
|
Dynamic: maintainer
|
|
330
330
|
Dynamic: requires-dist
|
|
331
331
|
|
{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/exchange_utils.py
RENAMED
|
@@ -325,7 +325,10 @@ def exchange_loaded_tensors_gather_rounds(
|
|
|
325
325
|
# this during state dict load.
|
|
326
326
|
# TODO: remove it once the bug is fixed
|
|
327
327
|
if is_float8tensor(local_ten):
|
|
328
|
-
|
|
328
|
+
try:
|
|
329
|
+
local_ten = local_ten.from_float8()
|
|
330
|
+
except Exception as e:
|
|
331
|
+
local_ten = local_ten.dequantize()
|
|
329
332
|
all_loaded_tensors[shard_id] = local_ten
|
|
330
333
|
|
|
331
334
|
round_tensors.append(local_ten)
|
|
@@ -483,7 +486,10 @@ def exchange_loaded_tensors_broadcast(
|
|
|
483
486
|
# this during state dict load.
|
|
484
487
|
# TODO: remove it once the bug is fixed
|
|
485
488
|
if is_float8tensor(local_ten):
|
|
486
|
-
|
|
489
|
+
try:
|
|
490
|
+
local_ten = local_ten.from_float8()
|
|
491
|
+
except Exception as e:
|
|
492
|
+
local_ten = local_ten.dequantize()
|
|
487
493
|
all_loaded_tensors[shard_id] = local_ten
|
|
488
494
|
|
|
489
495
|
global_src_rank = (
|
{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/mapping.py
RENAMED
|
@@ -136,7 +136,13 @@ class ShardedTensor(ShardedBase):
|
|
|
136
136
|
)
|
|
137
137
|
|
|
138
138
|
for off, sh in zip(self.global_offset[self.prepend_axis_num :], self.local_shape):
|
|
139
|
-
|
|
139
|
+
# NOTE: In custom FSDP, we have a case where a new parameter shard is created locally.
|
|
140
|
+
# For example, consider parameters [p0, p1, p2] sharded across GPU0 and GPU1.
|
|
141
|
+
# GPU0 receives p0 and a portion of p1, while GPU1 receives the
|
|
142
|
+
# remaining portion of p1 and p2.
|
|
143
|
+
# As a result, there is no parameter shard of p2 on GPU0, and
|
|
144
|
+
# the shape of p2 on GPU0 is zero.
|
|
145
|
+
if sh != 0 and off % sh != 0:
|
|
140
146
|
raise CheckpointingException(
|
|
141
147
|
f'Global offset ({off}) must be divisible by local shape ({sh}) for {self}.'
|
|
142
148
|
)
|
{megatron_core-0.11.0 → megatron_core-0.12.0rc3}/megatron/core/dist_checkpointing/serialization.py
RENAMED
|
@@ -351,9 +351,10 @@ def save(
|
|
|
351
351
|
)
|
|
352
352
|
|
|
353
353
|
if next(checkpoint_dir.iterdir(), None) is not None:
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
)
|
|
354
|
+
# Don't throw exception here since this could cause a cascade of failures
|
|
355
|
+
# without human intervention in cases where multiple jobs are queued up.
|
|
356
|
+
if torch.distributed.get_rank() == 0:
|
|
357
|
+
logger.warning("Overwriting old incomplete / corrupted checkpoint...")
|
|
357
358
|
|
|
358
359
|
if common_strategy is not None:
|
|
359
360
|
raise NotImplementedError('The only supported common strategy is torch')
|
|
@@ -2,12 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
""" Utilities for transforming state_dict."""
|
|
4
4
|
|
|
5
|
-
from typing import Callable
|
|
5
|
+
from typing import Callable, Union
|
|
6
6
|
|
|
7
7
|
from .dict_utils import dict_list_map_inplace, extract_matching_values
|
|
8
8
|
from .mapping import (
|
|
9
9
|
CommonStateDict,
|
|
10
10
|
ShardedStateDict,
|
|
11
|
+
ShardedTensor,
|
|
11
12
|
ShardedTensorFactory,
|
|
12
13
|
StateDict,
|
|
13
14
|
apply_factories,
|
|
@@ -39,6 +40,7 @@ def save_preprocess(
|
|
|
39
40
|
apply_factories(sharded_state_dict)
|
|
40
41
|
_, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
|
|
41
42
|
sharded_part, common_state_dict = extract_sharded_base(sharded_state_dict)
|
|
43
|
+
sharded_part = filter_out_empty_flatten_tensor(sharded_part)
|
|
42
44
|
if validate_access_integrity:
|
|
43
45
|
preprocessed_common_state_dict = common_state_dict
|
|
44
46
|
if preprocess_common_before_consistancy_check:
|
|
@@ -69,6 +71,7 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
|
|
|
69
71
|
# Create a copy of sharded_state_dict as the passed in state dict may have
|
|
70
72
|
# references that prevent tensors from being deallocated
|
|
71
73
|
sharded_state_dict, _ = extract_matching_values(sharded_state_dict, lambda x: True)
|
|
74
|
+
sharded_state_dict = filter_out_empty_flatten_tensor(sharded_state_dict)
|
|
72
75
|
|
|
73
76
|
sh_ten_factories, _ = extract_matching_values(
|
|
74
77
|
sharded_state_dict,
|
|
@@ -83,3 +86,27 @@ def load_preprocess(sharded_state_dict: ShardedStateDict):
|
|
|
83
86
|
nonpersistent_state_dict, sharded_state_dict = extract_nonpersistent(sharded_state_dict)
|
|
84
87
|
dict_list_map_inplace(lambda o: o.unwrap(), nonpersistent_state_dict)
|
|
85
88
|
return sharded_state_dict, nonpersistent_state_dict, sh_ten_factories
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def filter_out_empty_flatten_tensor(sharded_state_dict: Union[dict, list]):
|
|
92
|
+
"""
|
|
93
|
+
Filter out ShardedTensors with empty flatten_range.
|
|
94
|
+
These tensors can cause the PyTorch check in failure.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
sharded_state_dict: state dict possibly containing ShardedTensor objects
|
|
98
|
+
"""
|
|
99
|
+
# Filter out ShardedTensors with empty flatten_range.
|
|
100
|
+
# These tensors can cause the PyTorch check in
|
|
101
|
+
# `TorchShardedTensor._init_from_local_shards_and_global_metadata` to fail.
|
|
102
|
+
# This situation may occur in custom Fully Sharded Data Parallel (FSDP) cases.
|
|
103
|
+
sharded_state_dict, _ = extract_matching_values(
|
|
104
|
+
sharded_state_dict,
|
|
105
|
+
lambda v: not (
|
|
106
|
+
isinstance(v, ShardedTensor)
|
|
107
|
+
and v.flattened_range
|
|
108
|
+
and v.flattened_range.start == v.flattened_range.stop
|
|
109
|
+
),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return sharded_state_dict
|