megatron-core 0.13.0rc2__tar.gz → 0.14.0rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megatron-core might be problematic. Click here for more details.
- {megatron_core-0.13.0rc2/megatron_core.egg-info → megatron_core-0.14.0rc0}/PKG-INFO +4 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/bert_dataset.py +5 -7
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_dataset.py +4 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_megatron_dataset_builder.py +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/gpt_dataset.py +6 -4
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/helpers.py +3 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/indexed_dataset.py +8 -8
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/masked_dataset.py +1 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_dataset.py +1 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_tokenizer.py +0 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/bert_embedders.py +3 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/build.py +40 -24
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/dataset.py +12 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/utils.py +42 -11
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/external_libs.py +1 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/build.py +31 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/index.py +26 -9
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/faiss_base.py +34 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +54 -9
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/validate.py +15 -12
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +14 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/query.py +71 -15
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/retro_dataset.py +21 -8
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/utils.py +21 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/t5_dataset.py +2 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils.py +8 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/dict_utils.py +14 -14
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/exchange_utils.py +35 -32
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/mapping.py +54 -52
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/serialization.py +13 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +36 -27
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/resharding.py +9 -7
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/tensorstore.py +26 -12
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/zarr.py +62 -47
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +38 -21
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/utils.py +14 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/validation.py +38 -37
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/__init__.py +4 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +1 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/param_and_grad_buffer.py +25 -20
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +37 -25
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trt_model_config.py +10 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_helper.py +60 -49
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +15 -4
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +28 -18
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/extensions/kitchen.py +106 -44
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/extensions/transformer_engine.py +170 -129
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fp8_utils.py +24 -12
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_indices_converter.py +16 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +16 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_pad_routing_map.py +23 -4
- megatron_core-0.14.0rc0/megatron/core/hyper_comm_grid.py +239 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/dynamic_context.py +29 -8
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/dynamic_engine.py +98 -25
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/static_engine.py +17 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/inference_request.py +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +74 -21
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/jit.py +10 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/T5/t5_spec.py +6 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/backends.py +4 -4
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_layer_specs.py +5 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/rope_utils.py +3 -9
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/gpt_layer_specs.py +15 -14
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +6 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/clip_model.py +19 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/module.py +20 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/qwen_model.py +20 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/llava_spec.py +2 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/decoder_attention.py +4 -7
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/decoder_spec.py +2 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/encoder_spec.py +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/vision/radio.py +18 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/vision/vit_layer_specs.py +6 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/distrib_optimizer.py +50 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/optimizer.py +59 -14
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/optimizer_config.py +5 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/package_info.py +2 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/parallel_state.py +148 -139
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/layers.py +23 -11
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/quantization/quant_config.py +22 -15
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/rerun_state_machine.py +82 -83
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_context_parallel.py +8 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +5 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_mixer.py +55 -40
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/layers.py +47 -41
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/mappings.py +9 -5
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/utils.py +6 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/timers.py +6 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/cuda_graphs.py +53 -34
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/mlp.py +6 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/multi_latent_attention.py +2 -1
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_config.py +96 -69
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/utils.py +91 -27
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0/megatron_core.egg-info}/PKG-INFO +4 -3
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron_core.egg-info/SOURCES.txt +1 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron_core.egg-info/requires.txt +5 -2
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/pyproject.toml +7 -6
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/LICENSE +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/MANIFEST.in +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/README.md +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/README.md +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/config_logger.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/helpers.cpp +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/multimodal_dataset.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/object_storage_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/factory.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils_object_storage.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils_s3.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/core.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/optimizer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/data_parallel_base.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/distributed_data_parallel.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/finalize_model_grads.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/energy_monitor.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/enums.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/data_type.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/export_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/model_type.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trt_model_type.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/extensions/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_dropout.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_geglu.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_gelu.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_cross_entropy.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_layer_norm.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_softmax.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/async_stream.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/common_inference_params.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/communication_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/base_context.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/static_context.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/abstract_engine.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/mcore_engine.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/sampling_params.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/scheduler.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/inference_params.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/model_parallel_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/T5/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/T5/t5_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/bert/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_lm_head.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/bert/pooler.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/language_module/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/language_module/language_module.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/vision_module/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/common/vision_module/vision_module.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/gpt_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/moe_module_specs.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/mamba_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/config/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/config/base_configs.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/model/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/model/base.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/audio.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/base.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/vision.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/context_parallel.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/llava_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/base_attention.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/encoder_attention.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/retro/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/vision/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/vision/clip_vit_model.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/models/vision/multimodal_projector.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/msc_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/num_microbatches_calculator.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/clip_grads.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer/grad_scaler.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/optimizer_param_scheduler.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/packed_seq_params.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/schedules.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/process_groups_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/quantization/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/quantization/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/requirements.txt +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_block.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_layer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/mlp_layer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/ssm/triton_cache_manager.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/data.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/random.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/attention.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/custom_layers/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/dot_product_attention.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/enums.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/identity_op.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/module.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/__init__.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/experts.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/fused_a2a.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/moe_layer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/moe_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/router.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/shared_experts.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/multi_token_prediction.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/spec_utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/torch_layer_norm.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/torch_norm.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_block.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_layer.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/transformer/utils.py +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron_core.egg-info/dependency_links.txt +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron_core.egg-info/top_level.txt +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/setup.cfg +0 -0
- {megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: megatron-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0rc0
|
|
4
4
|
Summary: Megatron Core - a library for efficient and scalable training of transformer based models
|
|
5
5
|
Author-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
6
6
|
Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
|
|
@@ -30,6 +30,7 @@ Requires-Python: >=3.10
|
|
|
30
30
|
Description-Content-Type: text/markdown
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
Requires-Dist: torch
|
|
33
|
+
Requires-Dist: numpy<2.0.0
|
|
33
34
|
Provides-Extra: mlm
|
|
34
35
|
Requires-Dist: flask-restful; extra == "mlm"
|
|
35
36
|
Requires-Dist: sentencepiece; extra == "mlm"
|
|
@@ -40,16 +41,16 @@ Requires-Dist: tqdm; extra == "dev"
|
|
|
40
41
|
Requires-Dist: einops; extra == "dev"
|
|
41
42
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "dev"
|
|
42
43
|
Requires-Dist: nvtx; extra == "dev"
|
|
43
|
-
Requires-Dist: numpy<2.0.0; extra == "dev"
|
|
44
44
|
Requires-Dist: transformers; extra == "dev"
|
|
45
45
|
Requires-Dist: multi-storage-client; extra == "dev"
|
|
46
46
|
Requires-Dist: setuptools<80.0.0; extra == "dev"
|
|
47
|
+
Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
|
|
48
|
+
Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
|
|
47
49
|
Provides-Extra: lts
|
|
48
50
|
Requires-Dist: tqdm; extra == "lts"
|
|
49
51
|
Requires-Dist: einops; extra == "lts"
|
|
50
52
|
Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
|
|
51
53
|
Requires-Dist: nvtx; extra == "lts"
|
|
52
|
-
Requires-Dist: numpy<2.0.0; extra == "lts"
|
|
53
54
|
Requires-Dist: transformers; extra == "lts"
|
|
54
55
|
Requires-Dist: zarr; extra == "lts"
|
|
55
56
|
Requires-Dist: setuptools<80.0.0; extra == "lts"
|
|
@@ -31,16 +31,13 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
|
|
|
31
31
|
"""The BERT dataset that assumes WordPiece tokenization
|
|
32
32
|
|
|
33
33
|
Args:
|
|
34
|
-
indexed_dataset (IndexedDataset): The IndexedDataset around which
|
|
35
|
-
|
|
34
|
+
indexed_dataset (IndexedDataset): The IndexedDataset around which
|
|
35
|
+
to build the MegatronDataset
|
|
36
36
|
dataset_path (str): The real path on disk to the dataset, for bookkeeping
|
|
37
|
-
|
|
38
37
|
indexed_indices (numpy.ndarray): The set of the documents indices to expose
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
38
|
+
num_samples (Optional[int]): The number of samples to draw from the indexed dataset.
|
|
39
|
+
When None, build as many samples as correspond to one epoch.
|
|
42
40
|
index_split (Split): The indexed_indices Split
|
|
43
|
-
|
|
44
41
|
config (BERTMaskedWordPieceDatasetConfig): The config
|
|
45
42
|
"""
|
|
46
43
|
|
|
@@ -83,6 +80,7 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
|
|
|
83
80
|
Returns:
|
|
84
81
|
Dict[str, Union[int, numpy.ndarray]]: The
|
|
85
82
|
"""
|
|
83
|
+
|
|
86
84
|
idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
|
|
87
85
|
sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
|
|
88
86
|
numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_dataset.py
RENAMED
|
@@ -80,7 +80,7 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
80
80
|
unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
|
|
81
81
|
)
|
|
82
82
|
self.unique_description_hash = hashlib.md5(
|
|
83
|
-
self.unique_description.encode("utf-8")
|
|
83
|
+
self.unique_description.encode("utf-8"), usedforsecurity=False
|
|
84
84
|
).hexdigest()
|
|
85
85
|
|
|
86
86
|
self.dataset_index, self.dataset_sample_index = self._build_indices()
|
|
@@ -103,6 +103,7 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
103
103
|
Returns:
|
|
104
104
|
Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
|
|
105
105
|
"""
|
|
106
|
+
|
|
106
107
|
path_to_cache = self.config.path_to_cache
|
|
107
108
|
|
|
108
109
|
if path_to_cache:
|
|
@@ -192,7 +193,7 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
192
193
|
logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
|
|
193
194
|
)
|
|
194
195
|
t_beg = time.time()
|
|
195
|
-
dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode=
|
|
196
|
+
dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode="r")
|
|
196
197
|
t_end = time.time()
|
|
197
198
|
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
|
198
199
|
|
|
@@ -203,7 +204,7 @@ class BlendedDataset(torch.utils.data.Dataset):
|
|
|
203
204
|
)
|
|
204
205
|
t_beg = time.time()
|
|
205
206
|
dataset_sample_index = numpy.load(
|
|
206
|
-
path_to_dataset_sample_index, allow_pickle=True, mmap_mode=
|
|
207
|
+
path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r"
|
|
207
208
|
)
|
|
208
209
|
t_end = time.time()
|
|
209
210
|
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
|
@@ -529,6 +529,7 @@ def _get_size_per_split_per_dataset(
|
|
|
529
529
|
Returns:
|
|
530
530
|
List[List[int]]: The number of samples to request per MegatronDataset per split
|
|
531
531
|
"""
|
|
532
|
+
|
|
532
533
|
assert numpy.isclose(sum(normalized_weights), 1.0)
|
|
533
534
|
|
|
534
535
|
# Use margin as buffer to ensure we satiate the request
|
|
@@ -19,6 +19,7 @@ from megatron.core.utils import log_single_rank
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
+
|
|
22
23
|
_PAD_TOKEN_ID = -1
|
|
23
24
|
|
|
24
25
|
|
|
@@ -356,7 +357,6 @@ class GPTDataset(MegatronDataset):
|
|
|
356
357
|
not cache_hit
|
|
357
358
|
and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)
|
|
358
359
|
):
|
|
359
|
-
|
|
360
360
|
log_single_rank(
|
|
361
361
|
logger,
|
|
362
362
|
logging.INFO,
|
|
@@ -494,7 +494,7 @@ class GPTDataset(MegatronDataset):
|
|
|
494
494
|
f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
|
|
495
495
|
)
|
|
496
496
|
t_beg = time.time()
|
|
497
|
-
document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode=
|
|
497
|
+
document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode="r")
|
|
498
498
|
t_end = time.time()
|
|
499
499
|
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
|
500
500
|
|
|
@@ -504,7 +504,7 @@ class GPTDataset(MegatronDataset):
|
|
|
504
504
|
f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
|
|
505
505
|
)
|
|
506
506
|
t_beg = time.time()
|
|
507
|
-
sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode=
|
|
507
|
+
sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
|
|
508
508
|
t_end = time.time()
|
|
509
509
|
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
|
510
510
|
|
|
@@ -514,7 +514,7 @@ class GPTDataset(MegatronDataset):
|
|
|
514
514
|
f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
|
|
515
515
|
)
|
|
516
516
|
t_beg = time.time()
|
|
517
|
-
shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode=
|
|
517
|
+
shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode="r")
|
|
518
518
|
t_end = time.time()
|
|
519
519
|
log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
|
|
520
520
|
|
|
@@ -575,6 +575,7 @@ def _build_document_index(
|
|
|
575
575
|
Returns:
|
|
576
576
|
numpy.ndarray: The document index
|
|
577
577
|
"""
|
|
578
|
+
|
|
578
579
|
if not separate_final_epoch or num_epochs == 1:
|
|
579
580
|
document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
|
|
580
581
|
document_index[:] = documents
|
|
@@ -604,6 +605,7 @@ def _build_shuffle_index(
|
|
|
604
605
|
Returns:
|
|
605
606
|
numpy.ndarray: The shuffle index
|
|
606
607
|
"""
|
|
608
|
+
|
|
607
609
|
dtype_ = numpy.uint32
|
|
608
610
|
if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
|
|
609
611
|
dtype_ = numpy.int64
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
2
2
|
|
|
3
|
-
import numpy
|
|
4
3
|
|
|
5
4
|
# Implicit imports for backwards compatibility
|
|
6
5
|
# Explicit imports for readability
|
|
6
|
+
import numpy
|
|
7
|
+
|
|
7
8
|
from megatron.core.datasets.helpers_cpp import *
|
|
8
9
|
from megatron.core.datasets.helpers_cpp import build_sample_idx_int32, build_sample_idx_int64
|
|
9
10
|
|
|
@@ -39,6 +40,7 @@ def build_sample_idx(
|
|
|
39
40
|
Returns:
|
|
40
41
|
numpy.ndarray: The 2-D sample index
|
|
41
42
|
"""
|
|
43
|
+
|
|
42
44
|
sample_idx_max = max(document_indices.shape[0], sizes.max())
|
|
43
45
|
if sample_idx_max <= numpy.iinfo(numpy.int32).max:
|
|
44
46
|
sample_idx = build_sample_idx_int32(
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/indexed_dataset.py
RENAMED
|
@@ -17,12 +17,13 @@ from itertools import accumulate
|
|
|
17
17
|
from types import TracebackType
|
|
18
18
|
from typing import List, Optional, Tuple, Type, Union
|
|
19
19
|
|
|
20
|
+
import numpy
|
|
21
|
+
|
|
20
22
|
try:
|
|
21
23
|
import boto3
|
|
22
24
|
except ModuleNotFoundError:
|
|
23
25
|
pass
|
|
24
26
|
|
|
25
|
-
import numpy
|
|
26
27
|
import torch
|
|
27
28
|
|
|
28
29
|
from megatron.core.datasets.object_storage_utils import S3Config # pylint: disable=unused-import
|
|
@@ -204,7 +205,7 @@ class _IndexWriter(object):
|
|
|
204
205
|
|
|
205
206
|
# the mode per sequence
|
|
206
207
|
if sequence_modes is not None:
|
|
207
|
-
self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order=
|
|
208
|
+
self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
|
|
208
209
|
|
|
209
210
|
def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
|
|
210
211
|
"""Build the sequence pointers per the sequence lengths and dtype size
|
|
@@ -234,7 +235,6 @@ class _IndexReader(object):
|
|
|
234
235
|
"""
|
|
235
236
|
|
|
236
237
|
def __init__(self, idx_path: str, multimodal: bool) -> None:
|
|
237
|
-
|
|
238
238
|
log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
|
|
239
239
|
|
|
240
240
|
with open(idx_path, "rb") as stream:
|
|
@@ -435,11 +435,11 @@ class _FileBinReader(_BinReader):
|
|
|
435
435
|
sequence = numpy.empty(count, dtype=dtype)
|
|
436
436
|
if MultiStorageClientFeature.is_enabled():
|
|
437
437
|
msc = MultiStorageClientFeature.import_package()
|
|
438
|
-
with msc.open(self._bin_path, mode=
|
|
438
|
+
with msc.open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
|
|
439
439
|
bin_buffer_file.seek(offset)
|
|
440
440
|
bin_buffer_file.readinto(sequence)
|
|
441
441
|
else:
|
|
442
|
-
with open(self._bin_path, mode=
|
|
442
|
+
with open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
|
|
443
443
|
bin_buffer_file.seek(offset)
|
|
444
444
|
bin_buffer_file.readinto(sequence)
|
|
445
445
|
return sequence
|
|
@@ -520,8 +520,8 @@ class _S3BinReader(_BinReader):
|
|
|
520
520
|
Bucket=self._s3_bucket,
|
|
521
521
|
Key=self._s3_key,
|
|
522
522
|
# Subtract 1, because the end of Range is inclusive.
|
|
523
|
-
Range=f
|
|
524
|
-
)[
|
|
523
|
+
Range=f"bytes={bytes_start}-{bytes_end - 1}",
|
|
524
|
+
)["Body"].read()
|
|
525
525
|
self._cache_bytes_start = bytes_start
|
|
526
526
|
self._cache_bytes_end = bytes_end
|
|
527
527
|
return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype)
|
|
@@ -551,7 +551,7 @@ class _MultiStorageClientBinReader(_BinReader):
|
|
|
551
551
|
|
|
552
552
|
|
|
553
553
|
# Map of object storage access to the corresponding bin reader
|
|
554
|
-
OBJECT_STORAGE_BIN_READERS = {
|
|
554
|
+
OBJECT_STORAGE_BIN_READERS = {"s3": _S3BinReader, "msc": _MultiStorageClientBinReader}
|
|
555
555
|
|
|
556
556
|
|
|
557
557
|
class IndexedDataset(torch.utils.data.Dataset):
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/masked_dataset.py
RENAMED
|
@@ -355,7 +355,6 @@ class MaskedWordPieceDataset(MegatronDataset):
|
|
|
355
355
|
numpy_random_state.shuffle(candidate_ngrams)
|
|
356
356
|
|
|
357
357
|
if self.config.masking_do_permutation:
|
|
358
|
-
|
|
359
358
|
n_swappings = n_maskings
|
|
360
359
|
|
|
361
360
|
permuted_indices = set()
|
|
@@ -417,7 +416,7 @@ class MaskedWordPieceDataset(MegatronDataset):
|
|
|
417
416
|
|
|
418
417
|
masked_spans = sorted(masked_spans, key=lambda x: x[0][0])
|
|
419
418
|
|
|
420
|
-
return masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans
|
|
419
|
+
return (masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans)
|
|
421
420
|
|
|
422
421
|
@abstractmethod
|
|
423
422
|
def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_dataset.py
RENAMED
|
@@ -63,7 +63,7 @@ class MegatronDataset(ABC, torch.utils.data.Dataset):
|
|
|
63
63
|
self.unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
|
|
64
64
|
)
|
|
65
65
|
self.unique_description_hash = hashlib.md5(
|
|
66
|
-
self.unique_description.encode("utf-8")
|
|
66
|
+
self.unique_description.encode("utf-8"), usedforsecurity=False
|
|
67
67
|
).hexdigest()
|
|
68
68
|
|
|
69
69
|
@staticmethod
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_tokenizer.py
RENAMED
|
@@ -20,7 +20,6 @@ class MegatronTokenizer(ABC):
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
|
|
23
|
-
|
|
24
23
|
self.unique_identifiers = OrderedDict()
|
|
25
24
|
self.unique_identifiers["class"] = type(self).__name__
|
|
26
25
|
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
|
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
|
|
5
5
|
import abc
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import torch
|
|
@@ -22,7 +21,9 @@ class Embedder(abc.ABC):
|
|
|
22
21
|
"""Embed a text dataset.
|
|
23
22
|
|
|
24
23
|
Args:
|
|
25
|
-
text_dataset (torch.utils.data.Dataset): Text dataset to embed.
|
|
24
|
+
text_dataset (torch.utils.data.Dataset): Text dataset to embed.
|
|
25
|
+
Each sample of the text dataset should output a dict with a key 'text'
|
|
26
|
+
and a string value.
|
|
26
27
|
|
|
27
28
|
Returns:
|
|
28
29
|
A 2D ndarray with shape (len(text_dataset), dimension(embedder)).
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/build.py
RENAMED
|
@@ -11,7 +11,6 @@ Building a chunk database consists of.
|
|
|
11
11
|
- Save chunk offsets to disk for each indexed dataset.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
-
import glob
|
|
15
14
|
import os
|
|
16
15
|
import types
|
|
17
16
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
@@ -19,11 +18,9 @@ from typing import Dict, List, Tuple
|
|
|
19
18
|
|
|
20
19
|
import numpy as np
|
|
21
20
|
import torch
|
|
22
|
-
from tqdm import tqdm
|
|
23
21
|
|
|
24
22
|
from megatron.core.datasets.indexed_dataset import IndexedDataset
|
|
25
23
|
from megatron.core.datasets.retro.config import RetroPreprocessingConfig
|
|
26
|
-
from megatron.core.datasets.retro.external_libs import h5py
|
|
27
24
|
from megatron.core.datasets.retro.utils import (
|
|
28
25
|
extract_data_config,
|
|
29
26
|
get_blocks_by_rank,
|
|
@@ -40,10 +37,23 @@ from .utils import (
|
|
|
40
37
|
get_individual_doc_offsets,
|
|
41
38
|
get_merged_db_path_map,
|
|
42
39
|
init_indexed_dataset_infos,
|
|
43
|
-
load_indexed_datasets,
|
|
44
40
|
save_indexed_dataset_infos,
|
|
45
41
|
)
|
|
46
42
|
|
|
43
|
+
try:
|
|
44
|
+
from tqdm import tqdm
|
|
45
|
+
|
|
46
|
+
HAVE_TQDM = True
|
|
47
|
+
except ImportError:
|
|
48
|
+
HAVE_TQDM = False
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import h5py
|
|
52
|
+
|
|
53
|
+
HAVE_H5PY = True
|
|
54
|
+
except ImportError:
|
|
55
|
+
HAVE_H5PY = False
|
|
56
|
+
|
|
47
57
|
|
|
48
58
|
def build_partial_db(
|
|
49
59
|
config: types.SimpleNamespace,
|
|
@@ -64,7 +74,8 @@ def build_partial_db(
|
|
|
64
74
|
from each document.
|
|
65
75
|
|
|
66
76
|
Args:
|
|
67
|
-
config (types.SimpleNamespace): Subset of Retro config, containing
|
|
77
|
+
config (types.SimpleNamespace): Subset of Retro config, containing
|
|
78
|
+
'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
|
|
68
79
|
dataset_idx (int): Index of this dataset out of all blended datasets.
|
|
69
80
|
n_datasets (int): Total number of blended datasets.
|
|
70
81
|
indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
|
|
@@ -83,6 +94,9 @@ def build_partial_db(
|
|
|
83
94
|
- Dict mapping document ID to number of valid chunks.
|
|
84
95
|
"""
|
|
85
96
|
|
|
97
|
+
if not HAVE_TQDM:
|
|
98
|
+
raise ImportError("tqdm is required to use the RetroDataset. Please install tqdm.")
|
|
99
|
+
|
|
86
100
|
# Document start/end indexes.
|
|
87
101
|
doc_range = block["range"]
|
|
88
102
|
n_docs = doc_range[1] - doc_range[0]
|
|
@@ -111,7 +125,6 @@ def build_partial_db(
|
|
|
111
125
|
chunk_db_invalid: List[Tuple] = []
|
|
112
126
|
doc_size_map = {}
|
|
113
127
|
for doc_id in pbar:
|
|
114
|
-
|
|
115
128
|
# Progress description.
|
|
116
129
|
try:
|
|
117
130
|
pbar.set_description(
|
|
@@ -142,7 +155,6 @@ def build_partial_db(
|
|
|
142
155
|
# Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
|
|
143
156
|
doc_size_map[doc_id] = 0
|
|
144
157
|
for i, chunk_start_idx in enumerate(chunk_start_idxs):
|
|
145
|
-
|
|
146
158
|
# Re-tokenize.
|
|
147
159
|
chunk_end_idx = chunk_end_idxs[i]
|
|
148
160
|
gpt_token_ids = indexed_dataset.get(
|
|
@@ -176,12 +188,13 @@ def build_block_db(
|
|
|
176
188
|
"""Split each document within block into consecutive retro_gpt_chunk_length size chunks.
|
|
177
189
|
|
|
178
190
|
Args:
|
|
179
|
-
config (RetroPreprocessingConfig): For DB building, we make use of attributes
|
|
191
|
+
config (RetroPreprocessingConfig): For DB building, we make use of attributes
|
|
192
|
+
'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
|
|
180
193
|
dataset_idx (int): Index of this dataset out of all blended datasets.
|
|
181
194
|
n_datasets (int): Total number of blended datasets.
|
|
182
195
|
indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
|
|
183
196
|
n_procs (int): Total number of parallel processes.
|
|
184
|
-
|
|
197
|
+
executor (ProcessPoolExecutor): Executor for launching parallel processes.
|
|
185
198
|
n_missing_blocks (int): Total number of blocks to be processed.
|
|
186
199
|
block_idx (int): Block index out of all blocks to be processed.
|
|
187
200
|
block (dict): Range information such as start/end points for chunking idnexed dataset.
|
|
@@ -195,7 +208,7 @@ def build_block_db(
|
|
|
195
208
|
"""
|
|
196
209
|
|
|
197
210
|
# Build partial dbs.
|
|
198
|
-
log_retro_rank_0(
|
|
211
|
+
log_retro_rank_0(" > build partial dbs.")
|
|
199
212
|
futures = []
|
|
200
213
|
for proc_id in range(n_procs): # not true process id
|
|
201
214
|
futures.append(
|
|
@@ -232,7 +245,7 @@ def build_block_db(
|
|
|
232
245
|
]
|
|
233
246
|
|
|
234
247
|
# Convert to numpy.
|
|
235
|
-
log_retro_rank_0(
|
|
248
|
+
log_retro_rank_0(" > converting chunk db to numpy.")
|
|
236
249
|
chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
|
|
237
250
|
chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
|
|
238
251
|
|
|
@@ -261,6 +274,9 @@ def save_block_db(
|
|
|
261
274
|
chunk_db_invalid (np.ndarray): Array of invalid chunk indexes.
|
|
262
275
|
doc_offsets (np.ndarray): Array of document offsets by chunks.
|
|
263
276
|
"""
|
|
277
|
+
if not HAVE_H5PY:
|
|
278
|
+
raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
|
|
279
|
+
|
|
264
280
|
log_retro_rank_0(" > saving individual db.")
|
|
265
281
|
with h5py.File(block["path"], "w") as f:
|
|
266
282
|
dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
|
|
@@ -277,7 +293,8 @@ def build_individual_db(
|
|
|
277
293
|
config (RetroPreprocessingConfig): Retro preprocessing config.
|
|
278
294
|
dataset_idx (int): Dataset index within blended dataset.
|
|
279
295
|
n_datasets (int): Total number of datasets within blended dataset.
|
|
280
|
-
dataset_info (dict): Metadata for dataset
|
|
296
|
+
dataset_info (dict): Metadata for dataset
|
|
297
|
+
(see `save_indexed_dataset_infos()` in `utils.py` for more detail).
|
|
281
298
|
"""
|
|
282
299
|
|
|
283
300
|
# Make directory.
|
|
@@ -323,9 +340,7 @@ def build_individual_db(
|
|
|
323
340
|
# Process documents in parallel.
|
|
324
341
|
with ProcessPoolExecutor(max_workers=n_procs) as executor:
|
|
325
342
|
for block_idx, block in enumerate(active_blocks):
|
|
326
|
-
|
|
327
343
|
if block is not None:
|
|
328
|
-
|
|
329
344
|
# Build block DB.
|
|
330
345
|
chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db(
|
|
331
346
|
config=config,
|
|
@@ -349,7 +364,6 @@ def build_individual_db(
|
|
|
349
364
|
)
|
|
350
365
|
|
|
351
366
|
else:
|
|
352
|
-
|
|
353
367
|
# Load existing block DB.
|
|
354
368
|
with h5py.File(block["path"]) as f:
|
|
355
369
|
existing_chunks_valid = np.copy(f["chunks_valid"])
|
|
@@ -382,7 +396,6 @@ def build_individual_dbs(
|
|
|
382
396
|
# Build individual DBs.
|
|
383
397
|
log_retro_rank_0(" > build individual chunk dbs.")
|
|
384
398
|
for ds_idx, ds_info in enumerate(indexed_dataset_infos):
|
|
385
|
-
|
|
386
399
|
# Progress.
|
|
387
400
|
log_retro_rank_0(
|
|
388
401
|
" > building individual db, dataset %d / %d ... '%s'."
|
|
@@ -400,7 +413,8 @@ def update_chunk_counts(
|
|
|
400
413
|
|
|
401
414
|
Args:
|
|
402
415
|
config (RetroPreprocessingConfig): Retro preprocessing config.
|
|
403
|
-
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
416
|
+
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
417
|
+
(i.e., 'prefix', 'ratio', 'n_chunks', etc.).
|
|
404
418
|
"""
|
|
405
419
|
|
|
406
420
|
if torch.distributed.get_rank() != 0:
|
|
@@ -416,7 +430,6 @@ def update_chunk_counts(
|
|
|
416
430
|
# Set n_chunks (including n_chunks_sampled for unambiguity).
|
|
417
431
|
log_retro_rank_0(" > compute n_chunks.")
|
|
418
432
|
for ds_index, ds_info in enumerate(indexed_dataset_infos):
|
|
419
|
-
|
|
420
433
|
db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"])
|
|
421
434
|
|
|
422
435
|
# Update counts.
|
|
@@ -457,10 +470,14 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
|
|
|
457
470
|
|
|
458
471
|
Args:
|
|
459
472
|
project_dir (str): Retro project dir.
|
|
460
|
-
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
473
|
+
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
474
|
+
(i.e., 'prefix', 'ratio', 'n_chunks', etc.).
|
|
461
475
|
db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
|
|
462
476
|
"""
|
|
463
477
|
|
|
478
|
+
if not HAVE_H5PY:
|
|
479
|
+
raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
|
|
480
|
+
|
|
464
481
|
if torch.distributed.get_rank() != 0:
|
|
465
482
|
return
|
|
466
483
|
|
|
@@ -489,9 +506,7 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
|
|
|
489
506
|
|
|
490
507
|
# Delete existing chunk db if incorrect size.
|
|
491
508
|
if os.path.exists(db_path):
|
|
492
|
-
|
|
493
509
|
try:
|
|
494
|
-
|
|
495
510
|
f = h5py.File(db_path)
|
|
496
511
|
n_alloc = len(f["chunks"]) # total allocated
|
|
497
512
|
n_written = f["n_written"][0].item() # total written
|
|
@@ -511,7 +526,6 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
|
|
|
511
526
|
|
|
512
527
|
# Build merged chunk db.
|
|
513
528
|
if not os.path.exists(db_path):
|
|
514
|
-
|
|
515
529
|
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
|
516
530
|
f = h5py.File(db_path, "w")
|
|
517
531
|
|
|
@@ -589,7 +603,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
|
|
|
589
603
|
|
|
590
604
|
Args:
|
|
591
605
|
project_dir (str): Retro project dir.
|
|
592
|
-
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
606
|
+
indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
|
|
607
|
+
(i.e., 'prefix', 'ratio', 'n_chunks', etc.).
|
|
593
608
|
"""
|
|
594
609
|
merge_dbs(project_dir, indexed_dataset_infos, "sampled")
|
|
595
610
|
merge_dbs(project_dir, indexed_dataset_infos, "train")
|
|
@@ -599,7 +614,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
|
|
|
599
614
|
def build_db(config: RetroPreprocessingConfig) -> None:
|
|
600
615
|
"""Extract token chunks from each indexed dataset.
|
|
601
616
|
|
|
602
|
-
Iterate each document of each indexed dataset, extract that document's chunks,
|
|
617
|
+
Iterate each document of each indexed dataset, extract that document's chunks,
|
|
618
|
+
and save to a 'DB' (hdf5 file).
|
|
603
619
|
|
|
604
620
|
Args:
|
|
605
621
|
config (RetroPreprocessingConfig): Retro preprocessing config.
|
{megatron_core-0.13.0rc2 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/dataset.py
RENAMED
|
@@ -10,10 +10,16 @@ from typing import List
|
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
import torch
|
|
13
|
-
from tqdm import tqdm
|
|
14
13
|
|
|
15
14
|
from megatron.core.datasets.indexed_dataset import IndexedDataset
|
|
16
15
|
|
|
16
|
+
try:
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
HAVE_TQDM = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
HAVE_TQDM = False
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
class DBDataset(torch.utils.data.Dataset):
|
|
19
25
|
"""Dataset for iterating chunks.
|
|
@@ -21,7 +27,8 @@ class DBDataset(torch.utils.data.Dataset):
|
|
|
21
27
|
Args:
|
|
22
28
|
db_path (str): Path of HDF5-format chunk database.
|
|
23
29
|
indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
|
|
24
|
-
chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets.
|
|
30
|
+
chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets.
|
|
31
|
+
Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
|
|
25
32
|
chunk_length (int): Max GPT chunk length (e.g., 64).
|
|
26
33
|
eod_token_id (int): EOD token ID.
|
|
27
34
|
"""
|
|
@@ -34,7 +41,6 @@ class DBDataset(torch.utils.data.Dataset):
|
|
|
34
41
|
chunk_length: int,
|
|
35
42
|
eod_token_id: int,
|
|
36
43
|
):
|
|
37
|
-
|
|
38
44
|
assert chunks.shape[1] == 5, (
|
|
39
45
|
"expected 5 columns (dataset_idx, "
|
|
40
46
|
"doc_idx, token_start_idx, token_end_idx, bert_chunk_length); "
|
|
@@ -93,6 +99,9 @@ class DBDataset(torch.utils.data.Dataset):
|
|
|
93
99
|
Load the dataset id & document id of each chunk in the database, to
|
|
94
100
|
be used for causality filtering during querying.
|
|
95
101
|
"""
|
|
102
|
+
if not HAVE_TQDM:
|
|
103
|
+
raise ImportError("tqdm is required to use the DBDataset. Please install tqdm.")
|
|
104
|
+
|
|
96
105
|
self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
|
|
97
106
|
block_size = int(1e6)
|
|
98
107
|
for start_idx in tqdm(
|