megatron-core 0.13.0rc1__tar.gz → 0.14.0rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (307) hide show
  1. {megatron_core-0.13.0rc1/megatron_core.egg-info → megatron_core-0.14.0rc0}/PKG-INFO +4 -3
  2. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/bert_dataset.py +5 -7
  3. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_dataset.py +4 -3
  4. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_megatron_dataset_builder.py +1 -0
  5. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/gpt_dataset.py +6 -4
  6. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/helpers.py +3 -1
  7. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/indexed_dataset.py +8 -8
  8. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/masked_dataset.py +1 -2
  9. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_dataset.py +1 -1
  10. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/megatron_tokenizer.py +0 -1
  11. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/bert_embedders.py +3 -2
  12. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/build.py +40 -24
  13. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/dataset.py +12 -3
  14. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/utils.py +42 -11
  15. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/external_libs.py +1 -3
  16. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/build.py +31 -5
  17. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/index.py +26 -9
  18. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/faiss_base.py +34 -5
  19. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +54 -9
  20. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/validate.py +15 -12
  21. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +14 -6
  22. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/query.py +71 -15
  23. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/retro_dataset.py +21 -8
  24. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/utils.py +21 -5
  25. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/t5_dataset.py +2 -2
  26. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils.py +8 -3
  27. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/dict_utils.py +14 -14
  28. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/exchange_utils.py +35 -32
  29. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/mapping.py +54 -52
  30. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/serialization.py +13 -2
  31. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +36 -27
  32. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/resharding.py +9 -7
  33. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/tensorstore.py +26 -12
  34. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/zarr.py +62 -47
  35. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +38 -21
  36. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/utils.py +14 -1
  37. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/validation.py +38 -37
  38. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/__init__.py +4 -1
  39. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/fully_sharded_data_parallel.py +1 -0
  40. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/param_and_grad_buffer.py +1 -1
  41. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/param_and_grad_buffer.py +25 -20
  42. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +37 -25
  43. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trt_model_config.py +10 -1
  44. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_helper.py +60 -49
  45. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +15 -4
  46. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +28 -18
  47. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/extensions/kitchen.py +106 -44
  48. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/extensions/transformer_engine.py +170 -129
  49. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fp8_utils.py +24 -12
  50. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_indices_converter.py +16 -6
  51. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +16 -6
  52. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_pad_routing_map.py +23 -4
  53. megatron_core-0.14.0rc0/megatron/core/hyper_comm_grid.py +239 -0
  54. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/dynamic_context.py +29 -8
  55. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/dynamic_engine.py +98 -25
  56. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/static_engine.py +17 -1
  57. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/inference_request.py +1 -0
  58. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +1 -0
  59. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +74 -21
  60. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/jit.py +10 -2
  61. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/T5/t5_spec.py +6 -5
  62. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/backends.py +4 -4
  63. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_layer_specs.py +5 -5
  64. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/rope_utils.py +3 -9
  65. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/gpt_layer_specs.py +15 -14
  66. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +6 -5
  67. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/clip_model.py +19 -3
  68. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/module.py +20 -1
  69. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/qwen_model.py +20 -3
  70. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/llava_spec.py +2 -1
  71. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/decoder_attention.py +4 -7
  72. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/decoder_spec.py +2 -1
  73. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/encoder_spec.py +1 -0
  74. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/vision/radio.py +18 -6
  75. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/vision/vit_layer_specs.py +6 -5
  76. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/distrib_optimizer.py +50 -1
  77. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/optimizer.py +59 -14
  78. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/optimizer_config.py +5 -1
  79. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/package_info.py +2 -2
  80. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/parallel_state.py +148 -139
  81. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/layers.py +23 -11
  82. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/quantization/quant_config.py +22 -15
  83. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/rerun_state_machine.py +82 -83
  84. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_context_parallel.py +8 -3
  85. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +5 -5
  86. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_mixer.py +55 -40
  87. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/layers.py +47 -41
  88. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/mappings.py +9 -5
  89. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/utils.py +6 -3
  90. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/timers.py +6 -3
  91. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/cuda_graphs.py +53 -34
  92. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/mlp.py +6 -6
  93. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/multi_latent_attention.py +2 -1
  94. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_config.py +96 -69
  95. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/utils.py +91 -27
  96. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0/megatron_core.egg-info}/PKG-INFO +4 -3
  97. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron_core.egg-info/SOURCES.txt +1 -0
  98. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron_core.egg-info/requires.txt +5 -2
  99. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/pyproject.toml +7 -6
  100. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/LICENSE +0 -0
  101. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/MANIFEST.in +0 -0
  102. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/README.md +0 -0
  103. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/README.md +0 -0
  104. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/__init__.py +0 -0
  105. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/config.py +0 -0
  106. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/config_logger.py +0 -0
  107. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/__init__.py +0 -0
  108. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  109. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/helpers.cpp +0 -0
  110. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/multimodal_dataset.py +0 -0
  111. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/object_storage_utils.py +0 -0
  112. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/__init__.py +0 -0
  113. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/__init__.py +0 -0
  114. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/config.py +0 -0
  115. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  116. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  117. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/db/__init__.py +0 -0
  118. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/__init__.py +0 -0
  119. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/factory.py +0 -0
  120. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  121. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/index/utils.py +0 -0
  122. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/__init__.py +0 -0
  123. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  124. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/retro/query/utils.py +0 -0
  125. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils_object_storage.py +0 -0
  126. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/datasets/utils_s3.py +0 -0
  127. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/__init__.py +0 -0
  128. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/core.py +0 -0
  129. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  130. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  131. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  132. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  133. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  134. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  135. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  136. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  137. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  138. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  139. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  140. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/custom_fsdp/__init__.py +0 -0
  141. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/data_parallel_base.py +0 -0
  142. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  143. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  144. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/finalize_model_grads.py +0 -0
  145. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  146. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  147. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/energy_monitor.py +0 -0
  148. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/enums.py +0 -0
  149. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/__init__.py +0 -0
  150. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/data_type.py +0 -0
  151. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/export_config.py +0 -0
  152. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/model_type.py +0 -0
  153. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/__init__.py +0 -0
  154. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  155. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  156. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  157. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  158. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  159. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  160. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  161. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/extensions/__init__.py +0 -0
  162. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  163. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/__init__.py +0 -0
  164. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  165. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  166. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  167. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  168. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  169. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_layer_norm.py +0 -0
  170. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/fusions/fused_softmax.py +0 -0
  171. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/__init__.py +0 -0
  172. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/async_stream.py +0 -0
  173. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/common_inference_params.py +0 -0
  174. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/communication_utils.py +0 -0
  175. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/__init__.py +0 -0
  176. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/base_context.py +0 -0
  177. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/dynamic_chunk_allocator.py +0 -0
  178. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/contexts/static_context.py +0 -0
  179. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/__init__.py +0 -0
  180. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/abstract_engine.py +0 -0
  181. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/engines/mcore_engine.py +0 -0
  182. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  183. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  184. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  185. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  186. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  187. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  188. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  189. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/sampling_params.py +0 -0
  190. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/scheduler.py +0 -0
  191. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  192. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  193. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  194. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  195. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference/utils.py +0 -0
  196. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/inference_params.py +0 -0
  197. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/model_parallel_config.py +0 -0
  198. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/T5/__init__.py +0 -0
  199. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/T5/t5_model.py +0 -0
  200. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/__init__.py +0 -0
  201. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/bert/__init__.py +0 -0
  202. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_lm_head.py +0 -0
  203. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/bert/bert_model.py +0 -0
  204. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/bert/pooler.py +0 -0
  205. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/__init__.py +0 -0
  206. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/__init__.py +0 -0
  207. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  208. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  209. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  210. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  211. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/language_module/__init__.py +0 -0
  212. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/language_module/language_module.py +0 -0
  213. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/vision_module/__init__.py +0 -0
  214. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  215. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/__init__.py +0 -0
  216. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  217. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/gpt_model.py +0 -0
  218. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  219. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/huggingface/__init__.py +0 -0
  220. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/__init__.py +0 -0
  221. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  222. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mamba/mamba_model.py +0 -0
  223. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/__init__.py +0 -0
  224. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/config/__init__.py +0 -0
  225. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/config/base_configs.py +0 -0
  226. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/model/__init__.py +0 -0
  227. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/model/base.py +0 -0
  228. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/audio.py +0 -0
  229. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/base.py +0 -0
  230. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/mimo/submodules/vision.py +0 -0
  231. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/__init__.py +0 -0
  232. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/context_parallel.py +0 -0
  233. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/multimodal/llava_model.py +0 -0
  234. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/__init__.py +0 -0
  235. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/base_attention.py +0 -0
  236. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/config.py +0 -0
  237. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/encoder_attention.py +0 -0
  238. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/model.py +0 -0
  239. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/retro/utils.py +0 -0
  240. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/vision/__init__.py +0 -0
  241. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/vision/clip_vit_model.py +0 -0
  242. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/models/vision/multimodal_projector.py +0 -0
  243. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/msc_utils.py +0 -0
  244. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/num_microbatches_calculator.py +0 -0
  245. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/__init__.py +0 -0
  246. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/clip_grads.py +0 -0
  247. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  248. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  249. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer/grad_scaler.py +0 -0
  250. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/optimizer_param_scheduler.py +0 -0
  251. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/packed_seq_params.py +0 -0
  252. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/__init__.py +0 -0
  253. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  254. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/schedules.py +0 -0
  255. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/pipeline_parallel/utils.py +0 -0
  256. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/__init__.py +0 -0
  257. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/__init__.py +0 -0
  258. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  259. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  260. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  261. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  262. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  263. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/process_groups_config.py +0 -0
  264. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/quantization/__init__.py +0 -0
  265. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/quantization/utils.py +0 -0
  266. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/requirements.txt +0 -0
  267. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/__init__.py +0 -0
  268. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_block.py +0 -0
  269. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mamba_layer.py +0 -0
  270. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/mlp_layer.py +0 -0
  271. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/ssm/triton_cache_manager.py +0 -0
  272. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/__init__.py +0 -0
  273. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  274. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/data.py +0 -0
  275. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/tensor_parallel/random.py +0 -0
  276. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/__init__.py +0 -0
  277. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/attention.py +0 -0
  278. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  279. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  280. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/dot_product_attention.py +0 -0
  281. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/enums.py +0 -0
  282. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  283. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  284. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/identity_op.py +0 -0
  285. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/module.py +0 -0
  286. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/__init__.py +0 -0
  287. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/experts.py +0 -0
  288. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  289. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  290. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/moe_layer.py +0 -0
  291. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/moe_utils.py +0 -0
  292. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/router.py +0 -0
  293. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/shared_experts.py +0 -0
  294. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  295. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  296. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/multi_token_prediction.py +0 -0
  297. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  298. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/spec_utils.py +0 -0
  299. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/torch_layer_norm.py +0 -0
  300. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/torch_norm.py +0 -0
  301. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_block.py +0 -0
  302. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/transformer_layer.py +0 -0
  303. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron/core/transformer/utils.py +0 -0
  304. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron_core.egg-info/dependency_links.txt +0 -0
  305. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/megatron_core.egg-info/top_level.txt +0 -0
  306. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/setup.cfg +0 -0
  307. {megatron_core-0.13.0rc1 → megatron_core-0.14.0rc0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.13.0rc1
3
+ Version: 0.14.0rc0
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -30,6 +30,7 @@ Requires-Python: >=3.10
30
30
  Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
32
  Requires-Dist: torch
33
+ Requires-Dist: numpy<2.0.0
33
34
  Provides-Extra: mlm
34
35
  Requires-Dist: flask-restful; extra == "mlm"
35
36
  Requires-Dist: sentencepiece; extra == "mlm"
@@ -40,16 +41,16 @@ Requires-Dist: tqdm; extra == "dev"
40
41
  Requires-Dist: einops; extra == "dev"
41
42
  Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "dev"
42
43
  Requires-Dist: nvtx; extra == "dev"
43
- Requires-Dist: numpy<2.0.0; extra == "dev"
44
44
  Requires-Dist: transformers; extra == "dev"
45
45
  Requires-Dist: multi-storage-client; extra == "dev"
46
46
  Requires-Dist: setuptools<80.0.0; extra == "dev"
47
+ Requires-Dist: nvidia-modelopt[torch]; sys_platform != "darwin" and extra == "dev"
48
+ Requires-Dist: megatron-energon[av_decode]<7; extra == "dev"
47
49
  Provides-Extra: lts
48
50
  Requires-Dist: tqdm; extra == "lts"
49
51
  Requires-Dist: einops; extra == "lts"
50
52
  Requires-Dist: tensorstore!=0.1.46,!=0.1.72; extra == "lts"
51
53
  Requires-Dist: nvtx; extra == "lts"
52
- Requires-Dist: numpy<2.0.0; extra == "lts"
53
54
  Requires-Dist: transformers; extra == "lts"
54
55
  Requires-Dist: zarr; extra == "lts"
55
56
  Requires-Dist: setuptools<80.0.0; extra == "lts"
@@ -31,16 +31,13 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
31
31
  """The BERT dataset that assumes WordPiece tokenization
32
32
 
33
33
  Args:
34
- indexed_dataset (IndexedDataset): The IndexedDataset around which to build the MegatronDataset
35
-
34
+ indexed_dataset (IndexedDataset): The IndexedDataset around which
35
+ to build the MegatronDataset
36
36
  dataset_path (str): The real path on disk to the dataset, for bookkeeping
37
-
38
37
  indexed_indices (numpy.ndarray): The set of the documents indices to expose
39
-
40
- num_samples (Optional[int]): The number of samples to draw from the indexed dataset. When None, build as many samples as correspond to one epoch.
41
-
38
+ num_samples (Optional[int]): The number of samples to draw from the indexed dataset.
39
+ When None, build as many samples as correspond to one epoch.
42
40
  index_split (Split): The indexed_indices Split
43
-
44
41
  config (BERTMaskedWordPieceDatasetConfig): The config
45
42
  """
46
43
 
@@ -83,6 +80,7 @@ class BERTMaskedWordPieceDataset(MaskedWordPieceDataset):
83
80
  Returns:
84
81
  Dict[str, Union[int, numpy.ndarray]]: The
85
82
  """
83
+
86
84
  idx_beg, idx_end, target_sequence_length = self.sample_index[idx]
87
85
  sample = [self.dataset[i] for i in range(idx_beg, idx_end)]
88
86
  numpy_random_state = numpy.random.RandomState(seed=(self.config.random_seed + idx) % 2**32)
@@ -80,7 +80,7 @@ class BlendedDataset(torch.utils.data.Dataset):
80
80
  unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
81
81
  )
82
82
  self.unique_description_hash = hashlib.md5(
83
- self.unique_description.encode("utf-8")
83
+ self.unique_description.encode("utf-8"), usedforsecurity=False
84
84
  ).hexdigest()
85
85
 
86
86
  self.dataset_index, self.dataset_sample_index = self._build_indices()
@@ -103,6 +103,7 @@ class BlendedDataset(torch.utils.data.Dataset):
103
103
  Returns:
104
104
  Tuple[numpy.ndarray, numpy.ndarray]: The dataset index and the dataset sample index
105
105
  """
106
+
106
107
  path_to_cache = self.config.path_to_cache
107
108
 
108
109
  if path_to_cache:
@@ -192,7 +193,7 @@ class BlendedDataset(torch.utils.data.Dataset):
192
193
  logger, logging.INFO, f"\tLoad the dataset index from {path_to_dataset_index}"
193
194
  )
194
195
  t_beg = time.time()
195
- dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode='r')
196
+ dataset_index = numpy.load(path_to_dataset_index, allow_pickle=True, mmap_mode="r")
196
197
  t_end = time.time()
197
198
  log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
198
199
 
@@ -203,7 +204,7 @@ class BlendedDataset(torch.utils.data.Dataset):
203
204
  )
204
205
  t_beg = time.time()
205
206
  dataset_sample_index = numpy.load(
206
- path_to_dataset_sample_index, allow_pickle=True, mmap_mode='r'
207
+ path_to_dataset_sample_index, allow_pickle=True, mmap_mode="r"
207
208
  )
208
209
  t_end = time.time()
209
210
  log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
@@ -529,6 +529,7 @@ def _get_size_per_split_per_dataset(
529
529
  Returns:
530
530
  List[List[int]]: The number of samples to request per MegatronDataset per split
531
531
  """
532
+
532
533
  assert numpy.isclose(sum(normalized_weights), 1.0)
533
534
 
534
535
  # Use margin as buffer to ensure we satiate the request
@@ -19,6 +19,7 @@ from megatron.core.utils import log_single_rank
19
19
 
20
20
  logger = logging.getLogger(__name__)
21
21
 
22
+
22
23
  _PAD_TOKEN_ID = -1
23
24
 
24
25
 
@@ -356,7 +357,6 @@ class GPTDataset(MegatronDataset):
356
357
  not cache_hit
357
358
  and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)
358
359
  ):
359
-
360
360
  log_single_rank(
361
361
  logger,
362
362
  logging.INFO,
@@ -494,7 +494,7 @@ class GPTDataset(MegatronDataset):
494
494
  f"\tLoad the document index from {os.path.basename(path_to_document_index)}",
495
495
  )
496
496
  t_beg = time.time()
497
- document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode='r')
497
+ document_index = numpy.load(path_to_document_index, allow_pickle=True, mmap_mode="r")
498
498
  t_end = time.time()
499
499
  log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
500
500
 
@@ -504,7 +504,7 @@ class GPTDataset(MegatronDataset):
504
504
  f"\tLoad the sample index from {os.path.basename(path_to_sample_index)}",
505
505
  )
506
506
  t_beg = time.time()
507
- sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode='r')
507
+ sample_index = numpy.load(path_to_sample_index, allow_pickle=True, mmap_mode="r")
508
508
  t_end = time.time()
509
509
  log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
510
510
 
@@ -514,7 +514,7 @@ class GPTDataset(MegatronDataset):
514
514
  f"\tLoad the shuffle index from {os.path.basename(path_to_shuffle_index)}",
515
515
  )
516
516
  t_beg = time.time()
517
- shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode='r')
517
+ shuffle_index = numpy.load(path_to_shuffle_index, allow_pickle=True, mmap_mode="r")
518
518
  t_end = time.time()
519
519
  log_single_rank(logger, logging.DEBUG, f"\t> time elapsed: {t_end - t_beg:4f} seconds")
520
520
 
@@ -575,6 +575,7 @@ def _build_document_index(
575
575
  Returns:
576
576
  numpy.ndarray: The document index
577
577
  """
578
+
578
579
  if not separate_final_epoch or num_epochs == 1:
579
580
  document_index = numpy.mgrid[0:num_epochs, 0 : len(documents)][1]
580
581
  document_index[:] = documents
@@ -604,6 +605,7 @@ def _build_shuffle_index(
604
605
  Returns:
605
606
  numpy.ndarray: The shuffle index
606
607
  """
608
+
607
609
  dtype_ = numpy.uint32
608
610
  if total_size >= (numpy.iinfo(numpy.uint32).max - 1):
609
611
  dtype_ = numpy.int64
@@ -1,9 +1,10 @@
1
1
  # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
2
 
3
- import numpy
4
3
 
5
4
  # Implicit imports for backwards compatibility
6
5
  # Explicit imports for readability
6
+ import numpy
7
+
7
8
  from megatron.core.datasets.helpers_cpp import *
8
9
  from megatron.core.datasets.helpers_cpp import build_sample_idx_int32, build_sample_idx_int64
9
10
 
@@ -39,6 +40,7 @@ def build_sample_idx(
39
40
  Returns:
40
41
  numpy.ndarray: The 2-D sample index
41
42
  """
43
+
42
44
  sample_idx_max = max(document_indices.shape[0], sizes.max())
43
45
  if sample_idx_max <= numpy.iinfo(numpy.int32).max:
44
46
  sample_idx = build_sample_idx_int32(
@@ -17,12 +17,13 @@ from itertools import accumulate
17
17
  from types import TracebackType
18
18
  from typing import List, Optional, Tuple, Type, Union
19
19
 
20
+ import numpy
21
+
20
22
  try:
21
23
  import boto3
22
24
  except ModuleNotFoundError:
23
25
  pass
24
26
 
25
- import numpy
26
27
  import torch
27
28
 
28
29
  from megatron.core.datasets.object_storage_utils import S3Config # pylint: disable=unused-import
@@ -204,7 +205,7 @@ class _IndexWriter(object):
204
205
 
205
206
  # the mode per sequence
206
207
  if sequence_modes is not None:
207
- self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order='C'))
208
+ self.idx_writer.write(numpy.array(sequence_modes, dtype=numpy.int8).tobytes(order="C"))
208
209
 
209
210
  def _sequence_pointers(self, sequence_lengths: List[int]) -> List[int]:
210
211
  """Build the sequence pointers per the sequence lengths and dtype size
@@ -234,7 +235,6 @@ class _IndexReader(object):
234
235
  """
235
236
 
236
237
  def __init__(self, idx_path: str, multimodal: bool) -> None:
237
-
238
238
  log_single_rank(logger, logging.INFO, f"Load the {type(self).__name__} from {idx_path}")
239
239
 
240
240
  with open(idx_path, "rb") as stream:
@@ -435,11 +435,11 @@ class _FileBinReader(_BinReader):
435
435
  sequence = numpy.empty(count, dtype=dtype)
436
436
  if MultiStorageClientFeature.is_enabled():
437
437
  msc = MultiStorageClientFeature.import_package()
438
- with msc.open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file:
438
+ with msc.open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
439
439
  bin_buffer_file.seek(offset)
440
440
  bin_buffer_file.readinto(sequence)
441
441
  else:
442
- with open(self._bin_path, mode='rb', buffering=0) as bin_buffer_file:
442
+ with open(self._bin_path, mode="rb", buffering=0) as bin_buffer_file:
443
443
  bin_buffer_file.seek(offset)
444
444
  bin_buffer_file.readinto(sequence)
445
445
  return sequence
@@ -520,8 +520,8 @@ class _S3BinReader(_BinReader):
520
520
  Bucket=self._s3_bucket,
521
521
  Key=self._s3_key,
522
522
  # Subtract 1, because the end of Range is inclusive.
523
- Range=f'bytes={bytes_start}-{bytes_end-1}',
524
- )['Body'].read()
523
+ Range=f"bytes={bytes_start}-{bytes_end - 1}",
524
+ )["Body"].read()
525
525
  self._cache_bytes_start = bytes_start
526
526
  self._cache_bytes_end = bytes_end
527
527
  return numpy.frombuffer(self._extract_from_cache(offset, size), dtype=dtype)
@@ -551,7 +551,7 @@ class _MultiStorageClientBinReader(_BinReader):
551
551
 
552
552
 
553
553
  # Map of object storage access to the corresponding bin reader
554
- OBJECT_STORAGE_BIN_READERS = {'s3': _S3BinReader, 'msc': _MultiStorageClientBinReader}
554
+ OBJECT_STORAGE_BIN_READERS = {"s3": _S3BinReader, "msc": _MultiStorageClientBinReader}
555
555
 
556
556
 
557
557
  class IndexedDataset(torch.utils.data.Dataset):
@@ -355,7 +355,6 @@ class MaskedWordPieceDataset(MegatronDataset):
355
355
  numpy_random_state.shuffle(candidate_ngrams)
356
356
 
357
357
  if self.config.masking_do_permutation:
358
-
359
358
  n_swappings = n_maskings
360
359
 
361
360
  permuted_indices = set()
@@ -417,7 +416,7 @@ class MaskedWordPieceDataset(MegatronDataset):
417
416
 
418
417
  masked_spans = sorted(masked_spans, key=lambda x: x[0][0])
419
418
 
420
- return masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans
419
+ return (masked_token_ids, masked_positions, masked_labels, boundaries, masked_spans)
421
420
 
422
421
  @abstractmethod
423
422
  def _get_token_mask(self, numpy_random_state: numpy.random.RandomState) -> Optional[int]:
@@ -63,7 +63,7 @@ class MegatronDataset(ABC, torch.utils.data.Dataset):
63
63
  self.unique_identifiers, indent=4, default=lambda obj: obj.unique_identifiers
64
64
  )
65
65
  self.unique_description_hash = hashlib.md5(
66
- self.unique_description.encode("utf-8")
66
+ self.unique_description.encode("utf-8"), usedforsecurity=False
67
67
  ).hexdigest()
68
68
 
69
69
  @staticmethod
@@ -20,7 +20,6 @@ class MegatronTokenizer(ABC):
20
20
  """
21
21
 
22
22
  def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
23
-
24
23
  self.unique_identifiers = OrderedDict()
25
24
  self.unique_identifiers["class"] = type(self).__name__
26
25
  self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
@@ -4,7 +4,6 @@
4
4
 
5
5
  import abc
6
6
  from dataclasses import dataclass
7
- from typing import Any
8
7
 
9
8
  import numpy as np
10
9
  import torch
@@ -22,7 +21,9 @@ class Embedder(abc.ABC):
22
21
  """Embed a text dataset.
23
22
 
24
23
  Args:
25
- text_dataset (torch.utils.data.Dataset): Text dataset to embed. Each sample of the text dataset should output a dict with a key 'text' and a string value.
24
+ text_dataset (torch.utils.data.Dataset): Text dataset to embed.
25
+ Each sample of the text dataset should output a dict with a key 'text'
26
+ and a string value.
26
27
 
27
28
  Returns:
28
29
  A 2D ndarray with shape (len(text_dataset), dimension(embedder)).
@@ -11,7 +11,6 @@ Building a chunk database consists of.
11
11
  - Save chunk offsets to disk for each indexed dataset.
12
12
  """
13
13
 
14
- import glob
15
14
  import os
16
15
  import types
17
16
  from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -19,11 +18,9 @@ from typing import Dict, List, Tuple
19
18
 
20
19
  import numpy as np
21
20
  import torch
22
- from tqdm import tqdm
23
21
 
24
22
  from megatron.core.datasets.indexed_dataset import IndexedDataset
25
23
  from megatron.core.datasets.retro.config import RetroPreprocessingConfig
26
- from megatron.core.datasets.retro.external_libs import h5py
27
24
  from megatron.core.datasets.retro.utils import (
28
25
  extract_data_config,
29
26
  get_blocks_by_rank,
@@ -40,10 +37,23 @@ from .utils import (
40
37
  get_individual_doc_offsets,
41
38
  get_merged_db_path_map,
42
39
  init_indexed_dataset_infos,
43
- load_indexed_datasets,
44
40
  save_indexed_dataset_infos,
45
41
  )
46
42
 
43
+ try:
44
+ from tqdm import tqdm
45
+
46
+ HAVE_TQDM = True
47
+ except ImportError:
48
+ HAVE_TQDM = False
49
+
50
+ try:
51
+ import h5py
52
+
53
+ HAVE_H5PY = True
54
+ except ImportError:
55
+ HAVE_H5PY = False
56
+
47
57
 
48
58
  def build_partial_db(
49
59
  config: types.SimpleNamespace,
@@ -64,7 +74,8 @@ def build_partial_db(
64
74
  from each document.
65
75
 
66
76
  Args:
67
- config (types.SimpleNamespace): Subset of Retro config, containing 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
77
+ config (types.SimpleNamespace): Subset of Retro config, containing
78
+ 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
68
79
  dataset_idx (int): Index of this dataset out of all blended datasets.
69
80
  n_datasets (int): Total number of blended datasets.
70
81
  indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
@@ -83,6 +94,9 @@ def build_partial_db(
83
94
  - Dict mapping document ID to number of valid chunks.
84
95
  """
85
96
 
97
+ if not HAVE_TQDM:
98
+ raise ImportError("tqdm is required to use the RetroDataset. Please install tqdm.")
99
+
86
100
  # Document start/end indexes.
87
101
  doc_range = block["range"]
88
102
  n_docs = doc_range[1] - doc_range[0]
@@ -111,7 +125,6 @@ def build_partial_db(
111
125
  chunk_db_invalid: List[Tuple] = []
112
126
  doc_size_map = {}
113
127
  for doc_id in pbar:
114
-
115
128
  # Progress description.
116
129
  try:
117
130
  pbar.set_description(
@@ -142,7 +155,6 @@ def build_partial_db(
142
155
  # Re-tokenize each chunk to Bert/Wordpiece (empty bert -> 'invalid').
143
156
  doc_size_map[doc_id] = 0
144
157
  for i, chunk_start_idx in enumerate(chunk_start_idxs):
145
-
146
158
  # Re-tokenize.
147
159
  chunk_end_idx = chunk_end_idxs[i]
148
160
  gpt_token_ids = indexed_dataset.get(
@@ -176,12 +188,13 @@ def build_block_db(
176
188
  """Split each document within block into consecutive retro_gpt_chunk_length size chunks.
177
189
 
178
190
  Args:
179
- config (RetroPreprocessingConfig): For DB building, we make use of attributes 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
191
+ config (RetroPreprocessingConfig): For DB building, we make use of attributes
192
+ 'chunk_length', 'gpt_eod', 'gpt_detokenize', 'bert_tokenize', and 'task_validate'.
180
193
  dataset_idx (int): Index of this dataset out of all blended datasets.
181
194
  n_datasets (int): Total number of blended datasets.
182
195
  indexed_dataset (IndexedDataset): Indexed dataset to be chunked.
183
196
  n_procs (int): Total number of parallel processes.
184
- executor (ProcessPoolExecutor): Executor for launching parallel processes.
197
+ executor (ProcessPoolExecutor): Executor for launching parallel processes.
185
198
  n_missing_blocks (int): Total number of blocks to be processed.
186
199
  block_idx (int): Block index out of all blocks to be processed.
187
200
  block (dict): Range information such as start/end points for chunking idnexed dataset.
@@ -195,7 +208,7 @@ def build_block_db(
195
208
  """
196
209
 
197
210
  # Build partial dbs.
198
- log_retro_rank_0(' > build partial dbs.')
211
+ log_retro_rank_0(" > build partial dbs.")
199
212
  futures = []
200
213
  for proc_id in range(n_procs): # not true process id
201
214
  futures.append(
@@ -232,7 +245,7 @@ def build_block_db(
232
245
  ]
233
246
 
234
247
  # Convert to numpy.
235
- log_retro_rank_0(' > converting chunk db to numpy.')
248
+ log_retro_rank_0(" > converting chunk db to numpy.")
236
249
  chunk_db_valid = np.array(chunk_db_valid, dtype="uint32")
237
250
  chunk_db_invalid = np.array(chunk_db_invalid, dtype="uint32")
238
251
 
@@ -261,6 +274,9 @@ def save_block_db(
261
274
  chunk_db_invalid (np.ndarray): Array of invalid chunk indexes.
262
275
  doc_offsets (np.ndarray): Array of document offsets by chunks.
263
276
  """
277
+ if not HAVE_H5PY:
278
+ raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
279
+
264
280
  log_retro_rank_0(" > saving individual db.")
265
281
  with h5py.File(block["path"], "w") as f:
266
282
  dset = f.create_dataset("chunks_valid", data=chunk_db_valid)
@@ -277,7 +293,8 @@ def build_individual_db(
277
293
  config (RetroPreprocessingConfig): Retro preprocessing config.
278
294
  dataset_idx (int): Dataset index within blended dataset.
279
295
  n_datasets (int): Total number of datasets within blended dataset.
280
- dataset_info (dict): Metadata for dataset (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
296
+ dataset_info (dict): Metadata for dataset
297
+ (see `save_indexed_dataset_infos()` in `utils.py` for more detail).
281
298
  """
282
299
 
283
300
  # Make directory.
@@ -323,9 +340,7 @@ def build_individual_db(
323
340
  # Process documents in parallel.
324
341
  with ProcessPoolExecutor(max_workers=n_procs) as executor:
325
342
  for block_idx, block in enumerate(active_blocks):
326
-
327
343
  if block is not None:
328
-
329
344
  # Build block DB.
330
345
  chunk_db_valid, chunk_db_invalid, doc_offsets = build_block_db(
331
346
  config=config,
@@ -349,7 +364,6 @@ def build_individual_db(
349
364
  )
350
365
 
351
366
  else:
352
-
353
367
  # Load existing block DB.
354
368
  with h5py.File(block["path"]) as f:
355
369
  existing_chunks_valid = np.copy(f["chunks_valid"])
@@ -382,7 +396,6 @@ def build_individual_dbs(
382
396
  # Build individual DBs.
383
397
  log_retro_rank_0(" > build individual chunk dbs.")
384
398
  for ds_idx, ds_info in enumerate(indexed_dataset_infos):
385
-
386
399
  # Progress.
387
400
  log_retro_rank_0(
388
401
  " > building individual db, dataset %d / %d ... '%s'."
@@ -400,7 +413,8 @@ def update_chunk_counts(
400
413
 
401
414
  Args:
402
415
  config (RetroPreprocessingConfig): Retro preprocessing config.
403
- indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
416
+ indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
417
+ (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
404
418
  """
405
419
 
406
420
  if torch.distributed.get_rank() != 0:
@@ -416,7 +430,6 @@ def update_chunk_counts(
416
430
  # Set n_chunks (including n_chunks_sampled for unambiguity).
417
431
  log_retro_rank_0(" > compute n_chunks.")
418
432
  for ds_index, ds_info in enumerate(indexed_dataset_infos):
419
-
420
433
  db_paths = get_individual_db_paths(config.retro_project_dir, ds_info["prefix"])
421
434
 
422
435
  # Update counts.
@@ -457,10 +470,14 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
457
470
 
458
471
  Args:
459
472
  project_dir (str): Retro project dir.
460
- indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
473
+ indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
474
+ (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
461
475
  db_type (str): DB type (e.g., 'sampled', 'train', or 'valid').
462
476
  """
463
477
 
478
+ if not HAVE_H5PY:
479
+ raise ImportError("h5py is required to use the RetroDataset. Please install h5py.")
480
+
464
481
  if torch.distributed.get_rank() != 0:
465
482
  return
466
483
 
@@ -489,9 +506,7 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
489
506
 
490
507
  # Delete existing chunk db if incorrect size.
491
508
  if os.path.exists(db_path):
492
-
493
509
  try:
494
-
495
510
  f = h5py.File(db_path)
496
511
  n_alloc = len(f["chunks"]) # total allocated
497
512
  n_written = f["n_written"][0].item() # total written
@@ -511,7 +526,6 @@ def merge_dbs(project_dir: str, indexed_dataset_infos: List[Dict], db_type: str)
511
526
 
512
527
  # Build merged chunk db.
513
528
  if not os.path.exists(db_path):
514
-
515
529
  os.makedirs(os.path.dirname(db_path), exist_ok=True)
516
530
  f = h5py.File(db_path, "w")
517
531
 
@@ -589,7 +603,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
589
603
 
590
604
  Args:
591
605
  project_dir (str): Retro project dir.
592
- indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
606
+ indexed_dataset_infos (List[Dict]): Preprocessing metadata for each dataset
607
+ (i.e., 'prefix', 'ratio', 'n_chunks', etc.).
593
608
  """
594
609
  merge_dbs(project_dir, indexed_dataset_infos, "sampled")
595
610
  merge_dbs(project_dir, indexed_dataset_infos, "train")
@@ -599,7 +614,8 @@ def build_merged_dbs(project_dir: str, indexed_dataset_infos: List[Dict]) -> Non
599
614
  def build_db(config: RetroPreprocessingConfig) -> None:
600
615
  """Extract token chunks from each indexed dataset.
601
616
 
602
- Iterate each document of each indexed dataset, extract that document's chunks, and save to a 'DB' (hdf5 file).
617
+ Iterate each document of each indexed dataset, extract that document's chunks,
618
+ and save to a 'DB' (hdf5 file).
603
619
 
604
620
  Args:
605
621
  config (RetroPreprocessingConfig): Retro preprocessing config.
@@ -10,10 +10,16 @@ from typing import List
10
10
 
11
11
  import numpy as np
12
12
  import torch
13
- from tqdm import tqdm
14
13
 
15
14
  from megatron.core.datasets.indexed_dataset import IndexedDataset
16
15
 
16
+ try:
17
+ from tqdm import tqdm
18
+
19
+ HAVE_TQDM = True
20
+ except ImportError:
21
+ HAVE_TQDM = False
22
+
17
23
 
18
24
  class DBDataset(torch.utils.data.Dataset):
19
25
  """Dataset for iterating chunks.
@@ -21,7 +27,8 @@ class DBDataset(torch.utils.data.Dataset):
21
27
  Args:
22
28
  db_path (str): Path of HDF5-format chunk database.
23
29
  indexed_datasets (List[IndexedDataset]): Indexed datasets used to build database.
24
- chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets. Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
30
+ chunks (np.ndarray): Array of chunk indexes, for indexing into indexed datasets.
31
+ Format [dataset_idx, doc_id, start_idx, end_idx, bert_length].
25
32
  chunk_length (int): Max GPT chunk length (e.g., 64).
26
33
  eod_token_id (int): EOD token ID.
27
34
  """
@@ -34,7 +41,6 @@ class DBDataset(torch.utils.data.Dataset):
34
41
  chunk_length: int,
35
42
  eod_token_id: int,
36
43
  ):
37
-
38
44
  assert chunks.shape[1] == 5, (
39
45
  "expected 5 columns (dataset_idx, "
40
46
  "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); "
@@ -93,6 +99,9 @@ class DBDataset(torch.utils.data.Dataset):
93
99
  Load the dataset id & document id of each chunk in the database, to
94
100
  be used for causality filtering during querying.
95
101
  """
102
+ if not HAVE_TQDM:
103
+ raise ImportError("tqdm is required to use the DBDataset. Please install tqdm.")
104
+
96
105
  self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
97
106
  block_size = int(1e6)
98
107
  for start_idx in tqdm(