megatron-core 0.16.0rc0.dev100285__tar.gz → 0.16.0rc0.dev100785__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (360) hide show
  1. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/dynamic_context.py +80 -1
  3. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/engines/dynamic_engine.py +72 -0
  4. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/package_info.py +1 -1
  5. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/attention.py +14 -3
  6. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/cuda_graphs.py +5 -1
  7. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/pipeline_parallel_layer_layout.py +5 -2
  8. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron_core.egg-info/PKG-INFO +1 -1
  9. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/MANIFEST.in +0 -0
  10. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/README.md +0 -0
  11. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/README.md +0 -0
  12. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/__init__.py +0 -0
  13. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/activations.py +0 -0
  14. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/config.py +0 -0
  15. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/config_logger.py +0 -0
  16. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/__init__.py +0 -0
  17. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/bert_dataset.py +0 -0
  18. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/blended_dataset.py +0 -0
  19. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  20. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  21. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/gpt_dataset.py +0 -0
  22. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/helpers.cpp +0 -0
  23. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/helpers.py +0 -0
  24. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/indexed_dataset.py +0 -0
  25. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/masked_dataset.py +0 -0
  26. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/megatron_dataset.py +0 -0
  27. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  28. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/multimodal_dataset.py +0 -0
  29. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/object_storage_utils.py +0 -0
  30. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/__init__.py +0 -0
  31. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/config/__init__.py +0 -0
  32. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  33. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/config/config.py +0 -0
  34. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  35. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  36. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/db/__init__.py +0 -0
  37. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/db/build.py +0 -0
  38. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/db/dataset.py +0 -0
  39. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/db/utils.py +0 -0
  40. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/external_libs.py +0 -0
  41. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/__init__.py +0 -0
  42. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/build.py +0 -0
  43. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/factory.py +0 -0
  44. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/index.py +0 -0
  45. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  46. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  47. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  48. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/utils.py +0 -0
  49. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/index/validate.py +0 -0
  50. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/__init__.py +0 -0
  51. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  52. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  53. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/query.py +0 -0
  54. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  55. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/query/utils.py +0 -0
  56. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/retro/utils.py +0 -0
  57. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/t5_dataset.py +0 -0
  58. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/utils.py +0 -0
  59. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/datasets/utils_s3.py +0 -0
  60. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/__init__.py +0 -0
  61. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/core.py +0 -0
  62. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  63. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  64. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/mapping.py +0 -0
  65. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  66. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/serialization.py +0 -0
  67. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  68. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  69. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  70. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  71. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  72. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  73. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  74. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  75. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  76. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  77. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  78. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  79. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  80. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  81. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  82. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  83. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/utils.py +0 -0
  84. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/dist_checkpointing/validation.py +0 -0
  85. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/__init__.py +0 -0
  86. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/data_parallel_base.py +0 -0
  87. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  88. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  89. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/finalize_model_grads.py +0 -0
  90. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/__init__.py +0 -0
  91. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  92. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  93. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  94. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  95. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  96. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  97. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  98. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  99. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  100. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  101. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  102. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  103. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  104. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  105. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/energy_monitor.py +0 -0
  106. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/enums.py +0 -0
  107. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/__init__.py +0 -0
  108. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/data_type.py +0 -0
  109. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/export_config.py +0 -0
  110. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/model_type.py +0 -0
  111. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/__init__.py +0 -0
  112. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  113. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  114. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  115. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  116. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  117. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  118. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  119. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  120. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  121. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  122. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  123. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  124. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/extensions/__init__.py +0 -0
  125. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/extensions/kitchen.py +0 -0
  126. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/extensions/transformer_engine.py +0 -0
  127. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  128. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fp4_utils.py +0 -0
  129. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fp8_utils.py +0 -0
  130. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/full_cuda_graph.py +0 -0
  131. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/__init__.py +0 -0
  132. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  133. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  134. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  135. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  136. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  137. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_indices_converter.py +0 -0
  138. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_layer_norm.py +0 -0
  139. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  140. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  141. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_softmax.py +0 -0
  142. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  143. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/hyper_comm_grid.py +0 -0
  144. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/__init__.py +0 -0
  145. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/async_stream.py +0 -0
  146. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/common_inference_params.py +0 -0
  147. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/communication_utils.py +0 -0
  148. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/__init__.py +0 -0
  149. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
  150. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -0
  151. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/base_context.py +0 -0
  152. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  153. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
  154. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/contexts/static_context.py +0 -0
  155. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  156. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/engines/__init__.py +0 -0
  157. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/engines/abstract_engine.py +0 -0
  158. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/engines/mcore_engine.py +0 -0
  159. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/engines/static_engine.py +0 -0
  160. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/headers.py +0 -0
  161. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/inference_client.py +0 -0
  162. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/inference_request.py +0 -0
  163. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  164. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  165. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  166. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  167. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  168. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  169. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  170. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  171. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/sampling_params.py +0 -0
  172. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/scheduler.py +0 -0
  173. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  174. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  175. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  176. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
  177. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  178. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  179. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  180. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  181. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  182. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  183. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  184. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/unified_memory.py +0 -0
  185. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference/utils.py +0 -0
  186. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/inference_params.py +0 -0
  187. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/jit.py +0 -0
  188. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/model_parallel_config.py +0 -0
  189. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/T5/__init__.py +0 -0
  190. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/T5/t5_model.py +0 -0
  191. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/T5/t5_spec.py +0 -0
  192. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/__init__.py +0 -0
  193. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/backends.py +0 -0
  194. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/bert/__init__.py +0 -0
  195. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  196. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/bert/bert_lm_head.py +0 -0
  197. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/bert/bert_model.py +0 -0
  198. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/bert/pooler.py +0 -0
  199. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/__init__.py +0 -0
  200. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/__init__.py +0 -0
  201. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  202. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  203. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  204. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  205. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  206. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/language_module/__init__.py +0 -0
  207. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/language_module/language_module.py +0 -0
  208. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  209. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/vision_module/__init__.py +0 -0
  210. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  211. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/__init__.py +0 -0
  212. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  213. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  214. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/gpt_model.py +0 -0
  215. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  216. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  217. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/huggingface/__init__.py +0 -0
  218. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/huggingface/clip_model.py +0 -0
  219. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/huggingface/module.py +0 -0
  220. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/huggingface/qwen_model.py +0 -0
  221. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mamba/__init__.py +0 -0
  222. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  223. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mamba/mamba_model.py +0 -0
  224. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/__init__.py +0 -0
  225. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/config/__init__.py +0 -0
  226. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/config/base_configs.py +0 -0
  227. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/model/__init__.py +0 -0
  228. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/model/base.py +0 -0
  229. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/submodules/audio.py +0 -0
  230. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/submodules/base.py +0 -0
  231. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/mimo/submodules/vision.py +0 -0
  232. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/multimodal/__init__.py +0 -0
  233. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/multimodal/context_parallel.py +0 -0
  234. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/multimodal/llava_model.py +0 -0
  235. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/multimodal/llava_spec.py +0 -0
  236. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/__init__.py +0 -0
  237. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/base_attention.py +0 -0
  238. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/config.py +0 -0
  239. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/decoder_attention.py +0 -0
  240. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/decoder_spec.py +0 -0
  241. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/encoder_attention.py +0 -0
  242. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/encoder_spec.py +0 -0
  243. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/model.py +0 -0
  244. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/retro/utils.py +0 -0
  245. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/vision/__init__.py +0 -0
  246. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/vision/clip_vit_model.py +0 -0
  247. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/vision/multimodal_projector.py +0 -0
  248. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/vision/radio.py +0 -0
  249. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  250. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/msc_utils.py +0 -0
  251. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/nccl_allocator.py +0 -0
  252. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/num_microbatches_calculator.py +0 -0
  253. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/__init__.py +0 -0
  254. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/clip_grads.py +0 -0
  255. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  256. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  257. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  258. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/grad_scaler.py +0 -0
  259. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/optimizer.py +0 -0
  260. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer/optimizer_config.py +0 -0
  261. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/optimizer_param_scheduler.py +0 -0
  262. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/packed_seq_params.py +0 -0
  263. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/parallel_state.py +0 -0
  264. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/__init__.py +0 -0
  265. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  266. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  267. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  268. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/schedules.py +0 -0
  269. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/pipeline_parallel/utils.py +0 -0
  270. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/__init__.py +0 -0
  271. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/__init__.py +0 -0
  272. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  273. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  274. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  275. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/layers.py +0 -0
  276. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  277. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  278. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/process_groups_config.py +0 -0
  279. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/quantization/__init__.py +0 -0
  280. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/quantization/quant_config.py +0 -0
  281. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/quantization/utils.py +0 -0
  282. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/requirements.txt +0 -0
  283. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/rerun_state_machine.py +0 -0
  284. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/safe_globals.py +0 -0
  285. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/__init__.py +0 -0
  286. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mamba_block.py +0 -0
  287. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  288. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  289. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mamba_layer.py +0 -0
  290. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mamba_mixer.py +0 -0
  291. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/mlp_layer.py +0 -0
  292. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/ssm/triton_cache_manager.py +0 -0
  293. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/__init__.py +0 -0
  294. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  295. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/data.py +0 -0
  296. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/layers.py +0 -0
  297. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/mappings.py +0 -0
  298. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/random.py +0 -0
  299. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tensor_parallel/utils.py +0 -0
  300. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/timers.py +0 -0
  301. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/__init__.py +0 -0
  302. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  303. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  304. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/__init__.py +0 -0
  305. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  306. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  307. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  308. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  309. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  312. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  313. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  314. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  315. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  318. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  322. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  323. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/__init__.py +0 -0
  324. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  325. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  326. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/dot_product_attention.py +0 -0
  327. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/enums.py +0 -0
  328. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  329. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  330. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  331. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/identity_op.py +0 -0
  332. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/mlp.py +0 -0
  333. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/module.py +0 -0
  334. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/__init__.py +0 -0
  335. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/experts.py +0 -0
  336. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  337. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  338. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/moe_layer.py +0 -0
  339. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/moe_utils.py +0 -0
  340. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/router.py +0 -0
  341. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/shared_experts.py +0 -0
  342. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  343. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  344. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/multi_latent_attention.py +0 -0
  345. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/multi_token_prediction.py +0 -0
  346. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/spec_utils.py +0 -0
  347. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/torch_layer_norm.py +0 -0
  348. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/torch_norm.py +0 -0
  349. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/transformer_block.py +0 -0
  350. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/transformer_config.py +0 -0
  351. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/transformer_layer.py +0 -0
  352. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/transformer/utils.py +0 -0
  353. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron/core/utils.py +0 -0
  354. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron_core.egg-info/SOURCES.txt +0 -0
  355. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron_core.egg-info/dependency_links.txt +0 -0
  356. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron_core.egg-info/requires.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/megatron_core.egg-info/top_level.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/pyproject.toml +0 -0
  359. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/setup.cfg +0 -0
  360. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev100785}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev100785
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -4,7 +4,7 @@ import math
4
4
  import warnings
5
5
  from contextlib import nullcontext
6
6
  from enum import Enum
7
- from typing import List, Optional, Tuple
7
+ from typing import TYPE_CHECKING, List, Optional, Tuple
8
8
 
9
9
  import torch
10
10
  import torch.nn.functional as F
@@ -49,6 +49,17 @@ try:
49
49
  except ImportError:
50
50
  HAVE_FLASHINFER = False
51
51
 
52
+ try:
53
+ import wandb # pylint: disable=unused-import
54
+
55
+ HAVE_WANDB = True
56
+ except ImportError:
57
+ HAVE_WANDB = False
58
+ wandb = None
59
+
60
+ if TYPE_CHECKING:
61
+ import wandb as WandbModule
62
+
52
63
 
53
64
  class ContextOverflowError(Exception):
54
65
  """Base exception for when a new request does not fit.
@@ -226,6 +237,7 @@ class DynamicInferenceContext(BaseInferenceContext):
226
237
  levels will be included to control other tensors within the context.
227
238
  use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
228
239
  If None, defaults to using flash-infer if available.
240
+ metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
229
241
  """
230
242
 
231
243
  def __init__(
@@ -251,6 +263,7 @@ class DynamicInferenceContext(BaseInferenceContext):
251
263
  use_cuda_graphs_for_non_decode_steps: bool = True,
252
264
  use_flashinfer_fused_rope: bool = False,
253
265
  unified_memory_level: Optional[int] = 0,
266
+ metrics_writer: Optional['WandbModule'] = None,
254
267
  ):
255
268
  super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
256
269
 
@@ -260,6 +273,8 @@ class DynamicInferenceContext(BaseInferenceContext):
260
273
  block_size_tokens == 64
261
274
  ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
262
275
 
276
+ self.metrics_writer = metrics_writer
277
+
263
278
  # Per partition num heads and hidden size.
264
279
  projection_size = kv_channels * num_attention_heads
265
280
  if tensor_model_parallel_size is None:
@@ -1569,3 +1584,67 @@ class DynamicInferenceContext(BaseInferenceContext):
1569
1584
 
1570
1585
  # Convert each log prob tensor into a list
1571
1586
  return [lp.tolist() for lp in selected_log_probs_list]
1587
+
1588
+ def get_kvcache_utilization_stats(self) -> dict:
1589
+ """Compute KV cache buffer utilization stats for the current step.
1590
+
1591
+ Returns a dictionary with counts and percentages for both allocated block
1592
+ usage (overall buffer occupancy) and active usage (blocks referenced by
1593
+ currently active requests this step).
1594
+
1595
+ Return:
1596
+ {
1597
+ 'total_blocks': int,
1598
+ 'allocated_blocks': int,
1599
+ 'active_unique_blocks': int,
1600
+ 'allocated_utilization': float,
1601
+ 'active_utilization': float,
1602
+ 'active_request_count': int,
1603
+ 'paused_request_count': int,
1604
+ 'gtd_block_count': int,
1605
+ }
1606
+ """
1607
+ # Total usable blocks exclude the reserved dummy block.
1608
+ total_blocks = max(self.block_allocator.block_count_total - 1, 1)
1609
+ block_count_avail = int(self.block_allocator.block_count_avail)
1610
+
1611
+ # Overall allocated blocks in the buffer right now.
1612
+ allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail
1613
+ allocated_blocks = int(max(0, allocated_blocks))
1614
+
1615
+ # Active unique blocks referenced by current active requests only.
1616
+ active_start = self.paused_request_count
1617
+ active_end = self.total_request_count
1618
+ if active_end > active_start:
1619
+ active_rows = self.request_to_kv_block_ids[active_start:active_end]
1620
+ # Filter valid block ids (>= 0) and count unique ids.
1621
+ valid_ids = active_rows[active_rows >= 0]
1622
+ if valid_ids.numel() > 0:
1623
+ unique_ids = torch.unique(valid_ids)
1624
+ active_unique_blocks = int(unique_ids.numel())
1625
+ else:
1626
+ active_unique_blocks = 0
1627
+ else:
1628
+ active_unique_blocks = 0
1629
+
1630
+ allocated_utilization = float(allocated_blocks) / float(total_blocks)
1631
+ active_utilization = float(active_unique_blocks) / float(total_blocks)
1632
+
1633
+ # Diagnostic helpers
1634
+ num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count))
1635
+ total_request_count = int(self.total_request_count)
1636
+ return {
1637
+ 'total_blocks': int(total_blocks),
1638
+ 'allocated_blocks': int(allocated_blocks),
1639
+ 'active_unique_blocks': int(active_unique_blocks),
1640
+ 'allocated_utilization': allocated_utilization,
1641
+ 'active_utilization': active_utilization,
1642
+ 'active_request_count': int(self.get_active_request_count()),
1643
+ 'paused_request_count': int(self.paused_request_count),
1644
+ 'gtd_block_count': int(self.gtd_block_count),
1645
+ 'block_count_avail': int(block_count_avail),
1646
+ 'num_non_gtd_blocks': int(num_non_gtd_blocks),
1647
+ 'active_token_count': int(self.active_token_count),
1648
+ 'total_request_count': int(total_request_count),
1649
+ 'max_requests': int(self.max_requests),
1650
+ }
@@ -57,6 +57,14 @@ try:
57
57
  except:
58
58
  HAVE_MSGPACK = False
59
59
 
60
+ try:
61
+ import wandb
62
+
63
+ HAVE_WANDB = True
64
+ except ImportError:
65
+ HAVE_WANDB = False
66
+ wandb = None
67
+
60
68
 
61
69
  def format_mem_bytes(mem_bytes):
62
70
  """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes."""
@@ -89,6 +97,8 @@ class DynamicInferenceEngine(AbstractEngine):
89
97
  static_sampling (bool): If True, all requests are assumed to have the same
90
98
  sampling parameters. This avoids needing to loop through all requests and
91
99
  their sampling parameters every generation step, improving latency.
100
+ inference_logging_step_interval (int): The step interval at which to log
101
+ inference metrics to wandb. Defaults to 0, which means no logging.
92
102
  """
93
103
 
94
104
  def __init__(
@@ -101,6 +111,7 @@ class DynamicInferenceEngine(AbstractEngine):
101
111
  track_paused_request_events: bool = False,
102
112
  enable_chunked_prefill: bool = True,
103
113
  static_sampling: bool = False,
114
+ inference_logging_step_interval: int = 0,
104
115
  ):
105
116
 
106
117
  if enable_cuda_graph is not None:
@@ -137,6 +148,32 @@ class DynamicInferenceEngine(AbstractEngine):
137
148
  self.enable_chunked_prefill = enable_chunked_prefill
138
149
  self.static_sampling = static_sampling
139
150
 
151
+ self.inference_logging_step_interval = inference_logging_step_interval
152
+ # Configure wandb to use separate step counter for inference metrics (only once)
153
+ if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
154
+ logging.info(
155
+ f"\033[1;93m[INFERENCE]\033[0m "
156
+ f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m"
157
+ )
158
+ if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
159
+ # Make all inference/* metrics use inference_step as their x-axis
160
+ # This allows inference and training to have independent step counters
161
+ context.metrics_writer.define_metric(
162
+ "inference/*", step_metric="inference/inference_step"
163
+ )
164
+ # Initialize inference step offset by querying existing run history
165
+ self.inference_step_offset = 0
166
+ if wandb.run is not None:
167
+ api_run = wandb.Api().run(
168
+ f"{wandb.run.entity}/{wandb.run.project}/{wandb.run.id}"
169
+ )
170
+ max_step = 0
171
+ for row in api_run.scan_history(keys=["inference/inference_step"]):
172
+ val = row.get("inference/inference_step")
173
+ if isinstance(val, (int, float)) and int(val) > max_step:
174
+ max_step = int(val)
175
+ self.inference_step_offset = int(max_step)
176
+
140
177
  # Initialize the asyncio loop if it has not already been initialized.
141
178
  # TODO: Start the engine loop here.
142
179
  self._loop = get_asyncio_loop()
@@ -780,6 +817,41 @@ class DynamicInferenceEngine(AbstractEngine):
780
817
  self.request_completion_futures[failed_request_id].set_result(failed_request)
781
818
  self.failed_request_ids.clear()
782
819
 
820
+ # Log KV cache utilization stats to W&B
821
+ if (
822
+ self.inference_logging_step_interval > 0
823
+ and self.step_count > 0
824
+ and self.step_count % self.inference_logging_step_interval == 0
825
+ and self.context.metrics_writer is not None
826
+ ):
827
+
828
+ # Get KV cache utilization stats from dynamic context
829
+ kv_stats = self.context.get_kvcache_utilization_stats()
830
+
831
+ # Prepare metrics dictionary with all stats
832
+ # Use 'inference/' prefix for all metrics to separate from training metrics
833
+ metrics = {
834
+ 'inference/inference_step': int(self.inference_step_offset + int(self.step_count)),
835
+ 'inference/step_time_s': float(step_time),
836
+ 'inference/waiting_queue_len': int(len(self.waiting_request_ids)),
837
+ 'inference/total_requests_dict_size': int(len(self.requests)),
838
+ }
839
+ # Add KV stats with inference/ prefix
840
+ # Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization
841
+ for key, value in kv_stats.items():
842
+ if 'utilization' in key:
843
+ # Convert to percentage (0-100) and group under kvcache_utilization
844
+ metrics[f'inference/{key}'] = float(value * 100.0)
845
+ else:
846
+ metrics[f'inference/{key}'] = value
847
+
848
+ if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
849
+ self.context.metrics_writer.log(metrics, commit=True)
850
+ else:
851
+ raise ValueError(
852
+ f"Unsupported metrics writer type: {type(self.context.metrics_writer)}"
853
+ )
854
+
783
855
  # Print context state.
784
856
  if verbose:
785
857
  context = self.context
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 16
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0.dev100285'
7
+ PRE_RELEASE = 'rc0.dev100785'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -48,15 +48,26 @@ except ImportError:
48
48
  rearrange = None
49
49
 
50
50
  try:
51
- from flashattn_hopper.flash_attn_interface import _flash_attn_forward
52
- from flashattn_hopper.flash_attn_interface import (
51
+ from flash_attn_3.flash_attn_interface import _flash_attn_forward
52
+ from flash_attn_3.flash_attn_interface import (
53
53
  flash_attn_with_kvcache as flash_attn3_with_kvcache,
54
54
  )
55
55
 
56
56
  HAVE_FA3 = True
57
- except:
57
+ except ImportError as e:
58
58
  HAVE_FA3 = False
59
59
 
60
+ if not HAVE_FA3:
61
+ try:
62
+ from flashattn_hopper.flash_attn_interface import _flash_attn_forward
63
+ from flashattn_hopper.flash_attn_interface import (
64
+ flash_attn_with_kvcache as flash_attn3_with_kvcache,
65
+ )
66
+
67
+ HAVE_FA3 = True
68
+ except ImportError as e:
69
+ pass
70
+
60
71
  try:
61
72
  from flash_mla import flash_mla_with_kvcache, get_mla_metadata
62
73
 
@@ -1182,7 +1182,11 @@ class CudaGraphManager(torch.nn.Module):
1182
1182
 
1183
1183
  if runner is None:
1184
1184
  if _CudagraphGlobalRecord.cudagraph_created:
1185
- assert False
1185
+ assert False, (
1186
+ f"`cudagraph_created` is set to True but no matching cudagraph "
1187
+ f"runners were found. This module has {len(self.cudagraph_runners)} "
1188
+ f"existing runners. Use `get_mismatch_errors` to debug mismatches."
1189
+ )
1186
1190
  else:
1187
1191
  runner = _CudaGraphRunner(
1188
1192
  megatron_module,
@@ -15,8 +15,11 @@ logger = logging.getLogger(__name__)
15
15
  class PipelineParallelLayerLayout:
16
16
  """Configuration of custom pipeline parallel layer partitioning."""
17
17
 
18
- def __repr__(self):
19
- return self.input_data
18
+ def __repr__(self) -> str:
19
+ if isinstance(self.input_data, str):
20
+ return self.input_data
21
+ else:
22
+ return str(self.input_data)
20
23
 
21
24
  def __init__(self, layout: str | list, pipeline_model_parallel_size: int):
22
25
  """Initialize PipelineParallelLayerLayout from a list or a str.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev100785
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>