megatron-core 0.16.0rc0.dev129362__tar.gz → 0.16.0rc0.dev129924__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show
  1. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/attention_context/mha_metadata.py +12 -2
  3. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/dynamic_context.py +95 -8
  4. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/dynamic_engine.py +72 -0
  5. megatron_core-0.16.0rc0.dev129924/megatron/core/inference/unified_memory.py +127 -0
  6. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/gpt_model.py +1 -2
  7. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/package_info.py +1 -1
  8. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/attention.py +14 -3
  9. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/cuda_graphs.py +5 -1
  10. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/dot_product_attention.py +2 -0
  11. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/router.py +2 -0
  12. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/pipeline_parallel_layer_layout.py +5 -2
  13. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/PKG-INFO +1 -1
  14. megatron_core-0.16.0rc0.dev129362/megatron/core/inference/unified_memory.py +0 -89
  15. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/MANIFEST.in +0 -0
  16. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/README.md +0 -0
  17. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/README.md +0 -0
  18. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/__init__.py +0 -0
  19. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/activations.py +0 -0
  20. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/config.py +0 -0
  21. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/config_logger.py +0 -0
  22. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/__init__.py +0 -0
  23. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/bert_dataset.py +0 -0
  24. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_dataset.py +0 -0
  25. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  26. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  27. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/gpt_dataset.py +0 -0
  28. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/helpers.cpp +0 -0
  29. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/helpers.py +0 -0
  30. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/indexed_dataset.py +0 -0
  31. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/masked_dataset.py +0 -0
  32. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/megatron_dataset.py +0 -0
  33. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  34. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/multimodal_dataset.py +0 -0
  35. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/object_storage_utils.py +0 -0
  36. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/__init__.py +0 -0
  37. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/__init__.py +0 -0
  38. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  39. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/config.py +0 -0
  40. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  41. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  42. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/__init__.py +0 -0
  43. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/build.py +0 -0
  44. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/dataset.py +0 -0
  45. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/db/utils.py +0 -0
  46. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/external_libs.py +0 -0
  47. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/__init__.py +0 -0
  48. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/build.py +0 -0
  49. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/factory.py +0 -0
  50. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/index.py +0 -0
  51. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  52. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  53. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  54. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/utils.py +0 -0
  55. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/index/validate.py +0 -0
  56. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/__init__.py +0 -0
  57. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  58. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  59. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/query.py +0 -0
  60. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  61. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/query/utils.py +0 -0
  62. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/retro/utils.py +0 -0
  63. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/t5_dataset.py +0 -0
  64. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/utils.py +0 -0
  65. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/datasets/utils_s3.py +0 -0
  66. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/__init__.py +0 -0
  67. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/core.py +0 -0
  68. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  69. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  70. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/mapping.py +0 -0
  71. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  72. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/serialization.py +0 -0
  73. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  74. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  75. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  76. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  77. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  78. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  79. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  80. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  81. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  82. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  83. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  84. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  85. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  86. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  87. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  88. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  89. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/utils.py +0 -0
  90. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/dist_checkpointing/validation.py +0 -0
  91. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/__init__.py +0 -0
  92. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/data_parallel_base.py +0 -0
  93. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  94. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  95. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/finalize_model_grads.py +0 -0
  96. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/__init__.py +0 -0
  97. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  98. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  99. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  100. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  101. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  102. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  103. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  104. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  105. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  106. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  107. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  108. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  109. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  110. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  111. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/energy_monitor.py +0 -0
  112. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/enums.py +0 -0
  113. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/__init__.py +0 -0
  114. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/data_type.py +0 -0
  115. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/export_config.py +0 -0
  116. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/model_type.py +0 -0
  117. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/__init__.py +0 -0
  118. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  119. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  120. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  121. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  122. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  123. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  124. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  125. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  126. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  127. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  128. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  129. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  130. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/__init__.py +0 -0
  131. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/kitchen.py +0 -0
  132. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/transformer_engine.py +0 -0
  133. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  134. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fp4_utils.py +0 -0
  135. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fp8_utils.py +0 -0
  136. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/full_cuda_graph.py +0 -0
  137. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/__init__.py +0 -0
  138. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  139. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  140. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  141. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  142. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  143. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_indices_converter.py +0 -0
  144. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_layer_norm.py +0 -0
  145. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  146. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  147. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_softmax.py +0 -0
  148. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  149. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/hyper_comm_grid.py +0 -0
  150. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/__init__.py +0 -0
  151. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/async_stream.py +0 -0
  152. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/common_inference_params.py +0 -0
  153. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/communication_utils.py +0 -0
  154. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/__init__.py +0 -0
  155. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
  156. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/base_context.py +0 -0
  157. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  158. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
  159. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/contexts/static_context.py +0 -0
  160. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  161. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/__init__.py +0 -0
  162. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/abstract_engine.py +0 -0
  163. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/mcore_engine.py +0 -0
  164. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/engines/static_engine.py +0 -0
  165. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/headers.py +0 -0
  166. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/inference_client.py +0 -0
  167. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/inference_request.py +0 -0
  168. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  169. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  170. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  171. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  172. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  173. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  174. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  175. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  176. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/sampling_params.py +0 -0
  177. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/scheduler.py +0 -0
  178. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  179. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  180. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  181. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
  182. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  183. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  184. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  185. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  186. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  187. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  188. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  189. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference/utils.py +0 -0
  190. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/inference_params.py +0 -0
  191. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/jit.py +0 -0
  192. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/model_parallel_config.py +0 -0
  193. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/__init__.py +0 -0
  194. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/t5_model.py +0 -0
  195. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/T5/t5_spec.py +0 -0
  196. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/__init__.py +0 -0
  197. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/backends.py +0 -0
  198. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/__init__.py +0 -0
  199. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  200. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_lm_head.py +0 -0
  201. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/bert_model.py +0 -0
  202. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/bert/pooler.py +0 -0
  203. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/__init__.py +0 -0
  204. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/__init__.py +0 -0
  205. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  206. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  207. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  208. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  209. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  210. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/language_module/__init__.py +0 -0
  211. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/language_module/language_module.py +0 -0
  212. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  213. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/vision_module/__init__.py +0 -0
  214. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  215. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/__init__.py +0 -0
  216. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  217. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  218. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  219. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  220. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/__init__.py +0 -0
  221. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/clip_model.py +0 -0
  222. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/module.py +0 -0
  223. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/huggingface/qwen_model.py +0 -0
  224. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/__init__.py +0 -0
  225. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  226. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mamba/mamba_model.py +0 -0
  227. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/__init__.py +0 -0
  228. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/config/__init__.py +0 -0
  229. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/config/base_configs.py +0 -0
  230. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/model/__init__.py +0 -0
  231. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/model/base.py +0 -0
  232. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/audio.py +0 -0
  233. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/base.py +0 -0
  234. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/mimo/submodules/vision.py +0 -0
  235. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/__init__.py +0 -0
  236. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/context_parallel.py +0 -0
  237. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/llava_model.py +0 -0
  238. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/multimodal/llava_spec.py +0 -0
  239. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/__init__.py +0 -0
  240. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/base_attention.py +0 -0
  241. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/config.py +0 -0
  242. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/decoder_attention.py +0 -0
  243. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/decoder_spec.py +0 -0
  244. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/encoder_attention.py +0 -0
  245. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/encoder_spec.py +0 -0
  246. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/model.py +0 -0
  247. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/retro/utils.py +0 -0
  248. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/__init__.py +0 -0
  249. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/clip_vit_model.py +0 -0
  250. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/multimodal_projector.py +0 -0
  251. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/radio.py +0 -0
  252. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  253. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/msc_utils.py +0 -0
  254. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/nccl_allocator.py +0 -0
  255. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/num_microbatches_calculator.py +0 -0
  256. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/__init__.py +0 -0
  257. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/clip_grads.py +0 -0
  258. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  259. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  260. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  261. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/grad_scaler.py +0 -0
  262. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/optimizer.py +0 -0
  263. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer/optimizer_config.py +0 -0
  264. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/optimizer_param_scheduler.py +0 -0
  265. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/packed_seq_params.py +0 -0
  266. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/parallel_state.py +0 -0
  267. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/__init__.py +0 -0
  268. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  269. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  270. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  271. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/schedules.py +0 -0
  272. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/pipeline_parallel/utils.py +0 -0
  273. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/__init__.py +0 -0
  274. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/__init__.py +0 -0
  275. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  276. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  277. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  278. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/layers.py +0 -0
  279. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  280. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  281. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/process_groups_config.py +0 -0
  282. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/__init__.py +0 -0
  283. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/quant_config.py +0 -0
  284. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/quantization/utils.py +0 -0
  285. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/requirements.txt +0 -0
  286. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/rerun_state_machine.py +0 -0
  287. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/safe_globals.py +0 -0
  288. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/__init__.py +0 -0
  289. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_block.py +0 -0
  290. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  291. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  292. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_layer.py +0 -0
  293. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mamba_mixer.py +0 -0
  294. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/mlp_layer.py +0 -0
  295. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/ssm/triton_cache_manager.py +0 -0
  296. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/__init__.py +0 -0
  297. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  298. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/data.py +0 -0
  299. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/layers.py +0 -0
  300. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/mappings.py +0 -0
  301. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/random.py +0 -0
  302. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tensor_parallel/utils.py +0 -0
  303. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/timers.py +0 -0
  304. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/__init__.py +0 -0
  305. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  306. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  307. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/__init__.py +0 -0
  308. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  309. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  312. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  313. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  314. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  315. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  318. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  322. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  323. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  324. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  325. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  326. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/__init__.py +0 -0
  327. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  328. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  329. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/enums.py +0 -0
  330. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  331. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  332. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  333. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/identity_op.py +0 -0
  334. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/mlp.py +0 -0
  335. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/module.py +0 -0
  336. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/__init__.py +0 -0
  337. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/experts.py +0 -0
  338. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  339. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  340. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/moe_layer.py +0 -0
  341. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/moe_utils.py +0 -0
  342. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/shared_experts.py +0 -0
  343. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  344. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  345. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/multi_latent_attention.py +0 -0
  346. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/multi_token_prediction.py +0 -0
  347. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/spec_utils.py +0 -0
  348. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/torch_layer_norm.py +0 -0
  349. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/torch_norm.py +0 -0
  350. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_block.py +0 -0
  351. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_config.py +0 -0
  352. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/transformer_layer.py +0 -0
  353. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/transformer/utils.py +0 -0
  354. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron/core/utils.py +0 -0
  355. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/SOURCES.txt +0 -0
  356. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/dependency_links.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/requires.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/megatron_core.egg-info/top_level.txt +0 -0
  359. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/pyproject.toml +0 -0
  360. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/setup.cfg +0 -0
  361. {megatron_core-0.16.0rc0.dev129362 → megatron_core-0.16.0rc0.dev129924}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev129362
3
+ Version: 0.16.0rc0.dev129924
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -43,6 +43,7 @@ class MHAMetadata(MetadataBase):
43
43
  padded_active_token_count: int,
44
44
  real_batch_size: int,
45
45
  padded_active_request_count: Optional[int] = None,
46
+ decode_only: bool = False,
46
47
  ):
47
48
  """
48
49
  Args:
@@ -52,6 +53,7 @@ class MHAMetadata(MetadataBase):
52
53
  padded_active_token_count: int
53
54
  real_batch_size: int
54
55
  padded_active_request_count: Optional[int]
56
+ decode_only: bool
55
57
  """
56
58
  if padded_active_request_count is None:
57
59
  padded_active_request_count = real_batch_size
@@ -98,9 +100,11 @@ class MHAMetadata(MetadataBase):
98
100
  padded_active_request_count,
99
101
  is_cumulative_tensor=True,
100
102
  )
101
- self._max_seqlen_q = padded_active_token_count
102
- if torch.all(self._query_lengths_buf[:padded_active_request_count] <= 1):
103
+
104
+ if decode_only:
103
105
  self._max_seqlen_q = 1
106
+ else:
107
+ self._max_seqlen_q = max(2, padded_active_token_count)
104
108
  self._max_seqlen_k = self.max_seqlen
105
109
 
106
110
  self.state_data = {
@@ -148,6 +152,7 @@ class GraphedMHAMetadata(MHAMetadata):
148
152
  padded_active_token_count: int,
149
153
  real_batch_size: int,
150
154
  padded_active_request_count: Optional[int] = None,
155
+ decode_only: bool = False,
151
156
  ):
152
157
  """
153
158
  Args:
@@ -157,6 +162,7 @@ class GraphedMHAMetadata(MHAMetadata):
157
162
  padded_active_token_count: int
158
163
  real_batch_size: int
159
164
  padded_active_request_count: Optional[int]
165
+ decode_only: bool
160
166
  """
161
167
  super().update(
162
168
  request_query_lengths,
@@ -165,6 +171,7 @@ class GraphedMHAMetadata(MHAMetadata):
165
171
  padded_active_token_count,
166
172
  real_batch_size,
167
173
  padded_active_request_count,
174
+ decode_only,
168
175
  )
169
176
 
170
177
  def reset(self):
@@ -184,6 +191,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
184
191
  padded_active_token_count: int,
185
192
  real_batch_size: int,
186
193
  padded_active_request_count: Optional[int] = None,
194
+ decode_only: bool = False,
187
195
  ):
188
196
  """
189
197
  Args:
@@ -193,6 +201,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
193
201
  padded_active_token_count: int
194
202
  real_batch_size: int
195
203
  padded_active_request_count: Optional[int]
204
+ decode_only: bool
196
205
  """
197
206
  super().update(
198
207
  request_query_lengths,
@@ -201,6 +210,7 @@ class NonGraphedMHAMetadata(MHAMetadata):
201
210
  padded_active_token_count,
202
211
  real_batch_size,
203
212
  padded_active_request_count,
213
+ decode_only,
204
214
  )
205
215
  if len(self.state_data["query_lengths"]) > 0:
206
216
  self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
@@ -4,7 +4,7 @@ import math
4
4
  import warnings
5
5
  from contextlib import nullcontext
6
6
  from enum import Enum
7
- from typing import List, Optional, Tuple
7
+ from typing import TYPE_CHECKING, List, Optional, Tuple
8
8
 
9
9
  import torch
10
10
  import torch.nn.functional as F
@@ -16,7 +16,10 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
16
16
  from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
17
17
  InferenceWrapperConfig,
18
18
  )
19
- from megatron.core.inference.unified_memory import create_unified_mempool, has_unified_memory
19
+ from megatron.core.inference.unified_memory import (
20
+ UnifiedMemoryUnsupportedError,
21
+ create_unified_mempool,
22
+ )
20
23
  from megatron.core.inference.utils import tensor_swap
21
24
  from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
22
25
  from megatron.core.package_info import __version__ as mcore_version
@@ -46,6 +49,17 @@ try:
46
49
  except ImportError:
47
50
  HAVE_FLASHINFER = False
48
51
 
52
+ try:
53
+ import wandb # pylint: disable=unused-import
54
+
55
+ HAVE_WANDB = True
56
+ except ImportError:
57
+ HAVE_WANDB = False
58
+ wandb = None
59
+
60
+ if TYPE_CHECKING:
61
+ import wandb as WandbModule
62
+
49
63
 
50
64
  class ContextOverflowError(Exception):
51
65
  """Base exception for when a new request does not fit.
@@ -223,6 +237,7 @@ class DynamicInferenceContext(BaseInferenceContext):
223
237
  levels will be included to control other tensors within the context.
224
238
  use_flashinfer_fused_rope (bool): If True, use flashinfer's fused rope implementation.
225
239
  If None, defaults to using flash-infer if available.
240
+ metrics_writer (Optional['WandbModule']): Wandb module for writing metrics.
226
241
  """
227
242
 
228
243
  def __init__(
@@ -248,6 +263,7 @@ class DynamicInferenceContext(BaseInferenceContext):
248
263
  use_cuda_graphs_for_non_decode_steps: bool = True,
249
264
  use_flashinfer_fused_rope: bool = False,
250
265
  unified_memory_level: Optional[int] = 0,
266
+ metrics_writer: Optional['WandbModule'] = None,
251
267
  ):
252
268
  super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
253
269
 
@@ -257,6 +273,8 @@ class DynamicInferenceContext(BaseInferenceContext):
257
273
  block_size_tokens == 64
258
274
  ), "Flash MLA requires a block size of 64. Set --inference-dynamic-batching-block-size 64 to fix this assert"
259
275
 
276
+ self.metrics_writer = metrics_writer
277
+
260
278
  # Per partition num heads and hidden size.
261
279
  projection_size = kv_channels * num_attention_heads
262
280
  if tensor_model_parallel_size is None:
@@ -323,16 +341,20 @@ class DynamicInferenceContext(BaseInferenceContext):
323
341
  self.params_dtype = params_dtype
324
342
  self.num_layers = num_layers
325
343
  self.max_sequence_length = max_sequence_length
344
+
345
+ # Unified memory.
326
346
  self.unified_memory_level = unified_memory_level
327
347
  if unified_memory_level > 0:
328
- if not has_unified_memory and torch.distributed.get_rank() == 0:
329
- warnings.warn(
330
- "Unified memory requested but not available; defaulting to GPU memory."
331
- )
332
- self.unified_memory_level = 0
333
- else:
348
+ try:
334
349
  self.unified_memory_mempool = create_unified_mempool()
350
+ except UnifiedMemoryUnsupportedError:
351
+ if torch.distributed.get_rank() == 0:
352
+ warnings.warn(
353
+ "Unified memory requested but not available; defaulting to GPU memory."
354
+ )
355
+ self.unified_memory_level = 0
335
356
 
357
+ # Request and token counts.
336
358
  self.total_request_count = 0
337
359
  self.active_token_count = 0
338
360
  self.paused_request_count = 0
@@ -951,6 +973,7 @@ class DynamicInferenceContext(BaseInferenceContext):
951
973
  padded_active_token_count=self.padded_active_token_count,
952
974
  real_batch_size=real_req_batch_size,
953
975
  padded_active_request_count=self.padded_active_request_count,
976
+ decode_only=self.is_decode_only(),
954
977
  )
955
978
  # All attention metadata calculations are now handled by MHAMetadata.update()
956
979
 
@@ -1561,3 +1584,67 @@ class DynamicInferenceContext(BaseInferenceContext):
1561
1584
 
1562
1585
  # Convert each log prob tensor into a list
1563
1586
  return [lp.tolist() for lp in selected_log_probs_list]
1587
+
1588
+ def get_kvcache_utilization_stats(self) -> dict:
1589
+ """Compute KV cache buffer utilization stats for the current step.
1590
+
1591
+ Returns a dictionary with counts and percentages for both allocated block
1592
+ usage (overall buffer occupancy) and active usage (blocks referenced by
1593
+ currently active requests this step).
1594
+
1595
+ Return:
1596
+ {
1597
+ 'total_blocks': int,
1598
+ 'allocated_blocks': int,
1599
+ 'active_unique_blocks': int,
1600
+ 'allocated_utilization': float,
1601
+ 'active_utilization': float,
1602
+ 'active_request_count': int,
1603
+ 'paused_request_count': int,
1604
+ 'gtd_block_count': int,
1605
+ }
1606
+ """
1607
+ # Total usable blocks exclude the reserved dummy block.
1608
+ total_blocks = max(self.block_allocator.block_count_total - 1, 1)
1609
+ block_count_avail = int(self.block_allocator.block_count_avail)
1610
+
1611
+ # Overall allocated blocks in the buffer right now.
1612
+ allocated_blocks = (self.block_allocator.block_count_total - 1) - block_count_avail
1613
+ allocated_blocks = int(max(0, allocated_blocks))
1614
+
1615
+ # Active unique blocks referenced by current active requests only.
1616
+ active_start = self.paused_request_count
1617
+ active_end = self.total_request_count
1618
+ if active_end > active_start:
1619
+ active_rows = self.request_to_kv_block_ids[active_start:active_end]
1620
+ # Filter valid block ids (>= 0) and count unique ids.
1621
+ valid_ids = active_rows[active_rows >= 0]
1622
+ if valid_ids.numel() > 0:
1623
+ unique_ids = torch.unique(valid_ids)
1624
+ active_unique_blocks = int(unique_ids.numel())
1625
+ else:
1626
+ active_unique_blocks = 0
1627
+ else:
1628
+ active_unique_blocks = 0
1629
+
1630
+ allocated_utilization = float(allocated_blocks) / float(total_blocks)
1631
+ active_utilization = float(active_unique_blocks) / float(total_blocks)
1632
+
1633
+ # Diagnostic helpers
1634
+ num_non_gtd_blocks = max(0, block_count_avail - int(self.gtd_block_count))
1635
+ total_request_count = int(self.total_request_count)
1636
+ return {
1637
+ 'total_blocks': int(total_blocks),
1638
+ 'allocated_blocks': int(allocated_blocks),
1639
+ 'active_unique_blocks': int(active_unique_blocks),
1640
+ 'allocated_utilization': allocated_utilization,
1641
+ 'active_utilization': active_utilization,
1642
+ 'active_request_count': int(self.get_active_request_count()),
1643
+ 'paused_request_count': int(self.paused_request_count),
1644
+ 'gtd_block_count': int(self.gtd_block_count),
1645
+ 'block_count_avail': int(block_count_avail),
1646
+ 'num_non_gtd_blocks': int(num_non_gtd_blocks),
1647
+ 'active_token_count': int(self.active_token_count),
1648
+ 'total_request_count': int(total_request_count),
1649
+ 'max_requests': int(self.max_requests),
1650
+ }
@@ -57,6 +57,14 @@ try:
57
57
  except:
58
58
  HAVE_MSGPACK = False
59
59
 
60
+ try:
61
+ import wandb
62
+
63
+ HAVE_WANDB = True
64
+ except ImportError:
65
+ HAVE_WANDB = False
66
+ wandb = None
67
+
60
68
 
61
69
  def format_mem_bytes(mem_bytes):
62
70
  """Convert a byte count to a human-readable string in tb, gb, mb, kb, or bytes."""
@@ -89,6 +97,8 @@ class DynamicInferenceEngine(AbstractEngine):
89
97
  static_sampling (bool): If True, all requests are assumed to have the same
90
98
  sampling parameters. This avoids needing to loop through all requests and
91
99
  their sampling parameters every generation step, improving latency.
100
+ inference_logging_step_interval (int): The step interval at which to log
101
+ inference metrics to wandb. Defaults to 0, which means no logging.
92
102
  """
93
103
 
94
104
  def __init__(
@@ -101,6 +111,7 @@ class DynamicInferenceEngine(AbstractEngine):
101
111
  track_paused_request_events: bool = False,
102
112
  enable_chunked_prefill: bool = True,
103
113
  static_sampling: bool = False,
114
+ inference_logging_step_interval: int = 0,
104
115
  ):
105
116
 
106
117
  if enable_cuda_graph is not None:
@@ -137,6 +148,32 @@ class DynamicInferenceEngine(AbstractEngine):
137
148
  self.enable_chunked_prefill = enable_chunked_prefill
138
149
  self.static_sampling = static_sampling
139
150
 
151
+ self.inference_logging_step_interval = inference_logging_step_interval
152
+ # Configure wandb to use separate step counter for inference metrics (only once)
153
+ if self.inference_logging_step_interval > 0 and self.context.metrics_writer is not None:
154
+ logging.info(
155
+ f"\033[1;93m[INFERENCE]\033[0m "
156
+ f"\033[1;95mLogging inference metrics to wandb (rank {torch.distributed.get_rank()})\033[0m"
157
+ )
158
+ if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
159
+ # Make all inference/* metrics use inference_step as their x-axis
160
+ # This allows inference and training to have independent step counters
161
+ context.metrics_writer.define_metric(
162
+ "inference/*", step_metric="inference/inference_step"
163
+ )
164
+ # Initialize inference step offset by querying existing run history
165
+ self.inference_step_offset = 0
166
+ if wandb.run is not None:
167
+ api_run = wandb.Api().run(
168
+ f"{wandb.run.entity}/{wandb.run.project}/{wandb.run.id}"
169
+ )
170
+ max_step = 0
171
+ for row in api_run.scan_history(keys=["inference/inference_step"]):
172
+ val = row.get("inference/inference_step")
173
+ if isinstance(val, (int, float)) and int(val) > max_step:
174
+ max_step = int(val)
175
+ self.inference_step_offset = int(max_step)
176
+
140
177
  # Initialize the asyncio loop if it has not already been initialized.
141
178
  # TODO: Start the engine loop here.
142
179
  self._loop = get_asyncio_loop()
@@ -780,6 +817,41 @@ class DynamicInferenceEngine(AbstractEngine):
780
817
  self.request_completion_futures[failed_request_id].set_result(failed_request)
781
818
  self.failed_request_ids.clear()
782
819
 
820
+ # Log KV cache utilization stats to W&B
821
+ if (
822
+ self.inference_logging_step_interval > 0
823
+ and self.step_count > 0
824
+ and self.step_count % self.inference_logging_step_interval == 0
825
+ and self.context.metrics_writer is not None
826
+ ):
827
+
828
+ # Get KV cache utilization stats from dynamic context
829
+ kv_stats = self.context.get_kvcache_utilization_stats()
830
+
831
+ # Prepare metrics dictionary with all stats
832
+ # Use 'inference/' prefix for all metrics to separate from training metrics
833
+ metrics = {
834
+ 'inference/inference_step': int(self.inference_step_offset + int(self.step_count)),
835
+ 'inference/step_time_s': float(step_time),
836
+ 'inference/waiting_queue_len': int(len(self.waiting_request_ids)),
837
+ 'inference/total_requests_dict_size': int(len(self.requests)),
838
+ }
839
+ # Add KV stats with inference/ prefix
840
+ # Convert utilization metrics from 0-1 range to 0-100 percentage range for better visualization
841
+ for key, value in kv_stats.items():
842
+ if 'utilization' in key:
843
+ # Convert to percentage (0-100) and group under kvcache_utilization
844
+ metrics[f'inference/{key}'] = float(value * 100.0)
845
+ else:
846
+ metrics[f'inference/{key}'] = value
847
+
848
+ if HAVE_WANDB and self.context.metrics_writer.__name__ == "wandb":
849
+ self.context.metrics_writer.log(metrics, commit=True)
850
+ else:
851
+ raise ValueError(
852
+ f"Unsupported metrics writer type: {type(self.context.metrics_writer)}"
853
+ )
854
+
783
855
  # Print context state.
784
856
  if verbose:
785
857
  context = self.context
@@ -0,0 +1,127 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import os
4
+ import warnings
5
+ from enum import Enum, auto
6
+ from pathlib import Path
7
+
8
+ from torch.cuda.memory import CUDAPluggableAllocator
9
+ from torch.utils.cpp_extension import CUDA_HOME, load_inline
10
+
11
+ from megatron.core.utils import is_torch_min_version
12
+
13
+ try:
14
+ if is_torch_min_version("2.8.0"):
15
+ from torch.cuda.memory import MemPool
16
+ else:
17
+ from torch.cuda import MemPool
18
+ _has_mem_pool = True
19
+ except ImportError:
20
+ _has_mem_pool = False
21
+
22
+
23
+ class CompilationState(Enum):
24
+ """Enum to distinguish between unified memory (UVM) compilation states."""
25
+
26
+ UNATTEMPTED = auto() # Compilation has not been attempted.
27
+ FAILURE = auto() # Compilation attempted, but failed.
28
+ SUCCESS = auto() # Compilation attempted, and succeeded.
29
+
30
+
31
+ # Compilation vars.
32
+ _compilation_state = CompilationState.UNATTEMPTED
33
+ _alloc = None # must remain global until process exit.
34
+ _mod = None # must remain global until process exit.
35
+
36
+
37
+ class UnifiedMemoryUnsupportedError(Exception):
38
+ """Unified memory is not supported on this system."""
39
+
40
+ pass
41
+
42
+
43
+ def compile_allocator():
44
+ """Attempt to compile UVM allocator."""
45
+
46
+ global _compilation_state, _alloc, _mod
47
+
48
+ if _compilation_state != CompilationState.UNATTEMPTED:
49
+ return
50
+
51
+ _mempool_c_src = r"""
52
+ #include <cuda_runtime_api.h>
53
+ #include <cstddef>
54
+
55
+ #define EXPORT extern "C"
56
+
57
+ EXPORT void* managed_malloc(size_t size, int device, void* stream) {
58
+ (void)stream;
59
+ int cur = -1;
60
+ cudaGetDevice(&cur);
61
+ if (device != cur && device >= 0) cudaSetDevice(device);
62
+
63
+ // cudaMallocManaged allows for more memory to be allocated than the device memory size.
64
+ // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
65
+ void* ptr = nullptr;
66
+ cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
67
+ if (err != cudaSuccess) return nullptr;
68
+
69
+ if (device >= 0) {
70
+ // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
71
+ // This is a hint that tries to prevent data from being migrated away from the device.
72
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
73
+ // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
74
+ // Even if the memory has to be migrated away from the device, it still does not page fault.
75
+ // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
76
+ // but there is no harm in adding this flag as well for future-proofing.
77
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
78
+ }
79
+ return ptr;
80
+ }
81
+
82
+ EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
83
+ // Memory allocated with cudaMallocManaged should be released with cudaFree.
84
+ (void)size; (void)device; (void)stream;
85
+ if (ptr) cudaFree(ptr);
86
+ }
87
+ """
88
+
89
+ # Build the .so upon import; this avoids issues.
90
+ if _has_mem_pool:
91
+ _extra_ldflags = ["-lcudart"]
92
+ if CUDA_HOME:
93
+ _cuda_lib = os.path.join(CUDA_HOME, "lib64")
94
+ if os.path.isdir(_cuda_lib):
95
+ _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
96
+ try:
97
+ _mod = load_inline(
98
+ name="managed_alloc_runtime",
99
+ cpp_sources=[_mempool_c_src],
100
+ functions=[],
101
+ with_cuda=True,
102
+ extra_ldflags=_extra_ldflags,
103
+ verbose=False,
104
+ )
105
+ _so_path = Path(_mod.__file__).as_posix()
106
+ _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
107
+ _compilation_state = CompilationState.SUCCESS
108
+ except (RuntimeError, ImportError, OSError):
109
+ warnings.warn("Failed to create unified memory mempool.")
110
+ _compilation_state = CompilationState.FAILURE
111
+
112
+
113
+ def create_unified_mempool() -> "MemPool":
114
+ """Create a unified memory mempool using CUDA managed memory.
115
+
116
+ Returns:
117
+ (MemPool) Unified memory mempool.
118
+ """
119
+
120
+ # Attempt to compile allocator.
121
+ compile_allocator()
122
+
123
+ # Return mempool.
124
+ if _compilation_state != CompilationState.SUCCESS:
125
+ raise UnifiedMemoryUnsupportedError()
126
+ else:
127
+ return MemPool(allocator=_alloc)
@@ -375,14 +375,13 @@ class GPTModel(LanguageModule):
375
375
  )
376
376
  or self.config.flash_decode
377
377
  )
378
- and rotary_pos_cos is not None
379
378
  and inference_context.is_static_batching()
380
379
  ):
381
380
  current_batch_size = input_ids.shape[0]
382
381
  sequence_len_offset = torch.tensor(
383
382
  [inference_context.sequence_len_offset] * current_batch_size,
384
383
  dtype=torch.int32,
385
- device=rotary_pos_cos.device, # Co-locate this with the rotary tensors
384
+ device=torch.cuda.current_device(),
386
385
  )
387
386
  else:
388
387
  sequence_len_offset = None
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 16
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0.dev129362'
7
+ PRE_RELEASE = 'rc0.dev129924'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -48,15 +48,26 @@ except ImportError:
48
48
  rearrange = None
49
49
 
50
50
  try:
51
- from flashattn_hopper.flash_attn_interface import _flash_attn_forward
52
- from flashattn_hopper.flash_attn_interface import (
51
+ from flash_attn_3.flash_attn_interface import _flash_attn_forward
52
+ from flash_attn_3.flash_attn_interface import (
53
53
  flash_attn_with_kvcache as flash_attn3_with_kvcache,
54
54
  )
55
55
 
56
56
  HAVE_FA3 = True
57
- except:
57
+ except ImportError as e:
58
58
  HAVE_FA3 = False
59
59
 
60
+ if not HAVE_FA3:
61
+ try:
62
+ from flashattn_hopper.flash_attn_interface import _flash_attn_forward
63
+ from flashattn_hopper.flash_attn_interface import (
64
+ flash_attn_with_kvcache as flash_attn3_with_kvcache,
65
+ )
66
+
67
+ HAVE_FA3 = True
68
+ except ImportError as e:
69
+ pass
70
+
60
71
  try:
61
72
  from flash_mla import flash_mla_with_kvcache, get_mla_metadata
62
73
 
@@ -1182,7 +1182,11 @@ class CudaGraphManager(torch.nn.Module):
1182
1182
 
1183
1183
  if runner is None:
1184
1184
  if _CudagraphGlobalRecord.cudagraph_created:
1185
- assert False
1185
+ assert False, (
1186
+ f"`cudagraph_created` is set to True but no matching cudagraph "
1187
+ f"runners were found. This module has {len(self.cudagraph_runners)} "
1188
+ f"existing runners. Use `get_mismatch_errors` to debug mismatches."
1189
+ )
1186
1190
  else:
1187
1191
  runner = _CudaGraphRunner(
1188
1192
  megatron_module,
@@ -126,6 +126,8 @@ class DotProductAttention(MegatronModule):
126
126
  )
127
127
  ),
128
128
  )
129
+ if config.perform_initialization:
130
+ self.softmax_offset = config.init_method(self.softmax_offset)
129
131
  else:
130
132
  raise ValueError("Softmax type not supported")
131
133
 
@@ -66,6 +66,8 @@ class Router(ABC, MegatronModule):
66
66
  """Reset the router parameters."""
67
67
  if self.config.perform_initialization:
68
68
  self.config.init_method(self.weight)
69
+ if self.bias is not None:
70
+ self.config.init_method(self.bias)
69
71
  self.weight.data = self.weight.data.to(dtype=self.config.params_dtype)
70
72
  setattr(self.weight, 'sequence_parallel', self.config.sequence_parallel)
71
73
  if self.bias is not None:
@@ -15,8 +15,11 @@ logger = logging.getLogger(__name__)
15
15
  class PipelineParallelLayerLayout:
16
16
  """Configuration of custom pipeline parallel layer partitioning."""
17
17
 
18
- def __repr__(self):
19
- return self.input_data
18
+ def __repr__(self) -> str:
19
+ if isinstance(self.input_data, str):
20
+ return self.input_data
21
+ else:
22
+ return str(self.input_data)
20
23
 
21
24
  def __init__(self, layout: str | list, pipeline_model_parallel_size: int):
22
25
  """Initialize PipelineParallelLayerLayout from a list or a str.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev129362
3
+ Version: 0.16.0rc0.dev129924
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>