megatron-core 0.16.0rc0.dev111286__tar.gz → 0.16.0rc0.dev112436__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show
  1. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/dynamic_context.py +133 -69
  3. megatron_core-0.16.0rc0.dev112436/megatron/core/inference/unified_memory.py +127 -0
  4. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/package_info.py +1 -1
  5. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron_core.egg-info/PKG-INFO +1 -1
  6. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron_core.egg-info/SOURCES.txt +0 -2
  7. megatron_core-0.16.0rc0.dev111286/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -72
  8. megatron_core-0.16.0rc0.dev111286/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -210
  9. megatron_core-0.16.0rc0.dev111286/megatron/core/inference/unified_memory.py +0 -89
  10. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/MANIFEST.in +0 -0
  11. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/README.md +0 -0
  12. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/README.md +0 -0
  13. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/__init__.py +0 -0
  14. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/activations.py +0 -0
  15. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/config.py +0 -0
  16. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/config_logger.py +0 -0
  17. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/__init__.py +0 -0
  18. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/bert_dataset.py +0 -0
  19. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/blended_dataset.py +0 -0
  20. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  21. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  22. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/gpt_dataset.py +0 -0
  23. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/helpers.cpp +0 -0
  24. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/helpers.py +0 -0
  25. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/indexed_dataset.py +0 -0
  26. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/masked_dataset.py +0 -0
  27. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/megatron_dataset.py +0 -0
  28. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  29. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/multimodal_dataset.py +0 -0
  30. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/object_storage_utils.py +0 -0
  31. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/__init__.py +0 -0
  32. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/config/__init__.py +0 -0
  33. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  34. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/config/config.py +0 -0
  35. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  36. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  37. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/db/__init__.py +0 -0
  38. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/db/build.py +0 -0
  39. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/db/dataset.py +0 -0
  40. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/db/utils.py +0 -0
  41. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/external_libs.py +0 -0
  42. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/__init__.py +0 -0
  43. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/build.py +0 -0
  44. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/factory.py +0 -0
  45. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/index.py +0 -0
  46. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  47. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  48. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  49. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/utils.py +0 -0
  50. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/index/validate.py +0 -0
  51. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/__init__.py +0 -0
  52. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  53. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  54. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/query.py +0 -0
  55. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  56. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/query/utils.py +0 -0
  57. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/retro/utils.py +0 -0
  58. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/t5_dataset.py +0 -0
  59. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/utils.py +0 -0
  60. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/datasets/utils_s3.py +0 -0
  61. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/__init__.py +0 -0
  62. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/core.py +0 -0
  63. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  64. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  65. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/mapping.py +0 -0
  66. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  67. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/serialization.py +0 -0
  68. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  69. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  70. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  71. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  72. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  73. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  74. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  75. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  76. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  77. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  78. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  79. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  80. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  81. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  82. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  83. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  84. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/utils.py +0 -0
  85. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/dist_checkpointing/validation.py +0 -0
  86. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/__init__.py +0 -0
  87. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/data_parallel_base.py +0 -0
  88. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  89. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  90. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/finalize_model_grads.py +0 -0
  91. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/__init__.py +0 -0
  92. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  93. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  94. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  95. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  96. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  97. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  98. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  99. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  100. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  101. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  102. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  103. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  104. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  105. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  106. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/energy_monitor.py +0 -0
  107. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/enums.py +0 -0
  108. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/__init__.py +0 -0
  109. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/data_type.py +0 -0
  110. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/export_config.py +0 -0
  111. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/model_type.py +0 -0
  112. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/__init__.py +0 -0
  113. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  114. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  115. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  116. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  117. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  118. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  119. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  120. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  121. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  122. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  123. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  124. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  125. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/extensions/__init__.py +0 -0
  126. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/extensions/kitchen.py +0 -0
  127. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/extensions/transformer_engine.py +0 -0
  128. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  129. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fp4_utils.py +0 -0
  130. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fp8_utils.py +0 -0
  131. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/full_cuda_graph.py +0 -0
  132. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/__init__.py +0 -0
  133. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  134. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  135. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  136. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  137. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  138. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_indices_converter.py +0 -0
  139. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_layer_norm.py +0 -0
  140. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  141. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  142. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_softmax.py +0 -0
  143. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  144. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/hyper_comm_grid.py +0 -0
  145. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/__init__.py +0 -0
  146. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/async_stream.py +0 -0
  147. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/common_inference_params.py +0 -0
  148. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/communication_utils.py +0 -0
  149. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/__init__.py +0 -0
  150. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/base_context.py +0 -0
  151. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  152. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
  153. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/contexts/static_context.py +0 -0
  154. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  155. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/engines/__init__.py +0 -0
  156. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/engines/abstract_engine.py +0 -0
  157. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/engines/dynamic_engine.py +0 -0
  158. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/engines/mcore_engine.py +0 -0
  159. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/engines/static_engine.py +0 -0
  160. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/headers.py +0 -0
  161. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/inference_client.py +0 -0
  162. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/inference_request.py +0 -0
  163. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  164. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  165. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  166. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  167. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  168. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  169. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  170. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  171. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/sampling_params.py +0 -0
  172. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/scheduler.py +0 -0
  173. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  174. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  175. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  176. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +0 -0
  177. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  178. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  179. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  180. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  181. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  182. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  183. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  184. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference/utils.py +0 -0
  185. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/inference_params.py +0 -0
  186. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/jit.py +0 -0
  187. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/model_parallel_config.py +0 -0
  188. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/T5/__init__.py +0 -0
  189. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/T5/t5_model.py +0 -0
  190. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/T5/t5_spec.py +0 -0
  191. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/__init__.py +0 -0
  192. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/backends.py +0 -0
  193. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/bert/__init__.py +0 -0
  194. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  195. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/bert/bert_lm_head.py +0 -0
  196. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/bert/bert_model.py +0 -0
  197. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/bert/pooler.py +0 -0
  198. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/__init__.py +0 -0
  199. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/__init__.py +0 -0
  200. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  201. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  202. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  203. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  204. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  205. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/language_module/__init__.py +0 -0
  206. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/language_module/language_module.py +0 -0
  207. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  208. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/vision_module/__init__.py +0 -0
  209. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  210. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/__init__.py +0 -0
  211. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  212. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  213. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/gpt_model.py +0 -0
  214. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  215. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  216. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/huggingface/__init__.py +0 -0
  217. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/huggingface/clip_model.py +0 -0
  218. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/huggingface/module.py +0 -0
  219. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/huggingface/qwen_model.py +0 -0
  220. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mamba/__init__.py +0 -0
  221. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  222. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mamba/mamba_model.py +0 -0
  223. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/__init__.py +0 -0
  224. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/config/__init__.py +0 -0
  225. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/config/base_configs.py +0 -0
  226. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/model/__init__.py +0 -0
  227. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/model/base.py +0 -0
  228. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/submodules/audio.py +0 -0
  229. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/submodules/base.py +0 -0
  230. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/mimo/submodules/vision.py +0 -0
  231. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/multimodal/__init__.py +0 -0
  232. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/multimodal/context_parallel.py +0 -0
  233. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/multimodal/llava_model.py +0 -0
  234. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/multimodal/llava_spec.py +0 -0
  235. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/__init__.py +0 -0
  236. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/base_attention.py +0 -0
  237. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/config.py +0 -0
  238. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/decoder_attention.py +0 -0
  239. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/decoder_spec.py +0 -0
  240. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/encoder_attention.py +0 -0
  241. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/encoder_spec.py +0 -0
  242. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/model.py +0 -0
  243. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/retro/utils.py +0 -0
  244. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/vision/__init__.py +0 -0
  245. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/vision/clip_vit_model.py +0 -0
  246. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/vision/multimodal_projector.py +0 -0
  247. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/vision/radio.py +0 -0
  248. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  249. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/msc_utils.py +0 -0
  250. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/nccl_allocator.py +0 -0
  251. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/num_microbatches_calculator.py +0 -0
  252. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/__init__.py +0 -0
  253. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/clip_grads.py +0 -0
  254. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  255. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  256. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  257. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/grad_scaler.py +0 -0
  258. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/optimizer.py +0 -0
  259. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer/optimizer_config.py +0 -0
  260. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/optimizer_param_scheduler.py +0 -0
  261. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/packed_seq_params.py +0 -0
  262. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/parallel_state.py +0 -0
  263. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/__init__.py +0 -0
  264. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  265. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  266. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  267. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/schedules.py +0 -0
  268. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/pipeline_parallel/utils.py +0 -0
  269. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/__init__.py +0 -0
  270. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/__init__.py +0 -0
  271. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  272. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  273. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  274. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/layers.py +0 -0
  275. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  276. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  277. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/process_groups_config.py +0 -0
  278. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/quantization/__init__.py +0 -0
  279. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/quantization/quant_config.py +0 -0
  280. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/quantization/utils.py +0 -0
  281. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/requirements.txt +0 -0
  282. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/rerun_state_machine.py +0 -0
  283. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/safe_globals.py +0 -0
  284. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/__init__.py +0 -0
  285. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mamba_block.py +0 -0
  286. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  287. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  288. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mamba_layer.py +0 -0
  289. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mamba_mixer.py +0 -0
  290. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/mlp_layer.py +0 -0
  291. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/ssm/triton_cache_manager.py +0 -0
  292. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/__init__.py +0 -0
  293. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  294. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/data.py +0 -0
  295. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/layers.py +0 -0
  296. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/mappings.py +0 -0
  297. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/random.py +0 -0
  298. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tensor_parallel/utils.py +0 -0
  299. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/timers.py +0 -0
  300. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/__init__.py +0 -0
  301. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  302. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  303. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/__init__.py +0 -0
  304. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  305. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  306. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  307. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  308. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  309. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  312. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  313. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  314. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  315. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  318. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  322. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/__init__.py +0 -0
  323. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/attention.py +0 -0
  324. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/cuda_graphs.py +0 -0
  325. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  326. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  327. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/dot_product_attention.py +0 -0
  328. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/enums.py +0 -0
  329. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  330. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  331. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  332. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/identity_op.py +0 -0
  333. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/mlp.py +0 -0
  334. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/module.py +0 -0
  335. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/__init__.py +0 -0
  336. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/experts.py +0 -0
  337. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  338. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  339. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/moe_layer.py +0 -0
  340. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/moe_utils.py +0 -0
  341. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/router.py +0 -0
  342. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/shared_experts.py +0 -0
  343. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  344. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  345. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/multi_latent_attention.py +0 -0
  346. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/multi_token_prediction.py +0 -0
  347. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  348. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/spec_utils.py +0 -0
  349. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/torch_layer_norm.py +0 -0
  350. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/torch_norm.py +0 -0
  351. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/transformer_block.py +0 -0
  352. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/transformer_config.py +0 -0
  353. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/transformer_layer.py +0 -0
  354. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/transformer/utils.py +0 -0
  355. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron/core/utils.py +0 -0
  356. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron_core.egg-info/dependency_links.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron_core.egg-info/requires.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/megatron_core.egg-info/top_level.txt +0 -0
  359. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/pyproject.toml +0 -0
  360. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/setup.cfg +0 -0
  361. {megatron_core-0.16.0rc0.dev111286 → megatron_core-0.16.0rc0.dev112436}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev111286
3
+ Version: 0.16.0rc0.dev112436
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -16,14 +16,16 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
16
16
  from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
17
17
  InferenceWrapperConfig,
18
18
  )
19
- from megatron.core.inference.unified_memory import create_unified_mempool, has_unified_memory
19
+ from megatron.core.inference.unified_memory import (
20
+ UnifiedMemoryUnsupportedError,
21
+ create_unified_mempool,
22
+ )
20
23
  from megatron.core.inference.utils import tensor_swap
21
24
  from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
22
25
  from megatron.core.package_info import __version__ as mcore_version
23
26
  from megatron.core.transformer import TransformerConfig
24
27
  from megatron.core.utils import divide as core_divide
25
28
 
26
- from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
27
29
  from .base_context import BaseInferenceContext
28
30
  from .dynamic_block_allocator import BlockAllocator
29
31
 
@@ -323,16 +325,20 @@ class DynamicInferenceContext(BaseInferenceContext):
323
325
  self.params_dtype = params_dtype
324
326
  self.num_layers = num_layers
325
327
  self.max_sequence_length = max_sequence_length
328
+
329
+ # Unified memory.
326
330
  self.unified_memory_level = unified_memory_level
327
331
  if unified_memory_level > 0:
328
- if not has_unified_memory and torch.distributed.get_rank() == 0:
329
- warnings.warn(
330
- "Unified memory requested but not available; defaulting to GPU memory."
331
- )
332
- self.unified_memory_level = 0
333
- else:
332
+ try:
334
333
  self.unified_memory_mempool = create_unified_mempool()
334
+ except UnifiedMemoryUnsupportedError:
335
+ if torch.distributed.get_rank() == 0:
336
+ warnings.warn(
337
+ "Unified memory requested but not available; defaulting to GPU memory."
338
+ )
339
+ self.unified_memory_level = 0
335
340
 
341
+ # Request and token counts.
336
342
  self.total_request_count = 0
337
343
  self.active_token_count = 0
338
344
  self.paused_request_count = 0
@@ -448,26 +454,30 @@ class DynamicInferenceContext(BaseInferenceContext):
448
454
  num_cuda_graphs is not None
449
455
  )
450
456
 
451
- # Attention metadata initialization (tensors are now handled by MHAMetadata classes)
457
+ # `*_cudagraph_only` tensors are for use with cuda graphs to maintain
458
+ # consistent input shapes, which is required to use cuda graphs.
459
+ # During these steps, the `*_cudagraph_only`
460
+ # tensors are used, otherwise their same-name but un-suffixed
461
+ # corresponding tensors are used.
452
462
 
453
- self.graph_attn_metadata = {}
454
- self.non_graph_attn_metadata = {}
455
- self.active_attn_metadata = None
456
-
457
- self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata(
458
- block_count_total=block_count_total,
459
- max_kv_block_count=self.max_kv_block_count,
460
- max_requests=self.max_requests,
461
- block_size_tokens=self.block_size_tokens,
462
- max_seqlen=self.max_sequence_length,
463
+ self.query_seq_lengths_cudagraph_only = torch.full(
464
+ (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
465
+ )
466
+ self.cu_query_seq_lengths_cudagraph_only = torch.full(
467
+ (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
468
+ )
469
+ self.kv_seq_lengths_cudagraph_only = torch.full(
470
+ (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
471
+ )
472
+ self.cu_kv_seq_lengths_cudagraph_only = torch.full(
473
+ (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
463
474
  )
464
475
 
465
- self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata(
466
- block_count_total=block_count_total,
467
- max_kv_block_count=self.max_kv_block_count,
468
- max_requests=self.max_requests,
469
- block_size_tokens=self.block_size_tokens,
470
- max_seqlen=self.max_sequence_length,
476
+ self.request_to_kv_block_ids_cudagraph_only = torch.full(
477
+ (self.max_requests, self.max_kv_block_count),
478
+ 0,
479
+ dtype=torch.int,
480
+ device=torch.cuda.current_device(),
471
481
  )
472
482
 
473
483
  # Guaranteed active requests.
@@ -617,18 +627,11 @@ class DynamicInferenceContext(BaseInferenceContext):
617
627
 
618
628
  def cu_query_lengths(self) -> Tuple[Tensor, int]:
619
629
  """Cumulative query sequence lengths."""
620
- return (
621
- self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"],
622
- self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"],
623
- )
630
+ return self.cu_query_seq_lengths, self.max_seqlen_q
624
631
 
625
- def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
632
+ def cu_kv_lengths(self) -> Tensor:
626
633
  """Cumulative key/value sequence lengths."""
627
- return (
628
- self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"],
629
- self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"],
630
- self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_k"],
631
- )
634
+ return (self.cu_kv_seq_lengths, self.kv_seq_lengths, self.max_seqlen_k)
632
635
 
633
636
  def get_active_sequence_lengths(self) -> Tensor:
634
637
  """Total sequence length (query + key) for active requests."""
@@ -706,16 +709,12 @@ class DynamicInferenceContext(BaseInferenceContext):
706
709
  to blocks within the block-level memory buffer.
707
710
  """
708
711
  if self.cache_mla_latent:
709
- return (
710
- self.memory_buffer[layer_number - 1],
711
- None,
712
- self.active_attn_metadata["mha_metadata"].state_data["block_table"],
713
- )
712
+ return (self.memory_buffer[layer_number - 1], None, self.block_table)
714
713
  else:
715
714
  return (
716
715
  self.memory_buffer[0, layer_number - 1],
717
716
  self.memory_buffer[1, layer_number - 1],
718
- self.active_attn_metadata["mha_metadata"].state_data["block_table"],
717
+ self.block_table,
719
718
  )
720
719
 
721
720
  def apply_fused_qk_rotary_emb(
@@ -825,12 +824,17 @@ class DynamicInferenceContext(BaseInferenceContext):
825
824
 
826
825
  def reset_attention_state(self) -> None:
827
826
  """Reset state used within attention, after each step."""
828
- # Attention metadata reset is now handled by MHAMetadata.reset()
829
- for attn_metadata in self.non_graph_attn_metadata.values():
830
- attn_metadata.reset()
831
- for attn_metadata in self.graph_attn_metadata.values():
832
- attn_metadata.reset()
833
- self.active_attn_metadata = None
827
+ self.max_seqlen_q = None
828
+ self.max_seqlen_k = None
829
+ self.cu_query_seq_lengths = None
830
+ self.cu_query_seq_lengths_cudagraph_only.fill_(0)
831
+ self.query_seq_lengths_cudagraph_only.fill_(0)
832
+ self.cu_kv_seq_lengths = None
833
+ self.cu_kv_seq_lengths_cudagraph_only.fill_(0)
834
+ self.kv_seq_lengths = None
835
+ self.kv_seq_lengths_cudagraph_only.fill_(0)
836
+ self.request_to_kv_block_ids_cudagraph_only.fill_(0)
837
+ self.block_table = None
834
838
 
835
839
  def using_cuda_graph_this_step(self) -> bool:
836
840
  """Returns True if cuda graphs are being used for this step."""
@@ -930,29 +934,89 @@ class DynamicInferenceContext(BaseInferenceContext):
930
934
  self.active_token_count : self.padded_active_token_count
931
935
  ] = 0
932
936
 
933
- real_req_batch_size = (
934
- self.total_request_count - self.paused_request_count
935
- ) # how many requests are indeed active
936
- self.active_attn_metadata = (
937
- self.graph_attn_metadata
938
- if self.using_cuda_graph_this_step()
939
- else self.non_graph_attn_metadata
940
- )
941
-
942
937
  # Update cu_query_seq_lengths, max_seqlen_q.
943
- active_slice = slice(self.paused_request_count, self.total_request_count)
944
- query_lengths_view = self.request_query_lengths[active_slice]
945
- request_kv_length_offsets_view = self.request_kv_length_offsets[active_slice]
946
- request_to_kv_block_ids_view = self.request_to_kv_block_ids[active_slice]
947
- self.active_attn_metadata["mha_metadata"].update(
948
- request_query_lengths=query_lengths_view,
949
- request_kv_length_offsets=request_kv_length_offsets_view,
950
- request_to_kv_block_ids=request_to_kv_block_ids_view,
951
- padded_active_token_count=self.padded_active_token_count,
952
- real_batch_size=real_req_batch_size,
953
- padded_active_request_count=self.padded_active_request_count,
954
- )
955
- # All attention metadata calculations are now handled by MHAMetadata.update()
938
+ query_lengths = self.request_query_lengths[
939
+ self.paused_request_count : self.total_request_count
940
+ ]
941
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
942
+ self.query_seq_lengths_cudagraph_only[
943
+ 0 : self.total_request_count - self.paused_request_count
944
+ ] = query_lengths
945
+ if self.is_decode_only():
946
+ self.cu_query_seq_lengths = None # ensure no accidental use
947
+ self.max_seqlen_q = 1
948
+ else:
949
+ self.cu_query_seq_lengths_cudagraph_only[
950
+ 1 : self.padded_active_request_count + 1
951
+ ] = torch.cumsum(
952
+ self.query_seq_lengths_cudagraph_only[: self.padded_active_request_count], dim=0
953
+ )
954
+
955
+ # The following will be passed to the FA kernel.
956
+ self.cu_query_seq_lengths = self.cu_query_seq_lengths_cudagraph_only[
957
+ : (self.padded_active_request_count + 1)
958
+ ]
959
+ self.max_seqlen_q = self.padded_active_token_count
960
+ else:
961
+ cu_query_lengths = torch.cumsum(query_lengths, dim=0)
962
+ self.cu_query_seq_lengths = torch.full(
963
+ (self.total_request_count - self.paused_request_count + 1,),
964
+ 0,
965
+ dtype=torch.int32,
966
+ device=torch.cuda.current_device(),
967
+ )
968
+ self.cu_query_seq_lengths[1:] = cu_query_lengths
969
+ self.max_seqlen_q = query_lengths.max().item()
970
+
971
+ kv_seq_lengths = self.request_kv_length_offsets + self.request_query_lengths
972
+ self.kv_seq_lengths = kv_seq_lengths[self.paused_request_count : self.total_request_count]
973
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
974
+ # Re-assign `kv_seq_lengths` to be a view of the first
975
+ # `active_cuda_graph_request_count` tokens of `kv_seq_lengths_decode_only`,
976
+ # such that `kv_seq_lengths` has a static memory address and is therefore
977
+ # cuda graph compatible. This allows `kv_seq_lengths` to transition between,
978
+ # cuda graph sizes, which makes multi-batch-size cuda graphs possible.
979
+ self.kv_seq_lengths_cudagraph_only[
980
+ 0 : self.total_request_count - self.paused_request_count
981
+ ] = self.kv_seq_lengths
982
+ self.kv_seq_lengths = self.kv_seq_lengths_cudagraph_only[
983
+ : self.padded_active_request_count
984
+ ]
985
+ self.max_seqlen_k = self.max_sequence_length
986
+ if self.is_decode_only():
987
+ self.cu_kv_seq_lengths = None # ensure no accidental use
988
+ else:
989
+ cu_kv_lengths = torch.cumsum(self.kv_seq_lengths, dim=0)
990
+ # The following will be passed to the FA kernel.
991
+ self.cu_kv_seq_lengths_cudagraph_only[1 : cu_kv_lengths.size(0) + 1] = cu_kv_lengths
992
+ self.cu_kv_seq_lengths = self.cu_kv_seq_lengths_cudagraph_only[
993
+ : (self.padded_active_request_count + 1)
994
+ ]
995
+ else:
996
+ self.cu_kv_seq_lengths = torch.full(
997
+ (self.total_request_count - self.paused_request_count + 1,),
998
+ 0,
999
+ dtype=torch.int32,
1000
+ device=torch.cuda.current_device(),
1001
+ )
1002
+ self.cu_kv_seq_lengths[1:] = torch.cumsum(self.kv_seq_lengths, dim=0)
1003
+ self.max_seqlen_k = self.kv_seq_lengths.max().item()
1004
+
1005
+ # Update KV block IDs, block table.
1006
+ request_to_kv_block_ids = self.request_to_kv_block_ids[
1007
+ self.paused_request_count : self.total_request_count
1008
+ ]
1009
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
1010
+ self.request_to_kv_block_ids_cudagraph_only[
1011
+ 0 : self.total_request_count - self.paused_request_count
1012
+ ] = request_to_kv_block_ids
1013
+ self.block_table = self.request_to_kv_block_ids_cudagraph_only[
1014
+ : self.padded_active_request_count
1015
+ ]
1016
+ else:
1017
+ self.block_table = self.request_to_kv_block_ids[
1018
+ self.paused_request_count : self.total_request_count
1019
+ ]
956
1020
 
957
1021
  def reset(self) -> None:
958
1022
  """Reset entire context.
@@ -0,0 +1,127 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import os
4
+ import warnings
5
+ from enum import Enum, auto
6
+ from pathlib import Path
7
+
8
+ from torch.cuda.memory import CUDAPluggableAllocator
9
+ from torch.utils.cpp_extension import CUDA_HOME, load_inline
10
+
11
+ from megatron.core.utils import is_torch_min_version
12
+
13
+ try:
14
+ if is_torch_min_version("2.8.0"):
15
+ from torch.cuda.memory import MemPool
16
+ else:
17
+ from torch.cuda import MemPool
18
+ _has_mem_pool = True
19
+ except ImportError:
20
+ _has_mem_pool = False
21
+
22
+
23
+ class CompilationState(Enum):
24
+ """Enum to distinguish between unified memory (UVM) compilation states."""
25
+
26
+ UNATTEMPTED = auto() # Compilation has not been attempted.
27
+ FAILURE = auto() # Compilation attempted, but failed.
28
+ SUCCESS = auto() # Compilation attempted, and succeeded.
29
+
30
+
31
+ # Compilation vars.
32
+ _compilation_state = CompilationState.UNATTEMPTED
33
+ _alloc = None # must remain global until process exit.
34
+ _mod = None # must remain global until process exit.
35
+
36
+
37
+ class UnifiedMemoryUnsupportedError(Exception):
38
+ """Unified memory is not supported on this system."""
39
+
40
+ pass
41
+
42
+
43
+ def compile_allocator():
44
+ """Attempt to compile UVM allocator."""
45
+
46
+ global _compilation_state, _alloc, _mod
47
+
48
+ if _compilation_state != CompilationState.UNATTEMPTED:
49
+ return
50
+
51
+ _mempool_c_src = r"""
52
+ #include <cuda_runtime_api.h>
53
+ #include <cstddef>
54
+
55
+ #define EXPORT extern "C"
56
+
57
+ EXPORT void* managed_malloc(size_t size, int device, void* stream) {
58
+ (void)stream;
59
+ int cur = -1;
60
+ cudaGetDevice(&cur);
61
+ if (device != cur && device >= 0) cudaSetDevice(device);
62
+
63
+ // cudaMallocManaged allows for more memory to be allocated than the device memory size.
64
+ // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
65
+ void* ptr = nullptr;
66
+ cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
67
+ if (err != cudaSuccess) return nullptr;
68
+
69
+ if (device >= 0) {
70
+ // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
71
+ // This is a hint that tries to prevent data from being migrated away from the device.
72
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
73
+ // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
74
+ // Even if the memory has to be migrated away from the device, it still does not page fault.
75
+ // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
76
+ // but there is no harm in adding this flag as well for future-proofing.
77
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
78
+ }
79
+ return ptr;
80
+ }
81
+
82
+ EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
83
+ // Memory allocated with cudaMallocManaged should be released with cudaFree.
84
+ (void)size; (void)device; (void)stream;
85
+ if (ptr) cudaFree(ptr);
86
+ }
87
+ """
88
+
89
+ # Build the .so upon import; this avoids issues.
90
+ if _has_mem_pool:
91
+ _extra_ldflags = ["-lcudart"]
92
+ if CUDA_HOME:
93
+ _cuda_lib = os.path.join(CUDA_HOME, "lib64")
94
+ if os.path.isdir(_cuda_lib):
95
+ _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
96
+ try:
97
+ _mod = load_inline(
98
+ name="managed_alloc_runtime",
99
+ cpp_sources=[_mempool_c_src],
100
+ functions=[],
101
+ with_cuda=True,
102
+ extra_ldflags=_extra_ldflags,
103
+ verbose=False,
104
+ )
105
+ _so_path = Path(_mod.__file__).as_posix()
106
+ _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
107
+ _compilation_state = CompilationState.SUCCESS
108
+ except (RuntimeError, ImportError, OSError):
109
+ warnings.warn("Failed to create unified memory mempool.")
110
+ _compilation_state = CompilationState.FAILURE
111
+
112
+
113
+ def create_unified_mempool() -> MemPool:
114
+ """Create a unified memory mempool using CUDA managed memory.
115
+
116
+ Returns:
117
+ (MemPool) Unified memory mempool.
118
+ """
119
+
120
+ # Attempt to compile allocator.
121
+ compile_allocator()
122
+
123
+ # Return mempool.
124
+ if _compilation_state != CompilationState.SUCCESS:
125
+ raise UnifiedMemoryUnsupportedError()
126
+ else:
127
+ return MemPool(allocator=_alloc)
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 16
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0.dev111286'
7
+ PRE_RELEASE = 'rc0.dev112436'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev111286
3
+ Version: 0.16.0rc0.dev112436
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -169,8 +169,6 @@ megatron/core/inference/contexts/dynamic_block_allocator.py
169
169
  megatron/core/inference/contexts/dynamic_context.py
170
170
  megatron/core/inference/contexts/fused_kv_append_kernel.py
171
171
  megatron/core/inference/contexts/static_context.py
172
- megatron/core/inference/contexts/attention_context/metadata_base.py
173
- megatron/core/inference/contexts/attention_context/mha_metadata.py
174
172
  megatron/core/inference/engines/__init__.py
175
173
  megatron/core/inference/engines/abstract_engine.py
176
174
  megatron/core/inference/engines/dynamic_engine.py
@@ -1,72 +0,0 @@
1
- # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
-
3
-
4
- class MetadataBase:
5
- """
6
- Base class for attention metadata.
7
- High-performance attention kernels often require input metadata in specific
8
- formats—such as cumulative query lengths, cumulative key/value lengths,
9
- and similar structures. Moreover, when using CUDA Graphs, these metadata
10
- buffers must be statically allocated. This class serves as a unified container
11
- that manages all such metadata in one place.
12
- """
13
-
14
- def __init__(self):
15
- """
16
- Initialize the metadata.
17
- """
18
- self.state_data = {}
19
-
20
- def update(self, *args, **kwargs):
21
- """
22
- Construct the metadata from request states.
23
- """
24
- pass
25
-
26
- def reset(self):
27
- """
28
- Reset the metadata.
29
- """
30
- pass
31
-
32
- def tensor_copy_and_pad(
33
- self,
34
- tensor_buf,
35
- unpadded_tensor,
36
- real_batch_size,
37
- padded_batch_size,
38
- is_cumulative_tensor=False,
39
- pad_value=0,
40
- ):
41
- """
42
- Copy the unpadded tensor to the tensor_buf,
43
- pad the tensor_buf with zero or the last value of the tensor,
44
- depending on whether the tensor is cumulative.
45
- Args:
46
- tensor_buf: The destination tensor, at least padded_batch_size long.
47
- unpadded_tensor: The tensor to copy, at least real_batch_size long.
48
- real_batch_size: The real batch size.
49
- padded_batch_size: Padded boundary of the tensor.
50
- is_cumulative_tensor: Whether the tensor is cumulative.
51
- If True, we pad the tensor_buf with the last value of the unpadded_tensor.
52
- pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
53
- """
54
- assert real_batch_size <= padded_batch_size
55
- assert tensor_buf.shape[0] >= padded_batch_size
56
- assert unpadded_tensor.shape[0] >= real_batch_size
57
- if is_cumulative_tensor:
58
- if real_batch_size == 0:
59
- value = pad_value
60
- else:
61
- value = unpadded_tensor[real_batch_size - 1]
62
- else:
63
- value = pad_value
64
- tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
65
- tensor_buf[real_batch_size:padded_batch_size] = value
66
- return tensor_buf
67
-
68
- def __str__(self):
69
- """
70
- Return a string representation of the metadata.
71
- """
72
- return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])