megatron-core 0.16.0rc0.dev100285__tar.gz → 0.16.0rc0.dev101543__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show
  1. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/dynamic_context.py +126 -77
  3. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/engines/dynamic_engine.py +1 -15
  4. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +1 -2
  5. megatron_core-0.16.0rc0.dev101543/megatron/core/inference/unified_memory.py +89 -0
  6. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/gpt_model.py +2 -1
  7. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/package_info.py +1 -1
  8. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron_core.egg-info/PKG-INFO +1 -1
  9. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron_core.egg-info/SOURCES.txt +0 -2
  10. megatron_core-0.16.0rc0.dev100285/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -72
  11. megatron_core-0.16.0rc0.dev100285/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -220
  12. megatron_core-0.16.0rc0.dev100285/megatron/core/inference/unified_memory.py +0 -127
  13. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/MANIFEST.in +0 -0
  14. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/README.md +0 -0
  15. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/README.md +0 -0
  16. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/__init__.py +0 -0
  17. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/activations.py +0 -0
  18. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/config.py +0 -0
  19. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/config_logger.py +0 -0
  20. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/__init__.py +0 -0
  21. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/bert_dataset.py +0 -0
  22. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/blended_dataset.py +0 -0
  23. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  24. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  25. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/gpt_dataset.py +0 -0
  26. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/helpers.cpp +0 -0
  27. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/helpers.py +0 -0
  28. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/indexed_dataset.py +0 -0
  29. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/masked_dataset.py +0 -0
  30. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/megatron_dataset.py +0 -0
  31. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  32. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/multimodal_dataset.py +0 -0
  33. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/object_storage_utils.py +0 -0
  34. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/__init__.py +0 -0
  35. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/config/__init__.py +0 -0
  36. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  37. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/config/config.py +0 -0
  38. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  39. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  40. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/db/__init__.py +0 -0
  41. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/db/build.py +0 -0
  42. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/db/dataset.py +0 -0
  43. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/db/utils.py +0 -0
  44. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/external_libs.py +0 -0
  45. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/__init__.py +0 -0
  46. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/build.py +0 -0
  47. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/factory.py +0 -0
  48. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/index.py +0 -0
  49. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  50. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  51. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  52. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/utils.py +0 -0
  53. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/index/validate.py +0 -0
  54. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/__init__.py +0 -0
  55. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  56. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  57. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/query.py +0 -0
  58. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  59. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/query/utils.py +0 -0
  60. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/retro/utils.py +0 -0
  61. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/t5_dataset.py +0 -0
  62. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/utils.py +0 -0
  63. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/datasets/utils_s3.py +0 -0
  64. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/__init__.py +0 -0
  65. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/core.py +0 -0
  66. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  67. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  68. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/mapping.py +0 -0
  69. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  70. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/serialization.py +0 -0
  71. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  72. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  73. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  74. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  75. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  76. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  77. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  78. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  79. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  80. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  81. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  82. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  83. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  84. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  85. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  86. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  87. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/utils.py +0 -0
  88. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/dist_checkpointing/validation.py +0 -0
  89. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/__init__.py +0 -0
  90. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/data_parallel_base.py +0 -0
  91. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  92. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  93. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/finalize_model_grads.py +0 -0
  94. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/__init__.py +0 -0
  95. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  96. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  97. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  98. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  99. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  100. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  101. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  102. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  103. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  104. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  105. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  106. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  107. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  108. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  109. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/energy_monitor.py +0 -0
  110. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/enums.py +0 -0
  111. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/__init__.py +0 -0
  112. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/data_type.py +0 -0
  113. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/export_config.py +0 -0
  114. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/model_type.py +0 -0
  115. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/__init__.py +0 -0
  116. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  117. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  118. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  119. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  120. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  121. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  122. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  123. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  124. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  125. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  126. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  127. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  128. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/extensions/__init__.py +0 -0
  129. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/extensions/kitchen.py +0 -0
  130. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/extensions/transformer_engine.py +0 -0
  131. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  132. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fp4_utils.py +0 -0
  133. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fp8_utils.py +0 -0
  134. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/full_cuda_graph.py +0 -0
  135. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/__init__.py +0 -0
  136. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  137. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  138. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  139. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  140. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  141. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_indices_converter.py +0 -0
  142. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_layer_norm.py +0 -0
  143. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  144. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  145. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_softmax.py +0 -0
  146. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  147. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/hyper_comm_grid.py +0 -0
  148. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/__init__.py +0 -0
  149. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/async_stream.py +0 -0
  150. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/common_inference_params.py +0 -0
  151. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/communication_utils.py +0 -0
  152. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/__init__.py +0 -0
  153. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/base_context.py +0 -0
  154. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  155. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
  156. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/contexts/static_context.py +0 -0
  157. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  158. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/engines/__init__.py +0 -0
  159. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/engines/abstract_engine.py +0 -0
  160. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/engines/mcore_engine.py +0 -0
  161. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/engines/static_engine.py +0 -0
  162. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/headers.py +0 -0
  163. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/inference_client.py +0 -0
  164. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/inference_request.py +0 -0
  165. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  166. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  167. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  168. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  169. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  170. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  171. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  172. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  173. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/sampling_params.py +0 -0
  174. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/scheduler.py +0 -0
  175. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  176. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  177. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  178. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  179. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  180. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  181. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  182. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  183. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  184. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  185. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference/utils.py +0 -0
  186. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/inference_params.py +0 -0
  187. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/jit.py +0 -0
  188. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/model_parallel_config.py +0 -0
  189. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/T5/__init__.py +0 -0
  190. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/T5/t5_model.py +0 -0
  191. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/T5/t5_spec.py +0 -0
  192. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/__init__.py +0 -0
  193. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/backends.py +0 -0
  194. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/bert/__init__.py +0 -0
  195. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  196. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/bert/bert_lm_head.py +0 -0
  197. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/bert/bert_model.py +0 -0
  198. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/bert/pooler.py +0 -0
  199. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/__init__.py +0 -0
  200. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/__init__.py +0 -0
  201. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  202. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  203. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  204. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  205. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  206. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/language_module/__init__.py +0 -0
  207. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/language_module/language_module.py +0 -0
  208. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  209. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/vision_module/__init__.py +0 -0
  210. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  211. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/__init__.py +0 -0
  212. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  213. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  214. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  215. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  216. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/huggingface/__init__.py +0 -0
  217. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/huggingface/clip_model.py +0 -0
  218. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/huggingface/module.py +0 -0
  219. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/huggingface/qwen_model.py +0 -0
  220. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mamba/__init__.py +0 -0
  221. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  222. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mamba/mamba_model.py +0 -0
  223. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/__init__.py +0 -0
  224. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/config/__init__.py +0 -0
  225. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/config/base_configs.py +0 -0
  226. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/model/__init__.py +0 -0
  227. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/model/base.py +0 -0
  228. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/submodules/audio.py +0 -0
  229. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/submodules/base.py +0 -0
  230. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/mimo/submodules/vision.py +0 -0
  231. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/multimodal/__init__.py +0 -0
  232. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/multimodal/context_parallel.py +0 -0
  233. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/multimodal/llava_model.py +0 -0
  234. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/multimodal/llava_spec.py +0 -0
  235. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/__init__.py +0 -0
  236. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/base_attention.py +0 -0
  237. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/config.py +0 -0
  238. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/decoder_attention.py +0 -0
  239. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/decoder_spec.py +0 -0
  240. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/encoder_attention.py +0 -0
  241. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/encoder_spec.py +0 -0
  242. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/model.py +0 -0
  243. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/retro/utils.py +0 -0
  244. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/vision/__init__.py +0 -0
  245. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/vision/clip_vit_model.py +0 -0
  246. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/vision/multimodal_projector.py +0 -0
  247. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/vision/radio.py +0 -0
  248. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  249. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/msc_utils.py +0 -0
  250. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/nccl_allocator.py +0 -0
  251. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/num_microbatches_calculator.py +0 -0
  252. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/__init__.py +0 -0
  253. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/clip_grads.py +0 -0
  254. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  255. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  256. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  257. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/grad_scaler.py +0 -0
  258. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/optimizer.py +0 -0
  259. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer/optimizer_config.py +0 -0
  260. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/optimizer_param_scheduler.py +0 -0
  261. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/packed_seq_params.py +0 -0
  262. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/parallel_state.py +0 -0
  263. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/__init__.py +0 -0
  264. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  265. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  266. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  267. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/schedules.py +0 -0
  268. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/pipeline_parallel/utils.py +0 -0
  269. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/__init__.py +0 -0
  270. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/__init__.py +0 -0
  271. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  272. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  273. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  274. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/layers.py +0 -0
  275. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  276. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  277. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/process_groups_config.py +0 -0
  278. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/quantization/__init__.py +0 -0
  279. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/quantization/quant_config.py +0 -0
  280. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/quantization/utils.py +0 -0
  281. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/requirements.txt +0 -0
  282. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/rerun_state_machine.py +0 -0
  283. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/safe_globals.py +0 -0
  284. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/__init__.py +0 -0
  285. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mamba_block.py +0 -0
  286. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  287. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  288. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mamba_layer.py +0 -0
  289. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mamba_mixer.py +0 -0
  290. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/mlp_layer.py +0 -0
  291. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/ssm/triton_cache_manager.py +0 -0
  292. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/__init__.py +0 -0
  293. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  294. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/data.py +0 -0
  295. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/layers.py +0 -0
  296. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/mappings.py +0 -0
  297. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/random.py +0 -0
  298. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tensor_parallel/utils.py +0 -0
  299. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/timers.py +0 -0
  300. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/__init__.py +0 -0
  301. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  302. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  303. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/__init__.py +0 -0
  304. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  305. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  306. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  307. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  308. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  309. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  312. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  313. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  314. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  315. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  318. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  322. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/__init__.py +0 -0
  323. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/attention.py +0 -0
  324. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/cuda_graphs.py +0 -0
  325. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  326. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  327. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/dot_product_attention.py +0 -0
  328. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/enums.py +0 -0
  329. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  330. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  331. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  332. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/identity_op.py +0 -0
  333. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/mlp.py +0 -0
  334. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/module.py +0 -0
  335. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/__init__.py +0 -0
  336. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/experts.py +0 -0
  337. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  338. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  339. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/moe_layer.py +0 -0
  340. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/moe_utils.py +0 -0
  341. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/router.py +0 -0
  342. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/shared_experts.py +0 -0
  343. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  344. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  345. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/multi_latent_attention.py +0 -0
  346. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/multi_token_prediction.py +0 -0
  347. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  348. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/spec_utils.py +0 -0
  349. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/torch_layer_norm.py +0 -0
  350. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/torch_norm.py +0 -0
  351. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/transformer_block.py +0 -0
  352. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/transformer_config.py +0 -0
  353. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/transformer_layer.py +0 -0
  354. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/transformer/utils.py +0 -0
  355. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron/core/utils.py +0 -0
  356. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron_core.egg-info/dependency_links.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron_core.egg-info/requires.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/megatron_core.egg-info/top_level.txt +0 -0
  359. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/pyproject.toml +0 -0
  360. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/setup.cfg +0 -0
  361. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101543}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev101543
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -16,17 +16,13 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
16
16
  from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
17
17
  InferenceWrapperConfig,
18
18
  )
19
- from megatron.core.inference.unified_memory import (
20
- UnifiedMemoryUnsupportedError,
21
- create_unified_mempool,
22
- )
19
+ from megatron.core.inference.unified_memory import create_unified_mempool, has_unified_memory
23
20
  from megatron.core.inference.utils import tensor_swap
24
21
  from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
25
22
  from megatron.core.package_info import __version__ as mcore_version
26
23
  from megatron.core.transformer import TransformerConfig
27
24
  from megatron.core.utils import divide as core_divide
28
25
 
29
- from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
30
26
  from .base_context import BaseInferenceContext
31
27
  from .dynamic_block_allocator import BlockAllocator
32
28
 
@@ -326,20 +322,16 @@ class DynamicInferenceContext(BaseInferenceContext):
326
322
  self.params_dtype = params_dtype
327
323
  self.num_layers = num_layers
328
324
  self.max_sequence_length = max_sequence_length
329
-
330
- # Unified memory.
331
325
  self.unified_memory_level = unified_memory_level
332
326
  if unified_memory_level > 0:
333
- try:
334
- self.unified_memory_mempool = create_unified_mempool()
335
- except UnifiedMemoryUnsupportedError:
336
- if torch.distributed.get_rank() == 0:
337
- warnings.warn(
338
- "Unified memory requested but not available; defaulting to GPU memory."
339
- )
327
+ if not has_unified_memory and torch.distributed.get_rank() == 0:
328
+ warnings.warn(
329
+ "Unified memory requested but not available; defaulting to GPU memory."
330
+ )
340
331
  self.unified_memory_level = 0
332
+ else:
333
+ self.unified_memory_mempool = create_unified_mempool()
341
334
 
342
- # Request and token counts.
343
335
  self.total_request_count = 0
344
336
  self.active_token_count = 0
345
337
  self.paused_request_count = 0
@@ -455,26 +447,30 @@ class DynamicInferenceContext(BaseInferenceContext):
455
447
  num_cuda_graphs is not None
456
448
  )
457
449
 
458
- # Attention metadata initialization (tensors are now handled by MHAMetadata classes)
450
+ # `*_cudagraph_only` tensors are for use with cuda graphs to maintain
451
+ # consistent input shapes, which is required to use cuda graphs.
452
+ # During these steps, the `*_cudagraph_only`
453
+ # tensors are used, otherwise their same-name but un-suffixed
454
+ # corresponding tensors are used.
459
455
 
460
- self.graph_attn_metadata = {}
461
- self.non_graph_attn_metadata = {}
462
- self.active_attn_metadata = None
463
-
464
- self.graph_attn_metadata["mha_metadata"] = GraphedMHAMetadata(
465
- block_count_total=block_count_total,
466
- max_kv_block_count=self.max_kv_block_count,
467
- max_requests=self.max_requests,
468
- block_size_tokens=self.block_size_tokens,
469
- max_seqlen=self.max_sequence_length,
456
+ self.query_seq_lengths_cudagraph_only = torch.full(
457
+ (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
458
+ )
459
+ self.cu_query_seq_lengths_cudagraph_only = torch.full(
460
+ (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
461
+ )
462
+ self.kv_seq_lengths_cudagraph_only = torch.full(
463
+ (self.max_requests,), 0, dtype=torch.int32, device=torch.cuda.current_device()
464
+ )
465
+ self.cu_kv_seq_lengths_cudagraph_only = torch.full(
466
+ (self.max_requests + 1,), 0, dtype=torch.int32, device=torch.cuda.current_device()
470
467
  )
471
468
 
472
- self.non_graph_attn_metadata["mha_metadata"] = NonGraphedMHAMetadata(
473
- block_count_total=block_count_total,
474
- max_kv_block_count=self.max_kv_block_count,
475
- max_requests=self.max_requests,
476
- block_size_tokens=self.block_size_tokens,
477
- max_seqlen=self.max_sequence_length,
469
+ self.request_to_kv_block_ids_cudagraph_only = torch.full(
470
+ (self.max_requests, self.max_kv_block_count),
471
+ 0,
472
+ dtype=torch.int,
473
+ device=torch.cuda.current_device(),
478
474
  )
479
475
 
480
476
  # Guaranteed active requests.
@@ -624,18 +620,11 @@ class DynamicInferenceContext(BaseInferenceContext):
624
620
 
625
621
  def cu_query_lengths(self) -> Tuple[Tensor, int]:
626
622
  """Cumulative query sequence lengths."""
627
- return (
628
- self.active_attn_metadata["mha_metadata"].state_data["cu_query_seq_lengths"],
629
- self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_q"],
630
- )
623
+ return self.cu_query_seq_lengths, self.max_seqlen_q
631
624
 
632
- def cu_kv_lengths(self) -> Tuple[Tensor, Tensor, int]:
625
+ def cu_kv_lengths(self) -> Tensor:
633
626
  """Cumulative key/value sequence lengths."""
634
- return (
635
- self.active_attn_metadata["mha_metadata"].state_data["cu_kv_seq_lengths"],
636
- self.active_attn_metadata["mha_metadata"].state_data["kv_seq_lengths"],
637
- self.active_attn_metadata["mha_metadata"].state_data["max_seqlen_k"],
638
- )
627
+ return (self.cu_kv_seq_lengths, self.kv_seq_lengths, self.max_seqlen_k)
639
628
 
640
629
  def get_active_sequence_lengths(self) -> Tensor:
641
630
  """Total sequence length (query + key) for active requests."""
@@ -713,16 +702,12 @@ class DynamicInferenceContext(BaseInferenceContext):
713
702
  to blocks within the block-level memory buffer.
714
703
  """
715
704
  if self.cache_mla_latent:
716
- return (
717
- self.memory_buffer[layer_number - 1],
718
- None,
719
- self.active_attn_metadata["mha_metadata"].state_data["block_table"],
720
- )
705
+ return (self.memory_buffer[layer_number - 1], None, self.block_table)
721
706
  else:
722
707
  return (
723
708
  self.memory_buffer[0, layer_number - 1],
724
709
  self.memory_buffer[1, layer_number - 1],
725
- self.active_attn_metadata["mha_metadata"].state_data["block_table"],
710
+ self.block_table,
726
711
  )
727
712
 
728
713
  def apply_fused_qk_rotary_emb(
@@ -832,12 +817,17 @@ class DynamicInferenceContext(BaseInferenceContext):
832
817
 
833
818
  def reset_attention_state(self) -> None:
834
819
  """Reset state used within attention, after each step."""
835
- # Attention metadata reset is now handled by MHAMetadata.reset()
836
- for attn_metadata in self.non_graph_attn_metadata.values():
837
- attn_metadata.reset()
838
- for attn_metadata in self.graph_attn_metadata.values():
839
- attn_metadata.reset()
840
- self.active_attn_metadata = None
820
+ self.max_seqlen_q = None
821
+ self.max_seqlen_k = None
822
+ self.cu_query_seq_lengths = None
823
+ self.cu_query_seq_lengths_cudagraph_only.fill_(0)
824
+ self.query_seq_lengths_cudagraph_only.fill_(0)
825
+ self.cu_kv_seq_lengths = None
826
+ self.cu_kv_seq_lengths_cudagraph_only.fill_(0)
827
+ self.kv_seq_lengths = None
828
+ self.kv_seq_lengths_cudagraph_only.fill_(0)
829
+ self.request_to_kv_block_ids_cudagraph_only.fill_(0)
830
+ self.block_table = None
841
831
 
842
832
  def using_cuda_graph_this_step(self) -> bool:
843
833
  """Returns True if cuda graphs are being used for this step."""
@@ -937,30 +927,89 @@ class DynamicInferenceContext(BaseInferenceContext):
937
927
  self.active_token_count : self.padded_active_token_count
938
928
  ] = 0
939
929
 
940
- real_req_batch_size = (
941
- self.total_request_count - self.paused_request_count
942
- ) # how many requests are indeed active
943
- self.active_attn_metadata = (
944
- self.graph_attn_metadata
945
- if self.using_cuda_graph_this_step()
946
- else self.non_graph_attn_metadata
947
- )
948
-
949
930
  # Update cu_query_seq_lengths, max_seqlen_q.
950
- active_slice = slice(self.paused_request_count, self.total_request_count)
951
- query_lengths_view = self.request_query_lengths[active_slice]
952
- request_kv_length_offsets_view = self.request_kv_length_offsets[active_slice]
953
- request_to_kv_block_ids_view = self.request_to_kv_block_ids[active_slice]
954
- self.active_attn_metadata["mha_metadata"].update(
955
- request_query_lengths=query_lengths_view,
956
- request_kv_length_offsets=request_kv_length_offsets_view,
957
- request_to_kv_block_ids=request_to_kv_block_ids_view,
958
- padded_active_token_count=self.padded_active_token_count,
959
- real_batch_size=real_req_batch_size,
960
- padded_active_request_count=self.padded_active_request_count,
961
- decode_only=self.is_decode_only(),
962
- )
963
- # All attention metadata calculations are now handled by MHAMetadata.update()
931
+ query_lengths = self.request_query_lengths[
932
+ self.paused_request_count : self.total_request_count
933
+ ]
934
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
935
+ self.query_seq_lengths_cudagraph_only[
936
+ 0 : self.total_request_count - self.paused_request_count
937
+ ] = query_lengths
938
+ if self.is_decode_only():
939
+ self.cu_query_seq_lengths = None # ensure no accidental use
940
+ self.max_seqlen_q = 1
941
+ else:
942
+ self.cu_query_seq_lengths_cudagraph_only[
943
+ 1 : self.padded_active_request_count + 1
944
+ ] = torch.cumsum(
945
+ self.query_seq_lengths_cudagraph_only[: self.padded_active_request_count], dim=0
946
+ )
947
+
948
+ # The following will be passed to the FA kernel.
949
+ self.cu_query_seq_lengths = self.cu_query_seq_lengths_cudagraph_only[
950
+ : (self.padded_active_request_count + 1)
951
+ ]
952
+ self.max_seqlen_q = self.padded_active_token_count
953
+ else:
954
+ cu_query_lengths = torch.cumsum(query_lengths, dim=0)
955
+ self.cu_query_seq_lengths = torch.full(
956
+ (self.total_request_count - self.paused_request_count + 1,),
957
+ 0,
958
+ dtype=torch.int32,
959
+ device=torch.cuda.current_device(),
960
+ )
961
+ self.cu_query_seq_lengths[1:] = cu_query_lengths
962
+ self.max_seqlen_q = query_lengths.max().item()
963
+
964
+ kv_seq_lengths = self.request_kv_length_offsets + self.request_query_lengths
965
+ self.kv_seq_lengths = kv_seq_lengths[self.paused_request_count : self.total_request_count]
966
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
967
+ # Re-assign `kv_seq_lengths` to be a view of the first
968
+ # `active_cuda_graph_request_count` tokens of `kv_seq_lengths_decode_only`,
969
+ # such that `kv_seq_lengths` has a static memory address and is therefore
970
+ # cuda graph compatible. This allows `kv_seq_lengths` to transition between,
971
+ # cuda graph sizes, which makes multi-batch-size cuda graphs possible.
972
+ self.kv_seq_lengths_cudagraph_only[
973
+ 0 : self.total_request_count - self.paused_request_count
974
+ ] = self.kv_seq_lengths
975
+ self.kv_seq_lengths = self.kv_seq_lengths_cudagraph_only[
976
+ : self.padded_active_request_count
977
+ ]
978
+ self.max_seqlen_k = self.max_sequence_length
979
+ if self.is_decode_only():
980
+ self.cu_kv_seq_lengths = None # ensure no accidental use
981
+ else:
982
+ cu_kv_lengths = torch.cumsum(self.kv_seq_lengths, dim=0)
983
+ # The following will be passed to the FA kernel.
984
+ self.cu_kv_seq_lengths_cudagraph_only[1 : cu_kv_lengths.size(0) + 1] = cu_kv_lengths
985
+ self.cu_kv_seq_lengths = self.cu_kv_seq_lengths_cudagraph_only[
986
+ : (self.padded_active_request_count + 1)
987
+ ]
988
+ else:
989
+ self.cu_kv_seq_lengths = torch.full(
990
+ (self.total_request_count - self.paused_request_count + 1,),
991
+ 0,
992
+ dtype=torch.int32,
993
+ device=torch.cuda.current_device(),
994
+ )
995
+ self.cu_kv_seq_lengths[1:] = torch.cumsum(self.kv_seq_lengths, dim=0)
996
+ self.max_seqlen_k = self.kv_seq_lengths.max().item()
997
+
998
+ # Update KV block IDs, block table.
999
+ request_to_kv_block_ids = self.request_to_kv_block_ids[
1000
+ self.paused_request_count : self.total_request_count
1001
+ ]
1002
+ if self.is_decode_only() or self.using_cuda_graph_this_step():
1003
+ self.request_to_kv_block_ids_cudagraph_only[
1004
+ 0 : self.total_request_count - self.paused_request_count
1005
+ ] = request_to_kv_block_ids
1006
+ self.block_table = self.request_to_kv_block_ids_cudagraph_only[
1007
+ : self.padded_active_request_count
1008
+ ]
1009
+ else:
1010
+ self.block_table = self.request_to_kv_block_ids[
1011
+ self.paused_request_count : self.total_request_count
1012
+ ]
964
1013
 
965
1014
  def reset(self) -> None:
966
1015
  """Reset entire context.
@@ -165,17 +165,6 @@ class DynamicInferenceEngine(AbstractEngine):
165
165
  context = self.context
166
166
  controller = self.controller
167
167
 
168
- config = controller.inference_wrapped_model.inference_wrapper_config
169
- moe_pad_experts = config.moe_pad_experts_for_cuda_graph_inference
170
-
171
- if moe_pad_experts and context.non_decode_cuda_graphs:
172
- context.non_decode_cuda_graphs = False
173
- if torch.distributed.get_rank() == 0:
174
- warnings.warn(
175
- "MoE models do not support non-decode cuda graphs. "
176
- "Forcing non_decode_cuda_graphs to False."
177
- )
178
-
179
168
  time_start = time.time()
180
169
  mem_stats_start = torch.cuda.memory_stats()
181
170
 
@@ -185,18 +174,15 @@ class DynamicInferenceEngine(AbstractEngine):
185
174
  context.cuda_graph_token_counts,
186
175
  )
187
176
  for warmup_engine_mode in [WarmupEngineMode.DECODE, WarmupEngineMode.NON_DECODE]:
188
- # Check whether to skip non-decode graphs.
177
+ # Iterate cuda graph dims.
189
178
  if (
190
179
  warmup_engine_mode == WarmupEngineMode.NON_DECODE
191
180
  and not context.non_decode_cuda_graphs
192
181
  ):
193
182
  continue
194
-
195
183
  tbar = enumerate(context.cuda_graph_token_counts)
196
184
  if HAVE_TQDM:
197
185
  tbar = tqdm(tbar, total=len(context.cuda_graph_token_counts))
198
-
199
- # Iterate cuda graph dims.
200
186
  for tbar_idx, cuda_graph_token_count in tbar:
201
187
  if (
202
188
  cuda_graph_token_count == 1
@@ -508,8 +508,7 @@ class TextGenerationController:
508
508
  inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference
509
509
  )
510
510
  if moe_pad_experts_for_cuda_graph_inference:
511
- assert warmup_engine_mode is not WarmupEngineMode.NON_DECODE
512
- if context.is_decode_only():
511
+ if context.is_decode_only() or warmup_engine_mode is not None:
513
512
  capacity_factor = model_config.num_moe_experts / model_config.moe_router_topk
514
513
  set_decode_expert_padding(unwrapped_model, True, capacity_factor=capacity_factor)
515
514
  else:
@@ -0,0 +1,89 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import os
4
+ import warnings
5
+ from pathlib import Path
6
+
7
+ from torch.cuda.memory import CUDAPluggableAllocator
8
+ from torch.utils.cpp_extension import CUDA_HOME, load_inline
9
+
10
+ from megatron.core.utils import is_torch_min_version
11
+
12
+ try:
13
+ if is_torch_min_version("2.8.0"):
14
+ from torch.cuda.memory import MemPool
15
+ else:
16
+ from torch.cuda import MemPool
17
+ _has_mem_pool = True
18
+ except ImportError:
19
+ _has_mem_pool = False
20
+
21
+ _mempool_c_src = r"""
22
+ #include <cuda_runtime_api.h>
23
+ #include <cstddef>
24
+
25
+ #define EXPORT extern "C"
26
+
27
+ EXPORT void* managed_malloc(size_t size, int device, void* stream) {
28
+ (void)stream;
29
+ int cur = -1;
30
+ cudaGetDevice(&cur);
31
+ if (device != cur && device >= 0) cudaSetDevice(device);
32
+
33
+ // cudaMallocManaged allows for more memory to be allocated than the device memory size.
34
+ // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
35
+ void* ptr = nullptr;
36
+ cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
37
+ if (err != cudaSuccess) return nullptr;
38
+
39
+ if (device >= 0) {
40
+ // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
41
+ // This is a hint that tries to prevent data from being migrated away from the device.
42
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
43
+ // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
44
+ // Even if the memory has to be migrated away from the device, it still does not page fault.
45
+ // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
46
+ // but there is no harm in adding this flag as well for future-proofing.
47
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
48
+ }
49
+ return ptr;
50
+ }
51
+
52
+ EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
53
+ // Memory allocated with cudaMallocManaged should be released with cudaFree.
54
+ (void)size; (void)device; (void)stream;
55
+ if (ptr) cudaFree(ptr);
56
+ }
57
+ """
58
+
59
+ # Avoid linting errors.
60
+ has_unified_memory = False
61
+ _alloc = None
62
+
63
+ # Build the .so upon import; this avoids issues.
64
+ if _has_mem_pool:
65
+ _extra_ldflags = ["-lcudart"]
66
+ if CUDA_HOME:
67
+ _cuda_lib = os.path.join(CUDA_HOME, "lib64")
68
+ if os.path.isdir(_cuda_lib):
69
+ _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
70
+ try:
71
+ _mod = load_inline(
72
+ name="managed_alloc_runtime",
73
+ cpp_sources=[_mempool_c_src],
74
+ functions=[],
75
+ with_cuda=True,
76
+ extra_ldflags=_extra_ldflags,
77
+ verbose=False,
78
+ )
79
+ _so_path = Path(_mod.__file__).as_posix()
80
+ _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
81
+ has_unified_memory = True
82
+ except (RuntimeError, ImportError, OSError):
83
+ warnings.warn("Failed to create unified memory mempool.")
84
+
85
+
86
+ def create_unified_mempool():
87
+ """Create a unified memory mempool using CUDA managed memory."""
88
+ assert has_unified_memory
89
+ return MemPool(allocator=_alloc)
@@ -375,13 +375,14 @@ class GPTModel(LanguageModule):
375
375
  )
376
376
  or self.config.flash_decode
377
377
  )
378
+ and rotary_pos_cos is not None
378
379
  and inference_context.is_static_batching()
379
380
  ):
380
381
  current_batch_size = input_ids.shape[0]
381
382
  sequence_len_offset = torch.tensor(
382
383
  [inference_context.sequence_len_offset] * current_batch_size,
383
384
  dtype=torch.int32,
384
- device=torch.cuda.current_device(),
385
+ device=rotary_pos_cos.device, # Co-locate this with the rotary tensors
385
386
  )
386
387
  else:
387
388
  sequence_len_offset = None
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 16
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0.dev100285'
7
+ PRE_RELEASE = 'rc0.dev101543'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev101543
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -169,8 +169,6 @@ megatron/core/inference/contexts/dynamic_block_allocator.py
169
169
  megatron/core/inference/contexts/dynamic_context.py
170
170
  megatron/core/inference/contexts/fused_kv_append_kernel.py
171
171
  megatron/core/inference/contexts/static_context.py
172
- megatron/core/inference/contexts/attention_context/metadata_base.py
173
- megatron/core/inference/contexts/attention_context/mha_metadata.py
174
172
  megatron/core/inference/engines/__init__.py
175
173
  megatron/core/inference/engines/abstract_engine.py
176
174
  megatron/core/inference/engines/dynamic_engine.py
@@ -1,72 +0,0 @@
1
- # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
-
3
-
4
- class MetadataBase:
5
- """
6
- Base class for attention metadata.
7
- High-performance attention kernels often require input metadata in specific
8
- formats—such as cumulative query lengths, cumulative key/value lengths,
9
- and similar structures. Moreover, when using CUDA Graphs, these metadata
10
- buffers must be statically allocated. This class serves as a unified container
11
- that manages all such metadata in one place.
12
- """
13
-
14
- def __init__(self):
15
- """
16
- Initialize the metadata.
17
- """
18
- self.state_data = {}
19
-
20
- def update(self, *args, **kwargs):
21
- """
22
- Construct the metadata from request states.
23
- """
24
- pass
25
-
26
- def reset(self):
27
- """
28
- Reset the metadata.
29
- """
30
- pass
31
-
32
- def tensor_copy_and_pad(
33
- self,
34
- tensor_buf,
35
- unpadded_tensor,
36
- real_batch_size,
37
- padded_batch_size,
38
- is_cumulative_tensor=False,
39
- pad_value=0,
40
- ):
41
- """
42
- Copy the unpadded tensor to the tensor_buf,
43
- pad the tensor_buf with zero or the last value of the tensor,
44
- depending on whether the tensor is cumulative.
45
- Args:
46
- tensor_buf: The destination tensor, at least padded_batch_size long.
47
- unpadded_tensor: The tensor to copy, at least real_batch_size long.
48
- real_batch_size: The real batch size.
49
- padded_batch_size: Padded boundary of the tensor.
50
- is_cumulative_tensor: Whether the tensor is cumulative.
51
- If True, we pad the tensor_buf with the last value of the unpadded_tensor.
52
- pad_value: The value to pad the tensor_buf with when the tensor is not cumulative.
53
- """
54
- assert real_batch_size <= padded_batch_size
55
- assert tensor_buf.shape[0] >= padded_batch_size
56
- assert unpadded_tensor.shape[0] >= real_batch_size
57
- if is_cumulative_tensor:
58
- if real_batch_size == 0:
59
- value = pad_value
60
- else:
61
- value = unpadded_tensor[real_batch_size - 1]
62
- else:
63
- value = pad_value
64
- tensor_buf[0:real_batch_size] = unpadded_tensor[:real_batch_size]
65
- tensor_buf[real_batch_size:padded_batch_size] = value
66
- return tensor_buf
67
-
68
- def __str__(self):
69
- """
70
- Return a string representation of the metadata.
71
- """
72
- return "\n".join([f"{key}: {value}" for key, value in self.state_data.items()])