megatron-core 0.16.0rc0.dev127802__tar.gz → 0.16.0rc0.dev128858__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show
  1. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/async_stream.py +2 -8
  3. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/dynamic_context.py +32 -188
  4. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/fused_kv_append_kernel.py +2 -2
  5. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -7
  6. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/dynamic_engine.py +13 -27
  7. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/static_engine.py +7 -3
  8. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/inference_client.py +1 -3
  9. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +2 -4
  10. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/utils.py +0 -28
  11. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/gpt_model.py +3 -1
  12. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/mamba_model.py +1 -30
  13. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/package_info.py +1 -1
  14. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_block.py +25 -16
  15. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +2 -29
  16. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_layer.py +5 -5
  17. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_mixer.py +57 -301
  18. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/utils.py +1 -143
  19. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/PKG-INFO +1 -1
  20. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/SOURCES.txt +0 -1
  21. megatron_core-0.16.0rc0.dev127802/megatron/core/inference/contexts/attention_context/mamba_metadata.py +0 -106
  22. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/MANIFEST.in +0 -0
  23. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/README.md +0 -0
  24. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/README.md +0 -0
  25. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/__init__.py +0 -0
  26. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/activations.py +0 -0
  27. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/config.py +0 -0
  28. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/config_logger.py +0 -0
  29. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/__init__.py +0 -0
  30. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/bert_dataset.py +0 -0
  31. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_dataset.py +0 -0
  32. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  33. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  34. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/gpt_dataset.py +0 -0
  35. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/helpers.cpp +0 -0
  36. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/helpers.py +0 -0
  37. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/indexed_dataset.py +0 -0
  38. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/masked_dataset.py +0 -0
  39. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/megatron_dataset.py +0 -0
  40. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  41. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/multimodal_dataset.py +0 -0
  42. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/object_storage_utils.py +0 -0
  43. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/__init__.py +0 -0
  44. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/__init__.py +0 -0
  45. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  46. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/config.py +0 -0
  47. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  48. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  49. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/__init__.py +0 -0
  50. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/build.py +0 -0
  51. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/dataset.py +0 -0
  52. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/db/utils.py +0 -0
  53. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/external_libs.py +0 -0
  54. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/__init__.py +0 -0
  55. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/build.py +0 -0
  56. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/factory.py +0 -0
  57. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/index.py +0 -0
  58. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  59. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  60. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  61. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/utils.py +0 -0
  62. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/index/validate.py +0 -0
  63. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/__init__.py +0 -0
  64. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  65. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  66. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/query.py +0 -0
  67. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  68. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/query/utils.py +0 -0
  69. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/retro/utils.py +0 -0
  70. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/t5_dataset.py +0 -0
  71. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/utils.py +0 -0
  72. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/datasets/utils_s3.py +0 -0
  73. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/__init__.py +0 -0
  74. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/core.py +0 -0
  75. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  76. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  77. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/mapping.py +0 -0
  78. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  79. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/serialization.py +0 -0
  80. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  81. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  82. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  83. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  84. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  85. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  86. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  87. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  88. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  89. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  90. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  91. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  92. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  93. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  94. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  95. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  96. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/utils.py +0 -0
  97. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/dist_checkpointing/validation.py +0 -0
  98. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/__init__.py +0 -0
  99. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/data_parallel_base.py +0 -0
  100. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  101. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  102. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/finalize_model_grads.py +0 -0
  103. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/__init__.py +0 -0
  104. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  105. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  106. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  107. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  108. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  109. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  110. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  111. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  112. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  113. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  114. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  115. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  116. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  117. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  118. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/energy_monitor.py +0 -0
  119. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/enums.py +0 -0
  120. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/__init__.py +0 -0
  121. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/data_type.py +0 -0
  122. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/export_config.py +0 -0
  123. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/model_type.py +0 -0
  124. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/__init__.py +0 -0
  125. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  126. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  127. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  128. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  129. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  130. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  131. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  132. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  133. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  134. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  135. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  136. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  137. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/__init__.py +0 -0
  138. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/kitchen.py +0 -0
  139. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/transformer_engine.py +0 -0
  140. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  141. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fp4_utils.py +0 -0
  142. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fp8_utils.py +0 -0
  143. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/full_cuda_graph.py +0 -0
  144. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/__init__.py +0 -0
  145. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  146. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  147. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  148. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  149. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  150. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_indices_converter.py +0 -0
  151. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_layer_norm.py +0 -0
  152. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  153. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  154. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_softmax.py +0 -0
  155. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  156. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/hyper_comm_grid.py +0 -0
  157. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/__init__.py +0 -0
  158. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/common_inference_params.py +0 -0
  159. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/communication_utils.py +0 -0
  160. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/__init__.py +0 -0
  161. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
  162. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/attention_context/mha_metadata.py +0 -0
  163. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/base_context.py +0 -0
  164. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  165. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/contexts/static_context.py +0 -0
  166. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/__init__.py +0 -0
  167. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/abstract_engine.py +0 -0
  168. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/engines/mcore_engine.py +0 -0
  169. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/headers.py +0 -0
  170. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/inference_request.py +0 -0
  171. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  172. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  173. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  174. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  175. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  176. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  177. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  178. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  179. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/sampling_params.py +0 -0
  180. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/scheduler.py +0 -0
  181. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  182. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  183. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  184. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  185. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  186. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  187. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  188. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  189. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  190. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  191. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference/unified_memory.py +0 -0
  192. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/inference_params.py +0 -0
  193. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/jit.py +0 -0
  194. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/model_parallel_config.py +0 -0
  195. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/__init__.py +0 -0
  196. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/t5_model.py +0 -0
  197. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/T5/t5_spec.py +0 -0
  198. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/__init__.py +0 -0
  199. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/backends.py +0 -0
  200. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/__init__.py +0 -0
  201. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  202. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_lm_head.py +0 -0
  203. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/bert_model.py +0 -0
  204. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/bert/pooler.py +0 -0
  205. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/__init__.py +0 -0
  206. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/__init__.py +0 -0
  207. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  208. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  209. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  210. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  211. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  212. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/language_module/__init__.py +0 -0
  213. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/language_module/language_module.py +0 -0
  214. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  215. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/vision_module/__init__.py +0 -0
  216. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  217. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/__init__.py +0 -0
  218. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  219. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  220. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  221. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  222. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/__init__.py +0 -0
  223. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/clip_model.py +0 -0
  224. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/module.py +0 -0
  225. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/huggingface/qwen_model.py +0 -0
  226. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/__init__.py +0 -0
  227. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  228. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/__init__.py +0 -0
  229. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/config/__init__.py +0 -0
  230. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/config/base_configs.py +0 -0
  231. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/model/__init__.py +0 -0
  232. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/model/base.py +0 -0
  233. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/audio.py +0 -0
  234. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/base.py +0 -0
  235. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/mimo/submodules/vision.py +0 -0
  236. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/__init__.py +0 -0
  237. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/context_parallel.py +0 -0
  238. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/llava_model.py +0 -0
  239. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/multimodal/llava_spec.py +0 -0
  240. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/__init__.py +0 -0
  241. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/base_attention.py +0 -0
  242. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/config.py +0 -0
  243. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/decoder_attention.py +0 -0
  244. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/decoder_spec.py +0 -0
  245. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/encoder_attention.py +0 -0
  246. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/encoder_spec.py +0 -0
  247. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/model.py +0 -0
  248. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/retro/utils.py +0 -0
  249. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/__init__.py +0 -0
  250. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/clip_vit_model.py +0 -0
  251. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/multimodal_projector.py +0 -0
  252. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/radio.py +0 -0
  253. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  254. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/msc_utils.py +0 -0
  255. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/nccl_allocator.py +0 -0
  256. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/num_microbatches_calculator.py +0 -0
  257. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/__init__.py +0 -0
  258. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/clip_grads.py +0 -0
  259. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  260. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  261. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  262. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/grad_scaler.py +0 -0
  263. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/optimizer.py +0 -0
  264. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer/optimizer_config.py +0 -0
  265. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/optimizer_param_scheduler.py +0 -0
  266. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/packed_seq_params.py +0 -0
  267. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/parallel_state.py +0 -0
  268. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/__init__.py +0 -0
  269. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  270. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  271. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  272. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/schedules.py +0 -0
  273. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/pipeline_parallel/utils.py +0 -0
  274. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/__init__.py +0 -0
  275. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/__init__.py +0 -0
  276. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  277. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  278. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  279. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/layers.py +0 -0
  280. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  281. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  282. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/process_groups_config.py +0 -0
  283. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/__init__.py +0 -0
  284. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/quant_config.py +0 -0
  285. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/quantization/utils.py +0 -0
  286. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/requirements.txt +0 -0
  287. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/rerun_state_machine.py +0 -0
  288. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/safe_globals.py +0 -0
  289. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/__init__.py +0 -0
  290. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  291. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/mlp_layer.py +0 -0
  292. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/ssm/triton_cache_manager.py +0 -0
  293. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/__init__.py +0 -0
  294. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  295. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/data.py +0 -0
  296. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/layers.py +0 -0
  297. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/mappings.py +0 -0
  298. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/random.py +0 -0
  299. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tensor_parallel/utils.py +0 -0
  300. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/timers.py +0 -0
  301. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/__init__.py +0 -0
  302. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  303. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  304. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/__init__.py +0 -0
  305. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  306. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  307. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  308. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  309. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  312. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  313. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  314. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  315. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  318. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  322. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  323. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/__init__.py +0 -0
  324. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/attention.py +0 -0
  325. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/cuda_graphs.py +0 -0
  326. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  327. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  328. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/dot_product_attention.py +0 -0
  329. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/enums.py +0 -0
  330. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  331. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  332. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  333. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/identity_op.py +0 -0
  334. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/mlp.py +0 -0
  335. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/module.py +0 -0
  336. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/__init__.py +0 -0
  337. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/experts.py +0 -0
  338. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  339. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  340. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/moe_layer.py +0 -0
  341. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/moe_utils.py +0 -0
  342. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/router.py +0 -0
  343. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/shared_experts.py +0 -0
  344. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  345. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  346. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/multi_latent_attention.py +0 -0
  347. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/multi_token_prediction.py +0 -0
  348. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  349. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/spec_utils.py +0 -0
  350. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/torch_layer_norm.py +0 -0
  351. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/torch_norm.py +0 -0
  352. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_block.py +0 -0
  353. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_config.py +0 -0
  354. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/transformer_layer.py +0 -0
  355. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron/core/transformer/utils.py +0 -0
  356. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/dependency_links.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/requires.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/megatron_core.egg-info/top_level.txt +0 -0
  359. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/pyproject.toml +0 -0
  360. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/setup.cfg +0 -0
  361. {megatron_core-0.16.0rc0.dev127802 → megatron_core-0.16.0rc0.dev128858}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev127802
3
+ Version: 0.16.0rc0.dev128858
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -9,7 +9,6 @@ import asyncio
9
9
  from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
10
10
 
11
11
  from megatron.core.inference.inference_request import InferenceRequest
12
- from megatron.core.utils import get_asyncio_loop
13
12
 
14
13
  STOP_ITERATION = Exception()
15
14
 
@@ -21,17 +20,12 @@ class AsyncStream:
21
20
  Adopted from https://github.com/vllm-project/vllm/blob/eb881ed006ca458b052905e33f0d16dbb428063a/vllm/v1/engine/async_stream.py # pylint: disable=line-too-long
22
21
  """
23
22
 
24
- def __init__(
25
- self,
26
- request_id: int,
27
- cancel: Callable[[str], None],
28
- loop: Optional[asyncio.AbstractEventLoop] = None,
29
- ) -> None:
23
+ def __init__(self, request_id: int, cancel: Callable[[str], None]) -> None:
30
24
  self._request_id = request_id
31
25
  self._cancel = cancel
32
26
  self._queue: asyncio.Queue = asyncio.Queue()
33
27
  self._finished = False
34
- self._loop = get_asyncio_loop(loop)
28
+ self._loop = asyncio.get_running_loop()
35
29
 
36
30
  def put(self, item: Union[InferenceRequest, Exception]) -> None:
37
31
  """Adds a new value to the stream"""
@@ -23,14 +23,9 @@ from megatron.core.inference.unified_memory import (
23
23
  from megatron.core.inference.utils import tensor_swap
24
24
  from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
25
25
  from megatron.core.package_info import __version__ as mcore_version
26
- from megatron.core.ssm.mamba_hybrid_layer_allocation import (
27
- Symbols,
28
- get_layer_maps_from_layer_type_list,
29
- )
30
26
  from megatron.core.transformer import TransformerConfig
31
27
  from megatron.core.utils import divide as core_divide
32
28
 
33
- from .attention_context.mamba_metadata import MambaMetadata
34
29
  from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
35
30
  from .base_context import BaseInferenceContext
36
31
  from .dynamic_block_allocator import BlockAllocator
@@ -232,17 +227,8 @@ class DynamicInferenceContext(BaseInferenceContext):
232
227
  where the cuda graph batch sizes range from 1 to `max_requests` (as
233
228
  computed below). Due to rounding, the actual number of cuda graphs may
234
229
  not equal this argument.
235
- materialize_only_last_token_logits (Optional[bool]): Whether to only
236
- materialize logits for the last token. This should be set to False
237
- if returning log probs.
238
- layer_type_list (Optional[List[str]]): A list of strings that indicates
239
- the layer type (Mamba / Attention / MLP) for each layer.
240
- See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list
241
- of symbols. This must be provided for hybrid models.
242
- mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request.
243
- This must be provided for hybrid models.
244
- mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request.
245
- This must be provided for hybrid models.
230
+ materialize_only_last_token_logits (bool): If True, only the last token logits
231
+ are materialized in the context.
246
232
  use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode
247
233
  engine steps.
248
234
  unified_memory_level (Optional[int]): Set unified memory usage within the
@@ -273,10 +259,7 @@ class DynamicInferenceContext(BaseInferenceContext):
273
259
  kv_lora_rank: Optional[int] = None,
274
260
  qk_pos_emb_head_dim: Optional[int] = None,
275
261
  num_cuda_graphs: Optional[int] = None,
276
- materialize_only_last_token_logits: Optional[bool] = True,
277
- layer_type_list: Optional[List[str]] = None,
278
- mamba_conv_states_shape: Optional[Tuple[int]] = None,
279
- mamba_ssm_states_shape: Optional[Tuple[int]] = None,
262
+ materialize_only_last_token_logits: bool = True,
280
263
  use_cuda_graphs_for_non_decode_steps: bool = True,
281
264
  use_flashinfer_fused_rope: bool = False,
282
265
  unified_memory_level: Optional[int] = 0,
@@ -300,41 +283,6 @@ class DynamicInferenceContext(BaseInferenceContext):
300
283
  tp_size = tensor_model_parallel_size
301
284
  hidden_size_per_attention_head = core_divide(projection_size, num_attention_heads)
302
285
  num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
303
-
304
- # Mamba states.
305
- self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list
306
- if self.is_hybrid_model:
307
- assert (
308
- mamba_conv_states_shape is not None
309
- ), "`mamba_conv_states_shape` must be specified for hybrid models"
310
- assert (
311
- mamba_ssm_states_shape is not None
312
- ), "`mamba_ssm_states_shape` must be specified for hybrid models"
313
- assert (
314
- not use_cuda_graphs_for_non_decode_steps
315
- ), "Non-decode CUDA graphs not yet supported for hybrid models"
316
-
317
- # For hybrid models, the layer map converts the global layer index to the
318
- # corresponding attention layer index or Mamba layer index depending on the
319
- # layer type.
320
- attention_layer_map, mamba_layer_map, _ = get_layer_maps_from_layer_type_list(
321
- layer_type_list
322
- )
323
- self.num_attention_layers = len(attention_layer_map)
324
- self.num_mamba_layers = len(mamba_layer_map)
325
- self.layer_map = attention_layer_map | mamba_layer_map
326
- else:
327
- # The layer map is the identity function for pure Transformer models.
328
- self.num_attention_layers = num_layers
329
- self.num_mamba_layers = 0
330
- (mamba_conv_states_shape, mamba_ssm_states_shape) = (None, None)
331
- self.layer_map = {i: i for i in range(self.num_attention_layers)}
332
-
333
- if self.num_attention_layers == 0:
334
- raise NotImplementedError(
335
- f"Using `DynamicInferenceContext` with no attention is not supported."
336
- )
337
-
338
286
  # Block size tokens, bytes.
339
287
  dtype_size_bytes = params_dtype.itemsize
340
288
  self.block_size_tokens = block_size_tokens
@@ -349,38 +297,24 @@ class DynamicInferenceContext(BaseInferenceContext):
349
297
  self.block_size_bytes = (
350
298
  dtype_size_bytes
351
299
  * 2 # key, value
352
- * self.num_attention_layers
300
+ * num_layers
353
301
  * self.block_size_tokens
354
302
  * num_attention_heads_per_partition
355
303
  * hidden_size_per_attention_head
356
304
  )
357
- assert self.block_size_bytes > 0
358
305
 
359
306
  # Adjust buffer to be a multiple of block size.
360
307
  buffer_size_bytes = int(buffer_size_gb * 1024**3)
361
308
  buffer_size_bytes_rem = buffer_size_bytes % self.block_size_bytes
362
309
  buffer_size_bytes = buffer_size_bytes - buffer_size_bytes_rem
363
310
 
364
- mamba_states_memory_per_request = 0
365
- if self.is_hybrid_model:
366
- mamba_states_memory_per_request += math.prod(mamba_conv_states_shape)
367
- mamba_states_memory_per_request += math.prod(mamba_ssm_states_shape)
368
- mamba_states_memory_per_request *= self.num_mamba_layers
369
- mamba_states_memory_per_request *= dtype_size_bytes
370
-
371
- # Compute max_requets, max_tokens from buffer size, overflow factor, and Mamba state size.
311
+ # Compute max_requets, max_tokens from buffer size and overflow factor.
372
312
  def bytes_to_max_requests_and_tokens(n_bytes):
373
- bytes_per_token = self.block_size_bytes / self.block_size_tokens
374
- cost_per_request_bytes = (
375
- mamba_states_memory_per_request + max_sequence_length * bytes_per_token
313
+ n_tokens = n_bytes / self.block_size_bytes * self.block_size_tokens
314
+ n_requests = n_tokens / max_sequence_length
315
+ return self.round_up_requests(int(n_requests), tp_size=tp_size), self.round_up_tokens(
316
+ int(n_tokens), tp_size=tp_size
376
317
  )
377
- # TODO(ksanthanam): Leave room for an extra request in the event of padding
378
- # for non-decode CUDA graphs
379
- n_requests = n_bytes / cost_per_request_bytes
380
- n_tokens = n_requests * max_sequence_length
381
- n_requests = self.round_up_requests(int(n_requests), tp_size=tp_size)
382
- n_tokens = self.round_up_tokens(int(n_tokens), tp_size=tp_size)
383
- return n_requests, n_tokens
384
318
 
385
319
  self.max_requests, self.max_tokens = bytes_to_max_requests_and_tokens(buffer_size_bytes)
386
320
  if buffer_overflow_factor is not None:
@@ -405,6 +339,7 @@ class DynamicInferenceContext(BaseInferenceContext):
405
339
 
406
340
  # Initialize context state.
407
341
  self.params_dtype = params_dtype
342
+ self.num_layers = num_layers
408
343
  self.max_sequence_length = max_sequence_length
409
344
 
410
345
  # Unified memory.
@@ -455,11 +390,8 @@ class DynamicInferenceContext(BaseInferenceContext):
455
390
  self.token_to_position_in_request = torch.empty_like(self.token_to_input_ids)
456
391
  self.token_to_local_position_within_kv_block = torch.empty_like(self.token_to_input_ids)
457
392
 
458
- # Calculate the total number of chunks available in the buffer
459
- total_mamba_states_memory = mamba_states_memory_per_request * self.max_requests
460
- block_count_total = (
461
- max(0, buffer_size_bytes - total_mamba_states_memory) // self.block_size_bytes
462
- )
393
+ # Calculate the total number of blocks available in the buffer
394
+ block_count_total = buffer_size_bytes // self.block_size_bytes
463
395
 
464
396
  # Memory buffer.
465
397
  ctx_manager = (
@@ -470,12 +402,7 @@ class DynamicInferenceContext(BaseInferenceContext):
470
402
  with ctx_manager:
471
403
  if cache_mla_latent:
472
404
  self.memory_buffer = torch.full(
473
- (
474
- self.num_attention_layers,
475
- block_count_total,
476
- self.block_size_tokens,
477
- kv_reduced_dim,
478
- ),
405
+ (self.num_layers, block_count_total, self.block_size_tokens, kv_reduced_dim),
479
406
  -1,
480
407
  dtype=self.params_dtype,
481
408
  device=torch.cuda.current_device(),
@@ -484,7 +411,7 @@ class DynamicInferenceContext(BaseInferenceContext):
484
411
  self.memory_buffer = torch.full(
485
412
  (
486
413
  2, # key and value
487
- self.num_attention_layers,
414
+ self.num_layers,
488
415
  block_count_total,
489
416
  self.block_size_tokens,
490
417
  num_attention_heads_per_partition,
@@ -589,34 +516,14 @@ class DynamicInferenceContext(BaseInferenceContext):
589
516
  block_count_total=block_count_total, gtd_block_count=self.gtd_block_count
590
517
  )
591
518
 
592
- # Optional state tensors for hybrid models
593
- if self.is_hybrid_model:
594
- self.mamba_metadata = MambaMetadata(max_requests=self.max_requests)
595
-
596
- with ctx_manager:
597
- self.mamba_conv_states = torch.zeros(
598
- (self.num_mamba_layers, self.max_requests) + mamba_conv_states_shape,
599
- dtype=self.params_dtype,
600
- device=torch.cuda.current_device(),
601
- )
602
- self.mamba_ssm_states = torch.zeros(
603
- (self.num_mamba_layers, self.max_requests) + mamba_ssm_states_shape,
604
- dtype=self.params_dtype,
605
- device=torch.cuda.current_device(),
606
- )
607
-
608
- else:
609
- self.mamba_metadata = None
610
-
611
519
  # Store the dummy block idx reference for convenience
612
520
  self.dummy_block_idx = self.block_allocator.dummy_block_idx
613
521
 
614
522
  # Deal with chunked prefill
615
523
  self.chunked_prefill_request_id = -1
616
524
 
617
- # Reset attention and Mamba state.
525
+ # Reset attention state.
618
526
  self.reset_attention_state()
619
- self.reset_mamba_state()
620
527
 
621
528
  if use_flashinfer_fused_rope is True:
622
529
  assert HAVE_FLASHINFER, "flashinfer is not installed"
@@ -721,8 +628,7 @@ class DynamicInferenceContext(BaseInferenceContext):
721
628
  """Test if all active requests are in decode phase.
722
629
 
723
630
  For a request in prefill phase active_tokens = query length
724
- Once the request moves to decode phase active tokens is 1 for that request.
725
- So if all active requests are in decode phase, they will be equal to active token count.
631
+ Once the request moves to decode phase active tokens is 1 for that request. So if all active requests are in decode phase, they will be equal to active token count.
726
632
  """
727
633
  total_active_requests = self.total_request_count - self.paused_request_count
728
634
  return total_active_requests == self.active_token_count
@@ -758,7 +664,11 @@ class DynamicInferenceContext(BaseInferenceContext):
758
664
 
759
665
  def get_active_request_count(self):
760
666
  """Returns the current number of active requests."""
761
- return self.total_request_count - self.paused_request_count
667
+ active_sequence_lengths = self.get_active_sequence_lengths()
668
+ max_sequence_lengths = self.get_max_sequence_lengths()
669
+ active_requests_mask = torch.less(active_sequence_lengths, max_sequence_lengths).byte()
670
+ active_request_count = (active_requests_mask == 1).sum().item()
671
+ return active_request_count
762
672
 
763
673
  def append_key_value_cache(self, layer_number: int, key: Tensor, value: Tensor) -> None:
764
674
  """Append to KV cache.
@@ -768,12 +678,10 @@ class DynamicInferenceContext(BaseInferenceContext):
768
678
  key (Tensor): Key tensor.
769
679
  value (Tensor): Value tensor.
770
680
  """
771
- attention_layer_number = self.layer_map[layer_number - 1]
772
-
773
681
  if triton_append_key_value_cache is not None and not self.cache_mla_latent:
774
682
  # currently does not support MLA latent cache
775
683
  return triton_append_key_value_cache(
776
- layer_number=attention_layer_number,
684
+ layer_number=layer_number,
777
685
  key=key,
778
686
  value=value,
779
687
  memory_buffer=self.memory_buffer,
@@ -798,14 +706,14 @@ class DynamicInferenceContext(BaseInferenceContext):
798
706
  if self.cache_mla_latent:
799
707
  # We pass the kv_concat as the key in cache_mla_latent
800
708
  kv_concat = key
801
- self.memory_buffer[attention_layer_number, block_idx, local_kv_seq_idx] = kv_concat[
709
+ self.memory_buffer[layer_number - 1, block_idx, local_kv_seq_idx] = kv_concat[
802
710
  : self.padded_active_token_count
803
711
  ]
804
712
  else:
805
- self.memory_buffer[0, attention_layer_number, block_idx, local_kv_seq_idx] = key[
713
+ self.memory_buffer[0, layer_number - 1, block_idx, local_kv_seq_idx] = key[
806
714
  : self.padded_active_token_count
807
715
  ]
808
- self.memory_buffer[1, attention_layer_number, block_idx, local_kv_seq_idx] = value[
716
+ self.memory_buffer[1, layer_number - 1, block_idx, local_kv_seq_idx] = value[
809
717
  : self.padded_active_token_count
810
718
  ]
811
719
 
@@ -819,30 +727,19 @@ class DynamicInferenceContext(BaseInferenceContext):
819
727
  (Tuple[Tensor, Tensor]) The key and value pointer tensors that point
820
728
  to blocks within the block-level memory buffer.
821
729
  """
822
- attention_layer_number = self.layer_map[layer_number - 1]
823
730
  if self.cache_mla_latent:
824
731
  return (
825
- self.memory_buffer[attention_layer_number],
732
+ self.memory_buffer[layer_number - 1],
826
733
  None,
827
734
  self.active_attn_metadata["mha_metadata"].state_data["block_table"],
828
735
  )
829
736
  else:
830
737
  return (
831
- self.memory_buffer[0, attention_layer_number],
832
- self.memory_buffer[1, attention_layer_number],
738
+ self.memory_buffer[0, layer_number - 1],
739
+ self.memory_buffer[1, layer_number - 1],
833
740
  self.active_attn_metadata["mha_metadata"].state_data["block_table"],
834
741
  )
835
742
 
836
- def mamba_states_cache(self, layer_number: int) -> Tuple[Tensor, Tensor]:
837
- """Returns the Mamba state tensors for the given layer."""
838
- assert self.is_hybrid_model, "Only hybrid models have Mamba state tensors"
839
-
840
- mamba_layer_number = self.layer_map[layer_number - 1]
841
- conv_state = self.mamba_conv_states[mamba_layer_number]
842
- ssm_state = self.mamba_ssm_states[mamba_layer_number]
843
-
844
- return (conv_state, ssm_state)
845
-
846
743
  def apply_fused_qk_rotary_emb(
847
744
  self, query: Tensor, key: Tensor, cos_sin_emb: Tensor, config: TransformerConfig
848
745
  ) -> Tuple[Tensor, Tensor]:
@@ -957,16 +854,6 @@ class DynamicInferenceContext(BaseInferenceContext):
957
854
  attn_metadata.reset()
958
855
  self.active_attn_metadata = None
959
856
 
960
- if self.is_hybrid_model:
961
- self.mamba_metadata.reset_cudagraph_mapping()
962
-
963
- def reset_mamba_state(self) -> None:
964
- """Reset state used within Mamba layers."""
965
- if self.is_hybrid_model:
966
- self.mamba_conv_states.fill_(0)
967
- self.mamba_ssm_states.fill_(0)
968
- self.mamba_metadata.reset()
969
-
970
857
  def using_cuda_graph_this_step(self) -> bool:
971
858
  """Returns True if cuda graphs are being used for this step."""
972
859
  has_cuda_graphs = self.cuda_graph_token_counts is not None
@@ -1090,17 +977,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1090
977
  )
1091
978
  # All attention metadata calculations are now handled by MHAMetadata.update()
1092
979
 
1093
- # Create Mamba state block table if it's a hybrid model
1094
- if self.is_hybrid_model:
1095
- active_mamba_indices = self.mamba_metadata.request_to_mamba_state_idx[
1096
- self.paused_request_count : self.total_request_count
1097
- ]
1098
-
1099
- if self.is_decode_only() or self.using_cuda_graph_this_step():
1100
- self.mamba_metadata.update_cudagraph_mapping(
1101
- active_mamba_indices, self.total_request_count - self.paused_request_count
1102
- )
1103
-
1104
980
  def reset(self) -> None:
1105
981
  """Reset entire context.
1106
982
 
@@ -1142,13 +1018,15 @@ class DynamicInferenceContext(BaseInferenceContext):
1142
1018
 
1143
1019
  # Reset available block count.
1144
1020
  self.reset_attention_state()
1145
- self.reset_mamba_state()
1146
1021
  self.block_allocator.reset()
1147
1022
  self.request_to_kv_block_ids.fill_(-1)
1148
1023
 
1149
1024
  # Reset chunked prefill state
1150
1025
  self.chunked_prefill_request_id = -1
1151
1026
 
1027
+ # Reset chunked prefill state
1028
+ self.chunked_prefill_request_id = -1
1029
+
1152
1030
  def current_input_and_position_ids(
1153
1031
  self, *, num_warmup_tokens: Optional[int] = None
1154
1032
  ) -> Tuple[Tensor, Tensor]:
@@ -1320,18 +1198,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1320
1198
  self.token_to_local_position_within_kv_block[
1321
1199
  self.active_token_count : self.active_token_count + chunk_length
1322
1200
  ] = (token_offset_range % self.block_size_tokens)
1323
-
1324
- if self.is_hybrid_model and not is_chunked_prefill:
1325
- # Allocate a slot for Mamba states
1326
- mamba_idx = self.mamba_metadata.allocate_slot()
1327
- if mamba_idx is None:
1328
- raise ContextOverflowError(req.request_id, "No Mamba slots available")
1329
-
1330
- # Initialize the allocated Mamba state
1331
- self.mamba_conv_states[:, mamba_idx] = 0.0
1332
- self.mamba_ssm_states[:, mamba_idx] = 0.0
1333
- self.mamba_metadata.request_to_mamba_state_idx[self.total_request_count] = mamba_idx
1334
-
1335
1201
  self.active_token_count += chunk_length
1336
1202
  self.total_request_count += 0 if req.finished_chunk_token_count > 0 else 1
1337
1203
 
@@ -1350,11 +1216,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1350
1216
  self.request_last_kv_block_id[dst_idxs] = self.request_last_kv_block_id[src_idxs]
1351
1217
  self.request_last_kv_block_offset[dst_idxs] = self.request_last_kv_block_offset[src_idxs]
1352
1218
 
1353
- if self.is_hybrid_model:
1354
- self.mamba_metadata.request_to_mamba_state_idx[dst_idxs] = (
1355
- self.mamba_metadata.request_to_mamba_state_idx[src_idxs]
1356
- )
1357
-
1358
1219
  def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
1359
1220
  """
1360
1221
  Swaps all the relevent booking tensors with src idxs to dst idxs
@@ -1369,9 +1230,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1369
1230
  tensor_swap(self.request_last_kv_block_id, src_idxs, dst_idxs)
1370
1231
  tensor_swap(self.request_last_kv_block_offset, src_idxs, dst_idxs)
1371
1232
 
1372
- if self.is_hybrid_model:
1373
- tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
1374
-
1375
1233
  # TODO: see if we can compile this function
1376
1234
  def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
1377
1235
  """Update context state after calling engine.step().
@@ -1443,17 +1301,10 @@ class DynamicInferenceContext(BaseInferenceContext):
1443
1301
  non_zero_values_in_kv_memory = kv_blocks_assigned[kv_blocks_assigned != -1]
1444
1302
  self.block_allocator.release_memory_blocks(non_zero_values_in_kv_memory)
1445
1303
 
1446
- if self.is_hybrid_model:
1447
- self.mamba_metadata.free_slots(finished_idxs)
1448
-
1449
1304
  # Reset request/token counts.
1450
1305
  self.request_to_kv_block_ids.fill_(-1)
1451
1306
  self.total_request_count = 0
1452
1307
  self.active_token_count = 0
1453
-
1454
- # Reset Mamba state.
1455
- self.reset_mamba_state()
1456
-
1457
1308
  return
1458
1309
 
1459
1310
  # 3. Concatenate the paused tokens to the active tokens if present.
@@ -1481,10 +1332,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1481
1332
  # and updates it instead of the original tensor.
1482
1333
  self.request_to_kv_block_ids[finished_idxs] = -1
1483
1334
 
1484
- if self.is_hybrid_model:
1485
- # Get the Mamba state indices for finished requests and free them
1486
- self.mamba_metadata.free_slots(finished_idxs)
1487
-
1488
1335
  if active_request_count > 0:
1489
1336
  finished_idxs_on_left = (
1490
1337
  torch.nonzero(active_requests_mask[:active_request_count] == 0, as_tuple=True)[
@@ -1504,10 +1351,8 @@ class DynamicInferenceContext(BaseInferenceContext):
1504
1351
  next_tokens=next_tokens,
1505
1352
  )
1506
1353
 
1507
- # Reset chunk ids for recently moved requests.
1354
+ # Reset block ids for recently moved requests.
1508
1355
  self.request_to_kv_block_ids[active_idxs_on_right] = -1
1509
- if self.is_hybrid_model:
1510
- self.mamba_metadata.request_to_mamba_state_idx[active_idxs_on_right] = -1
1511
1356
 
1512
1357
  # 5. We identify requests that require a new block and add them to the paused requests (i.e move them left) :-
1513
1358
  # a) Put requests that have filled their current block and require a new one in a pause state temporarily
@@ -1605,7 +1450,6 @@ class DynamicInferenceContext(BaseInferenceContext):
1605
1450
 
1606
1451
  # 7. We make changes to the request book keeping tesnsors and setup the tokens for next iteration
1607
1452
  self.total_request_count = active_request_count + self.paused_request_count
1608
-
1609
1453
  # All these active requests are in decode phase, so they need only 1 token per request
1610
1454
  self.active_token_count = active_request_count
1611
1455
  # Always the first section of token input ids are only used.
@@ -119,8 +119,8 @@ def triton_append_key_value_cache(
119
119
 
120
120
  _, num_heads, h_dim = key.shape
121
121
 
122
- key_cache = memory_buffer[0, layer_number]
123
- value_cache = memory_buffer[1, layer_number]
122
+ key_cache = memory_buffer[0, layer_number - 1]
123
+ value_cache = memory_buffer[1, layer_number - 1]
124
124
 
125
125
  key_to_cache = key[:n_tokens]
126
126
  value_to_cache = value[:n_tokens]
@@ -1,8 +1,6 @@
1
1
  # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
 
3
- import faulthandler
4
3
  import logging
5
- import signal
6
4
  from collections import deque
7
5
  from itertools import cycle
8
6
  from multiprocessing import Event
@@ -25,11 +23,6 @@ try:
25
23
  except:
26
24
  HAVE_MSGPACK = False
27
25
 
28
- # Register faulthandler to emit stack traces upon process kill.
29
- faulthandler.enable()
30
- faulthandler.register(signal.SIGTERM, all_threads=False, chain=True)
31
- faulthandler.register(signal.SIGINT, all_threads=False, chain=True)
32
-
33
26
 
34
27
  class DataParallelInferenceCoordinator:
35
28
  """
@@ -33,8 +33,8 @@ from megatron.core.inference.sampling_params import SamplingParams
33
33
  from megatron.core.inference.text_generation_controllers.text_generation_controller import (
34
34
  TextGenerationController,
35
35
  )
36
- from megatron.core.inference.utils import Counter, await_process_event
37
- from megatron.core.utils import get_asyncio_loop, trace_async_exceptions
36
+ from megatron.core.inference.utils import Counter
37
+ from megatron.core.utils import get_asyncio_loop
38
38
 
39
39
  try:
40
40
  from tqdm import tqdm
@@ -293,11 +293,7 @@ class DynamicInferenceEngine(AbstractEngine):
293
293
  self.capture_stats = capture_stats
294
294
 
295
295
  async def start_listening_to_data_parallel_coordinator(
296
- self,
297
- inference_coordinator_port: int,
298
- launch_inference_coordinator: bool = True,
299
- *,
300
- loop: Optional[asyncio.AbstractEventLoop] = None,
296
+ self, inference_coordinator_port: int, launch_inference_coordinator: bool = True
301
297
  ):
302
298
  """Initializes ZMQ communication to connect the engine with an inference coordinator.
303
299
 
@@ -411,14 +407,12 @@ class DynamicInferenceEngine(AbstractEngine):
411
407
  torch.distributed.barrier(parallel_state.get_tensor_model_parallel_group())
412
408
 
413
409
  if launch_inference_coordinator and torch.distributed.get_rank() == 0:
414
- await await_process_event(coordinator_ready_event, self.inference_coordinator_process)
410
+ coordinator_ready_event.wait()
415
411
  logging.info("Inference co-ordinator is ready to receive requests!")
416
412
 
417
413
  # Finally run the engine infinite loop
418
- loop = get_asyncio_loop(loop)
419
- self.engine_loop_task = loop.create_task(self.run_engine_with_coordinator(loop=loop))
414
+ self.engine_loop_task = asyncio.create_task(self.run_engine_with_coordinator())
420
415
 
421
- @trace_async_exceptions
422
416
  async def _notify_cond_for_new_request(self):
423
417
  """Helper function to notify condition variable when a new request is added."""
424
418
  async with self._cond:
@@ -472,7 +466,7 @@ class DynamicInferenceEngine(AbstractEngine):
472
466
  self.waiting_request_ids.append(request_id)
473
467
 
474
468
  # Create a new asyncio Future to notify the user when the request has completed.
475
- self.request_completion_futures[request_id] = self._loop.create_future()
469
+ self.request_completion_futures[request_id] = asyncio.Future()
476
470
  return self.request_completion_futures[request_id]
477
471
 
478
472
  def add_request(
@@ -647,7 +641,7 @@ class DynamicInferenceEngine(AbstractEngine):
647
641
  if request_can_be_added and request_tokens_can_be_added and kv_cache_available:
648
642
  self.context.add_request(req)
649
643
  self._loop.call_soon_threadsafe(
650
- self._loop.create_task, self._notify_cond_for_new_request()
644
+ asyncio.create_task, self._notify_cond_for_new_request()
651
645
  )
652
646
  req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
653
647
  req.add_event_add()
@@ -708,7 +702,7 @@ class DynamicInferenceEngine(AbstractEngine):
708
702
 
709
703
  # is_continuing_chunked_prefill is True if we are scheduling next
710
704
  # chunk of a existing chunked prefill request
711
- is_continuing_chunked_prefill = self.context.chunked_prefill_request_id >= 0
705
+ is_continuing_chunked_prefill = self.context.chunked_prefill_request_id > 0
712
706
 
713
707
  # Use remaining prompt tokens for scheduling decisions
714
708
  remaining_len = len(req.remaining_prompt_tokens)
@@ -726,7 +720,7 @@ class DynamicInferenceEngine(AbstractEngine):
726
720
  self.context.chunked_prefill_request_id = -1
727
721
  self.context.add_request(req)
728
722
  self._loop.call_soon_threadsafe(
729
- self._loop.create_task, self._notify_cond_for_new_request()
723
+ asyncio.create_task, self._notify_cond_for_new_request()
730
724
  )
731
725
  req.remaining_prompt_tokens = req.remaining_prompt_tokens.new_empty(0)
732
726
  req.add_event_add()
@@ -738,7 +732,7 @@ class DynamicInferenceEngine(AbstractEngine):
738
732
  chunk_length = self.context.max_tokens - self.context.active_token_count
739
733
  self.context.add_request(req, chunk_length=chunk_length)
740
734
  self._loop.call_soon_threadsafe(
741
- self._loop.create_task, self._notify_cond_for_new_request()
735
+ asyncio.create_task, self._notify_cond_for_new_request()
742
736
  )
743
737
  self.context.chunked_prefill_request_id = req.request_id
744
738
  req.remaining_prompt_tokens = req.remaining_prompt_tokens[chunk_length:]
@@ -945,7 +939,7 @@ class DynamicInferenceEngine(AbstractEngine):
945
939
  result = self.step_modern()
946
940
  finished_requests_list.extend(result["finished_requests"])
947
941
 
948
- # Ensure requests are returned in the same order they were passed in
942
+ # Ensure requests are returned in the same order they were passed in.
949
943
  finished_requests_list.sort(key=lambda x: x.request_id)
950
944
 
951
945
  return finished_requests_list
@@ -1045,12 +1039,8 @@ class DynamicInferenceEngine(AbstractEngine):
1045
1039
  self.zmq_context.term()
1046
1040
  parallel_state.destroy_model_parallel()
1047
1041
 
1048
- @trace_async_exceptions
1049
- async def run_engine(
1050
- self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
1051
- ):
1042
+ async def run_engine(self, *, verbose: Optional[bool] = False):
1052
1043
  """Continually steps the engine asynchronously."""
1053
- self._loop = get_asyncio_loop(loop)
1054
1044
  try:
1055
1045
  while True:
1056
1046
  # Wait until there are active requests before proceeding.
@@ -1064,12 +1054,8 @@ class DynamicInferenceEngine(AbstractEngine):
1064
1054
  except asyncio.CancelledError:
1065
1055
  pass
1066
1056
 
1067
- @trace_async_exceptions
1068
- async def run_engine_with_coordinator(
1069
- self, *, loop: Optional[asyncio.AbstractEventLoop] = None, verbose: Optional[bool] = False
1070
- ):
1057
+ async def run_engine_with_coordinator(self, *, verbose: Optional[bool] = False):
1071
1058
  """Continually steps the engine asynchronously."""
1072
- self._loop = get_asyncio_loop(loop)
1073
1059
  try:
1074
1060
  while True:
1075
1061
  self.schedule_requests()
@@ -17,7 +17,6 @@ from megatron.core.inference.scheduler import Scheduler
17
17
  from megatron.core.inference.text_generation_controllers.text_generation_controller import (
18
18
  TextGenerationController,
19
19
  )
20
- from megatron.core.utils import get_asyncio_loop
21
20
 
22
21
  try:
23
22
  from tqdm import tqdm
@@ -218,6 +217,11 @@ class StaticInferenceEngine(AbstractEngine):
218
217
  generated tokens, texts and log probs if required
219
218
  """
220
219
  assert hasattr(self, 'dynamic_engine'), "Dynamic engine not initialized"
220
+ try:
221
+ loop = asyncio.get_running_loop()
222
+ except RuntimeError: # 'RuntimeError: There is no current event loop...'
223
+ loop = asyncio.new_event_loop()
224
+ asyncio.set_event_loop(loop)
221
225
 
222
226
  if common_inference_params:
223
227
  sampling_params = common_inference_params
@@ -381,8 +385,8 @@ class StaticInferenceEngine(AbstractEngine):
381
385
  torch.cuda.set_device(cuda_device)
382
386
  self.run_engine()
383
387
 
384
- async def run_engine_async(self, loop: Optional[asyncio.AbstractEventLoop] = None):
388
+ async def run_engine_async(self):
385
389
  """Runs the engine asynchronously using asyncio"""
386
- loop = get_asyncio_loop(loop)
390
+ loop = asyncio.get_running_loop()
387
391
 
388
392
  await loop.run_in_executor(None, self._wrapped_run_engine, torch.cuda.current_device())