megatron-core 0.16.0rc0.dev100285__tar.gz → 0.16.0rc0.dev101328__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megatron-core might be problematic. Click here for more details.

Files changed (361) hide show
  1. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/PKG-INFO +1 -1
  2. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/attention_context/mha_metadata.py +2 -12
  3. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/dynamic_context.py +7 -15
  4. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/engines/dynamic_engine.py +1 -15
  5. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_controllers/text_generation_controller.py +1 -2
  6. megatron_core-0.16.0rc0.dev101328/megatron/core/inference/unified_memory.py +89 -0
  7. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/gpt_model.py +2 -1
  8. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/package_info.py +1 -1
  9. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron_core.egg-info/PKG-INFO +1 -1
  10. megatron_core-0.16.0rc0.dev100285/megatron/core/inference/unified_memory.py +0 -127
  11. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/MANIFEST.in +0 -0
  12. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/README.md +0 -0
  13. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/README.md +0 -0
  14. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/__init__.py +0 -0
  15. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/activations.py +0 -0
  16. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/config.py +0 -0
  17. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/config_logger.py +0 -0
  18. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/__init__.py +0 -0
  19. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/bert_dataset.py +0 -0
  20. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/blended_dataset.py +0 -0
  21. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/blended_megatron_dataset_builder.py +0 -0
  22. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/blended_megatron_dataset_config.py +0 -0
  23. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/gpt_dataset.py +0 -0
  24. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/helpers.cpp +0 -0
  25. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/helpers.py +0 -0
  26. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/indexed_dataset.py +0 -0
  27. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/masked_dataset.py +0 -0
  28. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/megatron_dataset.py +0 -0
  29. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/megatron_tokenizer.py +0 -0
  30. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/multimodal_dataset.py +0 -0
  31. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/object_storage_utils.py +0 -0
  32. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/__init__.py +0 -0
  33. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/config/__init__.py +0 -0
  34. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/config/bert_embedders.py +0 -0
  35. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/config/config.py +0 -0
  36. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/config/gpt_chunk_datasets.py +0 -0
  37. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/config/tokenizers.py +0 -0
  38. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/db/__init__.py +0 -0
  39. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/db/build.py +0 -0
  40. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/db/dataset.py +0 -0
  41. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/db/utils.py +0 -0
  42. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/external_libs.py +0 -0
  43. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/__init__.py +0 -0
  44. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/build.py +0 -0
  45. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/factory.py +0 -0
  46. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/index.py +0 -0
  47. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/indexes/__init__.py +0 -0
  48. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/indexes/faiss_base.py +0 -0
  49. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/indexes/faiss_par_add.py +0 -0
  50. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/utils.py +0 -0
  51. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/index/validate.py +0 -0
  52. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/__init__.py +0 -0
  53. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/gpt_chunk_dataset.py +0 -0
  54. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py +0 -0
  55. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/query.py +0 -0
  56. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/retro_dataset.py +0 -0
  57. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/query/utils.py +0 -0
  58. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/retro/utils.py +0 -0
  59. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/t5_dataset.py +0 -0
  60. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/utils.py +0 -0
  61. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/datasets/utils_s3.py +0 -0
  62. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/__init__.py +0 -0
  63. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/core.py +0 -0
  64. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/dict_utils.py +0 -0
  65. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/exchange_utils.py +0 -0
  66. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/mapping.py +0 -0
  67. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/optimizer.py +0 -0
  68. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/serialization.py +0 -0
  69. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/state_dict_utils.py +0 -0
  70. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/__init__.py +0 -0
  71. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/async_utils.py +0 -0
  72. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/base.py +0 -0
  73. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/cached_metadata_filesystem_reader.py +0 -0
  74. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/checkpointable.py +0 -0
  75. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/common.py +0 -0
  76. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/filesystem_async.py +0 -0
  77. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/fully_parallel.py +0 -0
  78. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/resharding.py +0 -0
  79. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/state_dict_saver.py +0 -0
  80. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/tensorstore.py +0 -0
  81. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/torch.py +0 -0
  82. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/two_stage.py +0 -0
  83. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/strategies/zarr.py +0 -0
  84. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/tensor_aware_state_dict.py +0 -0
  85. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/utils.py +0 -0
  86. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/dist_checkpointing/validation.py +0 -0
  87. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/__init__.py +0 -0
  88. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/data_parallel_base.py +0 -0
  89. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/distributed_data_parallel.py +0 -0
  90. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/distributed_data_parallel_config.py +0 -0
  91. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/finalize_model_grads.py +0 -0
  92. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/__init__.py +0 -0
  93. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/mcore_fsdp_adapter.py +0 -0
  94. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/__init__.py +0 -0
  95. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/__init__.py +0 -0
  96. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/distributed_data_parallel_config.py +0 -0
  97. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py +0 -0
  98. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py +0 -0
  99. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/package_info.py +0 -0
  100. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py +0 -0
  101. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py +0 -0
  102. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/fsdp/src/megatron_fsdp/utils.py +0 -0
  103. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/param_and_grad_buffer.py +0 -0
  104. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py +0 -0
  105. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/torch_fully_sharded_data_parallel.py +0 -0
  106. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/distributed/torch_fully_sharded_data_parallel_config.py +0 -0
  107. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/energy_monitor.py +0 -0
  108. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/enums.py +0 -0
  109. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/__init__.py +0 -0
  110. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/data_type.py +0 -0
  111. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/export_config.py +0 -0
  112. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/model_type.py +0 -0
  113. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/__init__.py +0 -0
  114. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/engine_builder/__init__.py +0 -0
  115. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py +0 -0
  116. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py +0 -0
  117. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py +0 -0
  118. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trt_model_config.py +0 -0
  119. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trt_model_type.py +0 -0
  120. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_helper.py +0 -0
  121. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_layers.py +0 -0
  122. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py +0 -0
  123. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py +0 -0
  124. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py +0 -0
  125. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/export/trtllm/trtllm_weights_converter/utils.py +0 -0
  126. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/extensions/__init__.py +0 -0
  127. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/extensions/kitchen.py +0 -0
  128. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/extensions/transformer_engine.py +0 -0
  129. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/extensions/transformer_engine_spec_provider.py +0 -0
  130. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fp4_utils.py +0 -0
  131. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fp8_utils.py +0 -0
  132. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/full_cuda_graph.py +0 -0
  133. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/__init__.py +0 -0
  134. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_bias_dropout.py +0 -0
  135. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_bias_geglu.py +0 -0
  136. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_bias_gelu.py +0 -0
  137. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_bias_swiglu.py +0 -0
  138. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_cross_entropy.py +0 -0
  139. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_indices_converter.py +0 -0
  140. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_layer_norm.py +0 -0
  141. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_mla_yarn_rope_apply.py +0 -0
  142. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_pad_routing_map.py +0 -0
  143. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_softmax.py +0 -0
  144. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/fusions/fused_weighted_squared_relu.py +0 -0
  145. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/hyper_comm_grid.py +0 -0
  146. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/__init__.py +0 -0
  147. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/async_stream.py +0 -0
  148. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/common_inference_params.py +0 -0
  149. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/communication_utils.py +0 -0
  150. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/__init__.py +0 -0
  151. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/attention_context/metadata_base.py +0 -0
  152. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/base_context.py +0 -0
  153. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/dynamic_block_allocator.py +0 -0
  154. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/fused_kv_append_kernel.py +0 -0
  155. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/contexts/static_context.py +0 -0
  156. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/data_parallel_inference_coordinator.py +0 -0
  157. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/engines/__init__.py +0 -0
  158. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/engines/abstract_engine.py +0 -0
  159. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/engines/mcore_engine.py +0 -0
  160. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/engines/static_engine.py +0 -0
  161. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/headers.py +0 -0
  162. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/inference_client.py +0 -0
  163. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/inference_request.py +0 -0
  164. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/__init__.py +0 -0
  165. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py +0 -0
  166. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/gpt/__init__.py +0 -0
  167. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py +0 -0
  168. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py +0 -0
  169. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/multimodal/vlm_inference_wrapper.py +0 -0
  170. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/t5/__init__.py +0 -0
  171. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py +0 -0
  172. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/sampling_params.py +0 -0
  173. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/scheduler.py +0 -0
  174. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_controllers/__init__.py +0 -0
  175. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py +0 -0
  176. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py +0 -0
  177. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_controllers/vlm_text_generation_controller.py +0 -0
  178. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/__init__.py +0 -0
  179. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/endpoints/common.py +0 -0
  180. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/endpoints/completions.py +0 -0
  181. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/run_mcore_engine.py +0 -0
  182. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/text_generation_server.py +0 -0
  183. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/text_generation_server/tokenization.py +0 -0
  184. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference/utils.py +0 -0
  185. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/inference_params.py +0 -0
  186. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/jit.py +0 -0
  187. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/model_parallel_config.py +0 -0
  188. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/T5/__init__.py +0 -0
  189. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/T5/t5_model.py +0 -0
  190. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/T5/t5_spec.py +0 -0
  191. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/__init__.py +0 -0
  192. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/backends.py +0 -0
  193. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/bert/__init__.py +0 -0
  194. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/bert/bert_layer_specs.py +0 -0
  195. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/bert/bert_lm_head.py +0 -0
  196. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/bert/bert_model.py +0 -0
  197. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/bert/pooler.py +0 -0
  198. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/__init__.py +0 -0
  199. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/__init__.py +0 -0
  200. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/language_model_embedding.py +0 -0
  201. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/relative_pos_embedding.py +0 -0
  202. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/rope_utils.py +0 -0
  203. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/rotary_pos_embedding.py +0 -0
  204. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py +0 -0
  205. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/language_module/__init__.py +0 -0
  206. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/language_module/language_module.py +0 -0
  207. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/model_chunk_schedule_plan.py +0 -0
  208. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/vision_module/__init__.py +0 -0
  209. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/common/vision_module/vision_module.py +0 -0
  210. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/__init__.py +0 -0
  211. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/fine_grained_callables.py +0 -0
  212. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/gpt_layer_specs.py +0 -0
  213. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/heterogeneous/heterogeneous_layer_specs.py +0 -0
  214. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/gpt/moe_module_specs.py +0 -0
  215. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/huggingface/__init__.py +0 -0
  216. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/huggingface/clip_model.py +0 -0
  217. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/huggingface/module.py +0 -0
  218. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/huggingface/qwen_model.py +0 -0
  219. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mamba/__init__.py +0 -0
  220. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mamba/mamba_layer_specs.py +0 -0
  221. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mamba/mamba_model.py +0 -0
  222. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/__init__.py +0 -0
  223. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/config/__init__.py +0 -0
  224. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/config/base_configs.py +0 -0
  225. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/model/__init__.py +0 -0
  226. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/model/base.py +0 -0
  227. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/submodules/audio.py +0 -0
  228. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/submodules/base.py +0 -0
  229. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/mimo/submodules/vision.py +0 -0
  230. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/multimodal/__init__.py +0 -0
  231. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/multimodal/context_parallel.py +0 -0
  232. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/multimodal/llava_model.py +0 -0
  233. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/multimodal/llava_spec.py +0 -0
  234. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/__init__.py +0 -0
  235. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/base_attention.py +0 -0
  236. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/config.py +0 -0
  237. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/decoder_attention.py +0 -0
  238. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/decoder_spec.py +0 -0
  239. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/encoder_attention.py +0 -0
  240. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/encoder_spec.py +0 -0
  241. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/model.py +0 -0
  242. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/retro/utils.py +0 -0
  243. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/vision/__init__.py +0 -0
  244. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/vision/clip_vit_model.py +0 -0
  245. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/vision/multimodal_projector.py +0 -0
  246. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/vision/radio.py +0 -0
  247. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/models/vision/vit_layer_specs.py +0 -0
  248. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/msc_utils.py +0 -0
  249. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/nccl_allocator.py +0 -0
  250. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/num_microbatches_calculator.py +0 -0
  251. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/__init__.py +0 -0
  252. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/clip_grads.py +0 -0
  253. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/cpu_offloading/__init__.py +0 -0
  254. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/cpu_offloading/hybrid_optimizer.py +0 -0
  255. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/distrib_optimizer.py +0 -0
  256. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/grad_scaler.py +0 -0
  257. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/optimizer.py +0 -0
  258. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer/optimizer_config.py +0 -0
  259. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/optimizer_param_scheduler.py +0 -0
  260. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/packed_seq_params.py +0 -0
  261. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/parallel_state.py +0 -0
  262. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/__init__.py +0 -0
  263. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/bridge_communicator.py +0 -0
  264. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/combined_1f1b.py +0 -0
  265. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/p2p_communication.py +0 -0
  266. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/schedules.py +0 -0
  267. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/pipeline_parallel/utils.py +0 -0
  268. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/__init__.py +0 -0
  269. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/__init__.py +0 -0
  270. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/gpt/__init__.py +0 -0
  271. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/gpt/model_specs.py +0 -0
  272. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/gpt/state_dict_hooks.py +0 -0
  273. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/layers.py +0 -0
  274. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/mamba/__init__.py +0 -0
  275. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/post_training/modelopt/mamba/model_specs.py +0 -0
  276. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/process_groups_config.py +0 -0
  277. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/quantization/__init__.py +0 -0
  278. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/quantization/quant_config.py +0 -0
  279. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/quantization/utils.py +0 -0
  280. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/requirements.txt +0 -0
  281. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/rerun_state_machine.py +0 -0
  282. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/safe_globals.py +0 -0
  283. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/__init__.py +0 -0
  284. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mamba_block.py +0 -0
  285. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mamba_context_parallel.py +0 -0
  286. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mamba_hybrid_layer_allocation.py +0 -0
  287. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mamba_layer.py +0 -0
  288. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mamba_mixer.py +0 -0
  289. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/mlp_layer.py +0 -0
  290. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/ssm/triton_cache_manager.py +0 -0
  291. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/__init__.py +0 -0
  292. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/cross_entropy.py +0 -0
  293. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/data.py +0 -0
  294. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/layers.py +0 -0
  295. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/mappings.py +0 -0
  296. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/random.py +0 -0
  297. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tensor_parallel/utils.py +0 -0
  298. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/timers.py +0 -0
  299. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/__init__.py +0 -0
  300. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/base_tokenizer.py +0 -0
  301. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/megatron_tokenizer.py +0 -0
  302. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/__init__.py +0 -0
  303. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/__init__.py +0 -0
  304. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/abstract_tokenizer.py +0 -0
  305. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/bytelevel_tokenizer.py +0 -0
  306. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/chat_template.py +0 -0
  307. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/huggingface_tokenizer.py +0 -0
  308. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/megatron_hf_tokenizer.py +0 -0
  309. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/null_tokenizer.py +0 -0
  310. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/sentencepiece_tokenizer.py +0 -0
  311. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/libraries/tiktoken_tokenizer.py +0 -0
  312. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/__init__.py +0 -0
  313. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/bert_tokenizer.py +0 -0
  314. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/default_tokenizer.py +0 -0
  315. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/gpt_tokenizer.py +0 -0
  316. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/mamba_tokenizer.py +0 -0
  317. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/retro_tokenizer.py +0 -0
  318. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/models/t5_tokenizer.py +0 -0
  319. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/text_tokenizer.py +0 -0
  320. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/tokenizers/text/utils/build_tokenizer.py +0 -0
  321. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/__init__.py +0 -0
  322. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/attention.py +0 -0
  323. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/cuda_graphs.py +0 -0
  324. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/custom_layers/__init__.py +0 -0
  325. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/custom_layers/transformer_engine.py +0 -0
  326. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/dot_product_attention.py +0 -0
  327. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/enums.py +0 -0
  328. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/fsdp_dtensor_checkpoint.py +0 -0
  329. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/heterogeneous/heterogeneous_config.py +0 -0
  330. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/heterogeneous/linear_replacements.py +0 -0
  331. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/identity_op.py +0 -0
  332. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/mlp.py +0 -0
  333. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/module.py +0 -0
  334. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/__init__.py +0 -0
  335. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/experts.py +0 -0
  336. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/fused_a2a.py +0 -0
  337. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/grouped_gemm_util.py +0 -0
  338. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/moe_layer.py +0 -0
  339. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/moe_utils.py +0 -0
  340. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/router.py +0 -0
  341. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/shared_experts.py +0 -0
  342. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/token_dispatcher.py +0 -0
  343. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/moe/upcycling_utils.py +0 -0
  344. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/multi_latent_attention.py +0 -0
  345. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/multi_token_prediction.py +0 -0
  346. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/pipeline_parallel_layer_layout.py +0 -0
  347. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/spec_utils.py +0 -0
  348. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/torch_layer_norm.py +0 -0
  349. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/torch_norm.py +0 -0
  350. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/transformer_block.py +0 -0
  351. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/transformer_config.py +0 -0
  352. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/transformer_layer.py +0 -0
  353. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/transformer/utils.py +0 -0
  354. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron/core/utils.py +0 -0
  355. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron_core.egg-info/SOURCES.txt +0 -0
  356. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron_core.egg-info/dependency_links.txt +0 -0
  357. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron_core.egg-info/requires.txt +0 -0
  358. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/megatron_core.egg-info/top_level.txt +0 -0
  359. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/pyproject.toml +0 -0
  360. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/setup.cfg +0 -0
  361. {megatron_core-0.16.0rc0.dev100285 → megatron_core-0.16.0rc0.dev101328}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev101328
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -43,7 +43,6 @@ class MHAMetadata(MetadataBase):
43
43
  padded_active_token_count: int,
44
44
  real_batch_size: int,
45
45
  padded_active_request_count: Optional[int] = None,
46
- decode_only: bool = False,
47
46
  ):
48
47
  """
49
48
  Args:
@@ -53,7 +52,6 @@ class MHAMetadata(MetadataBase):
53
52
  padded_active_token_count: int
54
53
  real_batch_size: int
55
54
  padded_active_request_count: Optional[int]
56
- decode_only: bool
57
55
  """
58
56
  if padded_active_request_count is None:
59
57
  padded_active_request_count = real_batch_size
@@ -100,11 +98,9 @@ class MHAMetadata(MetadataBase):
100
98
  padded_active_request_count,
101
99
  is_cumulative_tensor=True,
102
100
  )
103
-
104
- if decode_only:
101
+ self._max_seqlen_q = padded_active_token_count
102
+ if torch.all(self._query_lengths_buf[:padded_active_request_count] <= 1):
105
103
  self._max_seqlen_q = 1
106
- else:
107
- self._max_seqlen_q = max(2, padded_active_token_count)
108
104
  self._max_seqlen_k = self.max_seqlen
109
105
 
110
106
  self.state_data = {
@@ -152,7 +148,6 @@ class GraphedMHAMetadata(MHAMetadata):
152
148
  padded_active_token_count: int,
153
149
  real_batch_size: int,
154
150
  padded_active_request_count: Optional[int] = None,
155
- decode_only: bool = False,
156
151
  ):
157
152
  """
158
153
  Args:
@@ -162,7 +157,6 @@ class GraphedMHAMetadata(MHAMetadata):
162
157
  padded_active_token_count: int
163
158
  real_batch_size: int
164
159
  padded_active_request_count: Optional[int]
165
- decode_only: bool
166
160
  """
167
161
  super().update(
168
162
  request_query_lengths,
@@ -171,7 +165,6 @@ class GraphedMHAMetadata(MHAMetadata):
171
165
  padded_active_token_count,
172
166
  real_batch_size,
173
167
  padded_active_request_count,
174
- decode_only,
175
168
  )
176
169
 
177
170
  def reset(self):
@@ -191,7 +184,6 @@ class NonGraphedMHAMetadata(MHAMetadata):
191
184
  padded_active_token_count: int,
192
185
  real_batch_size: int,
193
186
  padded_active_request_count: Optional[int] = None,
194
- decode_only: bool = False,
195
187
  ):
196
188
  """
197
189
  Args:
@@ -201,7 +193,6 @@ class NonGraphedMHAMetadata(MHAMetadata):
201
193
  padded_active_token_count: int
202
194
  real_batch_size: int
203
195
  padded_active_request_count: Optional[int]
204
- decode_only: bool
205
196
  """
206
197
  super().update(
207
198
  request_query_lengths,
@@ -210,7 +201,6 @@ class NonGraphedMHAMetadata(MHAMetadata):
210
201
  padded_active_token_count,
211
202
  real_batch_size,
212
203
  padded_active_request_count,
213
- decode_only,
214
204
  )
215
205
  if len(self.state_data["query_lengths"]) > 0:
216
206
  self.state_data["max_seqlen_q"] = torch.max(self.state_data["query_lengths"]).item()
@@ -16,10 +16,7 @@ from megatron.core.inference.inference_request import DynamicInferenceRequest
16
16
  from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
17
17
  InferenceWrapperConfig,
18
18
  )
19
- from megatron.core.inference.unified_memory import (
20
- UnifiedMemoryUnsupportedError,
21
- create_unified_mempool,
22
- )
19
+ from megatron.core.inference.unified_memory import create_unified_mempool, has_unified_memory
23
20
  from megatron.core.inference.utils import tensor_swap
24
21
  from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
25
22
  from megatron.core.package_info import __version__ as mcore_version
@@ -326,20 +323,16 @@ class DynamicInferenceContext(BaseInferenceContext):
326
323
  self.params_dtype = params_dtype
327
324
  self.num_layers = num_layers
328
325
  self.max_sequence_length = max_sequence_length
329
-
330
- # Unified memory.
331
326
  self.unified_memory_level = unified_memory_level
332
327
  if unified_memory_level > 0:
333
- try:
334
- self.unified_memory_mempool = create_unified_mempool()
335
- except UnifiedMemoryUnsupportedError:
336
- if torch.distributed.get_rank() == 0:
337
- warnings.warn(
338
- "Unified memory requested but not available; defaulting to GPU memory."
339
- )
328
+ if not has_unified_memory and torch.distributed.get_rank() == 0:
329
+ warnings.warn(
330
+ "Unified memory requested but not available; defaulting to GPU memory."
331
+ )
340
332
  self.unified_memory_level = 0
333
+ else:
334
+ self.unified_memory_mempool = create_unified_mempool()
341
335
 
342
- # Request and token counts.
343
336
  self.total_request_count = 0
344
337
  self.active_token_count = 0
345
338
  self.paused_request_count = 0
@@ -958,7 +951,6 @@ class DynamicInferenceContext(BaseInferenceContext):
958
951
  padded_active_token_count=self.padded_active_token_count,
959
952
  real_batch_size=real_req_batch_size,
960
953
  padded_active_request_count=self.padded_active_request_count,
961
- decode_only=self.is_decode_only(),
962
954
  )
963
955
  # All attention metadata calculations are now handled by MHAMetadata.update()
964
956
 
@@ -165,17 +165,6 @@ class DynamicInferenceEngine(AbstractEngine):
165
165
  context = self.context
166
166
  controller = self.controller
167
167
 
168
- config = controller.inference_wrapped_model.inference_wrapper_config
169
- moe_pad_experts = config.moe_pad_experts_for_cuda_graph_inference
170
-
171
- if moe_pad_experts and context.non_decode_cuda_graphs:
172
- context.non_decode_cuda_graphs = False
173
- if torch.distributed.get_rank() == 0:
174
- warnings.warn(
175
- "MoE models do not support non-decode cuda graphs. "
176
- "Forcing non_decode_cuda_graphs to False."
177
- )
178
-
179
168
  time_start = time.time()
180
169
  mem_stats_start = torch.cuda.memory_stats()
181
170
 
@@ -185,18 +174,15 @@ class DynamicInferenceEngine(AbstractEngine):
185
174
  context.cuda_graph_token_counts,
186
175
  )
187
176
  for warmup_engine_mode in [WarmupEngineMode.DECODE, WarmupEngineMode.NON_DECODE]:
188
- # Check whether to skip non-decode graphs.
177
+ # Iterate cuda graph dims.
189
178
  if (
190
179
  warmup_engine_mode == WarmupEngineMode.NON_DECODE
191
180
  and not context.non_decode_cuda_graphs
192
181
  ):
193
182
  continue
194
-
195
183
  tbar = enumerate(context.cuda_graph_token_counts)
196
184
  if HAVE_TQDM:
197
185
  tbar = tqdm(tbar, total=len(context.cuda_graph_token_counts))
198
-
199
- # Iterate cuda graph dims.
200
186
  for tbar_idx, cuda_graph_token_count in tbar:
201
187
  if (
202
188
  cuda_graph_token_count == 1
@@ -508,8 +508,7 @@ class TextGenerationController:
508
508
  inference_wrapper_config.moe_pad_experts_for_cuda_graph_inference
509
509
  )
510
510
  if moe_pad_experts_for_cuda_graph_inference:
511
- assert warmup_engine_mode is not WarmupEngineMode.NON_DECODE
512
- if context.is_decode_only():
511
+ if context.is_decode_only() or warmup_engine_mode is not None:
513
512
  capacity_factor = model_config.num_moe_experts / model_config.moe_router_topk
514
513
  set_decode_expert_padding(unwrapped_model, True, capacity_factor=capacity_factor)
515
514
  else:
@@ -0,0 +1,89 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+
3
+ import os
4
+ import warnings
5
+ from pathlib import Path
6
+
7
+ from torch.cuda.memory import CUDAPluggableAllocator
8
+ from torch.utils.cpp_extension import CUDA_HOME, load_inline
9
+
10
+ from megatron.core.utils import is_torch_min_version
11
+
12
+ try:
13
+ if is_torch_min_version("2.8.0"):
14
+ from torch.cuda.memory import MemPool
15
+ else:
16
+ from torch.cuda import MemPool
17
+ _has_mem_pool = True
18
+ except ImportError:
19
+ _has_mem_pool = False
20
+
21
+ _mempool_c_src = r"""
22
+ #include <cuda_runtime_api.h>
23
+ #include <cstddef>
24
+
25
+ #define EXPORT extern "C"
26
+
27
+ EXPORT void* managed_malloc(size_t size, int device, void* stream) {
28
+ (void)stream;
29
+ int cur = -1;
30
+ cudaGetDevice(&cur);
31
+ if (device != cur && device >= 0) cudaSetDevice(device);
32
+
33
+ // cudaMallocManaged allows for more memory to be allocated than the device memory size.
34
+ // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
35
+ void* ptr = nullptr;
36
+ cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
37
+ if (err != cudaSuccess) return nullptr;
38
+
39
+ if (device >= 0) {
40
+ // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
41
+ // This is a hint that tries to prevent data from being migrated away from the device.
42
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
43
+ // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
44
+ // Even if the memory has to be migrated away from the device, it still does not page fault.
45
+ // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
46
+ // but there is no harm in adding this flag as well for future-proofing.
47
+ cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
48
+ }
49
+ return ptr;
50
+ }
51
+
52
+ EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
53
+ // Memory allocated with cudaMallocManaged should be released with cudaFree.
54
+ (void)size; (void)device; (void)stream;
55
+ if (ptr) cudaFree(ptr);
56
+ }
57
+ """
58
+
59
+ # Avoid linting errors.
60
+ has_unified_memory = False
61
+ _alloc = None
62
+
63
+ # Build the .so upon import; this avoids issues.
64
+ if _has_mem_pool:
65
+ _extra_ldflags = ["-lcudart"]
66
+ if CUDA_HOME:
67
+ _cuda_lib = os.path.join(CUDA_HOME, "lib64")
68
+ if os.path.isdir(_cuda_lib):
69
+ _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
70
+ try:
71
+ _mod = load_inline(
72
+ name="managed_alloc_runtime",
73
+ cpp_sources=[_mempool_c_src],
74
+ functions=[],
75
+ with_cuda=True,
76
+ extra_ldflags=_extra_ldflags,
77
+ verbose=False,
78
+ )
79
+ _so_path = Path(_mod.__file__).as_posix()
80
+ _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
81
+ has_unified_memory = True
82
+ except (RuntimeError, ImportError, OSError):
83
+ warnings.warn("Failed to create unified memory mempool.")
84
+
85
+
86
+ def create_unified_mempool():
87
+ """Create a unified memory mempool using CUDA managed memory."""
88
+ assert has_unified_memory
89
+ return MemPool(allocator=_alloc)
@@ -375,13 +375,14 @@ class GPTModel(LanguageModule):
375
375
  )
376
376
  or self.config.flash_decode
377
377
  )
378
+ and rotary_pos_cos is not None
378
379
  and inference_context.is_static_batching()
379
380
  ):
380
381
  current_batch_size = input_ids.shape[0]
381
382
  sequence_len_offset = torch.tensor(
382
383
  [inference_context.sequence_len_offset] * current_batch_size,
383
384
  dtype=torch.int32,
384
- device=torch.cuda.current_device(),
385
+ device=rotary_pos_cos.device, # Co-locate this with the rotary tensors
385
386
  )
386
387
  else:
387
388
  sequence_len_offset = None
@@ -4,7 +4,7 @@
4
4
  MAJOR = 0
5
5
  MINOR = 16
6
6
  PATCH = 0
7
- PRE_RELEASE = 'rc0.dev100285'
7
+ PRE_RELEASE = 'rc0.dev101328'
8
8
 
9
9
  # Use the following formatting: (major, minor, patch, pre-release)
10
10
  VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: megatron-core
3
- Version: 0.16.0rc0.dev100285
3
+ Version: 0.16.0rc0.dev101328
4
4
  Summary: Megatron Core - a library for efficient and scalable training of transformer based models
5
5
  Author-email: NVIDIA <nemo-toolkit@nvidia.com>
6
6
  Maintainer-email: NVIDIA <nemo-toolkit@nvidia.com>
@@ -1,127 +0,0 @@
1
- # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
-
3
- import os
4
- import warnings
5
- from enum import Enum, auto
6
- from pathlib import Path
7
-
8
- from torch.cuda.memory import CUDAPluggableAllocator
9
- from torch.utils.cpp_extension import CUDA_HOME, load_inline
10
-
11
- from megatron.core.utils import is_torch_min_version
12
-
13
- try:
14
- if is_torch_min_version("2.8.0"):
15
- from torch.cuda.memory import MemPool
16
- else:
17
- from torch.cuda import MemPool
18
- _has_mem_pool = True
19
- except ImportError:
20
- _has_mem_pool = False
21
-
22
-
23
- class CompilationState(Enum):
24
- """Enum to distinguish between unified memory (UVM) compilation states."""
25
-
26
- UNATTEMPTED = auto() # Compilation has not been attempted.
27
- FAILURE = auto() # Compilation attempted, but failed.
28
- SUCCESS = auto() # Compilation attempted, and succeeded.
29
-
30
-
31
- # Compilation vars.
32
- _compilation_state = CompilationState.UNATTEMPTED
33
- _alloc = None # must remain global until process exit.
34
- _mod = None # must remain global until process exit.
35
-
36
-
37
- class UnifiedMemoryUnsupportedError(Exception):
38
- """Unified memory is not supported on this system."""
39
-
40
- pass
41
-
42
-
43
- def compile_allocator():
44
- """Attempt to compile UVM allocator."""
45
-
46
- global _compilation_state, _alloc, _mod
47
-
48
- if _compilation_state != CompilationState.UNATTEMPTED:
49
- return
50
-
51
- _mempool_c_src = r"""
52
- #include <cuda_runtime_api.h>
53
- #include <cstddef>
54
-
55
- #define EXPORT extern "C"
56
-
57
- EXPORT void* managed_malloc(size_t size, int device, void* stream) {
58
- (void)stream;
59
- int cur = -1;
60
- cudaGetDevice(&cur);
61
- if (device != cur && device >= 0) cudaSetDevice(device);
62
-
63
- // cudaMallocManaged allows for more memory to be allocated than the device memory size.
64
- // The cudaMemAttachGlobal flag makes the memory accessible from both host and device.
65
- void* ptr = nullptr;
66
- cudaError_t err = cudaMallocManaged(&ptr, (size_t)size, cudaMemAttachGlobal);
67
- if (err != cudaSuccess) return nullptr;
68
-
69
- if (device >= 0) {
70
- // cudaMemAdviseSetPreferredLocation sets the preferred location for the memory.
71
- // This is a hint that tries to prevent data from being migrated away from the device.
72
- cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetPreferredLocation, device);
73
- // cudaMemAdviseSetAccessedBy ensures the memory always lives in the device's page table.
74
- // Even if the memory has to be migrated away from the device, it still does not page fault.
75
- // The CUDA docs claim that cudaMemAdviseSetPreferredLocation completely overrides this flag,
76
- // but there is no harm in adding this flag as well for future-proofing.
77
- cudaMemAdvise(ptr, (size_t)size, cudaMemAdviseSetAccessedBy, device);
78
- }
79
- return ptr;
80
- }
81
-
82
- EXPORT void managed_free(void* ptr, size_t size, int device, void* stream) {
83
- // Memory allocated with cudaMallocManaged should be released with cudaFree.
84
- (void)size; (void)device; (void)stream;
85
- if (ptr) cudaFree(ptr);
86
- }
87
- """
88
-
89
- # Build the .so upon import; this avoids issues.
90
- if _has_mem_pool:
91
- _extra_ldflags = ["-lcudart"]
92
- if CUDA_HOME:
93
- _cuda_lib = os.path.join(CUDA_HOME, "lib64")
94
- if os.path.isdir(_cuda_lib):
95
- _extra_ldflags = [f"-L{_cuda_lib}", "-lcudart"]
96
- try:
97
- _mod = load_inline(
98
- name="managed_alloc_runtime",
99
- cpp_sources=[_mempool_c_src],
100
- functions=[],
101
- with_cuda=True,
102
- extra_ldflags=_extra_ldflags,
103
- verbose=False,
104
- )
105
- _so_path = Path(_mod.__file__).as_posix()
106
- _alloc = CUDAPluggableAllocator(_so_path, "managed_malloc", "managed_free").allocator()
107
- _compilation_state = CompilationState.SUCCESS
108
- except (RuntimeError, ImportError, OSError):
109
- warnings.warn("Failed to create unified memory mempool.")
110
- _compilation_state = CompilationState.FAILURE
111
-
112
-
113
- def create_unified_mempool() -> "MemPool":
114
- """Create a unified memory mempool using CUDA managed memory.
115
-
116
- Returns:
117
- (MemPool) Unified memory mempool.
118
- """
119
-
120
- # Attempt to compile allocator.
121
- compile_allocator()
122
-
123
- # Return mempool.
124
- if _compilation_state != CompilationState.SUCCESS:
125
- raise UnifiedMemoryUnsupportedError()
126
- else:
127
- return MemPool(allocator=_alloc)