vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1398) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2044 -0
  5. vllm/_ipex_ops.py +393 -0
  6. vllm/_version.py +34 -0
  7. vllm/assets/__init__.py +0 -0
  8. vllm/assets/audio.py +45 -0
  9. vllm/assets/base.py +41 -0
  10. vllm/assets/image.py +50 -0
  11. vllm/assets/video.py +145 -0
  12. vllm/attention/__init__.py +15 -0
  13. vllm/attention/backends/__init__.py +0 -0
  14. vllm/attention/backends/abstract.py +204 -0
  15. vllm/attention/backends/utils.py +33 -0
  16. vllm/attention/layer.py +645 -0
  17. vllm/attention/layers/__init__.py +0 -0
  18. vllm/attention/layers/chunked_local_attention.py +93 -0
  19. vllm/attention/layers/cross_attention.py +162 -0
  20. vllm/attention/layers/encoder_only_attention.py +86 -0
  21. vllm/attention/ops/__init__.py +0 -0
  22. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  23. vllm/attention/ops/common.py +345 -0
  24. vllm/attention/ops/flashmla.py +192 -0
  25. vllm/attention/ops/merge_attn_states.py +43 -0
  26. vllm/attention/ops/paged_attn.py +262 -0
  27. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  28. vllm/attention/ops/prefix_prefill.py +928 -0
  29. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  30. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  31. vllm/attention/ops/triton_decode_attention.py +691 -0
  32. vllm/attention/ops/triton_flash_attention.py +984 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +175 -0
  35. vllm/attention/ops/triton_unified_attention.py +894 -0
  36. vllm/attention/selector.py +245 -0
  37. vllm/attention/utils/__init__.py +0 -0
  38. vllm/attention/utils/fa_utils.py +85 -0
  39. vllm/attention/utils/kv_sharing_utils.py +33 -0
  40. vllm/beam_search.py +87 -0
  41. vllm/benchmarks/__init__.py +0 -0
  42. vllm/benchmarks/datasets.py +2723 -0
  43. vllm/benchmarks/latency.py +170 -0
  44. vllm/benchmarks/lib/__init__.py +3 -0
  45. vllm/benchmarks/lib/endpoint_request_func.py +533 -0
  46. vllm/benchmarks/lib/ready_checker.py +73 -0
  47. vllm/benchmarks/lib/utils.py +80 -0
  48. vllm/benchmarks/serve.py +1358 -0
  49. vllm/benchmarks/throughput.py +696 -0
  50. vllm/collect_env.py +823 -0
  51. vllm/compilation/__init__.py +0 -0
  52. vllm/compilation/activation_quant_fusion.py +189 -0
  53. vllm/compilation/backends.py +650 -0
  54. vllm/compilation/base_static_graph.py +56 -0
  55. vllm/compilation/collective_fusion.py +1188 -0
  56. vllm/compilation/compiler_interface.py +573 -0
  57. vllm/compilation/counter.py +47 -0
  58. vllm/compilation/cuda_graph.py +199 -0
  59. vllm/compilation/cuda_piecewise_backend.py +117 -0
  60. vllm/compilation/decorators.py +400 -0
  61. vllm/compilation/fix_functionalization.py +205 -0
  62. vllm/compilation/fusion.py +383 -0
  63. vllm/compilation/fusion_attn.py +295 -0
  64. vllm/compilation/fx_utils.py +84 -0
  65. vllm/compilation/inductor_pass.py +136 -0
  66. vllm/compilation/monitor.py +57 -0
  67. vllm/compilation/noop_elimination.py +158 -0
  68. vllm/compilation/pass_manager.py +125 -0
  69. vllm/compilation/post_cleanup.py +20 -0
  70. vllm/compilation/sequence_parallelism.py +478 -0
  71. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  72. vllm/compilation/vllm_inductor_pass.py +156 -0
  73. vllm/compilation/wrapper.py +136 -0
  74. vllm/config/__init__.py +814 -0
  75. vllm/config/cache.py +220 -0
  76. vllm/config/compilation.py +673 -0
  77. vllm/config/device.py +74 -0
  78. vllm/config/kv_events.py +50 -0
  79. vllm/config/kv_transfer.py +111 -0
  80. vllm/config/load.py +113 -0
  81. vllm/config/lora.py +132 -0
  82. vllm/config/model.py +1912 -0
  83. vllm/config/multimodal.py +129 -0
  84. vllm/config/observability.py +99 -0
  85. vllm/config/parallel.py +524 -0
  86. vllm/config/pooler.py +97 -0
  87. vllm/config/scheduler.py +287 -0
  88. vllm/config/speculative.py +568 -0
  89. vllm/config/speech_to_text.py +39 -0
  90. vllm/config/structured_outputs.py +64 -0
  91. vllm/config/utils.py +145 -0
  92. vllm/connections.py +186 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +311 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +41 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +440 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +317 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +295 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +323 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +28 -0
  106. vllm/distributed/device_communicators/pynccl.py +340 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +186 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +416 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +589 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +635 -0
  113. vllm/distributed/device_communicators/symm_mem.py +136 -0
  114. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  115. vllm/distributed/device_communicators/xpu_communicator.py +94 -0
  116. vllm/distributed/eplb/__init__.py +8 -0
  117. vllm/distributed/eplb/eplb_state.py +620 -0
  118. vllm/distributed/eplb/rebalance_algo.py +239 -0
  119. vllm/distributed/eplb/rebalance_execute.py +424 -0
  120. vllm/distributed/kv_events.py +362 -0
  121. vllm/distributed/kv_transfer/README.md +29 -0
  122. vllm/distributed/kv_transfer/__init__.py +13 -0
  123. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  124. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  125. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  126. vllm/distributed/kv_transfer/kv_connector/factory.py +113 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +261 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +388 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +168 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +100 -0
  132. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +328 -0
  133. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1473 -0
  134. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +485 -0
  135. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +488 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +550 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +267 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +418 -0
  140. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  141. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  142. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  144. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  145. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  146. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  147. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  148. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  149. vllm/distributed/parallel_state.py +1532 -0
  150. vllm/distributed/tpu_distributed_utils.py +178 -0
  151. vllm/distributed/utils.py +536 -0
  152. vllm/engine/__init__.py +0 -0
  153. vllm/engine/arg_utils.py +1778 -0
  154. vllm/engine/async_llm_engine.py +6 -0
  155. vllm/engine/llm_engine.py +6 -0
  156. vllm/engine/metrics.py +577 -0
  157. vllm/engine/metrics_types.py +84 -0
  158. vllm/engine/protocol.py +333 -0
  159. vllm/entrypoints/__init__.py +0 -0
  160. vllm/entrypoints/api_server.py +178 -0
  161. vllm/entrypoints/chat_utils.py +1705 -0
  162. vllm/entrypoints/cli/__init__.py +12 -0
  163. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  164. vllm/entrypoints/cli/benchmark/base.py +25 -0
  165. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  166. vllm/entrypoints/cli/benchmark/main.py +55 -0
  167. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  168. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  169. vllm/entrypoints/cli/collect_env.py +36 -0
  170. vllm/entrypoints/cli/main.py +60 -0
  171. vllm/entrypoints/cli/openai.py +233 -0
  172. vllm/entrypoints/cli/run_batch.py +67 -0
  173. vllm/entrypoints/cli/serve.py +232 -0
  174. vllm/entrypoints/cli/types.py +29 -0
  175. vllm/entrypoints/constants.py +10 -0
  176. vllm/entrypoints/context.py +481 -0
  177. vllm/entrypoints/harmony_utils.py +436 -0
  178. vllm/entrypoints/launcher.py +164 -0
  179. vllm/entrypoints/llm.py +1629 -0
  180. vllm/entrypoints/logger.py +79 -0
  181. vllm/entrypoints/openai/__init__.py +0 -0
  182. vllm/entrypoints/openai/api_server.py +1953 -0
  183. vllm/entrypoints/openai/cli_args.py +288 -0
  184. vllm/entrypoints/openai/logits_processors.py +90 -0
  185. vllm/entrypoints/openai/protocol.py +2757 -0
  186. vllm/entrypoints/openai/run_batch.py +491 -0
  187. vllm/entrypoints/openai/serving_chat.py +1597 -0
  188. vllm/entrypoints/openai/serving_classification.py +173 -0
  189. vllm/entrypoints/openai/serving_completion.py +692 -0
  190. vllm/entrypoints/openai/serving_embedding.py +631 -0
  191. vllm/entrypoints/openai/serving_engine.py +992 -0
  192. vllm/entrypoints/openai/serving_models.py +288 -0
  193. vllm/entrypoints/openai/serving_pooling.py +276 -0
  194. vllm/entrypoints/openai/serving_responses.py +1709 -0
  195. vllm/entrypoints/openai/serving_score.py +479 -0
  196. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  197. vllm/entrypoints/openai/serving_transcription.py +136 -0
  198. vllm/entrypoints/openai/speech_to_text.py +388 -0
  199. vllm/entrypoints/openai/tool_parsers/__init__.py +55 -0
  200. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  201. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  202. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  203. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  204. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  205. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  206. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +455 -0
  207. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  208. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  209. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  210. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  211. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  212. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  213. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +39 -0
  214. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  216. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +93 -0
  217. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  218. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  219. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  220. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1137 -0
  221. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  222. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  223. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  224. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  225. vllm/entrypoints/renderer.py +395 -0
  226. vllm/entrypoints/score_utils.py +232 -0
  227. vllm/entrypoints/ssl.py +75 -0
  228. vllm/entrypoints/tool.py +139 -0
  229. vllm/entrypoints/tool_server.py +206 -0
  230. vllm/entrypoints/utils.py +233 -0
  231. vllm/env_override.py +23 -0
  232. vllm/envs.py +1590 -0
  233. vllm/executor/__init__.py +0 -0
  234. vllm/executor/executor_base.py +381 -0
  235. vllm/executor/msgspec_utils.py +35 -0
  236. vllm/executor/ray_distributed_executor.py +699 -0
  237. vllm/executor/ray_utils.py +410 -0
  238. vllm/executor/uniproc_executor.py +176 -0
  239. vllm/forward_context.py +402 -0
  240. vllm/inputs/__init__.py +30 -0
  241. vllm/inputs/data.py +356 -0
  242. vllm/inputs/parse.py +151 -0
  243. vllm/inputs/preprocess.py +664 -0
  244. vllm/logger.py +229 -0
  245. vllm/logging_utils/__init__.py +10 -0
  246. vllm/logging_utils/dump_input.py +81 -0
  247. vllm/logging_utils/formatter.py +79 -0
  248. vllm/logging_utils/log_time.py +32 -0
  249. vllm/logits_process.py +119 -0
  250. vllm/logprobs.py +28 -0
  251. vllm/lora/__init__.py +0 -0
  252. vllm/lora/layers/__init__.py +34 -0
  253. vllm/lora/layers/base.py +69 -0
  254. vllm/lora/layers/base_linear.py +185 -0
  255. vllm/lora/layers/column_parallel_linear.py +609 -0
  256. vllm/lora/layers/logits_processor.py +247 -0
  257. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  258. vllm/lora/layers/replicated_linear.py +60 -0
  259. vllm/lora/layers/row_parallel_linear.py +196 -0
  260. vllm/lora/layers/utils.py +65 -0
  261. vllm/lora/layers/vocal_parallel_embedding.py +174 -0
  262. vllm/lora/lora_weights.py +199 -0
  263. vllm/lora/models.py +816 -0
  264. vllm/lora/ops/__init__.py +0 -0
  265. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  266. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  267. vllm/lora/ops/torch_ops/__init__.py +16 -0
  268. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  269. vllm/lora/ops/triton_ops/__init__.py +12 -0
  270. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  271. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  272. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  273. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  274. vllm/lora/ops/triton_ops/utils.py +126 -0
  275. vllm/lora/ops/xla_ops/__init__.py +7 -0
  276. vllm/lora/ops/xla_ops/lora_ops.py +144 -0
  277. vllm/lora/peft_helper.py +127 -0
  278. vllm/lora/punica_wrapper/__init__.py +10 -0
  279. vllm/lora/punica_wrapper/punica_base.py +458 -0
  280. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  281. vllm/lora/punica_wrapper/punica_gpu.py +272 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  284. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  285. vllm/lora/punica_wrapper/utils.py +136 -0
  286. vllm/lora/request.py +97 -0
  287. vllm/lora/resolver.py +85 -0
  288. vllm/lora/utils.py +246 -0
  289. vllm/lora/worker_manager.py +267 -0
  290. vllm/model_executor/__init__.py +12 -0
  291. vllm/model_executor/custom_op.py +194 -0
  292. vllm/model_executor/layers/__init__.py +0 -0
  293. vllm/model_executor/layers/activation.py +575 -0
  294. vllm/model_executor/layers/attention_layer_base.py +23 -0
  295. vllm/model_executor/layers/fla/__init__.py +8 -0
  296. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  297. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  298. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  299. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  300. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  301. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  302. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  303. vllm/model_executor/layers/fla/ops/index.py +39 -0
  304. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  305. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  306. vllm/model_executor/layers/fla/ops/op.py +39 -0
  307. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  308. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  309. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  310. vllm/model_executor/layers/fused_moe/__init__.py +89 -0
  311. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +322 -0
  312. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +141 -0
  313. vllm/model_executor/layers/fused_moe/config.py +804 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  545. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +300 -0
  546. vllm/model_executor/layers/fused_moe/cutlass_moe.py +957 -0
  547. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +362 -0
  548. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  549. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +361 -0
  550. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +274 -0
  551. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +268 -0
  552. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +300 -0
  553. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +184 -0
  554. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +993 -0
  555. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +239 -0
  556. vllm/model_executor/layers/fused_moe/fused_moe.py +1890 -0
  557. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +307 -0
  558. vllm/model_executor/layers/fused_moe/layer.py +2195 -0
  559. vllm/model_executor/layers/fused_moe/modular_kernel.py +1038 -0
  560. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  561. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  562. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  563. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  564. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +341 -0
  565. vllm/model_executor/layers/fused_moe/prepare_finalize.py +70 -0
  566. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +424 -0
  567. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  568. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  569. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +143 -0
  570. vllm/model_executor/layers/fused_moe/trtllm_moe.py +191 -0
  571. vllm/model_executor/layers/fused_moe/utils.py +274 -0
  572. vllm/model_executor/layers/layernorm.py +395 -0
  573. vllm/model_executor/layers/lightning_attn.py +661 -0
  574. vllm/model_executor/layers/linear.py +1603 -0
  575. vllm/model_executor/layers/logits_processor.py +106 -0
  576. vllm/model_executor/layers/mamba/__init__.py +0 -0
  577. vllm/model_executor/layers/mamba/abstract.py +42 -0
  578. vllm/model_executor/layers/mamba/linear_attn.py +403 -0
  579. vllm/model_executor/layers/mamba/mamba_mixer.py +466 -0
  580. vllm/model_executor/layers/mamba/mamba_mixer2.py +764 -0
  581. vllm/model_executor/layers/mamba/mamba_utils.py +186 -0
  582. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  583. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1092 -0
  584. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  585. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  586. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +242 -0
  587. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +527 -0
  588. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +724 -0
  589. vllm/model_executor/layers/mamba/ops/ssd_combined.py +238 -0
  590. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +200 -0
  591. vllm/model_executor/layers/mamba/short_conv.py +253 -0
  592. vllm/model_executor/layers/mla.py +173 -0
  593. vllm/model_executor/layers/pooler.py +719 -0
  594. vllm/model_executor/layers/quantization/__init__.py +157 -0
  595. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  596. vllm/model_executor/layers/quantization/awq.py +228 -0
  597. vllm/model_executor/layers/quantization/awq_marlin.py +554 -0
  598. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  599. vllm/model_executor/layers/quantization/base_config.py +170 -0
  600. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  601. vllm/model_executor/layers/quantization/bitsandbytes.py +627 -0
  602. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  603. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +797 -0
  604. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2074 -0
  605. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  606. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  607. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  608. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  609. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  610. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +185 -0
  611. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  612. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  613. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  614. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +157 -0
  615. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  616. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +238 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +153 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +46 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  625. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  626. vllm/model_executor/layers/quantization/experts_int8.py +223 -0
  627. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  628. vllm/model_executor/layers/quantization/fp8.py +1098 -0
  629. vllm/model_executor/layers/quantization/gguf.py +599 -0
  630. vllm/model_executor/layers/quantization/gptq.py +340 -0
  631. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  632. vllm/model_executor/layers/quantization/gptq_marlin.py +751 -0
  633. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  634. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  635. vllm/model_executor/layers/quantization/inc.py +61 -0
  636. vllm/model_executor/layers/quantization/input_quant_fp8.py +156 -0
  637. vllm/model_executor/layers/quantization/ipex_quant.py +415 -0
  638. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  639. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  640. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  641. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  642. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  643. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  644. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  645. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  646. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  647. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  648. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  649. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  650. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  651. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +161 -0
  652. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  653. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  654. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  655. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  656. vllm/model_executor/layers/quantization/kv_cache.py +143 -0
  657. vllm/model_executor/layers/quantization/modelopt.py +1596 -0
  658. vllm/model_executor/layers/quantization/moe_wna16.py +484 -0
  659. vllm/model_executor/layers/quantization/mxfp4.py +988 -0
  660. vllm/model_executor/layers/quantization/petit.py +306 -0
  661. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  662. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  663. vllm/model_executor/layers/quantization/quark/quark.py +432 -0
  664. vllm/model_executor/layers/quantization/quark/quark_moe.py +561 -0
  665. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  666. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  667. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +239 -0
  668. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  669. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  670. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  671. vllm/model_executor/layers/quantization/rtn.py +466 -0
  672. vllm/model_executor/layers/quantization/schema.py +86 -0
  673. vllm/model_executor/layers/quantization/torchao.py +214 -0
  674. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  675. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  676. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  677. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  889. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +79 -0
  890. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +248 -0
  891. vllm/model_executor/layers/quantization/utils/fp8_utils.py +949 -0
  892. vllm/model_executor/layers/quantization/utils/gptq_utils.py +146 -0
  893. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  894. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  895. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  896. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  897. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  898. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  899. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  900. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  901. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +141 -0
  902. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  903. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  904. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  905. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  906. vllm/model_executor/layers/quantization/utils/quant_utils.py +641 -0
  907. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  908. vllm/model_executor/layers/resampler.py +270 -0
  909. vllm/model_executor/layers/rotary_embedding/__init__.py +204 -0
  910. vllm/model_executor/layers/rotary_embedding/base.py +177 -0
  911. vllm/model_executor/layers/rotary_embedding/common.py +150 -0
  912. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +138 -0
  913. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  914. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  915. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  916. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  917. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  918. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  919. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  920. vllm/model_executor/layers/rotary_embedding/mrope.py +1321 -0
  921. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  922. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  923. vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +86 -0
  924. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  925. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  926. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  927. vllm/model_executor/layers/utils.py +195 -0
  928. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  929. vllm/model_executor/model_loader/__init__.py +138 -0
  930. vllm/model_executor/model_loader/base_loader.py +52 -0
  931. vllm/model_executor/model_loader/bitsandbytes_loader.py +788 -0
  932. vllm/model_executor/model_loader/default_loader.py +277 -0
  933. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  934. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  935. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  936. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  937. vllm/model_executor/model_loader/tensorizer.py +738 -0
  938. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  939. vllm/model_executor/model_loader/tpu.py +114 -0
  940. vllm/model_executor/model_loader/utils.py +292 -0
  941. vllm/model_executor/model_loader/weight_utils.py +990 -0
  942. vllm/model_executor/models/__init__.py +33 -0
  943. vllm/model_executor/models/adapters.py +542 -0
  944. vllm/model_executor/models/aimv2.py +246 -0
  945. vllm/model_executor/models/apertus.py +579 -0
  946. vllm/model_executor/models/arcee.py +422 -0
  947. vllm/model_executor/models/arctic.py +558 -0
  948. vllm/model_executor/models/aria.py +650 -0
  949. vllm/model_executor/models/aya_vision.py +468 -0
  950. vllm/model_executor/models/baichuan.py +474 -0
  951. vllm/model_executor/models/bailing_moe.py +642 -0
  952. vllm/model_executor/models/bamba.py +514 -0
  953. vllm/model_executor/models/bert.py +665 -0
  954. vllm/model_executor/models/bert_with_rope.py +687 -0
  955. vllm/model_executor/models/blip.py +339 -0
  956. vllm/model_executor/models/blip2.py +712 -0
  957. vllm/model_executor/models/bloom.py +374 -0
  958. vllm/model_executor/models/chameleon.py +1139 -0
  959. vllm/model_executor/models/chatglm.py +476 -0
  960. vllm/model_executor/models/clip.py +407 -0
  961. vllm/model_executor/models/cohere2_vision.py +481 -0
  962. vllm/model_executor/models/commandr.py +465 -0
  963. vllm/model_executor/models/config.py +445 -0
  964. vllm/model_executor/models/dbrx.py +471 -0
  965. vllm/model_executor/models/deepseek.py +497 -0
  966. vllm/model_executor/models/deepseek_eagle.py +240 -0
  967. vllm/model_executor/models/deepseek_mtp.py +289 -0
  968. vllm/model_executor/models/deepseek_v2.py +1444 -0
  969. vllm/model_executor/models/deepseek_vl2.py +658 -0
  970. vllm/model_executor/models/dots1.py +546 -0
  971. vllm/model_executor/models/dots_ocr.py +873 -0
  972. vllm/model_executor/models/ernie45.py +43 -0
  973. vllm/model_executor/models/ernie45_moe.py +607 -0
  974. vllm/model_executor/models/ernie45_vl.py +1527 -0
  975. vllm/model_executor/models/ernie45_vl_moe.py +727 -0
  976. vllm/model_executor/models/ernie_mtp.py +268 -0
  977. vllm/model_executor/models/exaone.py +550 -0
  978. vllm/model_executor/models/exaone4.py +533 -0
  979. vllm/model_executor/models/fairseq2_llama.py +154 -0
  980. vllm/model_executor/models/falcon.py +509 -0
  981. vllm/model_executor/models/falcon_h1.py +674 -0
  982. vllm/model_executor/models/fuyu.py +399 -0
  983. vllm/model_executor/models/gemma.py +425 -0
  984. vllm/model_executor/models/gemma2.py +422 -0
  985. vllm/model_executor/models/gemma3.py +555 -0
  986. vllm/model_executor/models/gemma3_mm.py +721 -0
  987. vllm/model_executor/models/gemma3n.py +1113 -0
  988. vllm/model_executor/models/gemma3n_mm.py +761 -0
  989. vllm/model_executor/models/glm.py +23 -0
  990. vllm/model_executor/models/glm4.py +304 -0
  991. vllm/model_executor/models/glm4_1v.py +1690 -0
  992. vllm/model_executor/models/glm4_moe.py +727 -0
  993. vllm/model_executor/models/glm4_moe_mtp.py +301 -0
  994. vllm/model_executor/models/glm4v.py +654 -0
  995. vllm/model_executor/models/gpt2.py +380 -0
  996. vllm/model_executor/models/gpt_bigcode.py +344 -0
  997. vllm/model_executor/models/gpt_j.py +339 -0
  998. vllm/model_executor/models/gpt_neox.py +330 -0
  999. vllm/model_executor/models/gpt_oss.py +712 -0
  1000. vllm/model_executor/models/granite.py +489 -0
  1001. vllm/model_executor/models/granite_speech.py +794 -0
  1002. vllm/model_executor/models/granitemoe.py +550 -0
  1003. vllm/model_executor/models/granitemoehybrid.py +614 -0
  1004. vllm/model_executor/models/granitemoeshared.py +332 -0
  1005. vllm/model_executor/models/gritlm.py +262 -0
  1006. vllm/model_executor/models/grok1.py +547 -0
  1007. vllm/model_executor/models/h2ovl.py +536 -0
  1008. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1009. vllm/model_executor/models/hyperclovax_vision.py +1192 -0
  1010. vllm/model_executor/models/idefics2_vision_model.py +417 -0
  1011. vllm/model_executor/models/idefics3.py +756 -0
  1012. vllm/model_executor/models/interfaces.py +959 -0
  1013. vllm/model_executor/models/interfaces_base.py +192 -0
  1014. vllm/model_executor/models/intern_vit.py +441 -0
  1015. vllm/model_executor/models/internlm2.py +450 -0
  1016. vllm/model_executor/models/internlm2_ve.py +148 -0
  1017. vllm/model_executor/models/interns1.py +838 -0
  1018. vllm/model_executor/models/interns1_vit.py +418 -0
  1019. vllm/model_executor/models/internvl.py +1423 -0
  1020. vllm/model_executor/models/jais.py +373 -0
  1021. vllm/model_executor/models/jamba.py +591 -0
  1022. vllm/model_executor/models/jina_vl.py +144 -0
  1023. vllm/model_executor/models/keye.py +1680 -0
  1024. vllm/model_executor/models/keye_vl1_5.py +602 -0
  1025. vllm/model_executor/models/kimi_vl.py +618 -0
  1026. vllm/model_executor/models/lfm2.py +548 -0
  1027. vllm/model_executor/models/llama.py +669 -0
  1028. vllm/model_executor/models/llama4.py +746 -0
  1029. vllm/model_executor/models/llama4_eagle.py +239 -0
  1030. vllm/model_executor/models/llama_eagle.py +179 -0
  1031. vllm/model_executor/models/llama_eagle3.py +296 -0
  1032. vllm/model_executor/models/llava.py +870 -0
  1033. vllm/model_executor/models/llava_next.py +571 -0
  1034. vllm/model_executor/models/llava_next_video.py +476 -0
  1035. vllm/model_executor/models/llava_onevision.py +942 -0
  1036. vllm/model_executor/models/longcat_flash.py +715 -0
  1037. vllm/model_executor/models/longcat_flash_mtp.py +352 -0
  1038. vllm/model_executor/models/mamba.py +275 -0
  1039. vllm/model_executor/models/mamba2.py +291 -0
  1040. vllm/model_executor/models/medusa.py +169 -0
  1041. vllm/model_executor/models/midashenglm.py +792 -0
  1042. vllm/model_executor/models/mimo.py +188 -0
  1043. vllm/model_executor/models/mimo_mtp.py +280 -0
  1044. vllm/model_executor/models/minicpm.py +631 -0
  1045. vllm/model_executor/models/minicpm3.py +230 -0
  1046. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1047. vllm/model_executor/models/minicpmo.py +770 -0
  1048. vllm/model_executor/models/minicpmv.py +1784 -0
  1049. vllm/model_executor/models/minimax_text_01.py +986 -0
  1050. vllm/model_executor/models/minimax_vl_01.py +426 -0
  1051. vllm/model_executor/models/mistral3.py +628 -0
  1052. vllm/model_executor/models/mixtral.py +606 -0
  1053. vllm/model_executor/models/mllama4.py +1076 -0
  1054. vllm/model_executor/models/mlp_speculator.py +206 -0
  1055. vllm/model_executor/models/modernbert.py +374 -0
  1056. vllm/model_executor/models/module_mapping.py +72 -0
  1057. vllm/model_executor/models/molmo.py +1567 -0
  1058. vllm/model_executor/models/moonvit.py +673 -0
  1059. vllm/model_executor/models/motif.py +345 -0
  1060. vllm/model_executor/models/mpt.py +329 -0
  1061. vllm/model_executor/models/nano_nemotron_vl.py +1394 -0
  1062. vllm/model_executor/models/nemotron.py +507 -0
  1063. vllm/model_executor/models/nemotron_h.py +565 -0
  1064. vllm/model_executor/models/nemotron_nas.py +481 -0
  1065. vllm/model_executor/models/nemotron_vl.py +652 -0
  1066. vllm/model_executor/models/nvlm_d.py +203 -0
  1067. vllm/model_executor/models/olmo.py +404 -0
  1068. vllm/model_executor/models/olmo2.py +439 -0
  1069. vllm/model_executor/models/olmoe.py +483 -0
  1070. vllm/model_executor/models/opt.py +412 -0
  1071. vllm/model_executor/models/orion.py +348 -0
  1072. vllm/model_executor/models/ovis.py +559 -0
  1073. vllm/model_executor/models/ovis2_5.py +642 -0
  1074. vllm/model_executor/models/paligemma.py +411 -0
  1075. vllm/model_executor/models/persimmon.py +343 -0
  1076. vllm/model_executor/models/phi.py +356 -0
  1077. vllm/model_executor/models/phi3.py +19 -0
  1078. vllm/model_executor/models/phi3v.py +698 -0
  1079. vllm/model_executor/models/phi4_multimodal.py +1475 -0
  1080. vllm/model_executor/models/phi4mm.py +1279 -0
  1081. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1082. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1083. vllm/model_executor/models/phimoe.py +679 -0
  1084. vllm/model_executor/models/pixtral.py +1345 -0
  1085. vllm/model_executor/models/plamo2.py +978 -0
  1086. vllm/model_executor/models/qwen.py +361 -0
  1087. vllm/model_executor/models/qwen2.py +523 -0
  1088. vllm/model_executor/models/qwen2_5_omni_thinker.py +984 -0
  1089. vllm/model_executor/models/qwen2_5_vl.py +1481 -0
  1090. vllm/model_executor/models/qwen2_audio.py +489 -0
  1091. vllm/model_executor/models/qwen2_moe.py +558 -0
  1092. vllm/model_executor/models/qwen2_rm.py +122 -0
  1093. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1094. vllm/model_executor/models/qwen3.py +341 -0
  1095. vllm/model_executor/models/qwen3_moe.py +692 -0
  1096. vllm/model_executor/models/qwen3_next.py +1266 -0
  1097. vllm/model_executor/models/qwen3_next_mtp.py +281 -0
  1098. vllm/model_executor/models/qwen3_vl.py +1613 -0
  1099. vllm/model_executor/models/qwen3_vl_moe.py +358 -0
  1100. vllm/model_executor/models/qwen_vl.py +795 -0
  1101. vllm/model_executor/models/radio.py +576 -0
  1102. vllm/model_executor/models/registry.py +990 -0
  1103. vllm/model_executor/models/roberta.py +252 -0
  1104. vllm/model_executor/models/rvl.py +103 -0
  1105. vllm/model_executor/models/seed_oss.py +485 -0
  1106. vllm/model_executor/models/siglip.py +540 -0
  1107. vllm/model_executor/models/siglip2navit.py +689 -0
  1108. vllm/model_executor/models/skyworkr1v.py +911 -0
  1109. vllm/model_executor/models/smolvlm.py +44 -0
  1110. vllm/model_executor/models/solar.py +504 -0
  1111. vllm/model_executor/models/stablelm.py +341 -0
  1112. vllm/model_executor/models/starcoder2.py +354 -0
  1113. vllm/model_executor/models/step3_text.py +510 -0
  1114. vllm/model_executor/models/step3_vl.py +1072 -0
  1115. vllm/model_executor/models/swin.py +475 -0
  1116. vllm/model_executor/models/tarsier.py +639 -0
  1117. vllm/model_executor/models/telechat2.py +151 -0
  1118. vllm/model_executor/models/teleflm.py +79 -0
  1119. vllm/model_executor/models/terratorch.py +294 -0
  1120. vllm/model_executor/models/transformers.py +948 -0
  1121. vllm/model_executor/models/ultravox.py +654 -0
  1122. vllm/model_executor/models/utils.py +808 -0
  1123. vllm/model_executor/models/vision.py +404 -0
  1124. vllm/model_executor/models/voxtral.py +786 -0
  1125. vllm/model_executor/models/whisper.py +963 -0
  1126. vllm/model_executor/models/zamba2.py +960 -0
  1127. vllm/model_executor/parameter.py +620 -0
  1128. vllm/model_executor/utils.py +86 -0
  1129. vllm/model_executor/warmup/__init__.py +0 -0
  1130. vllm/model_executor/warmup/deep_gemm_warmup.py +230 -0
  1131. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1132. vllm/multimodal/__init__.py +33 -0
  1133. vllm/multimodal/audio.py +116 -0
  1134. vllm/multimodal/base.py +27 -0
  1135. vllm/multimodal/cache.py +697 -0
  1136. vllm/multimodal/evs.py +273 -0
  1137. vllm/multimodal/hasher.py +102 -0
  1138. vllm/multimodal/image.py +130 -0
  1139. vllm/multimodal/inputs.py +987 -0
  1140. vllm/multimodal/parse.py +511 -0
  1141. vllm/multimodal/processing.py +2148 -0
  1142. vllm/multimodal/profiling.py +284 -0
  1143. vllm/multimodal/registry.py +345 -0
  1144. vllm/multimodal/utils.py +503 -0
  1145. vllm/multimodal/video.py +319 -0
  1146. vllm/outputs.py +324 -0
  1147. vllm/platforms/__init__.py +263 -0
  1148. vllm/platforms/cpu.py +340 -0
  1149. vllm/platforms/cuda.py +668 -0
  1150. vllm/platforms/interface.py +620 -0
  1151. vllm/platforms/rocm.py +497 -0
  1152. vllm/platforms/tpu.py +233 -0
  1153. vllm/platforms/xpu.py +243 -0
  1154. vllm/plugins/__init__.py +72 -0
  1155. vllm/plugins/io_processors/__init__.py +68 -0
  1156. vllm/plugins/io_processors/interface.py +67 -0
  1157. vllm/plugins/lora_resolvers/README.md +16 -0
  1158. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1159. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1160. vllm/pooling_params.py +191 -0
  1161. vllm/profiler/__init__.py +0 -0
  1162. vllm/profiler/layerwise_profile.py +375 -0
  1163. vllm/profiler/utils.py +148 -0
  1164. vllm/py.typed +2 -0
  1165. vllm/ray/__init__.py +0 -0
  1166. vllm/ray/lazy_utils.py +22 -0
  1167. vllm/ray/ray_env.py +72 -0
  1168. vllm/reasoning/__init__.py +29 -0
  1169. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1170. vllm/reasoning/basic_parsers.py +156 -0
  1171. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1172. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1173. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1174. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1175. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1176. vllm/reasoning/mistral_reasoning_parser.py +56 -0
  1177. vllm/reasoning/qwen3_reasoning_parser.py +72 -0
  1178. vllm/reasoning/seedoss_reasoning_parser.py +28 -0
  1179. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1180. vllm/sampling_params.py +593 -0
  1181. vllm/scalar_type.py +349 -0
  1182. vllm/scripts.py +15 -0
  1183. vllm/sequence.py +103 -0
  1184. vllm/tasks.py +11 -0
  1185. vllm/test_utils.py +129 -0
  1186. vllm/third_party/__init__.py +0 -0
  1187. vllm/third_party/pynvml.py +6140 -0
  1188. vllm/tracing.py +136 -0
  1189. vllm/transformers_utils/__init__.py +24 -0
  1190. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1191. vllm/transformers_utils/chat_templates/registry.py +70 -0
  1192. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1193. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1194. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1195. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1196. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1197. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1198. vllm/transformers_utils/config.py +1102 -0
  1199. vllm/transformers_utils/config_parser_base.py +20 -0
  1200. vllm/transformers_utils/configs/__init__.py +63 -0
  1201. vllm/transformers_utils/configs/arctic.py +207 -0
  1202. vllm/transformers_utils/configs/chatglm.py +72 -0
  1203. vllm/transformers_utils/configs/deepseek_v3.py +101 -0
  1204. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1205. vllm/transformers_utils/configs/dotsocr.py +69 -0
  1206. vllm/transformers_utils/configs/eagle.py +84 -0
  1207. vllm/transformers_utils/configs/falcon.py +90 -0
  1208. vllm/transformers_utils/configs/jais.py +237 -0
  1209. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1210. vllm/transformers_utils/configs/medusa.py +63 -0
  1211. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1212. vllm/transformers_utils/configs/mistral.py +165 -0
  1213. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1214. vllm/transformers_utils/configs/moonvit.py +33 -0
  1215. vllm/transformers_utils/configs/nemotron.py +205 -0
  1216. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1217. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1218. vllm/transformers_utils/configs/olmo3.py +80 -0
  1219. vllm/transformers_utils/configs/ovis.py +176 -0
  1220. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1221. vllm/transformers_utils/configs/radio.py +91 -0
  1222. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1223. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1224. vllm/transformers_utils/configs/speculators/base.py +111 -0
  1225. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1226. vllm/transformers_utils/configs/ultravox.py +116 -0
  1227. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1228. vllm/transformers_utils/dynamic_module.py +60 -0
  1229. vllm/transformers_utils/processor.py +299 -0
  1230. vllm/transformers_utils/processors/__init__.py +16 -0
  1231. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1232. vllm/transformers_utils/processors/ovis.py +420 -0
  1233. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1234. vllm/transformers_utils/runai_utils.py +104 -0
  1235. vllm/transformers_utils/s3_utils.py +93 -0
  1236. vllm/transformers_utils/tokenizer.py +292 -0
  1237. vllm/transformers_utils/tokenizer_base.py +154 -0
  1238. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1239. vllm/transformers_utils/tokenizers/mistral.py +521 -0
  1240. vllm/transformers_utils/utils.py +108 -0
  1241. vllm/triton_utils/__init__.py +16 -0
  1242. vllm/triton_utils/importing.py +96 -0
  1243. vllm/usage/__init__.py +0 -0
  1244. vllm/usage/usage_lib.py +259 -0
  1245. vllm/utils/__init__.py +3566 -0
  1246. vllm/utils/deep_gemm.py +319 -0
  1247. vllm/utils/flashinfer.py +443 -0
  1248. vllm/utils/jsontree.py +178 -0
  1249. vllm/utils/tensor_schema.py +235 -0
  1250. vllm/v1/__init__.py +0 -0
  1251. vllm/v1/attention/__init__.py +0 -0
  1252. vllm/v1/attention/backends/__init__.py +0 -0
  1253. vllm/v1/attention/backends/cpu_attn.py +919 -0
  1254. vllm/v1/attention/backends/flash_attn.py +795 -0
  1255. vllm/v1/attention/backends/flashinfer.py +1181 -0
  1256. vllm/v1/attention/backends/flex_attention.py +861 -0
  1257. vllm/v1/attention/backends/gdn_attn.py +332 -0
  1258. vllm/v1/attention/backends/linear_attn.py +67 -0
  1259. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1260. vllm/v1/attention/backends/mamba2_attn.py +232 -0
  1261. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1262. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1263. vllm/v1/attention/backends/mla/common.py +1783 -0
  1264. vllm/v1/attention/backends/mla/cutlass_mla.py +248 -0
  1265. vllm/v1/attention/backends/mla/flashattn_mla.py +271 -0
  1266. vllm/v1/attention/backends/mla/flashinfer_mla.py +114 -0
  1267. vllm/v1/attention/backends/mla/flashmla.py +203 -0
  1268. vllm/v1/attention/backends/mla/flashmla_sparse.py +544 -0
  1269. vllm/v1/attention/backends/mla/indexer.py +342 -0
  1270. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1271. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1272. vllm/v1/attention/backends/pallas.py +409 -0
  1273. vllm/v1/attention/backends/rocm_aiter_fa.py +549 -0
  1274. vllm/v1/attention/backends/rocm_attn.py +426 -0
  1275. vllm/v1/attention/backends/short_conv_attn.py +94 -0
  1276. vllm/v1/attention/backends/tree_attn.py +451 -0
  1277. vllm/v1/attention/backends/triton_attn.py +361 -0
  1278. vllm/v1/attention/backends/utils.py +990 -0
  1279. vllm/v1/attention/backends/xformers.py +438 -0
  1280. vllm/v1/core/__init__.py +0 -0
  1281. vllm/v1/core/block_pool.py +416 -0
  1282. vllm/v1/core/encoder_cache_manager.py +333 -0
  1283. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1284. vllm/v1/core/kv_cache_manager.py +399 -0
  1285. vllm/v1/core/kv_cache_utils.py +1291 -0
  1286. vllm/v1/core/sched/__init__.py +0 -0
  1287. vllm/v1/core/sched/async_scheduler.py +47 -0
  1288. vllm/v1/core/sched/interface.py +158 -0
  1289. vllm/v1/core/sched/output.py +166 -0
  1290. vllm/v1/core/sched/request_queue.py +224 -0
  1291. vllm/v1/core/sched/scheduler.py +1296 -0
  1292. vllm/v1/core/sched/utils.py +69 -0
  1293. vllm/v1/core/single_type_kv_cache_manager.py +671 -0
  1294. vllm/v1/cudagraph_dispatcher.py +125 -0
  1295. vllm/v1/engine/__init__.py +203 -0
  1296. vllm/v1/engine/async_llm.py +742 -0
  1297. vllm/v1/engine/coordinator.py +357 -0
  1298. vllm/v1/engine/core.py +1235 -0
  1299. vllm/v1/engine/core_client.py +1334 -0
  1300. vllm/v1/engine/detokenizer.py +349 -0
  1301. vllm/v1/engine/exceptions.py +17 -0
  1302. vllm/v1/engine/llm_engine.py +370 -0
  1303. vllm/v1/engine/logprobs.py +201 -0
  1304. vllm/v1/engine/output_processor.py +576 -0
  1305. vllm/v1/engine/parallel_sampling.py +133 -0
  1306. vllm/v1/engine/processor.py +545 -0
  1307. vllm/v1/engine/utils.py +860 -0
  1308. vllm/v1/executor/__init__.py +0 -0
  1309. vllm/v1/executor/abstract.py +137 -0
  1310. vllm/v1/executor/multiproc_executor.py +726 -0
  1311. vllm/v1/executor/ray_distributed_executor.py +108 -0
  1312. vllm/v1/executor/utils.py +23 -0
  1313. vllm/v1/kv_cache_interface.py +375 -0
  1314. vllm/v1/kv_offload/__init__.py +0 -0
  1315. vllm/v1/kv_offload/abstract.py +165 -0
  1316. vllm/v1/kv_offload/backend.py +96 -0
  1317. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1318. vllm/v1/kv_offload/backends/cpu.py +61 -0
  1319. vllm/v1/kv_offload/cpu.py +75 -0
  1320. vllm/v1/kv_offload/factory.py +56 -0
  1321. vllm/v1/kv_offload/lru_manager.py +132 -0
  1322. vllm/v1/kv_offload/mediums.py +39 -0
  1323. vllm/v1/kv_offload/spec.py +61 -0
  1324. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1325. vllm/v1/kv_offload/worker/cpu_gpu.py +171 -0
  1326. vllm/v1/kv_offload/worker/worker.py +142 -0
  1327. vllm/v1/metrics/__init__.py +0 -0
  1328. vllm/v1/metrics/loggers.py +741 -0
  1329. vllm/v1/metrics/prometheus.py +82 -0
  1330. vllm/v1/metrics/ray_wrappers.py +152 -0
  1331. vllm/v1/metrics/reader.py +246 -0
  1332. vllm/v1/metrics/stats.py +257 -0
  1333. vllm/v1/outputs.py +161 -0
  1334. vllm/v1/pool/__init__.py +0 -0
  1335. vllm/v1/pool/metadata.py +77 -0
  1336. vllm/v1/request.py +241 -0
  1337. vllm/v1/sample/__init__.py +0 -0
  1338. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1339. vllm/v1/sample/logits_processor/builtin.py +275 -0
  1340. vllm/v1/sample/logits_processor/interface.py +97 -0
  1341. vllm/v1/sample/logits_processor/state.py +161 -0
  1342. vllm/v1/sample/metadata.py +43 -0
  1343. vllm/v1/sample/ops/__init__.py +0 -0
  1344. vllm/v1/sample/ops/bad_words.py +39 -0
  1345. vllm/v1/sample/ops/logprobs.py +26 -0
  1346. vllm/v1/sample/ops/penalties.py +43 -0
  1347. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1348. vllm/v1/sample/rejection_sampler.py +623 -0
  1349. vllm/v1/sample/sampler.py +285 -0
  1350. vllm/v1/sample/tpu/__init__.py +0 -0
  1351. vllm/v1/sample/tpu/metadata.py +124 -0
  1352. vllm/v1/sample/tpu/sampler.py +213 -0
  1353. vllm/v1/serial_utils.py +423 -0
  1354. vllm/v1/spec_decode/__init__.py +0 -0
  1355. vllm/v1/spec_decode/eagle.py +1011 -0
  1356. vllm/v1/spec_decode/medusa.py +66 -0
  1357. vllm/v1/spec_decode/metadata.py +62 -0
  1358. vllm/v1/spec_decode/metrics.py +211 -0
  1359. vllm/v1/spec_decode/ngram_proposer.py +276 -0
  1360. vllm/v1/spec_decode/utils.py +14 -0
  1361. vllm/v1/structured_output/__init__.py +295 -0
  1362. vllm/v1/structured_output/backend_guidance.py +245 -0
  1363. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1364. vllm/v1/structured_output/backend_outlines.py +320 -0
  1365. vllm/v1/structured_output/backend_types.py +134 -0
  1366. vllm/v1/structured_output/backend_xgrammar.py +327 -0
  1367. vllm/v1/structured_output/request.py +86 -0
  1368. vllm/v1/structured_output/utils.py +454 -0
  1369. vllm/v1/utils.py +396 -0
  1370. vllm/v1/worker/__init__.py +0 -0
  1371. vllm/v1/worker/block_table.py +210 -0
  1372. vllm/v1/worker/cpu_model_runner.py +175 -0
  1373. vllm/v1/worker/cpu_worker.py +156 -0
  1374. vllm/v1/worker/gpu_input_batch.py +863 -0
  1375. vllm/v1/worker/gpu_model_runner.py +4160 -0
  1376. vllm/v1/worker/gpu_ubatch_wrapper.py +399 -0
  1377. vllm/v1/worker/gpu_worker.py +710 -0
  1378. vllm/v1/worker/kv_connector_model_runner_mixin.py +132 -0
  1379. vllm/v1/worker/lora_model_runner_mixin.py +183 -0
  1380. vllm/v1/worker/tpu_input_batch.py +587 -0
  1381. vllm/v1/worker/tpu_model_runner.py +1946 -0
  1382. vllm/v1/worker/tpu_worker.py +346 -0
  1383. vllm/v1/worker/ubatch_splitting.py +192 -0
  1384. vllm/v1/worker/ubatch_utils.py +27 -0
  1385. vllm/v1/worker/ubatching.py +224 -0
  1386. vllm/v1/worker/utils.py +344 -0
  1387. vllm/v1/worker/worker_base.py +65 -0
  1388. vllm/v1/worker/xpu_model_runner.py +57 -0
  1389. vllm/v1/worker/xpu_worker.py +179 -0
  1390. vllm/version.py +41 -0
  1391. vllm/vllm_flash_attn/.gitkeep +0 -0
  1392. vllm/worker/__init__.py +0 -0
  1393. vllm/worker/worker_base.py +279 -0
  1394. vllm_cpu-0.11.0.post2.dist-info/METADATA +348 -0
  1395. vllm_cpu-0.11.0.post2.dist-info/RECORD +1398 -0
  1396. vllm_cpu-0.11.0.post2.dist-info/WHEEL +5 -0
  1397. vllm_cpu-0.11.0.post2.dist-info/entry_points.txt +5 -0
  1398. vllm_cpu-0.11.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1783 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ # MLA Common Components
5
+
6
+ This file implements common components for MLA implementations.
7
+
8
+ First we define:
9
+
10
+ Sq as Q sequence length
11
+ Skv as KV sequence length
12
+
13
+ MLA has two possible ways of computing, a data-movement friendly approach and a
14
+ compute friendly approach, we generally want to use the compute friendly
15
+ approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
16
+ and the data-movement friendly approach for "decode" (i.e. the ratio
17
+ Sq / Skv is "large").
18
+
19
+ NOTE what we deem small and large is currently determined by if its labelled
20
+ prefill or decode by the scheduler, but this is something we should probably
21
+ tune.
22
+
23
+ Main reference: DeepseekV2 paper, and FlashInfer Implementation
24
+ (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
25
+
26
+ Deepseek's MLA attention works the following way:
27
+ * Use a single latent vector to represent the per-token entry of the KV cache.
28
+ * For decode (i.e. the memory friendly approach) the attention "simulates" a
29
+ multi-head attention, while the compute is similar to multi-query attention.
30
+
31
+ Below is example of both paths assuming batchsize = 1
32
+
33
+ ## More Extent Definitions:
34
+
35
+ C Context length, `Skv - Sq`
36
+ H hidden size
37
+ N number of attention heads
38
+ Lq latent dimension for Q 1536 in DSV3
39
+ Lkv latent dimension for K/V 512 in DSV3
40
+ P nope dimension, no rope. 128 in DSV3
41
+ R rope dimension, goes through rope. 64 in DSV3
42
+ V V head dim. 128 in DSV3
43
+
44
+ ## Vector/Matrix Definitions
45
+
46
+ h_t hidden states (input to attention) shape [Sq, H]
47
+ q_c latent/compressed Q shape [Sq, Lq]
48
+ q_nope uncompressed Q (no-rope) shape [Sq, N, P]
49
+ q_pe uncompressed Q (rope) shape [Sq, N, R]
50
+ kv_c latent/compressed KV shape [Skv, Lkv]
51
+ k_pe decoupled k position embeddings shape [Skv, R]
52
+ new_kv_c new kv_c from current iter shape [Sq, Lkv]
53
+ new_k_pe new k_pe from current iter shape [Sq, R]
54
+ cache_kv_c cached k_c from previous iters shape [C, Lkv]
55
+ cache_k_pe cached k_pe from previous iters shape [C, R]
56
+ W_DQ project h_t to q_c shape [H, Lq]
57
+ W_UQ project q_c to q_nope shape [Lq, N * P]
58
+ W_QR project q_c to q_pe shape [Lq, N * R]
59
+ W_DKV project h_t to kv_c shape [H, Lkv]
60
+ W_UK project kv_c to k_nope shape [Lkv, N, P]
61
+ W_KR project h_t to k_pe shape [H, R]
62
+ W_UV project kv_c to v shape [Lkv, N, V]
63
+ W_O project v to h_t shape [N * V, H]
64
+
65
+
66
+ ## Compute Friendly Approach (i.e. "_forward_prefill"):
67
+
68
+ q_c = h_t @ W_DQ
69
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
70
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
71
+ new_kv_c = h_t @ W_DKV
72
+ new_k_pe = RoPE(h_t @ W_KR)
73
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
74
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
75
+ k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
76
+ v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
77
+
78
+ // MHA with QK headdim = P + R
79
+ // V headdim = V
80
+ // spda_o shape [Sq, N, V]
81
+ spda_o = scaled_dot_product_attention(
82
+ torch.cat([q_nope, q_pe], dim=-1),
83
+ torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
84
+ v
85
+ )
86
+ return spda_o @ W_O
87
+
88
+ NOTE: in the actual code,
89
+ `kv_b_proj` is [W_UK; W_UV] concatenated per head
90
+ `q_b_proj` is [W_UQ; W_QR] concatenated per head
91
+ `out_proj` is W_O
92
+
93
+
94
+ ## Data-Movement Friendly Approach (i.e. "_forward_decode"):
95
+
96
+ Runtime
97
+ q_c = h_t @ W_DQ
98
+ q_nope = (q_c @ W_UQ).view(-1, N, P)
99
+ ql_nope = einsum("snh,lnh->snl", q, W_UK)
100
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
101
+ new_kv_c = h_t @ W_DKV
102
+ new_k_pe = RoPE(h_t @ W_KR)
103
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
104
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
105
+
106
+ // MQA with QK headdim = Lkv + R
107
+ // V headdim = Lkv
108
+ // spda_o shape [Sq, N, Lkv]
109
+ // NOTE: this is less compute-friendly since Lkv > P
110
+ // but is more data-movement friendly since its MQA vs MHA
111
+ spda_o = scaled_dot_product_attention(
112
+ torch.cat([ql_nope, q_pe], dim=-1),
113
+ torch.cat([kv_c, k_pe], dim=-1),
114
+ kv_c
115
+ )
116
+
117
+ o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
118
+ return o.view(-1, N * V) @ self.num_heads @ W_O
119
+
120
+
121
+ ## Chunked Prefill
122
+
123
+ For chunked prefill we want to use the compute friendly algorithm. We are
124
+ assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
125
+ the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
126
+
127
+ However, the compute-friendly approach can potentially run out of memory if Skv
128
+ is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
129
+
130
+ To mitigate this, we chunk the computation of attention with respect to the
131
+ current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
132
+ fixed workspace size.
133
+
134
+ The chunked prefill approach is as follows:
135
+
136
+ MCC Max chunk of context to process per iter, computed dynamically,
137
+ used to bound the memory usage
138
+
139
+ q_c = h_t @ W_DQ
140
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
141
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
142
+ new_kv_c = h_t @ W_DKV
143
+ new_k_pe = RoPE(h_t @ W_KR)
144
+ new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
145
+ new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
146
+
147
+ // MHA between queries and new KV
148
+ // with QK headdim = P + R
149
+ // V headdim = V
150
+ // curr_o shape [Sq, N, V]
151
+ // curr_lse shape [N, Sq], this is just order FA returns
152
+ curr_o, curr_lse = scaled_dot_product_attention(
153
+ torch.cat([q_nope, q_pe], dim=-1),
154
+ torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
155
+ new_v,
156
+ casual=True,
157
+ return_softmax_lse=True
158
+ )
159
+
160
+ // Compute attention with the already existing context
161
+ for chunk_idx in range(cdiv(C, MCC)):
162
+ chunk_start = chunk_idx * MCC
163
+ chunk_end = min(chunk_start + MCC, C)
164
+ Sc = chunk_end - chunk_start
165
+ cache_kv_c_chunk = cache_kv_c[chunk_start:chunk_end]
166
+ cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end]
167
+ cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
168
+ cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
169
+
170
+ chunk_o, chunk_lse = scaled_dot_product_attention(
171
+ torch.cat([q_nope, q_pe], dim=-1),
172
+ torch.cat([cache_k_nope_chunk,
173
+ cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
174
+ dim=-1),
175
+ cache_v_chunk,
176
+ casual=False,
177
+ return_softmax_lse=True
178
+ )
179
+
180
+ curr_o, curr_lse = merge_attn_states(
181
+ suffix_output=curr_o,
182
+ suffix_lse=curr_lse,
183
+ prefix_output=chunk_o,
184
+ prefix_lse=chunk_lse,
185
+ )
186
+
187
+ return curr_o @ W_O
188
+ """
189
+
190
+ import functools
191
+ from abc import abstractmethod
192
+ from dataclasses import dataclass, field
193
+ from typing import Generic, Optional, TypeVar, Union
194
+
195
+ import torch
196
+ from tqdm import tqdm
197
+
198
+ import vllm.envs as envs
199
+ from vllm import _custom_ops as ops
200
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
201
+ AttentionMetadata,
202
+ MLAAttentionImpl)
203
+ from vllm.attention.backends.utils import get_mla_dims
204
+ from vllm.attention.ops.common import cp_lse_ag_out_rs
205
+ from vllm.attention.ops.merge_attn_states import merge_attn_states
206
+ from vllm.attention.utils.fa_utils import get_flash_attn_version
207
+ from vllm.config import VllmConfig, get_current_vllm_config
208
+ from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
209
+ from vllm.logger import init_logger
210
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
211
+ LinearBase,
212
+ UnquantizedLinearMethod)
213
+ from vllm.platforms import current_platform
214
+ from vllm.utils import cdiv, round_down
215
+ from vllm.utils.flashinfer import has_nvidia_artifactory
216
+ from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
217
+ CommonAttentionMetadata,
218
+ get_per_layer_parameters,
219
+ infer_global_hyperparameters,
220
+ split_decodes_and_prefills)
221
+ from vllm.v1.kv_cache_interface import AttentionSpec
222
+
223
+ try:
224
+ from vllm.vllm_flash_attn import flash_attn_varlen_func
225
+ is_vllm_fa = True
226
+ except ImportError:
227
+ # For rocm use upstream flash attention
228
+ if current_platform.is_rocm():
229
+ from flash_attn import flash_attn_varlen_func
230
+ is_vllm_fa = False
231
+
232
+ try:
233
+ from flashinfer import BatchPrefillWithRaggedKVCacheWrapper
234
+ from flashinfer.prefill import ( # noqa: F401
235
+ cudnn_batch_prefill_with_kv_cache)
236
+ flashinfer_available = True
237
+ except ImportError:
238
+ flashinfer_available = False
239
+
240
+
241
+ def is_rocm_aiter_fp8bmm_enabled() -> bool:
242
+ return current_platform.is_rocm() \
243
+ and envs.VLLM_ROCM_USE_AITER_FP8BMM \
244
+ and envs.VLLM_ROCM_USE_AITER
245
+
246
+
247
+ if is_rocm_aiter_fp8bmm_enabled():
248
+ from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip
249
+ batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
250
+ as aiter_triton_fp8_bmm)
251
+
252
+ def dynamic_per_batched_tensor_quant(
253
+ x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
254
+ DTYPE_MAX = torch.finfo(dtype).max
255
+ min_val, max_val = x.aminmax()
256
+ amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
257
+ scale = DTYPE_MAX / amax
258
+ x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
259
+ return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
260
+
261
+
262
+ logger = init_logger(__name__)
263
+
264
+ CUDNN_WORKSPACE_SIZE = 12800
265
+
266
+
267
+ class MLACommonBackend(AttentionBackend):
268
+
269
+ accept_output_buffer: bool = True
270
+
271
+ @staticmethod
272
+ def get_name() -> str:
273
+ return "TRITON_MLA"
274
+
275
+ @staticmethod
276
+ def get_metadata_cls() -> type["AttentionMetadata"]:
277
+ return MLACommonMetadata
278
+
279
+ @staticmethod
280
+ def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
281
+ return MLACommonMetadataBuilder
282
+
283
+ @staticmethod
284
+ def get_kv_cache_shape(
285
+ num_blocks: int,
286
+ block_size: int,
287
+ num_kv_heads: int, # assumed to be 1 for MLA
288
+ head_size: int,
289
+ cache_dtype_str: str = "auto",
290
+ ) -> tuple[int, ...]:
291
+ return (num_blocks, block_size, head_size)
292
+
293
+ @classmethod
294
+ def get_supported_dtypes(cls) -> list[torch.dtype]:
295
+ return [torch.float16, torch.bfloat16]
296
+
297
+ @classmethod
298
+ def get_supported_head_sizes(cls) -> list[int]:
299
+ return [576]
300
+
301
+ @classmethod
302
+ def validate_head_size(cls, head_size: int) -> None:
303
+ supported_head_sizes = cls.get_supported_head_sizes()
304
+ if head_size not in supported_head_sizes:
305
+ attn_type = cls.__name__.removesuffix("Backend")
306
+ raise ValueError(
307
+ f"Head size {head_size} is not supported by {attn_type}. "
308
+ f"Supported head sizes are: {supported_head_sizes}. "
309
+ "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
310
+ "FlexAttention backend which supports all head sizes.")
311
+
312
+
313
+ @dataclass
314
+ class MLACommonPrefillMetadata:
315
+ """ Prefill Specific Metadata """
316
+
317
+ @dataclass
318
+ class ChunkedContextMetadata:
319
+ # New for MLA (compared to FlashAttention)
320
+ # For handling chunked prefill
321
+ cu_seq_lens: torch.Tensor
322
+ starts: torch.Tensor
323
+ seq_tot: list[int]
324
+ max_seq_lens: list[int]
325
+ seq_lens: torch.Tensor
326
+ workspace: torch.Tensor
327
+
328
+ # for mla DCP
329
+ cp_chunk_seq_lens: Optional[list[list[int]]] = None
330
+ origin_context_lens: Optional[list[int]] = None
331
+ cp_cu_seq_lens: Optional[torch.Tensor] = None
332
+ chunk_size: Optional[int] = None
333
+ cu_seq_lens_lst: Optional[list[list[int]]] = None
334
+
335
+ block_table: torch.Tensor
336
+ query_start_loc: torch.Tensor
337
+ max_query_len: int
338
+ chunked_context: Optional[ChunkedContextMetadata] = None
339
+
340
+
341
+ @dataclass
342
+ class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
343
+ prefill_main: Optional['BatchPrefillWithRaggedKVCacheWrapper'] = None
344
+ prefill_chunks: list['BatchPrefillWithRaggedKVCacheWrapper'] = field(
345
+ default_factory=list)
346
+
347
+
348
+ @dataclass
349
+ class CudnnPrefillMetadata(MLACommonPrefillMetadata):
350
+
351
+ class ChunkedContextMetadata(
352
+ MLACommonPrefillMetadata.ChunkedContextMetadata):
353
+ seq_lens: torch.Tensor
354
+
355
+ query_seq_lens: Optional[torch.Tensor] = None
356
+ cudnn_workspace: Optional[torch.Tensor] = None
357
+
358
+
359
+ @dataclass
360
+ class MLACommonDecodeMetadata:
361
+ block_table: torch.Tensor
362
+ seq_lens: torch.Tensor
363
+
364
+
365
+ D = TypeVar("D", bound=MLACommonDecodeMetadata)
366
+
367
+
368
+ @dataclass
369
+ class MLACommonMetadata(Generic[D]):
370
+ """Metadata for MLACommon.
371
+
372
+ NOTE: Please read the comment at the top of the file before trying to
373
+ understand this class
374
+ """
375
+ # NOTE(sang): Definition of context_len, query_len, and seq_len.
376
+ # |---------- N-1 iteration --------|
377
+ # |---------------- N iteration ---------------------|
378
+ # |- tokenA -|......................|-- newTokens ---|
379
+ # |---------- context_len ----------|
380
+ # |-------------------- seq_len ---------------------|
381
+ # |-- query_len ---|
382
+
383
+ num_reqs: int
384
+ max_query_len: int
385
+ max_seq_len: int
386
+
387
+ num_actual_tokens: int # Number of tokens excluding padding.
388
+ query_start_loc: torch.Tensor
389
+ slot_mapping: torch.Tensor
390
+
391
+ # New for MLA (compared to FlashAttention)
392
+ # For handling prefill decode split
393
+ num_decodes: int
394
+ num_decode_tokens: int
395
+ num_prefills: int
396
+
397
+ # The dimension of the attention heads
398
+ head_dim: Optional[int] = None
399
+
400
+ decode: Optional[D] = None
401
+ prefill: Optional[Union[MLACommonPrefillMetadata,
402
+ FlashInferPrefillMetadata,
403
+ CudnnPrefillMetadata]] = None
404
+
405
+ def __post_init__(self):
406
+ if self.head_dim is not None:
407
+ MLACommonBackend.validate_head_size(self.head_dim)
408
+
409
+
410
+ M = TypeVar("M", bound=MLACommonMetadata)
411
+ A = TypeVar("A")
412
+
413
+
414
+ def use_flashinfer_prefill() -> bool:
415
+ # For blackwell default to flashinfer prefill if it's available since
416
+ # it is faster than FA2.
417
+ return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available
418
+ and not envs.VLLM_USE_CUDNN_PREFILL
419
+ and current_platform.is_device_capability(100))
420
+
421
+
422
+ def use_cudnn_prefill() -> bool:
423
+ return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL
424
+ and current_platform.is_device_capability(100)
425
+ and has_nvidia_artifactory())
426
+
427
+
428
+ # Currently 394MB, this can be tuned based on GEMM sizes used.
429
+ # Chosen to be the same as sglang:
430
+ # https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
431
+ FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
432
+
433
+
434
+ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
435
+ """
436
+ NOTE: Please read the comment at the top of the file before trying to
437
+ understand this class
438
+ """
439
+ reorder_batch_threshold: int = 1
440
+
441
+ @staticmethod
442
+ def determine_chunked_prefill_workspace_size(
443
+ vllm_config: VllmConfig) -> int:
444
+ scheduler_config = vllm_config.scheduler_config
445
+ cache_config = vllm_config.cache_config
446
+ model_config = vllm_config.model_config
447
+
448
+ chunked_prefill_workspace_size = min(
449
+ # Try for 8 full length request or at least 4 pages per-request
450
+ max(8 * model_config.max_model_len,
451
+ 4 * scheduler_config.max_num_seqs * cache_config.block_size),
452
+ # For long-context models try not to over-allocate limiting
453
+ # kv-cache space, limiting it to 64k tokens,
454
+ # which would result in the workspace being:
455
+ # 2*(576)*(64*1024) = 144mb
456
+ # (assuming 576 MLA head dim, and fp16)
457
+ # which would result in up-projected context being
458
+ # 2*(192*128)*(64*1024) = 3gb
459
+ # (assuming 192 QK head dim, 128 heads, and fp16)
460
+ 64 * 1024)
461
+
462
+ # Enforce that we enough for at least 1 page per request
463
+ chunked_prefill_workspace_size = max(
464
+ chunked_prefill_workspace_size,
465
+ scheduler_config.max_num_seqs * cache_config.block_size)
466
+
467
+ return chunked_prefill_workspace_size
468
+
469
+ def __init__(self,
470
+ kv_cache_spec: AttentionSpec,
471
+ layer_names: list[str],
472
+ vllm_config: VllmConfig,
473
+ device: torch.device,
474
+ metadata_cls: Optional[type[M]] = None):
475
+ self.metadata_cls = metadata_cls \
476
+ if metadata_cls is not None else MLACommonMetadata
477
+ self.kv_cache_spec = kv_cache_spec
478
+ scheduler_config = vllm_config.scheduler_config
479
+ self.model_config = vllm_config.model_config
480
+ parallel_config = vllm_config.parallel_config
481
+ self.compilation_config = vllm_config.compilation_config
482
+ self.device = device
483
+
484
+ self.num_heads = self.model_config.get_num_attention_heads(
485
+ parallel_config)
486
+ self.mla_dims = get_mla_dims(self.model_config)
487
+ self.aot_schedule = current_platform.is_cuda()
488
+ try:
489
+ self.dcp_world_size = get_dcp_group().world_size
490
+ self.dcp_rank = get_dcp_group().rank_in_group
491
+ except AssertionError:
492
+ # DCP might not be initialized in testing
493
+ self.dcp_world_size = 1
494
+ self.dcp_rank = 0
495
+
496
+ # Don't try to access the runner on AMD
497
+ if self.aot_schedule:
498
+ self.page_size = self.kv_cache_spec.block_size
499
+
500
+ self.chunked_prefill_workspace_size = \
501
+ self.determine_chunked_prefill_workspace_size(vllm_config)
502
+
503
+ if self.dcp_world_size > 1:
504
+ # Note(hc): The local kvcache is incomplete when DCP is triggered,
505
+ # an additional kvcache allgather across the DCP group is therefore
506
+ # required, so the workspace has to be enlarged by 1/DCP relative
507
+ # to the original TP allocation.
508
+ assert self.chunked_prefill_workspace_size % \
509
+ self.dcp_world_size == 0
510
+ self.chunked_prefill_workspace = torch.empty(
511
+ (self.chunked_prefill_workspace_size +
512
+ self.chunked_prefill_workspace_size // self.dcp_world_size,
513
+ self.model_config.get_head_size()),
514
+ dtype=self.model_config.dtype,
515
+ device=device,
516
+ )
517
+ else:
518
+ self.chunked_prefill_workspace = torch.empty(
519
+ (self.chunked_prefill_workspace_size,
520
+ self.model_config.get_head_size()),
521
+ dtype=self.model_config.dtype,
522
+ device=device,
523
+ )
524
+
525
+ self._use_cudnn_prefill = use_cudnn_prefill()
526
+ self._use_fi_prefill = use_flashinfer_prefill()
527
+ self.prefill_metadata_cls = (
528
+ FlashInferPrefillMetadata
529
+ if self._use_fi_prefill else CudnnPrefillMetadata
530
+ if self._use_cudnn_prefill else MLACommonPrefillMetadata)
531
+
532
+ if self._use_fi_prefill:
533
+ self._workspace_buffer = torch.empty(
534
+ FLASHINFER_WORKSPACE_BUFFER_SIZE,
535
+ dtype=torch.uint8,
536
+ device=device)
537
+
538
+ self._fi_prefill_main: Optional[
539
+ BatchPrefillWithRaggedKVCacheWrapper] = None
540
+ self._fi_prefill_chunks: list[
541
+ BatchPrefillWithRaggedKVCacheWrapper] = []
542
+
543
+ self._global_hyperparameters = infer_global_hyperparameters(
544
+ get_per_layer_parameters(vllm_config, layer_names,
545
+ MLACommonImpl))
546
+
547
+ if self._use_cudnn_prefill:
548
+ self.cudnn_workspace = torch.empty(
549
+ CUDNN_WORKSPACE_SIZE * scheduler_config.max_num_seqs,
550
+ dtype=torch.int8,
551
+ device=device,
552
+ )
553
+
554
+ def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
555
+ qo_indptr = prefill.query_start_loc
556
+
557
+ has_context = False
558
+ if prefill.chunked_context is not None:
559
+ chunked_context = prefill.chunked_context
560
+ has_context = True
561
+
562
+ if self._fi_prefill_main is None:
563
+ self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper(
564
+ self._workspace_buffer, "NHD", backend="cutlass")
565
+
566
+ if has_context:
567
+ num_chunks = chunked_context.cu_seq_lens.shape[0]
568
+ # Allocate more prefill chunk wrappers if needed
569
+ if len(self._fi_prefill_chunks) < num_chunks:
570
+ for _ in range(len(self._fi_prefill_chunks), num_chunks):
571
+ self._fi_prefill_chunks.append(
572
+ BatchPrefillWithRaggedKVCacheWrapper(
573
+ self._workspace_buffer, "NHD", backend="cutlass"))
574
+ assert num_chunks <= len(self._fi_prefill_chunks)
575
+
576
+ # In MLA, the non-latent num_qo_heads == num_kv_heads
577
+ num_qo_heads = self.num_heads
578
+ num_kv_heads = num_qo_heads
579
+
580
+ # Sanity: Verify that num_kv_heads == 1 since it is latent space
581
+ assert self.kv_cache_spec.num_kv_heads == 1
582
+
583
+ # Get non-latent head_dim_qk and head_dim_vo
584
+ head_dim_qk = (self.mla_dims.qk_nope_head_dim +
585
+ self.mla_dims.qk_rope_head_dim)
586
+ head_dim_vo = self.mla_dims.v_head_dim
587
+
588
+ # For main run, qo_indptr == kv_indptr
589
+ kv_indptr = qo_indptr.clone()
590
+
591
+ # Prepare main prefill
592
+ self._fi_prefill_main.plan(
593
+ qo_indptr=qo_indptr,
594
+ kv_indptr=kv_indptr,
595
+ num_qo_heads=num_qo_heads,
596
+ num_kv_heads=num_kv_heads,
597
+ head_dim_qk=head_dim_qk,
598
+ head_dim_vo=head_dim_vo,
599
+ causal=True, # This is main run
600
+ sm_scale=self._global_hyperparameters.sm_scale,
601
+ window_left=self._global_hyperparameters.window_left,
602
+ logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
603
+ q_data_type=self.model_config.dtype,
604
+ )
605
+
606
+ # Prepare context prefills
607
+ if has_context:
608
+ for i in range(num_chunks):
609
+ kv_indptr_chunk = chunked_context.cu_seq_lens[i]
610
+
611
+ self._fi_prefill_chunks[i].plan(
612
+ qo_indptr=qo_indptr,
613
+ kv_indptr=kv_indptr_chunk,
614
+ num_qo_heads=num_qo_heads,
615
+ num_kv_heads=num_kv_heads,
616
+ head_dim_qk=head_dim_qk,
617
+ head_dim_vo=head_dim_vo,
618
+ causal=False, # This is context run
619
+ sm_scale=self._global_hyperparameters.sm_scale,
620
+ window_left=self._global_hyperparameters.window_left,
621
+ logits_soft_cap=self._global_hyperparameters.
622
+ logits_soft_cap,
623
+ q_data_type=self.model_config.dtype,
624
+ )
625
+
626
+ prefill.prefill_main = self._fi_prefill_main
627
+ prefill.prefill_chunks = self._fi_prefill_chunks
628
+
629
+ def _build_decode(self, block_table_tensor: torch.Tensor,
630
+ seq_lens_cpu: torch.Tensor,
631
+ seq_lens_device: torch.Tensor,
632
+ query_start_loc_cpu: torch.Tensor,
633
+ query_start_loc_device: torch.Tensor,
634
+ num_decode_tokens: int) -> MLACommonDecodeMetadata:
635
+ return MLACommonDecodeMetadata(
636
+ block_table=block_table_tensor,
637
+ seq_lens=seq_lens_device,
638
+ )
639
+
640
+ def build_for_cudagraph_capture(
641
+ self, common_attn_metadata: CommonAttentionMetadata) -> M:
642
+ """
643
+ This method builds the metadata for full cudagraph capture.
644
+ Currently, only decode is supported for full cudagraphs with MLA.
645
+ """
646
+ m = common_attn_metadata
647
+ assert m.num_reqs <= (m.num_actual_tokens *
648
+ self.reorder_batch_threshold), \
649
+ "MLA only supports decode-only full CUDAGraph capture. " \
650
+ "Make sure all cudagraph capture sizes <= max_num_seq."
651
+
652
+ assert m.max_query_len <= self.reorder_batch_threshold # decode only
653
+
654
+ return self.build(0, m)
655
+
656
+ def build(self,
657
+ common_prefix_len: int,
658
+ common_attn_metadata: CommonAttentionMetadata,
659
+ fast_build: bool = False) -> M:
660
+ num_reqs = common_attn_metadata.num_reqs
661
+ num_tokens = common_attn_metadata.num_actual_tokens
662
+ max_query_len = common_attn_metadata.max_query_len
663
+ max_seq_len = common_attn_metadata.max_seq_len
664
+
665
+ # Note(simon): be careful about the CPU <> GPU memory movement in this
666
+ # function. We should avoid GPU -> CPU sync as much as possible because
667
+ # it blocks on all previous kernels.
668
+ device = self.device
669
+ block_table_tensor = common_attn_metadata.block_table_tensor
670
+ slot_mapping = common_attn_metadata.slot_mapping
671
+
672
+ query_start_loc = common_attn_metadata.query_start_loc
673
+ query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
674
+ seq_lens = common_attn_metadata.seq_lens
675
+ seq_lens_cpu = common_attn_metadata.seq_lens_cpu
676
+
677
+ query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
678
+
679
+ num_computed_tokens_cpu = (common_attn_metadata.seq_lens_cpu -
680
+ query_seq_lens_cpu)
681
+
682
+ num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
683
+ split_decodes_and_prefills(common_attn_metadata,
684
+ decode_threshold=self.reorder_batch_threshold)
685
+
686
+ # Note(hc): update seq_lens of decode reqs under DCP.
687
+ if self.dcp_world_size > 1:
688
+ seq_lens[:num_decodes] = seq_lens[:num_decodes] \
689
+ // self.dcp_world_size + (self.dcp_rank <= \
690
+ (seq_lens[:num_decodes] - 1) % self.dcp_world_size)
691
+
692
+ assert num_decodes + num_prefills == num_reqs
693
+ assert num_decode_tokens + num_prefill_tokens == num_tokens
694
+
695
+ prefill_metadata = None
696
+ if num_prefills > 0:
697
+ reqs_start = num_decodes # prefill_start
698
+
699
+ context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
700
+ # Note(hc): The context lengths in the perspective of dcp rank0.
701
+ cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() /
702
+ self.dcp_world_size).int()
703
+ origin_context_lens = context_lens_cpu.tolist()
704
+ max_context_len_cpu = context_lens_cpu.max().item()
705
+ num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
706
+ prefill_query_start_loc = query_start_loc[
707
+ reqs_start:] - query_start_loc[reqs_start]
708
+
709
+ chunked_context_metadata = None
710
+ if max_context_len_cpu > 0:
711
+ # NOTE: it is recommend you read the `Chunked Prefill` section
712
+ # in the comment at the top of the file before trying to
713
+ # understand the following code
714
+
715
+ # currently we allocate an equal amount of workspace for each
716
+ # prefill in the batch, we could probably use a more advanced
717
+ # algorithm here and allocate more workspace to prefills with
718
+ # longer context lengths
719
+ max_context_chunk = (self.chunked_prefill_workspace_size //
720
+ num_prefills_with_context_cpu)
721
+
722
+ if self.aot_schedule:
723
+ # align max_context_chunk to page_size by rounding down,
724
+ # currently the `gather_and_maybe_dequant_cache` kernel
725
+ # cannot handle `context_chunk_starts` that are not aligned
726
+ # to page_size
727
+ max_context_chunk = round_down(max_context_chunk,
728
+ self.page_size)
729
+
730
+ assert max_context_chunk > 0
731
+ num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
732
+
733
+ # if `max_context_chunk = 256`, `num_chunks = 3`, and
734
+ # `num_prefills_with_context = 4`, create a tensor that looks
735
+ # like
736
+ # [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
737
+ # Note(simon): this is done in CPU because of downstream's
738
+ # of `to_list`.
739
+ chunk_starts = \
740
+ torch.arange(num_chunks, dtype=torch.int32) \
741
+ .unsqueeze(1).expand(-1, num_prefills) \
742
+ * max_context_chunk
743
+ chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
744
+ chunk_starts + max_context_chunk)
745
+ chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
746
+
747
+ cu_seq_lens_cpu = torch.zeros(num_chunks,
748
+ num_prefills + 1,
749
+ dtype=torch.int32,
750
+ pin_memory=True)
751
+ torch.cumsum(chunk_seq_lens,
752
+ dim=1,
753
+ out=cu_seq_lens_cpu[:, 1:],
754
+ dtype=torch.int32)
755
+
756
+ if self.dcp_world_size > 1:
757
+ # Note(hc): The above max_context_chunk already enforces
758
+ # block_size alignment, DCP just need the block_size can
759
+ # be divisible by dcp_world_size, because DCP use
760
+ # cp_gather_cache which not require `cp_chunk_starts`
761
+ # aligned to page_size.
762
+ assert max_context_chunk % self.dcp_world_size == 0
763
+ cp_max_context_chunk = max_context_chunk // \
764
+ self.dcp_world_size
765
+ cp_chunk_starts = \
766
+ torch.arange(num_chunks, dtype=torch.int32) \
767
+ .unsqueeze(1).expand(-1, num_prefills) \
768
+ * cp_max_context_chunk
769
+ cp_chunk_ends = torch.min(
770
+ cp_context_lens_cpu.unsqueeze(0),
771
+ cp_chunk_starts + cp_max_context_chunk)
772
+ cp_chunk_seq_lens = (cp_chunk_ends -
773
+ cp_chunk_starts).clamp(min=0)
774
+
775
+ cp_cu_seq_lens_cpu = torch.zeros(num_chunks,
776
+ num_prefills + 1,
777
+ dtype=torch.int32,
778
+ pin_memory=True)
779
+ torch.cumsum(cp_chunk_seq_lens,
780
+ dim=1,
781
+ out=cp_cu_seq_lens_cpu[:, 1:],
782
+ dtype=torch.int32)
783
+
784
+ chunked_context_metadata_cls = \
785
+ CudnnPrefillMetadata.ChunkedContextMetadata \
786
+ if self._use_cudnn_prefill else \
787
+ MLACommonPrefillMetadata.ChunkedContextMetadata
788
+ if self.dcp_world_size > 1:
789
+ chunked_context_metadata = \
790
+ chunked_context_metadata_cls(
791
+ cu_seq_lens=cu_seq_lens_cpu \
792
+ .to(device, non_blocking=True),
793
+ starts=cp_chunk_starts.to(device, non_blocking=True),
794
+ seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
795
+ max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
796
+ seq_lens=chunk_seq_lens,
797
+ workspace=self.chunked_prefill_workspace,
798
+ cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
799
+ origin_context_lens=origin_context_lens,
800
+ cp_cu_seq_lens=cp_cu_seq_lens_cpu \
801
+ .to(device, non_blocking=True),
802
+ chunk_size=max_context_chunk,
803
+ cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
804
+ )
805
+ else:
806
+ chunked_context_metadata = \
807
+ chunked_context_metadata_cls(
808
+ cu_seq_lens=cu_seq_lens_cpu \
809
+ .to(device, non_blocking=True),
810
+ starts=chunk_starts.to(device, non_blocking=True),
811
+ seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
812
+ max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
813
+ seq_lens=chunk_seq_lens,
814
+ workspace=self.chunked_prefill_workspace,
815
+ )
816
+
817
+ if self._use_cudnn_prefill:
818
+ chunked_context_metadata.seq_lens = chunk_seq_lens
819
+
820
+ assert max(chunked_context_metadata.max_seq_lens) <= \
821
+ self.chunked_prefill_workspace_size
822
+
823
+ prefill_metadata = self.prefill_metadata_cls(
824
+ block_table=block_table_tensor[reqs_start:, ...],
825
+ query_start_loc=prefill_query_start_loc,
826
+ max_query_len=max_query_len,
827
+ chunked_context=chunked_context_metadata,
828
+ )
829
+
830
+ if self._use_cudnn_prefill:
831
+ assert isinstance(prefill_metadata, CudnnPrefillMetadata)
832
+ prefill_metadata.query_seq_lens = prefill_query_start_loc[1:] \
833
+ - prefill_query_start_loc[:-1]
834
+ prefill_metadata.cudnn_workspace = self.cudnn_workspace
835
+
836
+ decode_metadata = None
837
+ if num_decodes > 0:
838
+ decode_metadata = self._build_decode(
839
+ block_table_tensor=block_table_tensor[:num_decodes, ...],
840
+ seq_lens_cpu=seq_lens_cpu[:num_decodes],
841
+ seq_lens_device=seq_lens[:num_decodes],
842
+ query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1],
843
+ query_start_loc_device=query_start_loc[:num_decodes + 1],
844
+ num_decode_tokens=num_decode_tokens,
845
+ )
846
+
847
+ attn_metadata = self.metadata_cls(
848
+ num_reqs=common_attn_metadata.num_reqs,
849
+ max_query_len=common_attn_metadata.max_query_len,
850
+ max_seq_len=max_seq_len,
851
+ num_actual_tokens=num_tokens,
852
+ query_start_loc=query_start_loc,
853
+ slot_mapping=slot_mapping,
854
+ head_dim=self.model_config.get_head_size(),
855
+ # MLACommonMetadata Chunk prefill specific
856
+ num_decodes=num_decodes,
857
+ num_decode_tokens=num_decode_tokens,
858
+ num_prefills=num_prefills,
859
+ prefill=prefill_metadata,
860
+ decode=decode_metadata,
861
+ )
862
+
863
+ if self._use_fi_prefill and num_prefills > 0:
864
+ assert isinstance(attn_metadata.prefill, FlashInferPrefillMetadata)
865
+ self._build_fi_prefill_wrappers(attn_metadata.prefill)
866
+
867
+ return attn_metadata
868
+
869
+
870
+ def reorg_kvcache(
871
+ allgatered_kv_c_normed: torch.Tensor,
872
+ allgatered_k_pe: torch.Tensor,
873
+ cp_chunk_seq_lens_lst: list[int],
874
+ origin_context_lens: list[int],
875
+ cp_world_size: int,
876
+ sum_seq_len: int,
877
+ max_seq_len: int,
878
+ chunk_size: int,
879
+ chunk_idx: int,
880
+ toks: int,
881
+ ) -> tuple[torch.Tensor, torch.Tensor]:
882
+ """
883
+ reorg kvcache after cp local gather to tp layout for attn kernel.
884
+
885
+ Args:
886
+ cp_chunk_seq_lens_lst: chunk context lengths under CP.
887
+ origin_context_lens: origin full context lengths under CP.
888
+ cp_world_size: CP size.
889
+ sum_seq_len: the sum of cp_chunk_seq_lens_lst.
890
+ max_seq_len: the max value of cp_chunk_seq_lens_lst.
891
+ chunk_size: equals to max_context_chunk from
892
+ chunked_context_metadata building.
893
+ chunk_idx: chunk idx of chunked_prefill.
894
+ toks: the number of tokens for local gather cache.
895
+ """
896
+ kv_c_segments = []
897
+ k_pe_segments = []
898
+ src_token_idx = 0
899
+ max_seq_len_check = 0
900
+ for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst,
901
+ origin_context_lens):
902
+ chunk_context_len = chunk_size
903
+ if cp_chunk_seq_len != 0:
904
+ chunk_context_len = min(
905
+ chunk_context_len, origin_context_len - chunk_size * chunk_idx)
906
+ cp_target_rank = (chunk_context_len - 1) % cp_world_size
907
+ cur_seq_len = 0
908
+ for rank in range(cp_world_size):
909
+ if rank > cp_target_rank and cp_chunk_seq_len:
910
+ real_cp_chunk_seq_len = cp_chunk_seq_len - 1
911
+ else:
912
+ real_cp_chunk_seq_len = cp_chunk_seq_len
913
+ if real_cp_chunk_seq_len:
914
+ kv_c_segment = allgatered_kv_c_normed[rank * toks +
915
+ src_token_idx:rank *
916
+ toks + src_token_idx +
917
+ real_cp_chunk_seq_len]
918
+ k_pe_segment = allgatered_k_pe[rank * toks +
919
+ src_token_idx:rank * toks +
920
+ src_token_idx +
921
+ real_cp_chunk_seq_len]
922
+ kv_c_segments.append(kv_c_segment)
923
+ k_pe_segments.append(k_pe_segment)
924
+ cur_seq_len += real_cp_chunk_seq_len
925
+ max_seq_len_check = max(max_seq_len_check, cur_seq_len)
926
+ src_token_idx += cp_chunk_seq_len
927
+ reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
928
+ reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
929
+ assert reorganized_kv_c_normed.shape[0] == sum_seq_len
930
+ assert reorganized_k_pe.shape[0] == sum_seq_len
931
+ assert max_seq_len_check == max_seq_len
932
+ return reorganized_kv_c_normed, reorganized_k_pe
933
+
934
+
935
+ # TODO(Lucas): rename MLACommonBaseImpl -> MLACommonImpl,
936
+ # and MLACommonImpl -> MLACommonDenseImpl or somthing like that
937
+ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
938
+ """
939
+ NOTE: Please read the comment at the top of the file before trying to
940
+ understand this class
941
+ """
942
+
943
+ def __init__(
944
+ self,
945
+ num_heads: int,
946
+ head_size: int,
947
+ scale: float,
948
+ num_kv_heads: int,
949
+ alibi_slopes: Optional[list[float]],
950
+ sliding_window: Optional[int],
951
+ kv_cache_dtype: str,
952
+ logits_soft_cap: Optional[float],
953
+ attn_type: str,
954
+ kv_sharing_target_layer_name: Optional[str],
955
+ # MLA Specific Arguments
956
+ q_lora_rank: Optional[int],
957
+ kv_lora_rank: int,
958
+ qk_nope_head_dim: int,
959
+ qk_rope_head_dim: int,
960
+ qk_head_dim: int,
961
+ v_head_dim: int,
962
+ kv_b_proj: ColumnParallelLinear,
963
+ indexer=None,
964
+ q_pad_num_heads: Optional[int] = None,
965
+ ) -> None:
966
+ if kv_sharing_target_layer_name is not None:
967
+ raise NotImplementedError("KV sharing is not supported for MLA")
968
+
969
+ self.num_heads = num_heads
970
+ self.head_size = head_size
971
+ self.scale = float(scale)
972
+ self.num_kv_heads = num_kv_heads
973
+ self.kv_cache_dtype = kv_cache_dtype
974
+
975
+ self.q_lora_rank = q_lora_rank
976
+ self.kv_lora_rank = kv_lora_rank
977
+ self.qk_nope_head_dim = qk_nope_head_dim
978
+ self.qk_rope_head_dim = qk_rope_head_dim
979
+ self.qk_head_dim = qk_head_dim
980
+ self.v_head_dim = v_head_dim
981
+ self.kv_b_proj = kv_b_proj
982
+ self.indexer = indexer
983
+ self.q_pad_num_heads = q_pad_num_heads
984
+
985
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
986
+
987
+ def get_layer_weight(layer):
988
+ WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
989
+ for attr in WEIGHT_NAMES:
990
+ if hasattr(layer, attr):
991
+ return getattr(layer, attr)
992
+ raise AttributeError(
993
+ f"Layer '{layer}' has no recognized weight attribute:"
994
+ f" {WEIGHT_NAMES}.")
995
+
996
+ def get_and_maybe_dequant_weights(layer: LinearBase):
997
+ if not isinstance(layer.quant_method, UnquantizedLinearMethod):
998
+ # NOTE: This should only be used offline, since it's O(N^3)
999
+ eye = torch.eye(layer.input_size_per_partition,
1000
+ dtype=act_dtype,
1001
+ device=get_layer_weight(layer).device)
1002
+ dequant_weights = layer.quant_method.apply(layer,
1003
+ eye,
1004
+ bias=None)
1005
+ del eye
1006
+ # standardize to (output, input)
1007
+ return dequant_weights.T
1008
+ return layer.weight
1009
+
1010
+ # we currently do not have quantized bmm's which are needed for
1011
+ # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
1012
+ # the bmm's in 16-bit, the extra memory overhead of this is fairly low
1013
+ kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
1014
+ assert kv_b_proj_weight.shape == (
1015
+ self.kv_lora_rank,
1016
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
1017
+ f"{kv_b_proj_weight.shape=}, "
1018
+ f"{self.kv_lora_rank=}, "
1019
+ f"{self.num_heads=}, "
1020
+ f"{self.qk_nope_head_dim=}, "
1021
+ f"{self.v_head_dim=}")
1022
+ kv_b_proj_weight = kv_b_proj_weight.view(
1023
+ self.kv_lora_rank,
1024
+ self.num_heads,
1025
+ self.qk_nope_head_dim + self.v_head_dim,
1026
+ )
1027
+
1028
+ W_UK, W_UV = kv_b_proj_weight.split(
1029
+ [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1030
+
1031
+ if is_rocm_aiter_fp8bmm_enabled():
1032
+ W_K = W_UK.transpose(0, 1) # 16 512 128
1033
+ W_V = W_UV.permute(1, 2, 0) # 16 128 512
1034
+ self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
1035
+ W_K, dtype=current_platform.fp8_dtype())
1036
+ self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
1037
+ W_V, dtype=current_platform.fp8_dtype())
1038
+
1039
+ # The kernel operates on non-padded inputs. Hence, pre-compiling
1040
+ # triton kernel to avoid runtime compilation for unseen batch sizes
1041
+ # Pre-compile for batch sizes 1 to 1024 to cover most use-cases.
1042
+ # On DS-R1, this step adds roughly 50s to the model loading time.
1043
+ max_batch_size = 1024 # [ToDo] Find the optimal upper limit
1044
+ pre_compilation_list = list(range(1, max_batch_size + 1))
1045
+ if is_global_first_rank():
1046
+ pre_compilation_list = tqdm(
1047
+ pre_compilation_list,
1048
+ desc="[Aiter Triton] Pre-compiling fp8 BMM kernel",
1049
+ total=max_batch_size,
1050
+ )
1051
+
1052
+ for m in pre_compilation_list:
1053
+ x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
1054
+ dtype=torch.bfloat16,
1055
+ device=self.W_K.device)
1056
+ aiter_triton_fp8_bmm(x,
1057
+ self.W_K,
1058
+ self.W_K_scale,
1059
+ group_size=128,
1060
+ transpose_bm=True)
1061
+
1062
+ x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
1063
+ dtype=torch.bfloat16,
1064
+ device=self.W_V.device)
1065
+ aiter_triton_fp8_bmm(x,
1066
+ self.W_V,
1067
+ self.W_V_scale,
1068
+ group_size=128,
1069
+ transpose_bm=True)
1070
+ else:
1071
+ # Convert from (L, N, V) to (N, L, V)
1072
+ self.W_UV = W_UV.transpose(0, 1)
1073
+ # Convert from (L, N, P) to (N, P, L)
1074
+ self.W_UK_T = W_UK.permute(1, 2, 0)
1075
+
1076
+ def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
1077
+ # Convert from (B, N, L) to (N, B, L)
1078
+ x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
1079
+ if is_rocm_aiter_fp8bmm_enabled():
1080
+ # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
1081
+ x = aiter_triton_fp8_bmm(x,
1082
+ self.W_V,
1083
+ self.W_V_scale,
1084
+ group_size=128,
1085
+ transpose_bm=True)
1086
+ # Convert from (B, N, V) to (B, N * V)
1087
+ x = x.reshape(-1, self.num_heads * self.v_head_dim)
1088
+ # Copy result
1089
+ out.copy_(x)
1090
+ else:
1091
+ # Convert from (B, N * V) to (N, B, V)
1092
+ out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1)
1093
+
1094
+ # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
1095
+ torch.bmm(x, self.W_UV, out=out) # Reuse "out" to make it "hot"
1096
+
1097
+ # Convert from (N, B, V) to (B, N * V)
1098
+ out_new = out.transpose(0, 1).reshape(
1099
+ -1, self.num_heads * self.v_head_dim)
1100
+
1101
+ # Adjust output buffer shape back to the original (B, N * V)
1102
+ N, B, V = out.shape
1103
+ out.resize_((B, N * V))
1104
+ out.copy_(out_new) # Copy result
1105
+
1106
+
1107
+ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
1108
+ """
1109
+ NOTE: Please read the comment at the top of the file before trying to
1110
+ understand this class
1111
+ """
1112
+
1113
+ def __init__(self, *args, **kwargs) -> None:
1114
+ super().__init__(*args, **kwargs)
1115
+
1116
+ if use_flashinfer_prefill():
1117
+ logger.debug_once("Using FlashInfer prefill for MLA")
1118
+ self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
1119
+ self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
1120
+ self._pad_v = False
1121
+ elif use_cudnn_prefill():
1122
+ logger.debug_once("Using CUDNN prefill for MLA")
1123
+ self._run_prefill_context_chunk = \
1124
+ self._run_prefill_context_chunk_cudnn
1125
+ self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
1126
+ self._pad_v = False
1127
+ else: # Use FlashAttention
1128
+ logger.debug_once("Using FlashAttention prefill for MLA")
1129
+ self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
1130
+ self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa
1131
+
1132
+ # Handle the differences between the flash_attn_varlen from
1133
+ # flash_attn and the one from vllm_flash_attn. The former is used on
1134
+ # RoCM and the latter has an additional parameter to control
1135
+ # FA2 vs FA3
1136
+ self.flash_attn_varlen_func = flash_attn_varlen_func
1137
+ self.vllm_flash_attn_version = get_flash_attn_version()
1138
+ if self.vllm_flash_attn_version is not None:
1139
+ self.flash_attn_varlen_func = \
1140
+ functools.partial(flash_attn_varlen_func,
1141
+ fa_version=self.vllm_flash_attn_version)
1142
+
1143
+ # For MLA the v head dim is smaller than qk head dim so we pad out
1144
+ # v with 0s to match the qk head dim for attention backends that do
1145
+ # not support different headdims
1146
+ # We don't need to pad V if we are on a hopper system with FA3
1147
+ self._pad_v = self.vllm_flash_attn_version is None or not (
1148
+ self.vllm_flash_attn_version == 3
1149
+ and current_platform.get_device_capability()[0] == 9)
1150
+
1151
+ self.dcp_world_size: Optional[int] = None
1152
+
1153
+ self.chunked_prefill_workspace_size = \
1154
+ MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
1155
+ get_current_vllm_config())
1156
+
1157
+ def _flash_attn_varlen_diff_headdims(self,
1158
+ q,
1159
+ k,
1160
+ v,
1161
+ return_softmax_lse=False,
1162
+ softmax_scale=None,
1163
+ **kwargs):
1164
+ maybe_padded_v = v
1165
+ if self._pad_v:
1166
+ maybe_padded_v = torch.nn.functional.pad(
1167
+ v, [0, q.shape[-1] - v.shape[-1]], value=0)
1168
+
1169
+ if is_vllm_fa:
1170
+ kwargs["return_softmax_lse"] = return_softmax_lse
1171
+ else:
1172
+ # ROCm leverages the upstream flash_attn, which takes a parameter
1173
+ # called "return_attn_probs" instead of return_softmax_lse
1174
+ kwargs["return_attn_probs"] = return_softmax_lse
1175
+
1176
+ attn_out = self.flash_attn_varlen_func(
1177
+ q=q,
1178
+ k=k,
1179
+ v=maybe_padded_v,
1180
+ softmax_scale=softmax_scale,
1181
+ **kwargs,
1182
+ )
1183
+
1184
+ # Unpack the output if there is multiple results
1185
+ lse = None
1186
+ if isinstance(attn_out, tuple):
1187
+ attn_out, lse = attn_out[0], attn_out[1]
1188
+
1189
+ # Remain consistent with old `flash_attn_varlen_func` where there
1190
+ # is only one output tensor if `return_softmax_lse` is False.
1191
+ if return_softmax_lse:
1192
+ return attn_out, lse
1193
+ return attn_out
1194
+
1195
+ def _run_prefill_new_tokens_fa(self, prefill: MLACommonPrefillMetadata, q,
1196
+ k, v, return_softmax_lse):
1197
+ return self._flash_attn_varlen_diff_headdims(
1198
+ q=q,
1199
+ k=k,
1200
+ v=v,
1201
+ cu_seqlens_q=prefill.query_start_loc,
1202
+ cu_seqlens_k=prefill.query_start_loc,
1203
+ max_seqlen_q=prefill.max_query_len,
1204
+ max_seqlen_k=prefill.max_query_len,
1205
+ softmax_scale=self.scale,
1206
+ causal=True,
1207
+ return_softmax_lse=return_softmax_lse,
1208
+ )
1209
+
1210
+ def _run_prefill_new_tokens_fi(self, prefill: MLACommonPrefillMetadata, q,
1211
+ k, v, return_softmax_lse):
1212
+ assert isinstance(prefill, FlashInferPrefillMetadata)
1213
+ assert prefill.prefill_main is not None
1214
+ ret = prefill.prefill_main.run(
1215
+ q=q,
1216
+ k=k,
1217
+ v=v,
1218
+ return_lse=return_softmax_lse,
1219
+ )
1220
+
1221
+ if isinstance(ret, tuple):
1222
+ # Convert from (q_len, num_heads) to (num_heads, q_len)
1223
+ return ret[0], ret[1].transpose(0, 1).contiguous()
1224
+ return ret
1225
+
1226
+ def _run_prefill_new_tokens_cudnn(self, prefill: MLACommonPrefillMetadata,
1227
+ q, k, v, return_softmax_lse):
1228
+ assert isinstance(prefill, CudnnPrefillMetadata)
1229
+ assert prefill.query_seq_lens is not None
1230
+ output, lse = cudnn_batch_prefill_with_kv_cache(
1231
+ q=q,
1232
+ k_cache=k,
1233
+ v_cache=v,
1234
+ scale=self.scale,
1235
+ workspace_buffer=prefill.cudnn_workspace,
1236
+ max_token_per_sequence=prefill.max_query_len,
1237
+ max_sequence_kv=prefill.max_query_len,
1238
+ actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
1239
+ actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
1240
+ causal=True,
1241
+ return_lse=True, # do not support False for now
1242
+ is_cuda_graph_compatible=
1243
+ True, #Indicates actual_seq_lens are on GPU or CPU.
1244
+ )
1245
+ if return_softmax_lse:
1246
+ return output, lse
1247
+ return output
1248
+
1249
+ def _run_prefill_context_chunk_fa(self, prefill: MLACommonPrefillMetadata,
1250
+ chunk_idx: int, q, k, v):
1251
+ assert prefill.chunked_context is not None
1252
+ return self._flash_attn_varlen_diff_headdims(
1253
+ q=q,
1254
+ k=k,
1255
+ v=v,
1256
+ cu_seqlens_q=prefill.query_start_loc,
1257
+ cu_seqlens_k=prefill.chunked_context.cu_seq_lens[chunk_idx],
1258
+ max_seqlen_q=prefill.max_query_len,
1259
+ max_seqlen_k=prefill.chunked_context.max_seq_lens[chunk_idx],
1260
+ softmax_scale=self.scale,
1261
+ causal=False, # Context is unmasked
1262
+ return_softmax_lse=True,
1263
+ )
1264
+
1265
+ def _run_prefill_context_chunk_fi(self, prefill: MLACommonPrefillMetadata,
1266
+ chunk_idx: int, q, k, v):
1267
+ assert isinstance(prefill, FlashInferPrefillMetadata)
1268
+ attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
1269
+ q=q,
1270
+ k=k,
1271
+ v=v,
1272
+ return_lse=True,
1273
+ )
1274
+ # Convert from (q_len, num_heads) to (num_heads, q_len)
1275
+ return attn_out, lse.transpose(0, 1).contiguous()
1276
+
1277
+ def _run_prefill_context_chunk_cudnn(self,
1278
+ prefill: MLACommonPrefillMetadata,
1279
+ chunk_idx: int, q, k, v):
1280
+ assert isinstance(prefill, CudnnPrefillMetadata)
1281
+ assert prefill.chunked_context is not None
1282
+ assert prefill.chunked_context.seq_lens[chunk_idx] is not None
1283
+ assert prefill.query_seq_lens is not None
1284
+ return cudnn_batch_prefill_with_kv_cache(
1285
+ q=q,
1286
+ k_cache=k,
1287
+ v_cache=v,
1288
+ scale=self.scale,
1289
+ workspace_buffer=prefill.cudnn_workspace,
1290
+ max_token_per_sequence=prefill.max_query_len,
1291
+ max_sequence_kv=prefill.chunked_context.max_seq_lens[chunk_idx],
1292
+ actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
1293
+ actual_seq_lens_kv=prefill.chunked_context.seq_lens[chunk_idx].
1294
+ view(-1, 1, 1, 1),
1295
+ causal=False,
1296
+ return_lse=True,
1297
+ is_cuda_graph_compatible=
1298
+ True, #Indicates actual_seq_lens are on GPU or CPU.
1299
+ )
1300
+
1301
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
1302
+
1303
+ def get_layer_weight(layer):
1304
+ WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
1305
+ for attr in WEIGHT_NAMES:
1306
+ if hasattr(layer, attr):
1307
+ return getattr(layer, attr)
1308
+ raise AttributeError(
1309
+ f"Layer '{layer}' has no recognized weight attribute:"
1310
+ f" {WEIGHT_NAMES}.")
1311
+
1312
+ def get_and_maybe_dequant_weights(layer: LinearBase):
1313
+ if not isinstance(layer.quant_method, UnquantizedLinearMethod):
1314
+ # NOTE: This should only be used offline, since it's O(N^3)
1315
+ eye = torch.eye(layer.input_size_per_partition,
1316
+ dtype=act_dtype,
1317
+ device=get_layer_weight(layer).device)
1318
+ dequant_weights = layer.quant_method.apply(layer,
1319
+ eye,
1320
+ bias=None)
1321
+ del eye
1322
+ # standardize to (output, input)
1323
+ return dequant_weights.T
1324
+ return layer.weight
1325
+
1326
+ # we currently do not have quantized bmm's which are needed for
1327
+ # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
1328
+ # the bmm's in 16-bit, the extra memory overhead of this is fairly low
1329
+ kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
1330
+ assert kv_b_proj_weight.shape == (
1331
+ self.kv_lora_rank,
1332
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
1333
+ f"{kv_b_proj_weight.shape=}, "
1334
+ f"{self.kv_lora_rank=}, "
1335
+ f"{self.num_heads=}, "
1336
+ f"{self.qk_nope_head_dim=}, "
1337
+ f"{self.v_head_dim=}")
1338
+ kv_b_proj_weight = kv_b_proj_weight.view(
1339
+ self.kv_lora_rank,
1340
+ self.num_heads,
1341
+ self.qk_nope_head_dim + self.v_head_dim,
1342
+ )
1343
+
1344
+ W_UK, W_UV = kv_b_proj_weight.split(
1345
+ [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1346
+
1347
+ if is_rocm_aiter_fp8bmm_enabled():
1348
+ W_K = W_UK.transpose(0, 1) # 16 512 128
1349
+ W_V = W_UV.permute(1, 2, 0) # 16 128 512
1350
+ self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
1351
+ W_K, dtype=current_platform.fp8_dtype())
1352
+ self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
1353
+ W_V, dtype=current_platform.fp8_dtype())
1354
+
1355
+ # The kernel operates on non-padded inputs. Hence, pre-compiling
1356
+ # triton kernel to avoid runtime compilation for unseen batch sizes
1357
+ # Pre-compile for batch sizes 1 to 1024 to cover most use-cases.
1358
+ # On DS-R1, this step adds roughly 50s to the model loading time.
1359
+ max_batch_size = 1024 # [ToDo] Find the optimal upper limit
1360
+ pre_compilation_list = list(range(1, max_batch_size + 1))
1361
+ if is_global_first_rank():
1362
+ pre_compilation_list = tqdm(
1363
+ pre_compilation_list,
1364
+ desc="[Aiter Triton] Pre-compiling fp8 BMM kernel",
1365
+ total=max_batch_size,
1366
+ )
1367
+
1368
+ for m in pre_compilation_list:
1369
+ x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
1370
+ dtype=torch.bfloat16,
1371
+ device=self.W_K.device)
1372
+ aiter_triton_fp8_bmm(x,
1373
+ self.W_K,
1374
+ self.W_K_scale,
1375
+ group_size=128,
1376
+ transpose_bm=True)
1377
+
1378
+ x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
1379
+ dtype=torch.bfloat16,
1380
+ device=self.W_V.device)
1381
+ aiter_triton_fp8_bmm(x,
1382
+ self.W_V,
1383
+ self.W_V_scale,
1384
+ group_size=128,
1385
+ transpose_bm=True)
1386
+ else:
1387
+ # Convert from (L, N, V) to (N, L, V)
1388
+ self.W_UV = W_UV.transpose(0, 1)
1389
+ # Convert from (L, N, P) to (N, P, L)
1390
+ self.W_UK_T = W_UK.permute(1, 2, 0)
1391
+
1392
+ def _compute_prefill_context(
1393
+ self,
1394
+ q: torch.Tensor,
1395
+ kv_c_and_k_pe_cache: torch.Tensor,
1396
+ attn_metadata: MLACommonMetadata,
1397
+ k_scale: torch.Tensor,
1398
+ ):
1399
+ assert attn_metadata.prefill is not None
1400
+ prefill_metadata = attn_metadata.prefill
1401
+ assert prefill_metadata.chunked_context is not None
1402
+
1403
+ output = None
1404
+ iters = len(prefill_metadata.chunked_context.seq_tot)
1405
+ workspace = prefill_metadata.chunked_context.workspace
1406
+
1407
+ for i in range(iters):
1408
+ toks = prefill_metadata.chunked_context.seq_tot[i]
1409
+
1410
+ ops.gather_and_maybe_dequant_cache(
1411
+ src_cache=kv_c_and_k_pe_cache,
1412
+ dst=workspace,
1413
+ block_table=prefill_metadata.block_table,
1414
+ cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
1415
+ batch_size=attn_metadata.num_prefills,
1416
+ kv_cache_dtype=self.kv_cache_dtype,
1417
+ scale=k_scale,
1418
+ seq_starts=prefill_metadata.chunked_context.starts[i],
1419
+ )
1420
+
1421
+ kv_c_normed = workspace[:toks]\
1422
+ [..., :self.kv_lora_rank]
1423
+ k_pe = workspace[:toks]\
1424
+ [..., self.kv_lora_rank:].unsqueeze(1)
1425
+
1426
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
1427
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1428
+ k_nope, v = kv_nope\
1429
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1430
+
1431
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
1432
+ dim=-1)
1433
+
1434
+ attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
1435
+ prefill=prefill_metadata,
1436
+ chunk_idx=i,
1437
+ q=q,
1438
+ k=k,
1439
+ v=v,
1440
+ )
1441
+
1442
+ if output is None:
1443
+ output = attn_output
1444
+ output_lse = attn_softmax_lse
1445
+ else:
1446
+ output_tmp = torch.empty_like(output)
1447
+ output_lse_tmp = torch.empty_like(output_lse)
1448
+ merge_attn_states(
1449
+ output=output_tmp,
1450
+ output_lse=output_lse_tmp,
1451
+ prefix_output=output,
1452
+ prefix_lse=output_lse,
1453
+ suffix_output=attn_output,
1454
+ suffix_lse=attn_softmax_lse,
1455
+ )
1456
+ output = output_tmp
1457
+ output_lse = output_lse_tmp
1458
+
1459
+ return output, output_lse
1460
+
1461
+ def _context_parallel_compute_prefill_context(
1462
+ self,
1463
+ q: torch.Tensor,
1464
+ kv_c_and_k_pe_cache: torch.Tensor,
1465
+ attn_metadata: MLACommonMetadata,
1466
+ k_scale: torch.Tensor,
1467
+ dcp_world_size: int,
1468
+ ):
1469
+ assert k_scale is None, "DCP not support scaled kvcache now."
1470
+ assert attn_metadata.prefill is not None
1471
+ prefill_metadata = attn_metadata.prefill
1472
+ assert prefill_metadata.chunked_context is not None
1473
+ assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
1474
+ assert prefill_metadata.chunked_context.origin_context_lens is not None
1475
+ assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
1476
+ assert prefill_metadata.chunked_context.chunk_size is not None
1477
+ assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
1478
+
1479
+ output = None
1480
+ iters = len(prefill_metadata.chunked_context.seq_tot)
1481
+ workspace = prefill_metadata.chunked_context.workspace
1482
+
1483
+ for i in range(iters):
1484
+ toks = prefill_metadata.chunked_context.seq_tot[i]
1485
+ ops.cp_gather_cache(
1486
+ src_cache=kv_c_and_k_pe_cache,
1487
+ dst=workspace,
1488
+ block_table=prefill_metadata.block_table,
1489
+ cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
1490
+ batch_size=attn_metadata.num_prefills,
1491
+ seq_starts=prefill_metadata.chunked_context.starts[i],
1492
+ )
1493
+ # workspace
1494
+ # |------- N tokens --------|--------- N*dcp_size tokens ----------|
1495
+ # |<- use for loca_gather ->|<--------- use for allgather -------->|
1496
+ allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
1497
+ assert allgather_offset * (dcp_world_size +
1498
+ 1) == workspace.shape[0]
1499
+ assert toks <= allgather_offset
1500
+ local_gathered_kvcache = workspace[:toks]
1501
+ cur_allgather_workspace = workspace[
1502
+ allgather_offset:allgather_offset * (1 + dcp_world_size)]
1503
+ assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
1504
+ cur_allgather_kvcache = cur_allgather_workspace[:toks *
1505
+ dcp_world_size]
1506
+ cur_allgather_kvcache.copy_(get_dcp_group().all_gather(
1507
+ local_gathered_kvcache, dim=0))
1508
+ assert cur_allgather_kvcache.shape[
1509
+ -1] == self.kv_lora_rank + self.qk_rope_head_dim
1510
+ allgatered_kv_c_normed, allgatered_k_pe = \
1511
+ cur_allgather_kvcache.unsqueeze(
1512
+ 1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
1513
+
1514
+ kv_c_normed, k_pe = reorg_kvcache(
1515
+ allgatered_kv_c_normed,
1516
+ allgatered_k_pe,
1517
+ cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.
1518
+ cp_chunk_seq_lens[i],
1519
+ origin_context_lens=prefill_metadata.chunked_context.
1520
+ origin_context_lens,
1521
+ cp_world_size=dcp_world_size,
1522
+ sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i]
1523
+ [-1],
1524
+ max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
1525
+ chunk_size=prefill_metadata.chunked_context.chunk_size,
1526
+ chunk_idx=i,
1527
+ toks=toks)
1528
+
1529
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
1530
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1531
+ k_nope, v = kv_nope\
1532
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1533
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
1534
+ dim=-1)
1535
+
1536
+ attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
1537
+ prefill=prefill_metadata,
1538
+ chunk_idx=i,
1539
+ q=q,
1540
+ k=k,
1541
+ v=v,
1542
+ )
1543
+
1544
+ if output is None:
1545
+ output = attn_output
1546
+ output_lse = attn_softmax_lse
1547
+ else:
1548
+ output_tmp = torch.empty_like(output)
1549
+ output_lse_tmp = torch.empty_like(output_lse)
1550
+ merge_attn_states(
1551
+ output=output_tmp,
1552
+ output_lse=output_lse_tmp,
1553
+ prefix_output=output,
1554
+ prefix_lse=output_lse,
1555
+ suffix_output=attn_output,
1556
+ suffix_lse=attn_softmax_lse,
1557
+ )
1558
+ output = output_tmp
1559
+ output_lse = output_lse_tmp
1560
+
1561
+ return output, output_lse
1562
+
1563
+ def _forward_prefill(
1564
+ self,
1565
+ q: torch.Tensor,
1566
+ kv_c_normed: torch.Tensor,
1567
+ k_pe: torch.Tensor,
1568
+ kv_c_and_k_pe_cache: torch.Tensor,
1569
+ attn_metadata: MLACommonMetadata,
1570
+ k_scale: torch.Tensor,
1571
+ ) -> torch.Tensor:
1572
+ # TODO (zyongye): Prefill function here
1573
+ assert attn_metadata.prefill is not None
1574
+ assert self.dcp_world_size is not None
1575
+
1576
+ has_context = attn_metadata.prefill.chunked_context is not None
1577
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
1578
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1579
+ k_nope, v = kv_nope\
1580
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1581
+
1582
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
1583
+
1584
+ output = self._run_prefill_new_tokens(
1585
+ prefill=attn_metadata.prefill,
1586
+ q=q,
1587
+ k=k,
1588
+ v=v,
1589
+ return_softmax_lse=has_context,
1590
+ )
1591
+
1592
+ if has_context:
1593
+ suffix_output, suffix_lse = output
1594
+ if self.dcp_world_size > 1:
1595
+ context_output, context_lse = \
1596
+ self._context_parallel_compute_prefill_context(
1597
+ q, kv_c_and_k_pe_cache, attn_metadata,
1598
+ k_scale=None, dcp_world_size=self.dcp_world_size)
1599
+ else:
1600
+ context_output, context_lse = \
1601
+ self._compute_prefill_context(
1602
+ q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
1603
+
1604
+ output = torch.empty_like(suffix_output)
1605
+ merge_attn_states(
1606
+ output=output,
1607
+ prefix_output=context_output,
1608
+ prefix_lse=context_lse,
1609
+ suffix_output=suffix_output,
1610
+ suffix_lse=suffix_lse,
1611
+ )
1612
+
1613
+ # unpad if necessary
1614
+ if self._pad_v:
1615
+ output = output[..., :v.shape[-1]]
1616
+
1617
+ return output.flatten(start_dim=-2)
1618
+
1619
+ @abstractmethod
1620
+ def _forward_decode(
1621
+ self,
1622
+ q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
1623
+ kv_c_and_k_pe_cache: torch.Tensor,
1624
+ attn_metadata: M,
1625
+ layer: AttentionLayer,
1626
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
1627
+ raise NotImplementedError
1628
+
1629
+ def forward(
1630
+ self,
1631
+ layer: AttentionLayer,
1632
+ q: torch.Tensor,
1633
+ k_c_normed: torch.Tensor, # key in unified attn
1634
+ k_pe: torch.Tensor, # value in unified attn
1635
+ kv_cache: torch.Tensor,
1636
+ attn_metadata: M,
1637
+ output: Optional[torch.Tensor] = None,
1638
+ output_scale: Optional[torch.Tensor] = None,
1639
+ output_block_scale: Optional[torch.Tensor] = None,
1640
+ ) -> torch.Tensor:
1641
+ assert output is not None, "Output tensor must be provided."
1642
+
1643
+ if output_scale is not None or output_block_scale is not None:
1644
+ raise NotImplementedError(
1645
+ "fused output quantization is not yet supported"
1646
+ " for MLACommonImpl")
1647
+
1648
+ if attn_metadata is None:
1649
+ # During the profile run try to simulate to worse case output size
1650
+ # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
1651
+ # since this can be large
1652
+ _ = torch.empty(
1653
+ (self.chunked_prefill_workspace_size, self.num_heads,
1654
+ self.qk_nope_head_dim + self.v_head_dim),
1655
+ device=k_c_normed.device,
1656
+ dtype=k_c_normed.dtype,
1657
+ )
1658
+
1659
+ # The zero fill is required when used with DP + EP
1660
+ # to ensure all ranks within a DP group compute the
1661
+ # same expert outputs.
1662
+ return output.fill_(0)
1663
+
1664
+ if self.dcp_world_size is None:
1665
+ self.dcp_world_size = get_dcp_group().world_size
1666
+
1667
+ fp8_attention = self.kv_cache_dtype.startswith("fp8")
1668
+
1669
+ num_actual_toks = attn_metadata.num_actual_tokens
1670
+
1671
+ # Inputs and outputs may be padded for CUDA graphs
1672
+ output_padded = output
1673
+ output = output[:num_actual_toks, ...]
1674
+ q = q[:num_actual_toks, ...]
1675
+ k_c_normed = k_c_normed[:num_actual_toks, ...]
1676
+ k_pe = k_pe[:num_actual_toks, ...]
1677
+
1678
+ assert attn_metadata.num_decodes is not None and \
1679
+ attn_metadata.num_prefills is not None and \
1680
+ attn_metadata.num_decode_tokens is not None
1681
+
1682
+ has_decode = attn_metadata.num_decodes > 0
1683
+ has_prefill = attn_metadata.num_prefills > 0
1684
+ num_decode_tokens = attn_metadata.num_decode_tokens
1685
+
1686
+ decode_q = q[:num_decode_tokens]
1687
+
1688
+ prefill_q = q[num_decode_tokens:]
1689
+ prefill_k_pe = k_pe[num_decode_tokens:]
1690
+ prefill_k_c_normed = k_c_normed[num_decode_tokens:]
1691
+
1692
+ # write the latent and rope to kv cache
1693
+ if kv_cache.numel() > 0:
1694
+ ops.concat_and_cache_mla(
1695
+ k_c_normed,
1696
+ k_pe.squeeze(1),
1697
+ kv_cache,
1698
+ attn_metadata.slot_mapping.flatten(),
1699
+ kv_cache_dtype=self.kv_cache_dtype,
1700
+ scale=layer._k_scale,
1701
+ )
1702
+
1703
+ if fp8_attention:
1704
+ kv_cache = kv_cache.view(current_platform.fp8_dtype())
1705
+
1706
+ if has_prefill:
1707
+ output[num_decode_tokens:] = self._forward_prefill(
1708
+ prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
1709
+ attn_metadata, layer._k_scale)
1710
+
1711
+ if has_decode:
1712
+ assert attn_metadata.decode is not None
1713
+ decode_q_nope, decode_q_pe = decode_q.split(
1714
+ [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
1715
+ # Convert from (B, N, P) to (N, B, P)
1716
+ decode_q_nope = decode_q_nope.transpose(0, 1)
1717
+
1718
+ # Pads the head_dim if necessary (for the underlying kernel)
1719
+ if self.q_pad_num_heads is not None:
1720
+ B, N, L = decode_q_pe.shape
1721
+ decode_pe_padded = decode_q_pe.new_empty(
1722
+ (B, self.q_pad_num_heads, L))
1723
+ decode_pe_padded.resize_((B, N, L))
1724
+ decode_pe_padded.copy_(decode_q_pe)
1725
+ decode_q_pe = decode_pe_padded
1726
+
1727
+ if is_rocm_aiter_fp8bmm_enabled():
1728
+ # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
1729
+ decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope,
1730
+ self.W_K,
1731
+ self.W_K_scale,
1732
+ group_size=128,
1733
+ transpose_bm=True)
1734
+ else:
1735
+ # Pads the head_dim if necessary (for the underlying kernel)
1736
+ N, B, P = decode_q_nope.shape
1737
+ _, _, L = self.W_UK_T.shape
1738
+ if self.q_pad_num_heads is not None:
1739
+ decode_ql_nope = decode_q_nope.new_empty(
1740
+ (self.q_pad_num_heads, B, L))
1741
+ decode_ql_nope.resize_((N, B, L))
1742
+
1743
+ else:
1744
+ decode_ql_nope = decode_q_nope.new_empty((N, B, L))
1745
+
1746
+ # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
1747
+ torch.bmm(decode_q_nope, self.W_UK_T, out=decode_ql_nope)
1748
+ # Convert from (N, B, L) to (B, N, L)
1749
+ decode_ql_nope = decode_ql_nope.transpose(0, 1)
1750
+
1751
+ if fp8_attention:
1752
+ ql_nope_shape = decode_ql_nope.shape
1753
+ decode_ql_nope, _ = ops.scaled_fp8_quant(
1754
+ decode_ql_nope.reshape([
1755
+ ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]
1756
+ ]), layer._q_scale)
1757
+ decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape)
1758
+ q_pe_shape = decode_q_pe.shape
1759
+ decode_q_pe, _ = ops.scaled_fp8_quant(
1760
+ decode_q_pe.reshape(
1761
+ [q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]),
1762
+ layer._q_scale)
1763
+ decode_q_pe = decode_q_pe.reshape(q_pe_shape)
1764
+
1765
+ decode_q = (decode_ql_nope, decode_q_pe)
1766
+ if self.dcp_world_size > 1:
1767
+ assert not fp8_attention, "DCP not support fp8 kvcache now."
1768
+ # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)
1769
+ decode_q = torch.cat(decode_q, dim=-1)
1770
+ # decode_q do allgather in head dim.
1771
+ decode_q = get_dcp_group().all_gather(decode_q, dim=1)
1772
+
1773
+ # call decode attn
1774
+ attn_out, lse = self._forward_decode(decode_q, kv_cache,
1775
+ attn_metadata, layer)
1776
+
1777
+ # recorect dcp attn_out with lse.
1778
+ if self.dcp_world_size > 1:
1779
+ attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
1780
+
1781
+ # v_up projection
1782
+ self._v_up_proj(attn_out, out=output[:num_decode_tokens])
1783
+ return output_padded