vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1175) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1742 -0
  4. vllm/_ipex_ops.py +243 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +15 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +44 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +33 -0
  16. vllm/assets/video.py +114 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +305 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1494 -0
  23. vllm/attention/backends/flash_attn.py +999 -0
  24. vllm/attention/backends/flashinfer.py +1100 -0
  25. vllm/attention/backends/flashmla.py +242 -0
  26. vllm/attention/backends/hpu_attn.py +309 -0
  27. vllm/attention/backends/ipex_attn.py +394 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1381 -0
  30. vllm/attention/backends/pallas.py +347 -0
  31. vllm/attention/backends/placeholder_attn.py +399 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +970 -0
  34. vllm/attention/backends/torch_sdpa.py +691 -0
  35. vllm/attention/backends/triton_mla.py +113 -0
  36. vllm/attention/backends/utils.py +609 -0
  37. vllm/attention/backends/xformers.py +798 -0
  38. vllm/attention/layer.py +452 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +245 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +367 -0
  45. vllm/attention/ops/flashmla.py +115 -0
  46. vllm/attention/ops/hpu_paged_attn.py +87 -0
  47. vllm/attention/ops/ipex_attn.py +194 -0
  48. vllm/attention/ops/merge_attn_states.py +42 -0
  49. vllm/attention/ops/nki_flash_attn.py +905 -0
  50. vllm/attention/ops/paged_attn.py +255 -0
  51. vllm/attention/ops/prefix_prefill.py +901 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +99 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  54. vllm/attention/ops/triton_decode_attention.py +673 -0
  55. vllm/attention/ops/triton_flash_attention.py +1374 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  57. vllm/attention/ops/triton_unified_attention.py +337 -0
  58. vllm/attention/selector.py +186 -0
  59. vllm/attention/utils/fa_utils.py +54 -0
  60. vllm/beam_search.py +82 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +921 -0
  63. vllm/benchmarks/endpoint_request_func.py +160 -0
  64. vllm/benchmarks/latency.py +184 -0
  65. vllm/benchmarks/serve.py +925 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +69 -0
  68. vllm/collect_env.py +818 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +88 -0
  71. vllm/compilation/backends.py +560 -0
  72. vllm/compilation/base_piecewise_backend.py +71 -0
  73. vllm/compilation/collective_fusion.py +126 -0
  74. vllm/compilation/compiler_interface.py +533 -0
  75. vllm/compilation/counter.py +33 -0
  76. vllm/compilation/cuda_piecewise_backend.py +213 -0
  77. vllm/compilation/decorators.py +249 -0
  78. vllm/compilation/fix_functionalization.py +190 -0
  79. vllm/compilation/fusion.py +617 -0
  80. vllm/compilation/fx_utils.py +61 -0
  81. vllm/compilation/inductor_pass.py +114 -0
  82. vllm/compilation/monitor.py +38 -0
  83. vllm/compilation/multi_output_match.py +108 -0
  84. vllm/compilation/noop_elimination.py +136 -0
  85. vllm/compilation/pass_manager.py +77 -0
  86. vllm/compilation/sequence_parallelism.py +267 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  88. vllm/compilation/vllm_inductor_pass.py +66 -0
  89. vllm/compilation/wrapper.py +129 -0
  90. vllm/config.py +4600 -0
  91. vllm/connections.py +173 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +398 -0
  95. vllm/core/block/common.py +370 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  97. vllm/core/block/interfaces.py +318 -0
  98. vllm/core/block/naive_block.py +465 -0
  99. vllm/core/block/prefix_caching_block.py +1134 -0
  100. vllm/core/block/utils.py +27 -0
  101. vllm/core/block_manager.py +520 -0
  102. vllm/core/evictor.py +156 -0
  103. vllm/core/interfaces.py +134 -0
  104. vllm/core/placeholder_block_space_manager.py +99 -0
  105. vllm/core/scheduler.py +2092 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +280 -0
  108. vllm/distributed/__init__.py +5 -0
  109. vllm/distributed/communication_op.py +40 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +126 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +144 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +167 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +303 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +258 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  120. vllm/distributed/device_communicators/pynccl.py +217 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +541 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  125. vllm/distributed/kv_events.py +296 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +11 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +126 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +202 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +91 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +5 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +259 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +189 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +851 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  152. vllm/distributed/parallel_state.py +1294 -0
  153. vllm/distributed/utils.py +520 -0
  154. vllm/engine/__init__.py +0 -0
  155. vllm/engine/arg_utils.py +1649 -0
  156. vllm/engine/async_llm_engine.py +1274 -0
  157. vllm/engine/async_timeout.py +191 -0
  158. vllm/engine/llm_engine.py +2153 -0
  159. vllm/engine/metrics.py +717 -0
  160. vllm/engine/metrics_types.py +96 -0
  161. vllm/engine/multiprocessing/__init__.py +188 -0
  162. vllm/engine/multiprocessing/client.py +755 -0
  163. vllm/engine/multiprocessing/engine.py +459 -0
  164. vllm/engine/output_processor/__init__.py +0 -0
  165. vllm/engine/output_processor/interfaces.py +74 -0
  166. vllm/engine/output_processor/multi_step.py +215 -0
  167. vllm/engine/output_processor/single_step.py +144 -0
  168. vllm/engine/output_processor/stop_checker.py +130 -0
  169. vllm/engine/output_processor/util.py +27 -0
  170. vllm/engine/protocol.py +310 -0
  171. vllm/entrypoints/__init__.py +0 -0
  172. vllm/entrypoints/api_server.py +177 -0
  173. vllm/entrypoints/chat_utils.py +1298 -0
  174. vllm/entrypoints/cli/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/base.py +38 -0
  177. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  178. vllm/entrypoints/cli/benchmark/main.py +53 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  180. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  181. vllm/entrypoints/cli/collect_env.py +34 -0
  182. vllm/entrypoints/cli/main.py +62 -0
  183. vllm/entrypoints/cli/openai.py +204 -0
  184. vllm/entrypoints/cli/serve.py +141 -0
  185. vllm/entrypoints/cli/types.py +24 -0
  186. vllm/entrypoints/launcher.py +146 -0
  187. vllm/entrypoints/llm.py +1503 -0
  188. vllm/entrypoints/logger.py +49 -0
  189. vllm/entrypoints/openai/__init__.py +0 -0
  190. vllm/entrypoints/openai/api_server.py +1376 -0
  191. vllm/entrypoints/openai/cli_args.py +306 -0
  192. vllm/entrypoints/openai/logits_processors.py +89 -0
  193. vllm/entrypoints/openai/protocol.py +1890 -0
  194. vllm/entrypoints/openai/run_batch.py +439 -0
  195. vllm/entrypoints/openai/serving_chat.py +1192 -0
  196. vllm/entrypoints/openai/serving_classification.py +159 -0
  197. vllm/entrypoints/openai/serving_completion.py +590 -0
  198. vllm/entrypoints/openai/serving_embedding.py +200 -0
  199. vllm/entrypoints/openai/serving_engine.py +985 -0
  200. vllm/entrypoints/openai/serving_models.py +314 -0
  201. vllm/entrypoints/openai/serving_pooling.py +231 -0
  202. vllm/entrypoints/openai/serving_score.py +432 -0
  203. vllm/entrypoints/openai/serving_tokenization.py +151 -0
  204. vllm/entrypoints/openai/serving_transcription.py +421 -0
  205. vllm/entrypoints/openai/tool_parsers/__init__.py +22 -0
  206. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  207. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +369 -0
  208. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +258 -0
  209. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +236 -0
  210. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  211. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +215 -0
  212. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +307 -0
  213. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +302 -0
  214. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +266 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  216. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +111 -0
  217. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +296 -0
  218. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  219. vllm/entrypoints/score_utils.py +49 -0
  220. vllm/entrypoints/ssl.py +74 -0
  221. vllm/entrypoints/utils.py +219 -0
  222. vllm/env_override.py +34 -0
  223. vllm/envs.py +896 -0
  224. vllm/executor/__init__.py +0 -0
  225. vllm/executor/executor_base.py +400 -0
  226. vllm/executor/mp_distributed_executor.py +243 -0
  227. vllm/executor/msgspec_utils.py +29 -0
  228. vllm/executor/multiproc_worker_utils.py +312 -0
  229. vllm/executor/ray_distributed_executor.py +700 -0
  230. vllm/executor/ray_utils.py +398 -0
  231. vllm/executor/uniproc_executor.py +138 -0
  232. vllm/forward_context.py +147 -0
  233. vllm/inputs/__init__.py +40 -0
  234. vllm/inputs/data.py +330 -0
  235. vllm/inputs/parse.py +150 -0
  236. vllm/inputs/preprocess.py +908 -0
  237. vllm/inputs/registry.py +214 -0
  238. vllm/jsontree.py +79 -0
  239. vllm/logger.py +211 -0
  240. vllm/logging_utils/__init__.py +7 -0
  241. vllm/logging_utils/dump_input.py +84 -0
  242. vllm/logging_utils/formatter.py +17 -0
  243. vllm/logits_process.py +118 -0
  244. vllm/lora/__init__.py +0 -0
  245. vllm/lora/fully_sharded_layers.py +354 -0
  246. vllm/lora/layers.py +1284 -0
  247. vllm/lora/lora.py +198 -0
  248. vllm/lora/models.py +817 -0
  249. vllm/lora/ops/__init__.py +0 -0
  250. vllm/lora/ops/torch_ops/__init__.py +15 -0
  251. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  252. vllm/lora/ops/triton_ops/__init__.py +11 -0
  253. vllm/lora/ops/triton_ops/kernel_utils.py +242 -0
  254. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  255. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  256. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  257. vllm/lora/ops/triton_ops/utils.py +119 -0
  258. vllm/lora/ops/xla_ops/__init__.py +6 -0
  259. vllm/lora/ops/xla_ops/lora_ops.py +106 -0
  260. vllm/lora/ops/xla_ops/pallas.py +133 -0
  261. vllm/lora/peft_helper.py +135 -0
  262. vllm/lora/punica_wrapper/__init__.py +9 -0
  263. vllm/lora/punica_wrapper/punica_base.py +484 -0
  264. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  265. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  266. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  267. vllm/lora/punica_wrapper/punica_selector.py +19 -0
  268. vllm/lora/punica_wrapper/punica_tpu.py +325 -0
  269. vllm/lora/punica_wrapper/utils.py +163 -0
  270. vllm/lora/request.py +98 -0
  271. vllm/lora/resolver.py +84 -0
  272. vllm/lora/utils.py +239 -0
  273. vllm/lora/worker_manager.py +253 -0
  274. vllm/model_executor/__init__.py +15 -0
  275. vllm/model_executor/custom_op.py +151 -0
  276. vllm/model_executor/guided_decoding/__init__.py +180 -0
  277. vllm/model_executor/guided_decoding/guidance_decoding.py +62 -0
  278. vllm/model_executor/guided_decoding/guidance_logits_processors.py +103 -0
  279. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  280. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  281. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  282. vllm/model_executor/guided_decoding/outlines_logits_processors.py +283 -0
  283. vllm/model_executor/guided_decoding/utils.py +241 -0
  284. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  285. vllm/model_executor/layers/__init__.py +0 -0
  286. vllm/model_executor/layers/activation.py +368 -0
  287. vllm/model_executor/layers/fused_moe/__init__.py +53 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  449. vllm/model_executor/layers/fused_moe/cutlass_moe.py +382 -0
  450. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +227 -0
  451. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +755 -0
  452. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +231 -0
  453. vllm/model_executor/layers/fused_moe/fused_moe.py +1722 -0
  454. vllm/model_executor/layers/fused_moe/layer.py +1366 -0
  455. vllm/model_executor/layers/fused_moe/modular_kernel.py +364 -0
  456. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +242 -0
  457. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  458. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +188 -0
  459. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  460. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +146 -0
  461. vllm/model_executor/layers/fused_moe/prepare_finalize.py +60 -0
  462. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +372 -0
  463. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +112 -0
  464. vllm/model_executor/layers/fused_moe/utils.py +97 -0
  465. vllm/model_executor/layers/layernorm.py +287 -0
  466. vllm/model_executor/layers/lightning_attn.py +651 -0
  467. vllm/model_executor/layers/linear.py +1523 -0
  468. vllm/model_executor/layers/logits_processor.py +196 -0
  469. vllm/model_executor/layers/mamba/__init__.py +0 -0
  470. vllm/model_executor/layers/mamba/mamba2_metadata.py +124 -0
  471. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  472. vllm/model_executor/layers/mamba/mamba_mixer2.py +615 -0
  473. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  474. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  475. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
  476. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  477. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  478. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  479. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  480. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  481. vllm/model_executor/layers/pooler.py +343 -0
  482. vllm/model_executor/layers/quantization/__init__.py +156 -0
  483. vllm/model_executor/layers/quantization/aqlm.py +375 -0
  484. vllm/model_executor/layers/quantization/auto_round.py +308 -0
  485. vllm/model_executor/layers/quantization/awq.py +185 -0
  486. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  487. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  488. vllm/model_executor/layers/quantization/base_config.py +150 -0
  489. vllm/model_executor/layers/quantization/bitblas.py +460 -0
  490. vllm/model_executor/layers/quantization/bitsandbytes.py +397 -0
  491. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  492. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +644 -0
  493. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1252 -0
  494. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +21 -0
  495. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  496. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  497. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  498. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +92 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +120 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +214 -0
  505. vllm/model_executor/layers/quantization/deepspeedfp.py +194 -0
  506. vllm/model_executor/layers/quantization/experts_int8.py +195 -0
  507. vllm/model_executor/layers/quantization/fbgemm_fp8.py +171 -0
  508. vllm/model_executor/layers/quantization/fp8.py +876 -0
  509. vllm/model_executor/layers/quantization/gguf.py +564 -0
  510. vllm/model_executor/layers/quantization/gptq.py +277 -0
  511. vllm/model_executor/layers/quantization/gptq_bitblas.py +444 -0
  512. vllm/model_executor/layers/quantization/gptq_marlin.py +647 -0
  513. vllm/model_executor/layers/quantization/gptq_marlin_24.py +296 -0
  514. vllm/model_executor/layers/quantization/hqq_marlin.py +331 -0
  515. vllm/model_executor/layers/quantization/ipex_quant.py +249 -0
  516. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  517. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  518. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  519. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  520. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  521. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  522. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  523. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +130 -0
  524. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  525. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  526. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  527. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  528. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  529. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  530. vllm/model_executor/layers/quantization/kv_cache.py +138 -0
  531. vllm/model_executor/layers/quantization/marlin.py +260 -0
  532. vllm/model_executor/layers/quantization/modelopt.py +734 -0
  533. vllm/model_executor/layers/quantization/moe_wna16.py +448 -0
  534. vllm/model_executor/layers/quantization/neuron_quant.py +68 -0
  535. vllm/model_executor/layers/quantization/ptpc_fp8.py +126 -0
  536. vllm/model_executor/layers/quantization/qqq.py +274 -0
  537. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  538. vllm/model_executor/layers/quantization/quark/quark.py +440 -0
  539. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  540. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +8 -0
  541. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  542. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +125 -0
  543. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +145 -0
  544. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  545. vllm/model_executor/layers/quantization/quark/utils.py +104 -0
  546. vllm/model_executor/layers/quantization/schema.py +85 -0
  547. vllm/model_executor/layers/quantization/torchao.py +143 -0
  548. vllm/model_executor/layers/quantization/tpu_int8.py +120 -0
  549. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  550. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  551. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +207 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  754. vllm/model_executor/layers/quantization/utils/fp8_utils.py +611 -0
  755. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  756. vllm/model_executor/layers/quantization/utils/int8_utils.py +484 -0
  757. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  758. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  759. vllm/model_executor/layers/quantization/utils/marlin_utils.py +475 -0
  760. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +277 -0
  761. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +324 -0
  762. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  763. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +463 -0
  764. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +125 -0
  765. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +44 -0
  766. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +61 -0
  767. vllm/model_executor/layers/quantization/utils/quant_utils.py +572 -0
  768. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  769. vllm/model_executor/layers/rejection_sampler.py +405 -0
  770. vllm/model_executor/layers/resampler.py +269 -0
  771. vllm/model_executor/layers/rotary_embedding.py +1861 -0
  772. vllm/model_executor/layers/sampler.py +1203 -0
  773. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  774. vllm/model_executor/layers/typical_acceptance_sampler.py +165 -0
  775. vllm/model_executor/layers/utils.py +99 -0
  776. vllm/model_executor/layers/vocab_parallel_embedding.py +486 -0
  777. vllm/model_executor/model_loader/__init__.py +75 -0
  778. vllm/model_executor/model_loader/base_loader.py +24 -0
  779. vllm/model_executor/model_loader/bitsandbytes_loader.py +582 -0
  780. vllm/model_executor/model_loader/default_loader.py +295 -0
  781. vllm/model_executor/model_loader/dummy_loader.py +37 -0
  782. vllm/model_executor/model_loader/gguf_loader.py +113 -0
  783. vllm/model_executor/model_loader/neuron.py +475 -0
  784. vllm/model_executor/model_loader/neuronx_distributed.py +622 -0
  785. vllm/model_executor/model_loader/runai_streamer_loader.py +120 -0
  786. vllm/model_executor/model_loader/sharded_state_loader.py +211 -0
  787. vllm/model_executor/model_loader/tensorizer.py +632 -0
  788. vllm/model_executor/model_loader/tensorizer_loader.py +122 -0
  789. vllm/model_executor/model_loader/utils.py +301 -0
  790. vllm/model_executor/model_loader/weight_utils.py +781 -0
  791. vllm/model_executor/models/__init__.py +27 -0
  792. vllm/model_executor/models/adapters.py +247 -0
  793. vllm/model_executor/models/aimv2.py +199 -0
  794. vllm/model_executor/models/arctic.py +558 -0
  795. vllm/model_executor/models/aria.py +656 -0
  796. vllm/model_executor/models/aya_vision.py +461 -0
  797. vllm/model_executor/models/baichuan.py +473 -0
  798. vllm/model_executor/models/bamba.py +542 -0
  799. vllm/model_executor/models/bart.py +937 -0
  800. vllm/model_executor/models/bert.py +517 -0
  801. vllm/model_executor/models/bert_with_rope.py +714 -0
  802. vllm/model_executor/models/blip.py +338 -0
  803. vllm/model_executor/models/blip2.py +717 -0
  804. vllm/model_executor/models/bloom.py +372 -0
  805. vllm/model_executor/models/chameleon.py +1135 -0
  806. vllm/model_executor/models/chatglm.py +477 -0
  807. vllm/model_executor/models/clip.py +411 -0
  808. vllm/model_executor/models/commandr.py +471 -0
  809. vllm/model_executor/models/constant_size_cache.py +136 -0
  810. vllm/model_executor/models/dbrx.py +471 -0
  811. vllm/model_executor/models/deepseek.py +485 -0
  812. vllm/model_executor/models/deepseek_mtp.py +268 -0
  813. vllm/model_executor/models/deepseek_v2.py +842 -0
  814. vllm/model_executor/models/deepseek_vl2.py +647 -0
  815. vllm/model_executor/models/eagle.py +259 -0
  816. vllm/model_executor/models/exaone.py +550 -0
  817. vllm/model_executor/models/fairseq2_llama.py +153 -0
  818. vllm/model_executor/models/falcon.py +509 -0
  819. vllm/model_executor/models/falcon_h1.py +684 -0
  820. vllm/model_executor/models/florence2.py +1102 -0
  821. vllm/model_executor/models/fuyu.py +388 -0
  822. vllm/model_executor/models/gemma.py +424 -0
  823. vllm/model_executor/models/gemma2.py +424 -0
  824. vllm/model_executor/models/gemma3.py +532 -0
  825. vllm/model_executor/models/gemma3_mm.py +708 -0
  826. vllm/model_executor/models/glm.py +22 -0
  827. vllm/model_executor/models/glm4.py +304 -0
  828. vllm/model_executor/models/glm4v.py +647 -0
  829. vllm/model_executor/models/gpt2.py +327 -0
  830. vllm/model_executor/models/gpt_bigcode.py +334 -0
  831. vllm/model_executor/models/gpt_j.py +338 -0
  832. vllm/model_executor/models/gpt_neox.py +331 -0
  833. vllm/model_executor/models/granite.py +492 -0
  834. vllm/model_executor/models/granite_speech.py +778 -0
  835. vllm/model_executor/models/granitemoe.py +436 -0
  836. vllm/model_executor/models/granitemoehybrid.py +585 -0
  837. vllm/model_executor/models/granitemoeshared.py +340 -0
  838. vllm/model_executor/models/gritlm.py +223 -0
  839. vllm/model_executor/models/grok1.py +545 -0
  840. vllm/model_executor/models/h2ovl.py +545 -0
  841. vllm/model_executor/models/idefics2_vision_model.py +388 -0
  842. vllm/model_executor/models/idefics3.py +767 -0
  843. vllm/model_executor/models/interfaces.py +571 -0
  844. vllm/model_executor/models/interfaces_base.py +163 -0
  845. vllm/model_executor/models/intern_vit.py +475 -0
  846. vllm/model_executor/models/internlm2.py +454 -0
  847. vllm/model_executor/models/internlm2_ve.py +146 -0
  848. vllm/model_executor/models/internvl.py +1405 -0
  849. vllm/model_executor/models/jais.py +372 -0
  850. vllm/model_executor/models/jamba.py +591 -0
  851. vllm/model_executor/models/kimi_vl.py +576 -0
  852. vllm/model_executor/models/llama.py +643 -0
  853. vllm/model_executor/models/llama4.py +531 -0
  854. vllm/model_executor/models/llama_eagle.py +166 -0
  855. vllm/model_executor/models/llama_eagle3.py +257 -0
  856. vllm/model_executor/models/llava.py +865 -0
  857. vllm/model_executor/models/llava_next.py +585 -0
  858. vllm/model_executor/models/llava_next_video.py +470 -0
  859. vllm/model_executor/models/llava_onevision.py +955 -0
  860. vllm/model_executor/models/mamba.py +272 -0
  861. vllm/model_executor/models/mamba2.py +302 -0
  862. vllm/model_executor/models/mamba_cache.py +75 -0
  863. vllm/model_executor/models/medusa.py +218 -0
  864. vllm/model_executor/models/mimo.py +191 -0
  865. vllm/model_executor/models/mimo_mtp.py +284 -0
  866. vllm/model_executor/models/minicpm.py +590 -0
  867. vllm/model_executor/models/minicpm3.py +229 -0
  868. vllm/model_executor/models/minicpmo.py +758 -0
  869. vllm/model_executor/models/minicpmv.py +1286 -0
  870. vllm/model_executor/models/minimax_cache.py +35 -0
  871. vllm/model_executor/models/minimax_text_01.py +1303 -0
  872. vllm/model_executor/models/minimax_vl_01.py +363 -0
  873. vllm/model_executor/models/mistral3.py +603 -0
  874. vllm/model_executor/models/mixtral.py +487 -0
  875. vllm/model_executor/models/mixtral_quant.py +452 -0
  876. vllm/model_executor/models/mllama.py +1623 -0
  877. vllm/model_executor/models/mllama4.py +838 -0
  878. vllm/model_executor/models/mlp_speculator.py +205 -0
  879. vllm/model_executor/models/modernbert.py +329 -0
  880. vllm/model_executor/models/module_mapping.py +71 -0
  881. vllm/model_executor/models/molmo.py +1567 -0
  882. vllm/model_executor/models/moonvit.py +629 -0
  883. vllm/model_executor/models/mpt.py +330 -0
  884. vllm/model_executor/models/nemotron.py +507 -0
  885. vllm/model_executor/models/nemotron_nas.py +483 -0
  886. vllm/model_executor/models/nvlm_d.py +215 -0
  887. vllm/model_executor/models/olmo.py +388 -0
  888. vllm/model_executor/models/olmo2.py +413 -0
  889. vllm/model_executor/models/olmoe.py +446 -0
  890. vllm/model_executor/models/opt.py +411 -0
  891. vllm/model_executor/models/orion.py +348 -0
  892. vllm/model_executor/models/ovis.py +554 -0
  893. vllm/model_executor/models/paligemma.py +397 -0
  894. vllm/model_executor/models/persimmon.py +343 -0
  895. vllm/model_executor/models/phi.py +355 -0
  896. vllm/model_executor/models/phi3.py +18 -0
  897. vllm/model_executor/models/phi3_small.py +464 -0
  898. vllm/model_executor/models/phi3v.py +722 -0
  899. vllm/model_executor/models/phi4mm.py +1245 -0
  900. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  901. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  902. vllm/model_executor/models/phimoe.py +664 -0
  903. vllm/model_executor/models/pixtral.py +1315 -0
  904. vllm/model_executor/models/plamo2.py +737 -0
  905. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  906. vllm/model_executor/models/qwen.py +361 -0
  907. vllm/model_executor/models/qwen2.py +567 -0
  908. vllm/model_executor/models/qwen2_5_omni_thinker.py +903 -0
  909. vllm/model_executor/models/qwen2_5_vl.py +1171 -0
  910. vllm/model_executor/models/qwen2_audio.py +409 -0
  911. vllm/model_executor/models/qwen2_moe.py +539 -0
  912. vllm/model_executor/models/qwen2_rm.py +131 -0
  913. vllm/model_executor/models/qwen2_vl.py +1410 -0
  914. vllm/model_executor/models/qwen3.py +320 -0
  915. vllm/model_executor/models/qwen3_moe.py +534 -0
  916. vllm/model_executor/models/qwen_vl.py +784 -0
  917. vllm/model_executor/models/registry.py +618 -0
  918. vllm/model_executor/models/roberta.py +273 -0
  919. vllm/model_executor/models/siglip.py +523 -0
  920. vllm/model_executor/models/skyworkr1v.py +950 -0
  921. vllm/model_executor/models/smolvlm.py +51 -0
  922. vllm/model_executor/models/solar.py +505 -0
  923. vllm/model_executor/models/stablelm.py +342 -0
  924. vllm/model_executor/models/starcoder2.py +355 -0
  925. vllm/model_executor/models/telechat2.py +139 -0
  926. vllm/model_executor/models/teleflm.py +78 -0
  927. vllm/model_executor/models/transformers.py +507 -0
  928. vllm/model_executor/models/ultravox.py +655 -0
  929. vllm/model_executor/models/utils.py +730 -0
  930. vllm/model_executor/models/vision.py +146 -0
  931. vllm/model_executor/models/whisper.py +746 -0
  932. vllm/model_executor/models/zamba2.py +1008 -0
  933. vllm/model_executor/parameter.py +458 -0
  934. vllm/model_executor/pooling_metadata.py +71 -0
  935. vllm/model_executor/sampling_metadata.py +596 -0
  936. vllm/model_executor/utils.py +53 -0
  937. vllm/multimodal/__init__.py +32 -0
  938. vllm/multimodal/audio.py +105 -0
  939. vllm/multimodal/base.py +218 -0
  940. vllm/multimodal/hasher.py +117 -0
  941. vllm/multimodal/image.py +96 -0
  942. vllm/multimodal/inputs.py +872 -0
  943. vllm/multimodal/parse.py +460 -0
  944. vllm/multimodal/processing.py +1894 -0
  945. vllm/multimodal/profiling.py +273 -0
  946. vllm/multimodal/registry.py +330 -0
  947. vllm/multimodal/utils.py +392 -0
  948. vllm/multimodal/video.py +197 -0
  949. vllm/outputs.py +525 -0
  950. vllm/platforms/__init__.py +290 -0
  951. vllm/platforms/cpu.py +205 -0
  952. vllm/platforms/cuda.py +461 -0
  953. vllm/platforms/hpu.py +105 -0
  954. vllm/platforms/interface.py +492 -0
  955. vllm/platforms/neuron.py +152 -0
  956. vllm/platforms/rocm.py +388 -0
  957. vllm/platforms/tpu.py +215 -0
  958. vllm/platforms/xpu.py +155 -0
  959. vllm/plugins/__init__.py +86 -0
  960. vllm/plugins/lora_resolvers/README.md +15 -0
  961. vllm/plugins/lora_resolvers/__init__.py +0 -0
  962. vllm/plugins/lora_resolvers/filesystem_resolver.py +49 -0
  963. vllm/pooling_params.py +53 -0
  964. vllm/profiler/__init__.py +0 -0
  965. vllm/profiler/layerwise_profile.py +374 -0
  966. vllm/profiler/utils.py +147 -0
  967. vllm/prompt_adapter/__init__.py +0 -0
  968. vllm/prompt_adapter/layers.py +82 -0
  969. vllm/prompt_adapter/models.py +357 -0
  970. vllm/prompt_adapter/request.py +36 -0
  971. vllm/prompt_adapter/utils.py +97 -0
  972. vllm/prompt_adapter/worker_manager.py +178 -0
  973. vllm/py.typed +2 -0
  974. vllm/reasoning/__init__.py +14 -0
  975. vllm/reasoning/abs_reasoning_parsers.py +191 -0
  976. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  977. vllm/reasoning/granite_reasoning_parser.py +362 -0
  978. vllm/reasoning/qwen3_reasoning_parser.py +150 -0
  979. vllm/sampling_params.py +590 -0
  980. vllm/scalar_type.py +346 -0
  981. vllm/scripts.py +14 -0
  982. vllm/sequence.py +1567 -0
  983. vllm/spec_decode/__init__.py +0 -0
  984. vllm/spec_decode/batch_expansion.py +505 -0
  985. vllm/spec_decode/draft_model_runner.py +349 -0
  986. vllm/spec_decode/interfaces.py +98 -0
  987. vllm/spec_decode/medusa_worker.py +137 -0
  988. vllm/spec_decode/metrics.py +212 -0
  989. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  990. vllm/spec_decode/mqa_scorer.py +159 -0
  991. vllm/spec_decode/multi_step_worker.py +422 -0
  992. vllm/spec_decode/ngram_worker.py +195 -0
  993. vllm/spec_decode/proposer_worker_base.py +58 -0
  994. vllm/spec_decode/smaller_tp_proposer_worker.py +195 -0
  995. vllm/spec_decode/spec_decode_worker.py +1325 -0
  996. vllm/spec_decode/target_model_runner.py +44 -0
  997. vllm/spec_decode/top1_proposer.py +274 -0
  998. vllm/spec_decode/util.py +276 -0
  999. vllm/test_utils.py +129 -0
  1000. vllm/third_party/__init__.py +0 -0
  1001. vllm/third_party/pynvml.py +6139 -0
  1002. vllm/tracing.py +130 -0
  1003. vllm/transformers_utils/__init__.py +23 -0
  1004. vllm/transformers_utils/chat_templates/__init__.py +4 -0
  1005. vllm/transformers_utils/chat_templates/registry.py +59 -0
  1006. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1007. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1008. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1009. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1010. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1011. vllm/transformers_utils/config.py +835 -0
  1012. vllm/transformers_utils/configs/__init__.py +58 -0
  1013. vllm/transformers_utils/configs/arctic.py +206 -0
  1014. vllm/transformers_utils/configs/chatglm.py +71 -0
  1015. vllm/transformers_utils/configs/cohere2.py +194 -0
  1016. vllm/transformers_utils/configs/dbrx.py +279 -0
  1017. vllm/transformers_utils/configs/deepseek_vl2.py +215 -0
  1018. vllm/transformers_utils/configs/eagle.py +84 -0
  1019. vllm/transformers_utils/configs/exaone.py +189 -0
  1020. vllm/transformers_utils/configs/falcon.py +89 -0
  1021. vllm/transformers_utils/configs/h2ovl.py +15 -0
  1022. vllm/transformers_utils/configs/internvl.py +53 -0
  1023. vllm/transformers_utils/configs/jais.py +237 -0
  1024. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  1025. vllm/transformers_utils/configs/medusa.py +62 -0
  1026. vllm/transformers_utils/configs/minimax_text_01.py +69 -0
  1027. vllm/transformers_utils/configs/minimax_vl_01.py +70 -0
  1028. vllm/transformers_utils/configs/mllama.py +30 -0
  1029. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  1030. vllm/transformers_utils/configs/moonvit.py +32 -0
  1031. vllm/transformers_utils/configs/mpt.py +179 -0
  1032. vllm/transformers_utils/configs/nemotron.py +204 -0
  1033. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  1034. vllm/transformers_utils/configs/ovis.py +183 -0
  1035. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  1036. vllm/transformers_utils/configs/solar.py +246 -0
  1037. vllm/transformers_utils/configs/telechat2.py +63 -0
  1038. vllm/transformers_utils/configs/ultravox.py +107 -0
  1039. vllm/transformers_utils/detokenizer.py +167 -0
  1040. vllm/transformers_utils/detokenizer_utils.py +188 -0
  1041. vllm/transformers_utils/processor.py +220 -0
  1042. vllm/transformers_utils/processors/__init__.py +7 -0
  1043. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1044. vllm/transformers_utils/processors/ovis.py +419 -0
  1045. vllm/transformers_utils/s3_utils.py +161 -0
  1046. vllm/transformers_utils/tokenizer.py +301 -0
  1047. vllm/transformers_utils/tokenizer_base.py +148 -0
  1048. vllm/transformers_utils/tokenizer_group.py +119 -0
  1049. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  1050. vllm/transformers_utils/tokenizers/mistral.py +490 -0
  1051. vllm/transformers_utils/utils.py +98 -0
  1052. vllm/triton_utils/__init__.py +13 -0
  1053. vllm/triton_utils/importing.py +49 -0
  1054. vllm/usage/__init__.py +0 -0
  1055. vllm/usage/usage_lib.py +255 -0
  1056. vllm/utils.py +2844 -0
  1057. vllm/v1/__init__.py +0 -0
  1058. vllm/v1/attention/__init__.py +0 -0
  1059. vllm/v1/attention/backends/__init__.py +0 -0
  1060. vllm/v1/attention/backends/flash_attn.py +833 -0
  1061. vllm/v1/attention/backends/flashinfer.py +639 -0
  1062. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1063. vllm/v1/attention/backends/mla/common.py +926 -0
  1064. vllm/v1/attention/backends/mla/flashmla.py +150 -0
  1065. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +221 -0
  1066. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1067. vllm/v1/attention/backends/pallas.py +235 -0
  1068. vllm/v1/attention/backends/triton_attn.py +279 -0
  1069. vllm/v1/attention/backends/utils.py +18 -0
  1070. vllm/v1/core/__init__.py +0 -0
  1071. vllm/v1/core/block_pool.py +328 -0
  1072. vllm/v1/core/encoder_cache_manager.py +149 -0
  1073. vllm/v1/core/kv_cache_manager.py +372 -0
  1074. vllm/v1/core/kv_cache_utils.py +748 -0
  1075. vllm/v1/core/sched/__init__.py +0 -0
  1076. vllm/v1/core/sched/interface.py +143 -0
  1077. vllm/v1/core/sched/output.py +153 -0
  1078. vllm/v1/core/sched/scheduler.py +1015 -0
  1079. vllm/v1/core/sched/utils.py +22 -0
  1080. vllm/v1/core/single_type_kv_cache_manager.py +358 -0
  1081. vllm/v1/engine/__init__.py +171 -0
  1082. vllm/v1/engine/async_llm.py +546 -0
  1083. vllm/v1/engine/core.py +801 -0
  1084. vllm/v1/engine/core_client.py +1020 -0
  1085. vllm/v1/engine/detokenizer.py +260 -0
  1086. vllm/v1/engine/exceptions.py +16 -0
  1087. vllm/v1/engine/llm_engine.py +316 -0
  1088. vllm/v1/engine/logprobs.py +198 -0
  1089. vllm/v1/engine/mm_input_cache.py +90 -0
  1090. vllm/v1/engine/output_processor.py +427 -0
  1091. vllm/v1/engine/parallel_sampling.py +132 -0
  1092. vllm/v1/engine/processor.py +398 -0
  1093. vllm/v1/executor/__init__.py +0 -0
  1094. vllm/v1/executor/abstract.py +112 -0
  1095. vllm/v1/executor/multiproc_executor.py +532 -0
  1096. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1097. vllm/v1/kv_cache_interface.py +208 -0
  1098. vllm/v1/metrics/__init__.py +0 -0
  1099. vllm/v1/metrics/loggers.py +511 -0
  1100. vllm/v1/metrics/ray_wrappers.py +120 -0
  1101. vllm/v1/metrics/reader.py +245 -0
  1102. vllm/v1/metrics/stats.py +238 -0
  1103. vllm/v1/outputs.py +115 -0
  1104. vllm/v1/request.py +191 -0
  1105. vllm/v1/sample/__init__.py +0 -0
  1106. vllm/v1/sample/metadata.py +43 -0
  1107. vllm/v1/sample/ops/__init__.py +0 -0
  1108. vllm/v1/sample/ops/bad_words.py +38 -0
  1109. vllm/v1/sample/ops/penalties.py +58 -0
  1110. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1111. vllm/v1/sample/rejection_sampler.py +630 -0
  1112. vllm/v1/sample/sampler.py +270 -0
  1113. vllm/v1/sample/tpu/__init__.py +0 -0
  1114. vllm/v1/sample/tpu/metadata.py +123 -0
  1115. vllm/v1/sample/tpu/sampler.py +144 -0
  1116. vllm/v1/serial_utils.py +313 -0
  1117. vllm/v1/spec_decode/__init__.py +0 -0
  1118. vllm/v1/spec_decode/eagle.py +424 -0
  1119. vllm/v1/spec_decode/medusa.py +61 -0
  1120. vllm/v1/spec_decode/metadata.py +61 -0
  1121. vllm/v1/spec_decode/metrics.py +177 -0
  1122. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1123. vllm/v1/spec_decode/utils.py +45 -0
  1124. vllm/v1/structured_output/__init__.py +215 -0
  1125. vllm/v1/structured_output/backend_guidance.py +244 -0
  1126. vllm/v1/structured_output/backend_types.py +133 -0
  1127. vllm/v1/structured_output/backend_xgrammar.py +317 -0
  1128. vllm/v1/structured_output/request.py +85 -0
  1129. vllm/v1/structured_output/utils.py +174 -0
  1130. vllm/v1/utils.py +294 -0
  1131. vllm/v1/worker/__init__.py +0 -0
  1132. vllm/v1/worker/block_table.py +139 -0
  1133. vllm/v1/worker/gpu_input_batch.py +680 -0
  1134. vllm/v1/worker/gpu_model_runner.py +2084 -0
  1135. vllm/v1/worker/gpu_worker.py +373 -0
  1136. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1137. vllm/v1/worker/tpu_model_runner.py +1510 -0
  1138. vllm/v1/worker/tpu_worker.py +276 -0
  1139. vllm/v1/worker/utils.py +74 -0
  1140. vllm/v1/worker/worker_base.py +64 -0
  1141. vllm/version.py +40 -0
  1142. vllm/vllm_flash_attn/.gitkeep +0 -0
  1143. vllm/worker/__init__.py +0 -0
  1144. vllm/worker/cache_engine.py +144 -0
  1145. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1146. vllm/worker/cpu_model_runner.py +671 -0
  1147. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1148. vllm/worker/cpu_worker.py +400 -0
  1149. vllm/worker/enc_dec_model_runner.py +555 -0
  1150. vllm/worker/hpu_model_runner.py +2319 -0
  1151. vllm/worker/hpu_worker.py +483 -0
  1152. vllm/worker/model_runner.py +2178 -0
  1153. vllm/worker/model_runner_base.py +281 -0
  1154. vllm/worker/multi_step_hpu_worker.py +122 -0
  1155. vllm/worker/multi_step_model_runner.py +910 -0
  1156. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1157. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1158. vllm/worker/multi_step_tpu_worker.py +107 -0
  1159. vllm/worker/multi_step_worker.py +196 -0
  1160. vllm/worker/neuron_model_runner.py +418 -0
  1161. vllm/worker/neuron_worker.py +158 -0
  1162. vllm/worker/neuronx_distributed_model_runner.py +136 -0
  1163. vllm/worker/pooling_model_runner.py +211 -0
  1164. vllm/worker/tpu_model_runner.py +908 -0
  1165. vllm/worker/tpu_worker.py +336 -0
  1166. vllm/worker/utils.py +52 -0
  1167. vllm/worker/worker.py +574 -0
  1168. vllm/worker/worker_base.py +644 -0
  1169. vllm/worker/xpu_model_runner.py +606 -0
  1170. vllm/worker/xpu_worker.py +185 -0
  1171. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/METADATA +335 -0
  1172. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/RECORD +1175 -0
  1173. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/WHEEL +5 -0
  1174. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/entry_points.txt +5 -0
  1175. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1381 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """
3
+ # MLA Common Components
4
+
5
+ This file implements common components for MLA implementations.
6
+
7
+ First we define:
8
+
9
+ Sq as Q sequence length
10
+ Skv as KV sequence length
11
+
12
+ MLA has two possible ways of computing, a data-movement friendly approach and a
13
+ compute friendly approach, we generally want to use the compute friendly
14
+ approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
15
+ and the data-movement friendly approach for "decode" (i.e. the ratio
16
+ Sq / Skv is "large").
17
+
18
+ NOTE what we deem small and large is currently determined by if its labelled
19
+ prefill or decode by the scheduler, but this is something we should probably
20
+ tune.
21
+
22
+ Main reference: DeepseekV2 paper, and FlashInfer Implementation
23
+ (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
24
+
25
+ Deepseek's MLA attention works the following way:
26
+ * Use a single latent vector to represent the per-token entry of the KV cache.
27
+ * For decode (i.e. the memory friendly approach) the attention "simulates" a
28
+ multi-head attention, while the compute is similar to multi-query attention.
29
+
30
+ Below is example of both paths assuming batchsize = 1
31
+
32
+ ## More Extent Definitions:
33
+
34
+ C Context length, `Skv - Sq`
35
+ H hidden size
36
+ N number of attention heads
37
+ Lq latent dimension for Q 1536 in DSV3
38
+ Lkv latent dimension for K/V 512 in DSV3
39
+ P nope dimension, no rope. 128 in DSV3
40
+ R rope dimension, goes through rope. 64 in DSV3
41
+ V V head dim. 128 in DSV3
42
+
43
+ ## Vector/Matrix Definitions
44
+
45
+ h_t hidden states (input to attention) shape [Sq, H]
46
+ q_c latent/compressed Q shape [Sq, Lq]
47
+ q_nope uncompressed Q (no-rope) shape [Sq, N, P]
48
+ q_pe uncompressed Q (rope) shape [Sq, N, R]
49
+ kv_c latent/compressed KV shape [Skv, Lkv]
50
+ k_pe decoupled k position embeddings shape [Skv, R]
51
+ new_kv_c new kv_c from current iter shape [Sq, Lkv]
52
+ new_k_pe new k_pe from current iter shape [Sq, R]
53
+ cache_kv_c cached k_c from previous iters shape [C, Lkv]
54
+ cache_k_pe cached k_pe from previous iters shape [C, R]
55
+ W_DQ project h_t to q_c shape [H, Lq]
56
+ W_UQ project q_c to q_nope shape [Lq, N * P]
57
+ W_QR project q_c to q_pe shape [Lq, N * R]
58
+ W_DKV project h_t to kv_c shape [H, Lkv]
59
+ W_UK project kv_c to k_nope shape [Lkv, N, P]
60
+ W_KR project h_t to k_pe shape [H, R]
61
+ W_UV project kv_c to v shape [Lkv, N, V]
62
+ W_O project v to h_t shape [N * V, H]
63
+
64
+
65
+ ## Compute Friendly Approach (i.e. "_forward_prefill"):
66
+
67
+ q_c = h_t @ W_DQ
68
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
69
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
70
+ new_kv_c = h_t @ W_DKV
71
+ new_k_pe = RoPE(h_t @ W_KR)
72
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
73
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
74
+ k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
75
+ v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
76
+
77
+ // MHA with QK headdim = P + R
78
+ // V headdim = V
79
+ // spda_o shape [Sq, N, V]
80
+ spda_o = scaled_dot_product_attention(
81
+ torch.cat([q_nope, q_pe], dim=-1),
82
+ torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
83
+ v
84
+ )
85
+ return spda_o @ W_O
86
+
87
+ NOTE: in the actual code,
88
+ `kv_b_proj` is [W_UK; W_UV] concatenated per head
89
+ `q_b_proj` is [W_UQ; W_QR] concatenated per head
90
+ `out_proj` is W_O
91
+
92
+
93
+ ## Data-Movement Friendly Approach (i.e. "_forward_decode"):
94
+
95
+ Runtime
96
+ q_c = h_t @ W_DQ
97
+ q_nope = (q_c @ W_UQ).view(-1, N, P)
98
+ ql_nope = einsum("snh,lnh->snl", q, W_UK)
99
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
100
+ new_kv_c = h_t @ W_DKV
101
+ new_k_pe = RoPE(h_t @ W_KR)
102
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
103
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
104
+
105
+ // MQA with QK headdim = Lkv + R
106
+ // V headdim = Lkv
107
+ // spda_o shape [Sq, N, Lkv]
108
+ // NOTE: this is less compute-friendly since Lkv > P
109
+ // but is more data-movement friendly since its MQA vs MHA
110
+ spda_o = scaled_dot_product_attention(
111
+ torch.cat([ql_nope, q_pe], dim=-1),
112
+ torch.cat([kv_c, k_pe], dim=-1),
113
+ kv_c
114
+ )
115
+
116
+ o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
117
+ return o.view(-1, N * V) @ self.num_heads @ W_O
118
+
119
+
120
+ ## Chunked Prefill
121
+
122
+ For chunked prefill we want to use the compute friendly algorithm. We are
123
+ assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
124
+ the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
125
+
126
+ However, the compute-friendly approach can potentially run out of memory if Skv
127
+ is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
128
+
129
+ To mitigate this, we chunk the computation of attention with respect to the
130
+ current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
131
+ fixed workspace size.
132
+
133
+ The chunked prefill approach is as follows:
134
+
135
+ MCC Max chunk of context to process per iter, computed dynamically,
136
+ used to bound the memory usage
137
+
138
+ q_c = h_t @ W_DQ
139
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
140
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
141
+ new_kv_c = h_t @ W_DKV
142
+ new_k_pe = RoPE(h_t @ W_KR)
143
+ new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
144
+ new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
145
+
146
+ // MHA between queries and new KV
147
+ // with QK headdim = P + R
148
+ // V headdim = V
149
+ // curr_o shape [Sq, N, V]
150
+ // curr_lse shape [N, Sq], this is just order FA returns
151
+ curr_o, curr_lse = scaled_dot_product_attention(
152
+ torch.cat([q_nope, q_pe], dim=-1),
153
+ torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
154
+ new_v,
155
+ casual=True,
156
+ return_softmax_lse=True
157
+ )
158
+
159
+ // Compute attention with the already existing context
160
+ for chunk_idx in range(cdiv(C, MCC)):
161
+ chunk_start = chunk_idx * MCC
162
+ chunk_end = min(chunk_start + MCC, C)
163
+ Sc = chunk_end - chunk_start
164
+ cache_kv_c_chunk = cache_kv_c[chunk_start:chunk_end]
165
+ cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end]
166
+ cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
167
+ cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
168
+
169
+ chunk_o, chunk_lse = scaled_dot_product_attention(
170
+ torch.cat([q_nope, q_pe], dim=-1),
171
+ torch.cat([cache_k_nope_chunk,
172
+ cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
173
+ dim=-1),
174
+ cache_v_chunk,
175
+ casual=False,
176
+ return_softmax_lse=True
177
+ )
178
+
179
+ curr_o, curr_lse = merge_attn_states(
180
+ suffix_output=curr_o,
181
+ suffix_lse=curr_lse,
182
+ prefix_output=chunk_o,
183
+ prefix_lse=chunk_lse,
184
+ )
185
+
186
+ return curr_o @ W_O
187
+ """
188
+
189
+ import functools
190
+ from abc import abstractmethod
191
+ from collections import defaultdict
192
+ from contextlib import contextmanager
193
+ from dataclasses import dataclass
194
+ from itertools import accumulate
195
+ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
196
+ Type, TypeVar)
197
+
198
+ import torch
199
+
200
+ from vllm import _custom_ops as ops
201
+ from vllm import envs
202
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
203
+ AttentionMetadata,
204
+ AttentionMetadataBuilder,
205
+ AttentionState, MLAAttentionImpl)
206
+ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
207
+ compute_slot_mapping_start_idx,
208
+ is_block_tables_empty)
209
+ from vllm.attention.ops.merge_attn_states import merge_attn_states
210
+ from vllm.attention.utils.fa_utils import get_flash_attn_version
211
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
212
+ LinearBase,
213
+ UnquantizedLinearMethod)
214
+ from vllm.multimodal import MultiModalPlaceholderMap
215
+ from vllm.platforms import current_platform
216
+ from vllm.triton_utils import HAS_TRITON
217
+ from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
218
+
219
+ if HAS_TRITON:
220
+ from vllm.attention.ops.triton_flash_attention import triton_attention
221
+ else:
222
+ triton_attention = None
223
+
224
+ try:
225
+ from vllm.vllm_flash_attn import flash_attn_varlen_func
226
+ is_vllm_fa = True
227
+ except ImportError:
228
+ is_vllm_fa = False
229
+ try:
230
+ # For rocm use upstream flash attention
231
+ from flash_attn import flash_attn_varlen_func
232
+ except ImportError:
233
+ flash_attn_varlen_func = None
234
+
235
+ if TYPE_CHECKING:
236
+ from vllm.worker.model_runner import (ModelInputForGPUBuilder,
237
+ ModelInputForGPUWithSamplingMetadata)
238
+
239
+ is_hip = current_platform.is_rocm()
240
+
241
+
242
+ class MLACommonBackend(AttentionBackend):
243
+
244
+ @staticmethod
245
+ def get_name() -> str:
246
+ return "TRITON_MLA"
247
+
248
+ @staticmethod
249
+ def get_metadata_cls() -> Type["AttentionMetadata"]:
250
+ return MLACommonMetadata
251
+
252
+ @staticmethod
253
+ def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
254
+ return MLACommonMetadataBuilder
255
+
256
+ @staticmethod
257
+ def get_state_cls() -> Type["MLACommonState"]:
258
+ return MLACommonState
259
+
260
+ @staticmethod
261
+ def get_kv_cache_shape(
262
+ num_blocks: int,
263
+ block_size: int,
264
+ num_kv_heads: int, # assumed to be 1 for MLA
265
+ head_size: int,
266
+ ) -> Tuple[int, ...]:
267
+ return (num_blocks, block_size, head_size)
268
+
269
+ @staticmethod
270
+ def swap_blocks(
271
+ src_kv_cache: torch.Tensor,
272
+ dst_kv_cache: torch.Tensor,
273
+ src_to_dst: torch.Tensor,
274
+ ) -> None:
275
+ ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
276
+
277
+ @staticmethod
278
+ def copy_blocks(
279
+ kv_caches: List[torch.Tensor],
280
+ src_to_dists: torch.Tensor,
281
+ ) -> None:
282
+ ops.copy_blocks_mla(kv_caches, src_to_dists)
283
+
284
+ @staticmethod
285
+ def get_supported_head_sizes() -> List[int]:
286
+ return [576]
287
+
288
+
289
+ T = TypeVar("T", bound="MLACommonMetadata")
290
+
291
+
292
+ class MLACommonState(AttentionState, Generic[T]):
293
+
294
+ def __init__(self, runner):
295
+ self.runner = runner
296
+ self._is_graph_capturing = False
297
+
298
+ scheduler_config = runner.scheduler_config
299
+ self.model_config = runner.model_config
300
+ cache_config = runner.cache_config
301
+
302
+ self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
303
+ self.enable_prefix_caching = cache_config.enable_prefix_caching
304
+
305
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
306
+ self.context_chunk_workspace_size = min(
307
+ # Max sure there is enough for 8 full length request or at least
308
+ # 4 pages of cache per request
309
+ max(
310
+ 8 * self.model_config.max_model_len, 4 *
311
+ scheduler_config.max_num_seqs * cache_config.block_size),
312
+ # For long-context models try not to over-allocate limiting
313
+ # kv-cache space, limiting it to 64k tokens,
314
+ # which would result in the workspace being:
315
+ # 2*(576)*(64*1024) = 144mb
316
+ # (assuming 576 MLA head dim, and fp16)
317
+ # which would result in up-projected context being
318
+ # 2*(192*128)*(64*1024) = 3gb
319
+ # (assuming 192 QK head dim, 128 heads, and fp16)
320
+ 128 * 1024)
321
+ assert self.context_chunk_workspace_size >= \
322
+ scheduler_config.max_num_seqs * cache_config.block_size
323
+
324
+ @contextmanager
325
+ def graph_capture(self, max_batch_size: int):
326
+ self._is_graph_capturing = True
327
+
328
+ self._graph_slot_mapping = torch.full((max_batch_size, ),
329
+ PAD_SLOT_ID,
330
+ dtype=torch.long,
331
+ device=self.runner.device)
332
+ self._graph_seq_lens = torch.ones(max_batch_size,
333
+ dtype=torch.int32,
334
+ device=self.runner.device)
335
+ self._graph_block_tables = torch.from_numpy(
336
+ self.runner.graph_block_tables).to(device=self.runner.device)
337
+
338
+ self._positions = torch.zeros((max_batch_size, ),
339
+ dtype=torch.long,
340
+ device=self.runner.device)
341
+
342
+ yield
343
+
344
+ self._is_graph_capturing = False
345
+ del self._graph_slot_mapping
346
+ del self._graph_seq_lens
347
+ del self._graph_block_tables
348
+ del self._positions
349
+
350
+ def graph_clone(self, batch_size: int):
351
+ assert self._is_graph_capturing
352
+ return self.__class__(self.runner)
353
+
354
+ def graph_capture_get_metadata_for_batch(
355
+ self,
356
+ batch_size: int,
357
+ is_encoder_decoder_model: bool = False) -> T:
358
+ assert self._is_graph_capturing
359
+
360
+ attn_metadata = self.runner.attn_backend.make_metadata(
361
+ multi_modal_placeholder_index_maps=None,
362
+ enable_kv_scales_calculation=False,
363
+ use_cuda_graph=True,
364
+ num_prefills=0,
365
+ num_prefill_tokens=0,
366
+ num_decode_tokens=batch_size,
367
+ slot_mapping=self._graph_slot_mapping[:batch_size],
368
+ seq_lens=None,
369
+ seq_lens_tensor=self._graph_seq_lens[:batch_size],
370
+ max_query_len=1,
371
+ max_decode_query_len=1,
372
+ max_prefill_seq_len=0,
373
+ max_decode_seq_len=self.runner.max_seq_len_to_capture,
374
+ query_start_loc=None,
375
+ seq_start_loc=None,
376
+ context_lens_tensor=None,
377
+ block_tables=self._graph_block_tables[:batch_size],
378
+ head_dim=self.runner.model_config.get_head_size())
379
+
380
+ if is_encoder_decoder_model:
381
+ raise NotImplementedError(
382
+ "MLACommonState does not support encoder/decoder yet")
383
+
384
+ return attn_metadata
385
+
386
+ def get_graph_input_buffers(self,
387
+ attn_metadata,
388
+ is_encoder_decoder_model: bool = False):
389
+ input_buffers = {
390
+ "slot_mapping": attn_metadata.slot_mapping,
391
+ "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
392
+ "block_tables": attn_metadata.decode_metadata.block_tables,
393
+ }
394
+ if is_encoder_decoder_model:
395
+ raise NotImplementedError(
396
+ "MLACommonState does not support encoder/decoder yet")
397
+
398
+ return input_buffers
399
+
400
+ def prepare_graph_input_buffers(self,
401
+ input_buffers,
402
+ attn_metadata,
403
+ is_encoder_decoder_model: bool = False):
404
+ input_buffers["seq_lens_tensor"].copy_(
405
+ attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
406
+ input_buffers["block_tables"].copy_(
407
+ attn_metadata.decode_metadata.block_tables, non_blocking=True)
408
+ if is_encoder_decoder_model:
409
+ raise NotImplementedError(
410
+ "TritonMLAState does not support encoder/decoder yet")
411
+
412
+ def begin_forward(self, model_input):
413
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
414
+ if not hasattr(self, "context_chunk_workspace"):
415
+ # not self.runner.device does not return the correct device
416
+ # for this process, (init_device sets the correct device but
417
+ # only on the Worker). The only way Ive figured out to get the
418
+ # correct device is to allocate the workspace on the first call
419
+ # to begin_forward and use the device of the input tokens
420
+ assert model_input.input_tokens is not None
421
+ self.context_chunk_workspace = torch.empty(
422
+ (self.context_chunk_workspace_size,
423
+ self.model_config.get_head_size()),
424
+ dtype=self.model_config.dtype,
425
+ device=model_input.input_tokens.device,
426
+ )
427
+
428
+ model_input.attn_metadata.context_chunk_workspace = \
429
+ self.context_chunk_workspace
430
+
431
+
432
+ @dataclass
433
+ class MLACommonMetadata(AttentionMetadata):
434
+ """Metadata for MLACommon.
435
+
436
+ NOTE: Please read the comment at the top of the file before trying to
437
+ understand this class
438
+
439
+ NOTE: Any python object stored here is not updated when it is
440
+ cuda-graph replayed. If you have values that need to be changed
441
+ dynamically, it should be stored in tensor. The tensor has to be
442
+ updated from `CUDAGraphRunner.forward` API.
443
+ """
444
+ # Whether or not if cuda graph is enabled.
445
+ # Cuda-graph is currently enabled for decoding only.
446
+ # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
447
+ use_cuda_graph: bool
448
+
449
+ # NOTE(sang): Definition of context_len, query_len, and seq_len.
450
+ # |---------- N-1 iteration --------|
451
+ # |---------------- N iteration ---------------------|
452
+ # |- tokenA -|......................|-- newTokens ---|
453
+ # |---------- context_len ----------|
454
+ # |-------------------- seq_len ---------------------|
455
+ # |-- query_len ---|
456
+
457
+ # (batch_size,). The sequence length per sequence. Sequence length means
458
+ # the computed tokens + new tokens None if it is a decoding.
459
+ seq_lens: Optional[List[int]]
460
+ # seq_lens stored as a tensor.
461
+ seq_lens_tensor: Optional[torch.Tensor]
462
+
463
+ # Maximum sequence length among prefill batch. 0 if there are decoding
464
+ # requests only.
465
+ max_prefill_seq_len: int
466
+ # Maximum sequence length among decode batch. 0 if there are prefill
467
+ # requests only.
468
+ max_decode_seq_len: int
469
+ # (batch_size,) A tensor of context lengths (tokens that are computed
470
+ # so far).
471
+ context_lens_tensor: Optional[torch.Tensor]
472
+
473
+ # (batch_size, max_blocks_per_seq).
474
+ # Block addresses per sequence. (Seq id -> list of physical block)
475
+ # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
476
+ # in the kv cache. Each block can contain up to block_size tokens.
477
+ # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
478
+ # captured.
479
+ block_tables: Optional[torch.Tensor]
480
+
481
+ # Maximum query length in the batch.
482
+ max_query_len: Optional[int] = None
483
+
484
+ # Max number of query tokens among request in the batch.
485
+ max_decode_query_len: Optional[int] = None
486
+
487
+ # (batch_size + 1,). The cumulative subquery lengths of the sequences in
488
+ # the batch, used to index into subquery. E.g., if the subquery length
489
+ # is [4, 6], it is [0, 4, 10].
490
+ query_start_loc: Optional[torch.Tensor] = None
491
+ # (batch_size + 1,). The cumulative sequence lengths of the sequences in
492
+ # the batch, used to index into sequence. E.g., if the sequence length is
493
+ # [4, 6], it is [0, 4, 10].
494
+ seq_start_loc: Optional[torch.Tensor] = None
495
+
496
+ _cached_prefill_metadata: Optional[Any] = None
497
+ _cached_decode_metadata: Optional[Any] = None
498
+
499
+ num_prefill_tokens: int
500
+
501
+ # The dimension of the attention heads
502
+ head_dim: Optional[int] = None
503
+
504
+ # Used when chunked prefill is enabled to simulate worst case workspace
505
+ # allocations, hopefully to avoid going OOM
506
+ is_profile_run: bool = False
507
+
508
+ # New for MLA (compared to FlashAttention)
509
+ # For chunked prefill
510
+ context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
511
+ context_chunk_starts: Optional[torch.Tensor] = None
512
+ context_chunk_seq_tot: Optional[List[int]] = None
513
+ context_chunk_max_seq_lens: Optional[List[int]] = None
514
+ # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
515
+ context_chunk_workspace: Optional[torch.Tensor] = None
516
+
517
+ def __post_init__(self):
518
+ supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
519
+ if self.head_dim is not None and self.head_dim \
520
+ not in supported_head_sizes:
521
+ raise ValueError(
522
+ f"Only {supported_head_sizes} are supported for head_dim,",
523
+ f" received {self.head_dim}.")
524
+
525
+ @property
526
+ def prefill_metadata(self):
527
+ if self.num_prefills == 0:
528
+ return None
529
+
530
+ if self._cached_prefill_metadata is not None:
531
+ return self._cached_prefill_metadata
532
+
533
+ assert self.seq_lens is not None
534
+ assert self.seq_lens_tensor is not None
535
+
536
+ # Compute some attn_metadata fields which default to None
537
+ query_start_loc = (None if self.query_start_loc is None else
538
+ self.query_start_loc[:self.num_prefills + 1])
539
+ slot_mapping = (None if self.slot_mapping is None else
540
+ self.slot_mapping[:self.num_prefill_tokens])
541
+ seq_lens = (None if self.seq_lens is None else
542
+ self.seq_lens[:self.num_prefills])
543
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
544
+ self.seq_lens_tensor[:self.num_prefills])
545
+ seq_start_loc = (None if self.seq_start_loc is None else
546
+ self.seq_start_loc[:self.num_prefills + 1])
547
+ context_lens_tensor = (None if self.context_lens_tensor is None else
548
+ self.context_lens_tensor[:self.num_prefills])
549
+ block_tables = (None if self.block_tables is None else
550
+ self.block_tables[:self.num_prefills])
551
+
552
+ self._cached_prefill_metadata = self.__class__(
553
+ # Required by ModelRunner
554
+ use_cuda_graph=False, # Not Attention Related
555
+ # Required by Attention Metadata
556
+ num_prefills=self.num_prefills,
557
+ num_prefill_tokens=self.num_prefill_tokens,
558
+ num_decode_tokens=0,
559
+ slot_mapping=slot_mapping,
560
+ # Required by Attention Metadata (not used)
561
+ multi_modal_placeholder_index_maps=None,
562
+ enable_kv_scales_calculation=False,
563
+ # MLACommonMetadata
564
+ seq_lens=seq_lens,
565
+ seq_lens_tensor=seq_lens_tensor,
566
+ max_query_len=self.max_query_len,
567
+ max_prefill_seq_len=self.max_prefill_seq_len,
568
+ max_decode_query_len=0,
569
+ max_decode_seq_len=0,
570
+ query_start_loc=query_start_loc,
571
+ seq_start_loc=seq_start_loc,
572
+ context_lens_tensor=context_lens_tensor,
573
+ block_tables=block_tables,
574
+ head_dim=self.head_dim,
575
+ is_profile_run=self.is_profile_run,
576
+ # MLACommonMetadata Chunk prefill specific
577
+ context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens,
578
+ context_chunk_starts=self.context_chunk_starts,
579
+ context_chunk_seq_tot=self.context_chunk_seq_tot,
580
+ context_chunk_max_seq_lens=self.context_chunk_max_seq_lens,
581
+ )
582
+ return self._cached_prefill_metadata
583
+
584
+ @property
585
+ def decode_metadata(self):
586
+ if self.num_decode_tokens == 0:
587
+ return None
588
+
589
+ if self._cached_decode_metadata is not None:
590
+ return self._cached_decode_metadata
591
+ assert self.seq_lens_tensor is not None
592
+
593
+ # Compute some attn_metadata fields which default to None
594
+ slot_mapping = (None if self.slot_mapping is None else
595
+ self.slot_mapping[self.num_prefill_tokens:])
596
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
597
+ self.seq_lens_tensor[self.num_prefills:])
598
+ block_tables = (None if self.block_tables is None else
599
+ self.block_tables[self.num_prefills:])
600
+
601
+ self._cached_decode_metadata = self.__class__(
602
+ # Required by ModelRunner
603
+ use_cuda_graph=self.use_cuda_graph, # Not Attention Related
604
+ # Required by Attention Metadata
605
+ num_prefills=0,
606
+ num_prefill_tokens=0,
607
+ num_decode_tokens=self.num_decode_tokens,
608
+ slot_mapping=slot_mapping,
609
+ # Required by Attention Metadata (not used)
610
+ multi_modal_placeholder_index_maps=None,
611
+ enable_kv_scales_calculation=False,
612
+ # MLACommonMetadata
613
+ seq_lens=None,
614
+ seq_lens_tensor=seq_lens_tensor,
615
+ max_decode_query_len=self.max_decode_query_len,
616
+ max_query_len=self.max_query_len,
617
+ max_prefill_seq_len=0,
618
+ max_decode_seq_len=self.max_decode_seq_len,
619
+ # Batch may be composed of prefill|decodes, adjust query start
620
+ # indices to refer to the start of decodes. E.g.
621
+ # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
622
+ query_start_loc=(self.query_start_loc[self.num_prefills:] -
623
+ self.query_start_loc[self.num_prefills])
624
+ if self.query_start_loc is not None else None,
625
+ seq_start_loc=self.seq_start_loc[self.num_prefills:]
626
+ if self.seq_start_loc is not None else None,
627
+ context_lens_tensor=None,
628
+ block_tables=block_tables,
629
+ head_dim=self.head_dim,
630
+ is_profile_run=self.is_profile_run)
631
+ return self._cached_decode_metadata
632
+
633
+ def advance_step(self,
634
+ model_input: "ModelInputForGPUWithSamplingMetadata",
635
+ sampled_token_ids: Optional[torch.Tensor],
636
+ block_size: int,
637
+ num_seqs: int,
638
+ num_queries: int,
639
+ turn_prefills_into_decodes: bool = False):
640
+ """
641
+ Update metadata in-place to advance one decode step.
642
+ """
643
+ # When using cudagraph, the num_seqs is padded to the next captured
644
+ # batch sized, but num_queries tracks the actual number of requests in
645
+ # the batch. For --enforce-eager mode, num_seqs == num_queries
646
+ if num_seqs != num_queries:
647
+ assert num_seqs > num_queries
648
+
649
+ if turn_prefills_into_decodes:
650
+ # When Multi-Step is enabled with Chunked-Prefill, prefills and
651
+ # decodes are scheduled together. In the first step, all the
652
+ # prefills turn into decodes. This update reflects that
653
+ # conversion.
654
+ assert self.num_decode_tokens + self.num_prefills == num_seqs
655
+ self.num_decode_tokens += self.num_prefills
656
+ self.num_prefills = 0
657
+ self.num_prefill_tokens = 0
658
+ self.max_prefill_seq_len = 0
659
+ self.max_query_len = 1
660
+
661
+ self.slot_mapping = self.slot_mapping[:num_seqs]
662
+ else:
663
+ assert self.seq_lens is not None
664
+ assert self.max_decode_seq_len == max(self.seq_lens)
665
+
666
+ assert self.num_prefills == 0
667
+ assert self.num_prefill_tokens == 0
668
+ assert self.num_decode_tokens == num_seqs
669
+ assert self.slot_mapping.shape == (num_seqs, )
670
+
671
+ assert self.seq_lens is not None
672
+ assert len(self.seq_lens) == num_seqs
673
+ assert self.seq_lens_tensor is not None
674
+ assert self.seq_lens_tensor.shape == (num_seqs, )
675
+ assert self.max_query_len == 1
676
+ assert self.max_prefill_seq_len == 0
677
+
678
+ assert self.query_start_loc is not None
679
+ assert self.query_start_loc.shape == (num_queries + 1, )
680
+ assert self.seq_start_loc is not None
681
+ assert self.seq_start_loc.shape == (num_seqs + 1, )
682
+
683
+ assert self.context_lens_tensor is not None
684
+ assert self.context_lens_tensor.shape == (num_queries, )
685
+
686
+ assert self.block_tables is not None
687
+ assert self.block_tables.shape[0] == num_seqs
688
+
689
+ # Update query lengths. Note that we update only queries and not seqs,
690
+ # since tensors may be padded due to captured cuda graph batch size
691
+ for i in range(num_queries):
692
+ self.seq_lens[i] += 1
693
+ self.max_decode_seq_len = max(self.seq_lens)
694
+
695
+ self._ops_advance_step(num_seqs=num_seqs,
696
+ num_queries=num_queries,
697
+ block_size=block_size,
698
+ input_tokens=model_input.input_tokens,
699
+ sampled_token_ids=sampled_token_ids,
700
+ input_positions=model_input.input_positions)
701
+
702
+ def _ops_advance_step(self, num_seqs: int, num_queries: int,
703
+ block_size: int, input_tokens: torch.Tensor,
704
+ sampled_token_ids: torch.Tensor,
705
+ input_positions: torch.Tensor) -> None:
706
+ # here we use advance_step_flashinfo to update the paged_kv_* tensors
707
+ ops.advance_step_flashattn(num_seqs=num_seqs,
708
+ num_queries=num_queries,
709
+ block_size=block_size,
710
+ input_tokens=input_tokens,
711
+ sampled_token_ids=sampled_token_ids,
712
+ input_positions=input_positions,
713
+ seq_lens=self.seq_lens_tensor,
714
+ slot_mapping=self.slot_mapping,
715
+ block_tables=self.block_tables)
716
+
717
+
718
+ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
719
+ """
720
+ NOTE: Please read the comment at the top of the file before trying to
721
+ understand this class
722
+ """
723
+ BLOCK_TABLE_EXTENDER: list[list[int]] = []
724
+
725
+ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
726
+ self.input_builder = input_builder
727
+ self.runner = input_builder.runner
728
+ self.sliding_window = input_builder.sliding_window
729
+ self.block_size = input_builder.block_size
730
+ self.chunked_prefill_enabled = \
731
+ self.runner.scheduler_config.chunked_prefill_enabled
732
+ self.enable_prefix_caching = \
733
+ self.runner.cache_config.enable_prefix_caching
734
+
735
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
736
+ attn_state = self.input_builder.runner.attn_state
737
+ self.context_chunk_workspace_size = \
738
+ attn_state.context_chunk_workspace_size
739
+ self.page_size = self.runner.block_size
740
+
741
+ def prepare(self):
742
+ self.slot_mapping: List[int] = []
743
+ self.prefill_seq_lens: List[int] = []
744
+ self.context_lens: List[int] = []
745
+ self.block_tables: List[List[int]] = []
746
+ self.curr_seq_lens: List[int] = []
747
+ self.multimodal_placeholder_maps: Dict[
748
+ str,
749
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
750
+ self.num_prefills = 0
751
+ self.num_prefill_tokens = 0
752
+ self.num_decode_tokens = 0
753
+ self.has_prefix_cache_hit = False
754
+
755
+ def _add_seq_group(
756
+ self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
757
+ chunked_prefill_enabled: bool, prefix_cache_hit: bool):
758
+ """Add a sequence group to the metadata. Specifically update/append
759
+ 1. context length.
760
+ 2. block table.
761
+ 3. slot mapping.
762
+ """
763
+ is_prompt = inter_data.is_prompt
764
+ block_tables = inter_data.block_tables
765
+
766
+ for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
767
+ curr_sliding_window_block) in zip(
768
+ inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
769
+ inter_data.orig_seq_lens, inter_data.seq_lens,
770
+ inter_data.query_lens, inter_data.context_lens,
771
+ inter_data.curr_sliding_window_blocks):
772
+ self.context_lens.append(context_len)
773
+ if is_prompt:
774
+ self.num_prefills += 1
775
+ self.num_prefill_tokens += token_len
776
+ self.prefill_seq_lens.append(seq_len)
777
+ else:
778
+ self.num_decode_tokens += query_len
779
+ self.curr_seq_lens.append(curr_seq_len)
780
+
781
+ # Compute block table.
782
+ # TODO(sang): Combine chunked prefill and prefix caching by
783
+ # only allowing multiple of block_size chunk size.
784
+ # NOTE: This only works for oooooooxxx style attention.
785
+ block_table = []
786
+ if prefix_cache_hit:
787
+ # NOTE(woosuk): For flash-attn, the block table should
788
+ # include the entries for the incoming prefill tokens.
789
+ block_table = block_tables[seq_id]
790
+ elif ((chunked_prefill_enabled or not is_prompt)
791
+ and block_tables is not None):
792
+ if curr_sliding_window_block == 0:
793
+ block_table = block_tables[seq_id]
794
+ else:
795
+ block_table = block_tables[seq_id][
796
+ -curr_sliding_window_block:]
797
+ self.block_tables.append(block_table)
798
+
799
+ # Compute slot mapping.
800
+ is_profile_run = is_block_tables_empty(block_tables)
801
+ start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
802
+ context_len,
803
+ self.sliding_window)
804
+ compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
805
+ seq_len, context_len, start_idx,
806
+ self.block_size, inter_data.block_tables)
807
+
808
+ def _get_graph_runner_block_tables(
809
+ self, num_seqs: int,
810
+ block_tables: List[List[int]]) -> torch.Tensor:
811
+ # The shape of graph_block_tables is
812
+ # [max batch size, max context len // block size].
813
+ max_batch_size, max_blocks = self.runner.graph_block_tables.shape
814
+ assert max_batch_size >= num_seqs
815
+
816
+ graph_block_tables = self.runner.graph_block_tables[:num_seqs]
817
+ for i, block_table in enumerate(block_tables):
818
+ if block_table:
819
+ num_blocks = len(block_table)
820
+ if num_blocks <= max_blocks:
821
+ graph_block_tables[i, :num_blocks] = block_table
822
+ else:
823
+ # It may be possible to have more blocks allocated due
824
+ # to lookahead slots of multi-step, however, they are
825
+ # not used anyway, so can be safely ignored.
826
+ graph_block_tables[
827
+ i, :max_blocks] = block_table[:max_blocks]
828
+
829
+ return torch.from_numpy(graph_block_tables).to(
830
+ device=self.runner.device, non_blocking=True)
831
+
832
+ def build(self, seq_lens: List[int], query_lens: List[int],
833
+ cuda_graph_pad_size: int, batch_size: int):
834
+ """Build attention metadata with on-device tensors.
835
+
836
+ Args:
837
+ seq_lens: The maybe padded sequence lengths of the input sequences.
838
+ query_lens: The query lengths of the input sequences.
839
+ cuda_graph_pad_size: The padding size for cuda graph.
840
+ -1 if cuda graph is not used.
841
+ batch_size: The maybe padded batch size.
842
+ """
843
+ prefix_cache_hit = any([
844
+ inter_data.prefix_cache_hit
845
+ for inter_data in self.input_builder.inter_data_list
846
+ ])
847
+
848
+ for inter_data in self.input_builder.inter_data_list:
849
+ self._add_seq_group(inter_data,
850
+ self.input_builder.chunked_prefill_enabled,
851
+ prefix_cache_hit)
852
+
853
+ device = self.runner.device
854
+ use_captured_graph = cuda_graph_pad_size != -1
855
+
856
+ max_query_len = max(query_lens)
857
+ decode_query_lens = query_lens[self.num_prefills:]
858
+ if len(decode_query_lens) > 0:
859
+ max_decode_query_len = max(decode_query_lens)
860
+ else:
861
+ max_decode_query_len = 1
862
+ max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
863
+ max_decode_seq_len = max(self.curr_seq_lens, default=0)
864
+ num_decode_tokens = self.num_decode_tokens
865
+ query_start_loc = list(accumulate(query_lens, initial=0))
866
+ seq_start_loc = list(accumulate(seq_lens, initial=0))
867
+
868
+ num_seqs = len(seq_lens)
869
+ if use_captured_graph:
870
+ self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
871
+ self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
872
+ cuda_graph_pad_size)
873
+ num_decode_tokens = batch_size - self.num_prefill_tokens
874
+
875
+ block_tables = self._get_graph_runner_block_tables(
876
+ num_seqs, self.block_tables)
877
+ else:
878
+ block_tables = make_tensor_with_pad(
879
+ self.block_tables,
880
+ pad=0,
881
+ dtype=torch.int,
882
+ device=device,
883
+ )
884
+ assert max_query_len > 0, ("query_lens: {}".format(query_lens))
885
+
886
+ assert device is not None
887
+ context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
888
+ device, self.runner.pin_memory)
889
+ seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
890
+ self.runner.pin_memory)
891
+ slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
892
+ device, self.runner.pin_memory)
893
+ query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
894
+ device,
895
+ self.runner.pin_memory)
896
+ seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
897
+ device, self.runner.pin_memory)
898
+
899
+ context_chunk_cu_seq_lens = None
900
+ context_chunk_starts = None
901
+ context_chunk_seq_tot = None
902
+ context_chunk_max_seq_lens = None
903
+
904
+ if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
905
+ and self.num_prefills > 0 \
906
+ and context_lens_tensor is not None \
907
+ and context_lens_tensor[:self.num_prefills].max() > 0:
908
+
909
+ # NOTE: it is recommend you read the `Chunked Prefill` section in
910
+ # the comment at the top of the file before trying to understand
911
+ # the following code
912
+
913
+ num_prefills_with_context = \
914
+ (context_lens_tensor[:self.num_prefills] > 0).sum().item()
915
+
916
+ # currently we allocate an equal amount of workspace for each
917
+ # prefill in the batch, we could probably use a more advanced
918
+ # algorithm here and allocate more workspace to prefills with
919
+ # longer context lengths
920
+ max_context_chunk = \
921
+ self.context_chunk_workspace_size // num_prefills_with_context
922
+
923
+ # align max_context_chunk to page_size by rounding down,
924
+ # currently the `gather_cache` kernel cannot handle
925
+ # `context_chunk_starts` that are not aligned to page_size
926
+ max_context_chunk = round_down(max_context_chunk, self.page_size)
927
+ assert max_context_chunk > 0
928
+ num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
929
+
930
+ # if `max_context_chunk = 256`, `num_chunks = 3`, and
931
+ # `num_prefills_with_context = 4`, create a tensor that looks like
932
+ # [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
933
+ context_chunk_starts = \
934
+ torch.arange(num_chunks, device=device, dtype=torch.int32)\
935
+ .unsqueeze(1).expand(-1, self.num_prefills)\
936
+ * max_context_chunk
937
+ chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\
938
+ .unsqueeze(0), context_chunk_starts + max_context_chunk)
939
+ chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
940
+ _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
941
+ torch.int32)
942
+ zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\
943
+ .unsqueeze(-1)
944
+ context_chunk_cu_seq_lens = \
945
+ torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
946
+ context_chunk_max_seq_lens = \
947
+ chunk_seq_lens.max(dim=1).values.tolist()
948
+ context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
949
+ assert max(context_chunk_seq_tot) <= \
950
+ self.context_chunk_workspace_size
951
+
952
+ return self.runner.attn_backend.make_metadata(
953
+ # Required by ModelRunner
954
+ use_cuda_graph=use_captured_graph, # Not Attention Related
955
+ # Required by Attention Metadata
956
+ num_prefills=self.num_prefills,
957
+ slot_mapping=slot_mapping_tensor,
958
+ num_prefill_tokens=self.num_prefill_tokens,
959
+ num_decode_tokens=num_decode_tokens,
960
+ # Required by Attention Metadata (not used)
961
+ multi_modal_placeholder_index_maps=None, # Not Attention Related
962
+ enable_kv_scales_calculation=False,
963
+ # MLACommonMetadata
964
+ seq_lens=seq_lens,
965
+ seq_lens_tensor=seq_lens_tensor,
966
+ max_query_len=max_query_len,
967
+ max_decode_query_len=max_decode_query_len,
968
+ max_prefill_seq_len=max_prefill_seq_len,
969
+ max_decode_seq_len=max_decode_seq_len,
970
+ query_start_loc=query_start_loc_tensor,
971
+ seq_start_loc=seq_start_loc_tensor,
972
+ context_lens_tensor=context_lens_tensor,
973
+ block_tables=block_tables,
974
+ head_dim=self.runner.model_config.get_head_size(),
975
+ is_profile_run=self.runner.in_profile_run,
976
+ # MLACommonMetadata Chunk prefill specific
977
+ context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
978
+ context_chunk_starts=context_chunk_starts,
979
+ context_chunk_seq_tot=context_chunk_seq_tot,
980
+ context_chunk_max_seq_lens=context_chunk_max_seq_lens,
981
+ )
982
+
983
+
984
+ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
985
+ """
986
+ NOTE: Please read the comment at the top of the file before trying to
987
+ understand this class
988
+ """
989
+
990
+ def __init__(
991
+ self,
992
+ num_heads: int,
993
+ head_size: int,
994
+ scale: float,
995
+ num_kv_heads: int,
996
+ alibi_slopes: Optional[List[float]],
997
+ sliding_window: Optional[int],
998
+ kv_cache_dtype: str,
999
+ blocksparse_params: Optional[Dict[str, Any]],
1000
+ logits_soft_cap: Optional[float],
1001
+ attn_type: str,
1002
+ # MLA Specific Arguments
1003
+ q_lora_rank: Optional[int],
1004
+ kv_lora_rank: int,
1005
+ qk_nope_head_dim: int,
1006
+ qk_rope_head_dim: int,
1007
+ qk_head_dim: int,
1008
+ v_head_dim: int,
1009
+ kv_b_proj: ColumnParallelLinear,
1010
+ ) -> None:
1011
+ self.num_heads = num_heads
1012
+ self.head_size = head_size
1013
+ self.scale = float(scale)
1014
+ self.num_kv_heads = num_kv_heads
1015
+ self.kv_cache_dtype = kv_cache_dtype
1016
+
1017
+ self.q_lora_rank = q_lora_rank
1018
+ self.kv_lora_rank = kv_lora_rank
1019
+ self.qk_nope_head_dim = qk_nope_head_dim
1020
+ self.qk_rope_head_dim = qk_rope_head_dim
1021
+ self.qk_head_dim = qk_head_dim
1022
+ self.v_head_dim = v_head_dim
1023
+ self.kv_b_proj = kv_b_proj
1024
+
1025
+ self.triton_fa_func = triton_attention
1026
+ # Handle the differences between the flash_attn_varlen from flash_attn
1027
+ # and the one from vllm_flash_attn. The former is used on RoCM and the
1028
+ # latter has an additional parameter to control FA2 vs FA3
1029
+ self.flash_attn_varlen_func = flash_attn_varlen_func
1030
+ self.vllm_flash_attn_version = get_flash_attn_version()
1031
+ if self.vllm_flash_attn_version is not None:
1032
+ self.flash_attn_varlen_func = \
1033
+ functools.partial(flash_attn_varlen_func,
1034
+ fa_version=self.vllm_flash_attn_version)
1035
+
1036
+ # For MLA the v head dim is smaller than qk head dim so we pad out
1037
+ # v with 0s to match the qk head dim for attention backends that do
1038
+ # not support different headdims
1039
+ # We don't need to pad V if we are on a hopper system with FA3
1040
+ self._pad_v = self.vllm_flash_attn_version is None or not (
1041
+ self.vllm_flash_attn_version == 3
1042
+ and current_platform.get_device_capability()[0] == 9)
1043
+
1044
+ def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
1045
+ return_softmax_lse, **kwargs):
1046
+ maybe_padded_v = v
1047
+ if self._pad_v:
1048
+ maybe_padded_v = torch.nn.functional.pad(
1049
+ v, [0, q.shape[-1] - v.shape[-1]], value=0)
1050
+
1051
+ if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
1052
+ and not return_softmax_lse:
1053
+ attn_out = self.triton_fa_func(
1054
+ q,
1055
+ k,
1056
+ maybe_padded_v,
1057
+ None, # output
1058
+ kwargs["cu_seqlens_q"],
1059
+ kwargs["cu_seqlens_k"],
1060
+ kwargs["max_seqlen_q"],
1061
+ kwargs["max_seqlen_k"],
1062
+ kwargs["causal"],
1063
+ softmax_scale,
1064
+ None, # bias
1065
+ )
1066
+ elif is_vllm_fa:
1067
+ attn_out = self.flash_attn_varlen_func(
1068
+ q=q,
1069
+ k=k,
1070
+ v=maybe_padded_v,
1071
+ return_softmax_lse=return_softmax_lse,
1072
+ softmax_scale=softmax_scale,
1073
+ **kwargs,
1074
+ )
1075
+ else:
1076
+ # Use return_attn_probs instead of return_softmax_lse for RoCM
1077
+ attn_out = self.flash_attn_varlen_func(
1078
+ q=q,
1079
+ k=k,
1080
+ v=maybe_padded_v,
1081
+ return_attn_probs=return_softmax_lse,
1082
+ softmax_scale=softmax_scale,
1083
+ **kwargs,
1084
+ )
1085
+
1086
+ # Unpack the output if there is multiple results,
1087
+ # triton always returns (output, softmax_lse),
1088
+ # vllm_flash_attn returns (output, softmax_lse) when
1089
+ # `return_softmax_lse = True`
1090
+ # flash_attn (RoCM) returns (output, softmax_lse, ...) when
1091
+ # `return_attn_probs = True`
1092
+ rest = None
1093
+ if isinstance(attn_out, tuple):
1094
+ attn_out, *rest = attn_out
1095
+
1096
+ # unpad if necessary
1097
+ if self._pad_v:
1098
+ attn_out = attn_out[..., :v.shape[-1]]
1099
+
1100
+ # Remain consistent with old `flash_attn_varlen_func` where there
1101
+ # is only one output tensor if `return_softmax_lse` is False.
1102
+ if return_softmax_lse:
1103
+ assert rest is not None
1104
+ return attn_out, rest[0]
1105
+ return attn_out
1106
+
1107
+ def _v_up_proj(self, x):
1108
+ # Convert from (B, N, L) to (N, B, L)
1109
+ x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
1110
+ # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
1111
+ x = torch.bmm(x, self.W_UV)
1112
+ # Convert from (N, B, V) to (B, N * V)
1113
+ return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
1114
+
1115
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
1116
+
1117
+ def get_layer_weight(layer):
1118
+ WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
1119
+ for attr in WEIGHT_NAMES:
1120
+ if hasattr(layer, attr):
1121
+ return getattr(layer, attr)
1122
+ raise AttributeError(
1123
+ f"Layer '{layer}' has no recognized weight attribute:"
1124
+ f" {WEIGHT_NAMES}.")
1125
+
1126
+ def get_and_maybe_dequant_weights(layer: LinearBase):
1127
+ if not isinstance(layer.quant_method, UnquantizedLinearMethod):
1128
+ # NOTE: This should only be used offline, since it's O(N^3)
1129
+ eye = torch.eye(layer.input_size_per_partition,
1130
+ dtype=act_dtype,
1131
+ device=get_layer_weight(layer).device)
1132
+ dequant_weights = layer.quant_method.apply(layer,
1133
+ eye,
1134
+ bias=None)
1135
+ del eye
1136
+ # standardize to (output, input)
1137
+ return dequant_weights.T
1138
+ return layer.weight
1139
+
1140
+ # we currently do not have quantized bmm's which are needed for
1141
+ # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
1142
+ # the bmm's in 16-bit, the extra memory overhead of this is fairly low
1143
+ kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
1144
+ assert kv_b_proj_weight.shape == (
1145
+ self.kv_lora_rank,
1146
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
1147
+ f"{kv_b_proj_weight.shape=}, "
1148
+ f"{self.kv_lora_rank=}, "
1149
+ f"{self.num_heads=}, "
1150
+ f"{self.qk_nope_head_dim=}, "
1151
+ f"{self.v_head_dim=}")
1152
+ kv_b_proj_weight = kv_b_proj_weight.view(
1153
+ self.kv_lora_rank,
1154
+ self.num_heads,
1155
+ self.qk_nope_head_dim + self.v_head_dim,
1156
+ )
1157
+
1158
+ W_UK, W_UV = kv_b_proj_weight.split(
1159
+ [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1160
+
1161
+ # Convert from (L, N, V) to (N, L, V)
1162
+ self.W_UV = W_UV.transpose(0, 1)
1163
+ # Convert from (L, N, P) to (N, P, L)
1164
+ self.W_UK_T = W_UK.permute(1, 2, 0)
1165
+
1166
+ def _compute_prefill_context(
1167
+ self,
1168
+ q: torch.Tensor,
1169
+ kv_c_and_k_pe_cache: torch.Tensor,
1170
+ attn_metadata: MLACommonMetadata,
1171
+ ):
1172
+ prefill_metadata = attn_metadata.prefill_metadata
1173
+ assert prefill_metadata is not None
1174
+ assert prefill_metadata.context_chunk_seq_tot is not None
1175
+ assert prefill_metadata.context_chunk_cu_seq_lens is not None
1176
+ assert prefill_metadata.context_chunk_starts is not None
1177
+ assert prefill_metadata.context_chunk_max_seq_lens is not None
1178
+ assert prefill_metadata.context_lens_tensor is not None
1179
+
1180
+ output = None
1181
+ iters = len(prefill_metadata.context_chunk_seq_tot)
1182
+
1183
+ # Fetch from attn_metadata directly, since it late bound by
1184
+ # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
1185
+ # any weirdness around prefill_metadata caching
1186
+ assert attn_metadata.context_chunk_workspace is not None
1187
+ workspace = attn_metadata.context_chunk_workspace
1188
+
1189
+ for i in range(iters):
1190
+ toks = prefill_metadata.context_chunk_seq_tot[i]
1191
+
1192
+ ops.gather_cache(
1193
+ src_cache=kv_c_and_k_pe_cache,
1194
+ dst=workspace,
1195
+ block_table=prefill_metadata.block_tables,
1196
+ cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
1197
+ batch_size=prefill_metadata.num_prefills,
1198
+ seq_starts=prefill_metadata.context_chunk_starts[i],
1199
+ )
1200
+
1201
+ kv_c_normed = workspace[:toks]\
1202
+ [..., :self.kv_lora_rank]
1203
+ k_pe = workspace[:toks]\
1204
+ [..., self.kv_lora_rank:].unsqueeze(1)
1205
+
1206
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
1207
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1208
+ k_nope, v = kv_nope\
1209
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1210
+
1211
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
1212
+ dim=-1)
1213
+
1214
+ attn_output, attn_softmax_lse = \
1215
+ self._flash_attn_varlen_diff_headdims(
1216
+ q=q,
1217
+ k=k,
1218
+ v=v,
1219
+ cu_seqlens_q=prefill_metadata.query_start_loc,
1220
+ cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
1221
+ max_seqlen_q=prefill_metadata.max_query_len,
1222
+ max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
1223
+ softmax_scale=self.scale,
1224
+ causal=False, # Context is unmasked
1225
+ return_softmax_lse=True,
1226
+ )
1227
+
1228
+ if output is None:
1229
+ output = attn_output
1230
+ output_lse = attn_softmax_lse
1231
+ else:
1232
+ output_tmp = torch.empty_like(output)
1233
+ output_lse_tmp = torch.empty_like(output_lse)
1234
+ merge_attn_states(
1235
+ output=output_tmp,
1236
+ output_lse=output_lse_tmp,
1237
+ prefix_output=output,
1238
+ prefix_lse=output_lse,
1239
+ suffix_output=attn_output,
1240
+ suffix_lse=attn_softmax_lse,
1241
+ )
1242
+ output = output_tmp
1243
+ output_lse = output_lse_tmp
1244
+
1245
+ return output, output_lse
1246
+
1247
+ def _forward_prefill(
1248
+ self,
1249
+ q: torch.Tensor,
1250
+ kv_c_normed: torch.Tensor,
1251
+ k_pe: torch.Tensor,
1252
+ kv_c_and_k_pe_cache: torch.Tensor,
1253
+ attn_metadata: MLACommonMetadata,
1254
+ ) -> torch.Tensor:
1255
+
1256
+ prefill_metadata = attn_metadata.prefill_metadata
1257
+ assert prefill_metadata is not None
1258
+
1259
+ has_context = prefill_metadata.context_lens_tensor is not None \
1260
+ and prefill_metadata.context_lens_tensor.max() > 0
1261
+
1262
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
1263
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1264
+ k_nope, v = kv_nope\
1265
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1266
+
1267
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
1268
+
1269
+ output = self._flash_attn_varlen_diff_headdims(
1270
+ q=q,
1271
+ k=k,
1272
+ v=v,
1273
+ cu_seqlens_q=prefill_metadata.query_start_loc,
1274
+ cu_seqlens_k=prefill_metadata.query_start_loc,
1275
+ max_seqlen_q=prefill_metadata.max_prefill_seq_len,
1276
+ max_seqlen_k=prefill_metadata.max_prefill_seq_len,
1277
+ softmax_scale=self.scale,
1278
+ causal=True,
1279
+ return_softmax_lse=has_context,
1280
+ )
1281
+
1282
+ if has_context:
1283
+ # ROCm flash_attn_varlen_func will return 3 objects instead of 2
1284
+ suffix_output, suffix_lse = output
1285
+ context_output, context_lse = self._compute_prefill_context( \
1286
+ q, kv_c_and_k_pe_cache, attn_metadata)
1287
+
1288
+ output = torch.empty_like(suffix_output)
1289
+ merge_attn_states(
1290
+ output=output,
1291
+ prefix_output=context_output,
1292
+ prefix_lse=context_lse,
1293
+ suffix_output=suffix_output,
1294
+ suffix_lse=suffix_lse,
1295
+ )
1296
+
1297
+ return output.flatten(start_dim=-2)
1298
+
1299
+ @abstractmethod
1300
+ def _forward_decode(
1301
+ self,
1302
+ ql_nope: torch.Tensor,
1303
+ q_pe: torch.Tensor,
1304
+ kv_c_and_k_pe_cache: torch.Tensor,
1305
+ attn_metadata: T,
1306
+ ) -> torch.Tensor:
1307
+ raise NotImplementedError
1308
+
1309
+ def forward(
1310
+ self,
1311
+ layer: AttentionLayer,
1312
+ q: torch.Tensor, # query in unified attn
1313
+ k_c_normed: torch.Tensor, # key in unified attn
1314
+ k_pe: torch.Tensor, # value in unified attn
1315
+ kv_cache: torch.Tensor,
1316
+ attn_metadata: T,
1317
+ output: Optional[torch.Tensor] = None,
1318
+ ) -> torch.Tensor:
1319
+ if output is not None:
1320
+ raise NotImplementedError(
1321
+ "output is not yet supported for MLAImplBase")
1322
+
1323
+ if attn_metadata.is_profile_run and \
1324
+ attn_metadata.context_chunk_workspace is not None:
1325
+ # During the profile run try to simulate to worse case output size
1326
+ # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
1327
+ # since this can be large
1328
+ _ = torch.empty(
1329
+ (attn_metadata.context_chunk_workspace.shape[0],
1330
+ self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
1331
+ device=k_c_normed.device,
1332
+ dtype=k_c_normed.dtype,
1333
+ )
1334
+
1335
+ has_decode = attn_metadata.decode_metadata is not None
1336
+ has_prefill = attn_metadata.prefill_metadata is not None
1337
+
1338
+ num_prefill_tokens: int = attn_metadata.num_prefill_tokens
1339
+ q = q.view(-1, self.num_heads, self.qk_head_dim)
1340
+
1341
+ decode_q = q[num_prefill_tokens:]
1342
+
1343
+ prefill_q = q[:num_prefill_tokens]
1344
+ prefill_k_pe = k_pe[:num_prefill_tokens]
1345
+ prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
1346
+
1347
+ # write the latent and rope to kv cache
1348
+ if kv_cache.numel() > 0:
1349
+ ops.concat_and_cache_mla(
1350
+ k_c_normed,
1351
+ k_pe.squeeze(1),
1352
+ kv_cache,
1353
+ attn_metadata.slot_mapping.flatten(),
1354
+ kv_cache_dtype=self.kv_cache_dtype,
1355
+ scale=layer._k_scale,
1356
+ )
1357
+
1358
+ output = torch.empty(attn_metadata.num_prefill_tokens +
1359
+ attn_metadata.num_decode_tokens,
1360
+ self.v_head_dim * self.num_heads,
1361
+ device=q.device,
1362
+ dtype=q.dtype)
1363
+ if has_prefill:
1364
+ output[:num_prefill_tokens] = self._forward_prefill(
1365
+ prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
1366
+ attn_metadata)
1367
+
1368
+ if has_decode:
1369
+ decode_q_nope, decode_q_pe = decode_q.split(
1370
+ [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
1371
+ # Convert from (B, N, P) to (N, B, P)
1372
+ decode_q_nope = decode_q_nope.transpose(0, 1)
1373
+ # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
1374
+ decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
1375
+ # Convert from (N, B, L) to (B, N, L)
1376
+ decode_ql_nope = decode_ql_nope.transpose(0, 1)
1377
+
1378
+ output[num_prefill_tokens:] = self._forward_decode(
1379
+ decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
1380
+
1381
+ return output