vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1385 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ # MLA Common Components
5
+
6
+ This file implements common components for MLA implementations.
7
+
8
+ First we define:
9
+
10
+ Sq as Q sequence length
11
+ Skv as KV sequence length
12
+
13
+ MLA has two possible ways of computing, a data-movement friendly approach and a
14
+ compute friendly approach, we generally want to use the compute friendly
15
+ approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
16
+ and the data-movement friendly approach for "decode" (i.e. the ratio
17
+ Sq / Skv is "large").
18
+
19
+ NOTE what we deem small and large is currently determined by if its labelled
20
+ prefill or decode by the scheduler, but this is something we should probably
21
+ tune.
22
+
23
+ Main reference: DeepseekV2 paper, and FlashInfer Implementation
24
+ (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
25
+
26
+ Deepseek's MLA attention works the following way:
27
+ * Use a single latent vector to represent the per-token entry of the KV cache.
28
+ * For decode (i.e. the memory friendly approach) the attention "simulates" a
29
+ multi-head attention, while the compute is similar to multi-query attention.
30
+
31
+ Below is example of both paths assuming batchsize = 1
32
+
33
+ ## More Extent Definitions:
34
+
35
+ C Context length, `Skv - Sq`
36
+ H hidden size
37
+ N number of attention heads
38
+ Lq latent dimension for Q 1536 in DSV3
39
+ Lkv latent dimension for K/V 512 in DSV3
40
+ P nope dimension, no rope. 128 in DSV3
41
+ R rope dimension, goes through rope. 64 in DSV3
42
+ V V head dim. 128 in DSV3
43
+
44
+ ## Vector/Matrix Definitions
45
+
46
+ h_t hidden states (input to attention) shape [Sq, H]
47
+ q_c latent/compressed Q shape [Sq, Lq]
48
+ q_nope uncompressed Q (no-rope) shape [Sq, N, P]
49
+ q_pe uncompressed Q (rope) shape [Sq, N, R]
50
+ kv_c latent/compressed KV shape [Skv, Lkv]
51
+ k_pe decoupled k position embeddings shape [Skv, R]
52
+ new_kv_c new kv_c from current iter shape [Sq, Lkv]
53
+ new_k_pe new k_pe from current iter shape [Sq, R]
54
+ cache_kv_c cached k_c from previous iters shape [C, Lkv]
55
+ cache_k_pe cached k_pe from previous iters shape [C, R]
56
+ W_DQ project h_t to q_c shape [H, Lq]
57
+ W_UQ project q_c to q_nope shape [Lq, N * P]
58
+ W_QR project q_c to q_pe shape [Lq, N * R]
59
+ W_DKV project h_t to kv_c shape [H, Lkv]
60
+ W_UK project kv_c to k_nope shape [Lkv, N, P]
61
+ W_KR project h_t to k_pe shape [H, R]
62
+ W_UV project kv_c to v shape [Lkv, N, V]
63
+ W_O project v to h_t shape [N * V, H]
64
+
65
+
66
+ ## Compute Friendly Approach (i.e. "_forward_prefill"):
67
+
68
+ q_c = h_t @ W_DQ
69
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
70
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
71
+ new_kv_c = h_t @ W_DKV
72
+ new_k_pe = RoPE(h_t @ W_KR)
73
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
74
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
75
+ k_nope = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
76
+ v = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
77
+
78
+ // MHA with QK headdim = P + R
79
+ // V headdim = V
80
+ // spda_o shape [Sq, N, V]
81
+ spda_o = scaled_dot_product_attention(
82
+ torch.cat([q_nope, q_pe], dim=-1),
83
+ torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
84
+ v
85
+ )
86
+ return spda_o @ W_O
87
+
88
+ NOTE: in the actual code,
89
+ `kv_b_proj` is [W_UK; W_UV] concatenated per head
90
+ `q_b_proj` is [W_UQ; W_QR] concatenated per head
91
+ `out_proj` is W_O
92
+
93
+
94
+ ## Data-Movement Friendly Approach (i.e. "_forward_decode"):
95
+
96
+ Runtime
97
+ q_c = h_t @ W_DQ
98
+ q_nope = (q_c @ W_UQ).view(-1, N, P)
99
+ ql_nope = einsum("snh,lnh->snl", q, W_UK)
100
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
101
+ new_kv_c = h_t @ W_DKV
102
+ new_k_pe = RoPE(h_t @ W_KR)
103
+ kv_c = torch.cat([new_kv_c, cache_kv_c], dim=0)
104
+ k_pe = torch.cat([new_k_pe, cache_k_pe], dim=0)
105
+
106
+ // MQA with QK headdim = Lkv + R
107
+ // V headdim = Lkv
108
+ // spda_o shape [Sq, N, Lkv]
109
+ // NOTE: this is less compute-friendly since Lkv > P
110
+ // but is more data-movement friendly since its MQA vs MHA
111
+ spda_o = scaled_dot_product_attention(
112
+ torch.cat([ql_nope, q_pe], dim=-1),
113
+ torch.cat([kv_c, k_pe], dim=-1),
114
+ kv_c
115
+ )
116
+
117
+ o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
118
+ return o.view(-1, N * V) @ self.num_heads @ W_O
119
+
120
+
121
+ ## Chunked Prefill
122
+
123
+ For chunked prefill we want to use the compute friendly algorithm. We are
124
+ assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
125
+ the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
126
+
127
+ However, the compute-friendly approach can potentially run out of memory if Skv
128
+ is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
129
+
130
+ To mitigate this, we chunk the computation of attention with respect to the
131
+ current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
132
+ fixed workspace size.
133
+
134
+ The chunked prefill approach is as follows:
135
+
136
+ MCC Max chunk of context to process per iter, computed dynamically,
137
+ used to bound the memory usage
138
+
139
+ q_c = h_t @ W_DQ
140
+ q_nope = (q_c @ W_UQ).view(Sq, N, P)
141
+ q_pe = RoPE(q_c @ W_QR).view(Sq, N, R)
142
+ new_kv_c = h_t @ W_DKV
143
+ new_k_pe = RoPE(h_t @ W_KR)
144
+ new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
145
+ new_v = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
146
+
147
+ // MHA between queries and new KV
148
+ // with QK headdim = P + R
149
+ // V headdim = V
150
+ // curr_o shape [Sq, N, V]
151
+ // curr_lse shape [N, Sq], this is just order FA returns
152
+ curr_o, curr_lse = scaled_dot_product_attention(
153
+ torch.cat([q_nope, q_pe], dim=-1),
154
+ torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
155
+ new_v,
156
+ casual=True,
157
+ return_softmax_lse=True
158
+ )
159
+
160
+ // Compute attention with the already existing context
161
+ for chunk_idx in range(cdiv(C, MCC)):
162
+ chunk_start = chunk_idx * MCC
163
+ chunk_end = min(chunk_start + MCC, C)
164
+ Sc = chunk_end - chunk_start
165
+ cache_kv_c_chunk = cache_kv_c[chunk_start:chunk_end]
166
+ cache_k_pe_chunk = cache_k_pe[chunk_start:chunk_end]
167
+ cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
168
+ cache_v_chunk = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
169
+
170
+ chunk_o, chunk_lse = scaled_dot_product_attention(
171
+ torch.cat([q_nope, q_pe], dim=-1),
172
+ torch.cat([cache_k_nope_chunk,
173
+ cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
174
+ dim=-1),
175
+ cache_v_chunk,
176
+ casual=False,
177
+ return_softmax_lse=True
178
+ )
179
+
180
+ curr_o, curr_lse = merge_attn_states(
181
+ suffix_output=curr_o,
182
+ suffix_lse=curr_lse,
183
+ prefix_output=chunk_o,
184
+ prefix_lse=chunk_lse,
185
+ )
186
+
187
+ return curr_o @ W_O
188
+ """
189
+
190
+ import functools
191
+ from abc import abstractmethod
192
+ from collections import defaultdict
193
+ from contextlib import contextmanager
194
+ from dataclasses import dataclass
195
+ from itertools import accumulate
196
+ from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
197
+ Type, TypeVar)
198
+
199
+ import torch
200
+
201
+ from vllm import _custom_ops as ops
202
+ from vllm import envs
203
+ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
204
+ AttentionMetadata,
205
+ AttentionMetadataBuilder,
206
+ AttentionState, MLAAttentionImpl)
207
+ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
208
+ compute_slot_mapping_start_idx,
209
+ is_block_tables_empty)
210
+ from vllm.attention.ops.merge_attn_states import merge_attn_states
211
+ from vllm.attention.utils.fa_utils import get_flash_attn_version
212
+ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
213
+ LinearBase,
214
+ UnquantizedLinearMethod)
215
+ from vllm.multimodal import MultiModalPlaceholderMap
216
+ from vllm.platforms import current_platform
217
+ from vllm.triton_utils import HAS_TRITON
218
+ from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
219
+
220
+ if HAS_TRITON:
221
+ from vllm.attention.ops.triton_flash_attention import triton_attention
222
+ else:
223
+ triton_attention = None
224
+
225
+ try:
226
+ from vllm.vllm_flash_attn import flash_attn_varlen_func
227
+ is_vllm_fa = True
228
+ except ImportError:
229
+ is_vllm_fa = False
230
+ try:
231
+ # For rocm use upstream flash attention
232
+ from flash_attn import flash_attn_varlen_func
233
+ except ImportError:
234
+ flash_attn_varlen_func = None
235
+
236
+ if TYPE_CHECKING:
237
+ from vllm.worker.model_runner import (ModelInputForGPUBuilder,
238
+ ModelInputForGPUWithSamplingMetadata)
239
+
240
+ is_hip = current_platform.is_rocm()
241
+
242
+
243
+ class MLACommonBackend(AttentionBackend):
244
+
245
+ @staticmethod
246
+ def get_name() -> str:
247
+ return "TRITON_MLA"
248
+
249
+ @staticmethod
250
+ def get_metadata_cls() -> Type["AttentionMetadata"]:
251
+ return MLACommonMetadata
252
+
253
+ @staticmethod
254
+ def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
255
+ return MLACommonMetadataBuilder
256
+
257
+ @staticmethod
258
+ def get_state_cls() -> Type["MLACommonState"]:
259
+ return MLACommonState
260
+
261
+ @staticmethod
262
+ def get_kv_cache_shape(
263
+ num_blocks: int,
264
+ block_size: int,
265
+ num_kv_heads: int, # assumed to be 1 for MLA
266
+ head_size: int,
267
+ ) -> Tuple[int, ...]:
268
+ return (num_blocks, block_size, head_size)
269
+
270
+ @staticmethod
271
+ def swap_blocks(
272
+ src_kv_cache: torch.Tensor,
273
+ dst_kv_cache: torch.Tensor,
274
+ src_to_dst: torch.Tensor,
275
+ ) -> None:
276
+ ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
277
+
278
+ @staticmethod
279
+ def copy_blocks(
280
+ kv_caches: List[torch.Tensor],
281
+ src_to_dists: torch.Tensor,
282
+ ) -> None:
283
+ ops.copy_blocks_mla(kv_caches, src_to_dists)
284
+
285
+ @staticmethod
286
+ def get_supported_head_sizes() -> List[int]:
287
+ return [576]
288
+
289
+
290
+ T = TypeVar("T", bound="MLACommonMetadata")
291
+
292
+
293
+ class MLACommonState(AttentionState, Generic[T]):
294
+
295
+ def __init__(self, runner):
296
+ self.runner = runner
297
+ self._is_graph_capturing = False
298
+
299
+ scheduler_config = runner.scheduler_config
300
+ self.model_config = runner.model_config
301
+ cache_config = runner.cache_config
302
+
303
+ self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
304
+ self.enable_prefix_caching = cache_config.enable_prefix_caching
305
+
306
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
307
+ self.context_chunk_workspace_size = min(
308
+ # Max sure there is enough for 8 full length request or at least
309
+ # 4 pages of cache per request
310
+ max(
311
+ 8 * self.model_config.max_model_len, 4 *
312
+ scheduler_config.max_num_seqs * cache_config.block_size),
313
+ # For long-context models try not to over-allocate limiting
314
+ # kv-cache space, limiting it to 64k tokens,
315
+ # which would result in the workspace being:
316
+ # 2*(576)*(64*1024) = 144mb
317
+ # (assuming 576 MLA head dim, and fp16)
318
+ # which would result in up-projected context being
319
+ # 2*(192*128)*(64*1024) = 3gb
320
+ # (assuming 192 QK head dim, 128 heads, and fp16)
321
+ 128 * 1024)
322
+ assert self.context_chunk_workspace_size >= \
323
+ scheduler_config.max_num_seqs * cache_config.block_size
324
+
325
+ @contextmanager
326
+ def graph_capture(self, max_batch_size: int):
327
+ self._is_graph_capturing = True
328
+
329
+ self._graph_slot_mapping = torch.full((max_batch_size, ),
330
+ PAD_SLOT_ID,
331
+ dtype=torch.long,
332
+ device=self.runner.device)
333
+ self._graph_seq_lens = torch.ones(max_batch_size,
334
+ dtype=torch.int32,
335
+ device=self.runner.device)
336
+ self._graph_block_tables = torch.from_numpy(
337
+ self.runner.graph_block_tables).to(device=self.runner.device)
338
+
339
+ self._positions = torch.zeros((max_batch_size, ),
340
+ dtype=torch.long,
341
+ device=self.runner.device)
342
+
343
+ yield
344
+
345
+ self._is_graph_capturing = False
346
+ del self._graph_slot_mapping
347
+ del self._graph_seq_lens
348
+ del self._graph_block_tables
349
+ del self._positions
350
+
351
+ def graph_clone(self, batch_size: int):
352
+ assert self._is_graph_capturing
353
+ return self.__class__(self.runner)
354
+
355
+ def graph_capture_get_metadata_for_batch(
356
+ self,
357
+ batch_size: int,
358
+ is_encoder_decoder_model: bool = False) -> T:
359
+ assert self._is_graph_capturing
360
+
361
+ attn_metadata = self.runner.attn_backend.make_metadata(
362
+ multi_modal_placeholder_index_maps=None,
363
+ enable_kv_scales_calculation=False,
364
+ use_cuda_graph=True,
365
+ num_prefills=0,
366
+ num_prefill_tokens=0,
367
+ num_decode_tokens=batch_size,
368
+ slot_mapping=self._graph_slot_mapping[:batch_size],
369
+ seq_lens=None,
370
+ seq_lens_tensor=self._graph_seq_lens[:batch_size],
371
+ max_query_len=1,
372
+ max_decode_query_len=1,
373
+ max_prefill_seq_len=0,
374
+ max_decode_seq_len=self.runner.max_seq_len_to_capture,
375
+ query_start_loc=None,
376
+ seq_start_loc=None,
377
+ context_lens_tensor=None,
378
+ block_tables=self._graph_block_tables[:batch_size],
379
+ head_dim=self.runner.model_config.get_head_size())
380
+
381
+ if is_encoder_decoder_model:
382
+ raise NotImplementedError(
383
+ "MLACommonState does not support encoder/decoder yet")
384
+
385
+ return attn_metadata
386
+
387
+ def get_graph_input_buffers(self,
388
+ attn_metadata,
389
+ is_encoder_decoder_model: bool = False):
390
+ input_buffers = {
391
+ "slot_mapping": attn_metadata.slot_mapping,
392
+ "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
393
+ "block_tables": attn_metadata.decode_metadata.block_tables,
394
+ }
395
+ if is_encoder_decoder_model:
396
+ raise NotImplementedError(
397
+ "MLACommonState does not support encoder/decoder yet")
398
+
399
+ return input_buffers
400
+
401
+ def prepare_graph_input_buffers(self,
402
+ input_buffers,
403
+ attn_metadata,
404
+ is_encoder_decoder_model: bool = False):
405
+ input_buffers["seq_lens_tensor"].copy_(
406
+ attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
407
+ input_buffers["block_tables"].copy_(
408
+ attn_metadata.decode_metadata.block_tables, non_blocking=True)
409
+ if is_encoder_decoder_model:
410
+ raise NotImplementedError(
411
+ "TritonMLAState does not support encoder/decoder yet")
412
+
413
+ def begin_forward(self, model_input):
414
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
415
+ if not hasattr(self, "context_chunk_workspace"):
416
+ # not self.runner.device does not return the correct device
417
+ # for this process, (init_device sets the correct device but
418
+ # only on the Worker). The only way Ive figured out to get the
419
+ # correct device is to allocate the workspace on the first call
420
+ # to begin_forward and use the device of the input tokens
421
+ assert model_input.input_tokens is not None
422
+ self.context_chunk_workspace = torch.empty(
423
+ (self.context_chunk_workspace_size,
424
+ self.model_config.get_head_size()),
425
+ dtype=self.model_config.dtype,
426
+ device=model_input.input_tokens.device,
427
+ )
428
+
429
+ model_input.attn_metadata.context_chunk_workspace = \
430
+ self.context_chunk_workspace
431
+
432
+
433
+ @dataclass
434
+ class MLACommonMetadata(AttentionMetadata):
435
+ """Metadata for MLACommon.
436
+
437
+ NOTE: Please read the comment at the top of the file before trying to
438
+ understand this class
439
+
440
+ NOTE: Any python object stored here is not updated when it is
441
+ cuda-graph replayed. If you have values that need to be changed
442
+ dynamically, it should be stored in tensor. The tensor has to be
443
+ updated from `CUDAGraphRunner.forward` API.
444
+ """
445
+ # Whether or not if cuda graph is enabled.
446
+ # Cuda-graph is currently enabled for decoding only.
447
+ # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
448
+ use_cuda_graph: bool
449
+
450
+ # NOTE(sang): Definition of context_len, query_len, and seq_len.
451
+ # |---------- N-1 iteration --------|
452
+ # |---------------- N iteration ---------------------|
453
+ # |- tokenA -|......................|-- newTokens ---|
454
+ # |---------- context_len ----------|
455
+ # |-------------------- seq_len ---------------------|
456
+ # |-- query_len ---|
457
+
458
+ # (batch_size,). The sequence length per sequence. Sequence length means
459
+ # the computed tokens + new tokens None if it is a decoding.
460
+ seq_lens: Optional[List[int]]
461
+ # seq_lens stored as a tensor.
462
+ seq_lens_tensor: Optional[torch.Tensor]
463
+
464
+ # Maximum sequence length among prefill batch. 0 if there are decoding
465
+ # requests only.
466
+ max_prefill_seq_len: int
467
+ # Maximum sequence length among decode batch. 0 if there are prefill
468
+ # requests only.
469
+ max_decode_seq_len: int
470
+ # (batch_size,) A tensor of context lengths (tokens that are computed
471
+ # so far).
472
+ context_lens_tensor: Optional[torch.Tensor]
473
+
474
+ # (batch_size, max_blocks_per_seq).
475
+ # Block addresses per sequence. (Seq id -> list of physical block)
476
+ # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
477
+ # in the kv cache. Each block can contain up to block_size tokens.
478
+ # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
479
+ # captured.
480
+ block_tables: Optional[torch.Tensor]
481
+
482
+ # Maximum query length in the batch.
483
+ max_query_len: Optional[int] = None
484
+
485
+ # Max number of query tokens among request in the batch.
486
+ max_decode_query_len: Optional[int] = None
487
+
488
+ # (batch_size + 1,). The cumulative subquery lengths of the sequences in
489
+ # the batch, used to index into subquery. E.g., if the subquery length
490
+ # is [4, 6], it is [0, 4, 10].
491
+ query_start_loc: Optional[torch.Tensor] = None
492
+ # (batch_size + 1,). The cumulative sequence lengths of the sequences in
493
+ # the batch, used to index into sequence. E.g., if the sequence length is
494
+ # [4, 6], it is [0, 4, 10].
495
+ seq_start_loc: Optional[torch.Tensor] = None
496
+
497
+ _cached_prefill_metadata: Optional[Any] = None
498
+ _cached_decode_metadata: Optional[Any] = None
499
+
500
+ num_prefill_tokens: int
501
+
502
+ # The dimension of the attention heads
503
+ head_dim: Optional[int] = None
504
+
505
+ # Used when chunked prefill is enabled to simulate worst case workspace
506
+ # allocations, hopefully to avoid going OOM
507
+ is_profile_run: bool = False
508
+
509
+ # New for MLA (compared to FlashAttention)
510
+ # For chunked prefill
511
+ context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
512
+ context_chunk_starts: Optional[torch.Tensor] = None
513
+ context_chunk_seq_tot: Optional[List[int]] = None
514
+ context_chunk_max_seq_lens: Optional[List[int]] = None
515
+ # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
516
+ context_chunk_workspace: Optional[torch.Tensor] = None
517
+
518
+ def __post_init__(self):
519
+ supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
520
+ if self.head_dim is not None and self.head_dim \
521
+ not in supported_head_sizes:
522
+ raise ValueError(
523
+ f"Only {supported_head_sizes} are supported for head_dim,",
524
+ f" received {self.head_dim}.")
525
+
526
+ @property
527
+ def prefill_metadata(self):
528
+ if self.num_prefills == 0:
529
+ return None
530
+
531
+ if self._cached_prefill_metadata is not None:
532
+ return self._cached_prefill_metadata
533
+
534
+ assert self.seq_lens is not None
535
+ assert self.seq_lens_tensor is not None
536
+
537
+ # Compute some attn_metadata fields which default to None
538
+ query_start_loc = (None if self.query_start_loc is None else
539
+ self.query_start_loc[:self.num_prefills + 1])
540
+ slot_mapping = (None if self.slot_mapping is None else
541
+ self.slot_mapping[:self.num_prefill_tokens])
542
+ seq_lens = (None if self.seq_lens is None else
543
+ self.seq_lens[:self.num_prefills])
544
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
545
+ self.seq_lens_tensor[:self.num_prefills])
546
+ seq_start_loc = (None if self.seq_start_loc is None else
547
+ self.seq_start_loc[:self.num_prefills + 1])
548
+ context_lens_tensor = (None if self.context_lens_tensor is None else
549
+ self.context_lens_tensor[:self.num_prefills])
550
+ block_tables = (None if self.block_tables is None else
551
+ self.block_tables[:self.num_prefills])
552
+
553
+ self._cached_prefill_metadata = self.__class__(
554
+ # Required by ModelRunner
555
+ use_cuda_graph=False, # Not Attention Related
556
+ # Required by Attention Metadata
557
+ num_prefills=self.num_prefills,
558
+ num_prefill_tokens=self.num_prefill_tokens,
559
+ num_decode_tokens=0,
560
+ slot_mapping=slot_mapping,
561
+ # Required by Attention Metadata (not used)
562
+ multi_modal_placeholder_index_maps=None,
563
+ enable_kv_scales_calculation=False,
564
+ # MLACommonMetadata
565
+ seq_lens=seq_lens,
566
+ seq_lens_tensor=seq_lens_tensor,
567
+ max_query_len=self.max_query_len,
568
+ max_prefill_seq_len=self.max_prefill_seq_len,
569
+ max_decode_query_len=0,
570
+ max_decode_seq_len=0,
571
+ query_start_loc=query_start_loc,
572
+ seq_start_loc=seq_start_loc,
573
+ context_lens_tensor=context_lens_tensor,
574
+ block_tables=block_tables,
575
+ head_dim=self.head_dim,
576
+ is_profile_run=self.is_profile_run,
577
+ # MLACommonMetadata Chunk prefill specific
578
+ context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens,
579
+ context_chunk_starts=self.context_chunk_starts,
580
+ context_chunk_seq_tot=self.context_chunk_seq_tot,
581
+ context_chunk_max_seq_lens=self.context_chunk_max_seq_lens,
582
+ )
583
+ return self._cached_prefill_metadata
584
+
585
+ @property
586
+ def decode_metadata(self):
587
+ if self.num_decode_tokens == 0:
588
+ return None
589
+
590
+ if self._cached_decode_metadata is not None:
591
+ return self._cached_decode_metadata
592
+ assert self.seq_lens_tensor is not None
593
+
594
+ # Compute some attn_metadata fields which default to None
595
+ slot_mapping = (None if self.slot_mapping is None else
596
+ self.slot_mapping[self.num_prefill_tokens:])
597
+ seq_lens_tensor = (None if self.seq_lens_tensor is None else
598
+ self.seq_lens_tensor[self.num_prefills:])
599
+ block_tables = (None if self.block_tables is None else
600
+ self.block_tables[self.num_prefills:])
601
+
602
+ self._cached_decode_metadata = self.__class__(
603
+ # Required by ModelRunner
604
+ use_cuda_graph=self.use_cuda_graph, # Not Attention Related
605
+ # Required by Attention Metadata
606
+ num_prefills=0,
607
+ num_prefill_tokens=0,
608
+ num_decode_tokens=self.num_decode_tokens,
609
+ slot_mapping=slot_mapping,
610
+ # Required by Attention Metadata (not used)
611
+ multi_modal_placeholder_index_maps=None,
612
+ enable_kv_scales_calculation=False,
613
+ # MLACommonMetadata
614
+ seq_lens=None,
615
+ seq_lens_tensor=seq_lens_tensor,
616
+ max_decode_query_len=self.max_decode_query_len,
617
+ max_query_len=self.max_query_len,
618
+ max_prefill_seq_len=0,
619
+ max_decode_seq_len=self.max_decode_seq_len,
620
+ # Batch may be composed of prefill|decodes, adjust query start
621
+ # indices to refer to the start of decodes. E.g.
622
+ # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
623
+ query_start_loc=(self.query_start_loc[self.num_prefills:] -
624
+ self.query_start_loc[self.num_prefills])
625
+ if self.query_start_loc is not None else None,
626
+ seq_start_loc=self.seq_start_loc[self.num_prefills:]
627
+ if self.seq_start_loc is not None else None,
628
+ context_lens_tensor=None,
629
+ block_tables=block_tables,
630
+ head_dim=self.head_dim,
631
+ is_profile_run=self.is_profile_run)
632
+ return self._cached_decode_metadata
633
+
634
+ def advance_step(self,
635
+ model_input: "ModelInputForGPUWithSamplingMetadata",
636
+ sampled_token_ids: Optional[torch.Tensor],
637
+ block_size: int,
638
+ num_seqs: int,
639
+ num_queries: int,
640
+ turn_prefills_into_decodes: bool = False):
641
+ """
642
+ Update metadata in-place to advance one decode step.
643
+ """
644
+ # When using cudagraph, the num_seqs is padded to the next captured
645
+ # batch sized, but num_queries tracks the actual number of requests in
646
+ # the batch. For --enforce-eager mode, num_seqs == num_queries
647
+ if num_seqs != num_queries:
648
+ assert num_seqs > num_queries
649
+
650
+ if turn_prefills_into_decodes:
651
+ # When Multi-Step is enabled with Chunked-Prefill, prefills and
652
+ # decodes are scheduled together. In the first step, all the
653
+ # prefills turn into decodes. This update reflects that
654
+ # conversion.
655
+ assert self.num_decode_tokens + self.num_prefills == num_seqs
656
+ self.num_decode_tokens += self.num_prefills
657
+ self.num_prefills = 0
658
+ self.num_prefill_tokens = 0
659
+ self.max_prefill_seq_len = 0
660
+ self.max_query_len = 1
661
+
662
+ self.slot_mapping = self.slot_mapping[:num_seqs]
663
+ else:
664
+ assert self.seq_lens is not None
665
+ assert self.max_decode_seq_len == max(self.seq_lens)
666
+
667
+ assert self.num_prefills == 0
668
+ assert self.num_prefill_tokens == 0
669
+ assert self.num_decode_tokens == num_seqs
670
+ assert self.slot_mapping.shape == (num_seqs, )
671
+
672
+ assert self.seq_lens is not None
673
+ assert len(self.seq_lens) == num_seqs
674
+ assert self.seq_lens_tensor is not None
675
+ assert self.seq_lens_tensor.shape == (num_seqs, )
676
+ assert self.max_query_len == 1
677
+ assert self.max_prefill_seq_len == 0
678
+
679
+ assert self.query_start_loc is not None
680
+ assert self.query_start_loc.shape == (num_queries + 1, )
681
+ assert self.seq_start_loc is not None
682
+ assert self.seq_start_loc.shape == (num_seqs + 1, )
683
+
684
+ assert self.context_lens_tensor is not None
685
+ assert self.context_lens_tensor.shape == (num_queries, )
686
+
687
+ assert self.block_tables is not None
688
+ assert self.block_tables.shape[0] == num_seqs
689
+
690
+ # Update query lengths. Note that we update only queries and not seqs,
691
+ # since tensors may be padded due to captured cuda graph batch size
692
+ for i in range(num_queries):
693
+ self.seq_lens[i] += 1
694
+ self.max_decode_seq_len = max(self.seq_lens)
695
+
696
+ self._ops_advance_step(num_seqs=num_seqs,
697
+ num_queries=num_queries,
698
+ block_size=block_size,
699
+ input_tokens=model_input.input_tokens,
700
+ sampled_token_ids=sampled_token_ids,
701
+ input_positions=model_input.input_positions)
702
+
703
+ def _ops_advance_step(self, num_seqs: int, num_queries: int,
704
+ block_size: int, input_tokens: torch.Tensor,
705
+ sampled_token_ids: torch.Tensor,
706
+ input_positions: torch.Tensor) -> None:
707
+ # here we use advance_step_flashinfo to update the paged_kv_* tensors
708
+ ops.advance_step_flashattn(num_seqs=num_seqs,
709
+ num_queries=num_queries,
710
+ block_size=block_size,
711
+ input_tokens=input_tokens,
712
+ sampled_token_ids=sampled_token_ids,
713
+ input_positions=input_positions,
714
+ seq_lens=self.seq_lens_tensor,
715
+ slot_mapping=self.slot_mapping,
716
+ block_tables=self.block_tables)
717
+
718
+
719
+ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
720
+ """
721
+ NOTE: Please read the comment at the top of the file before trying to
722
+ understand this class
723
+ """
724
+ BLOCK_TABLE_EXTENDER: list[list[int]] = []
725
+
726
+ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
727
+ self.input_builder = input_builder
728
+ self.runner = input_builder.runner
729
+ self.sliding_window = input_builder.sliding_window
730
+ self.block_size = input_builder.block_size
731
+ self.chunked_prefill_enabled = \
732
+ self.runner.scheduler_config.chunked_prefill_enabled
733
+ self.enable_prefix_caching = \
734
+ self.runner.cache_config.enable_prefix_caching
735
+
736
+ if self.chunked_prefill_enabled or self.enable_prefix_caching:
737
+ attn_state = self.input_builder.runner.attn_state
738
+ self.context_chunk_workspace_size = \
739
+ attn_state.context_chunk_workspace_size
740
+ self.page_size = self.runner.block_size
741
+
742
+ def prepare(self):
743
+ self.slot_mapping: List[int] = []
744
+ self.prefill_seq_lens: List[int] = []
745
+ self.context_lens: List[int] = []
746
+ self.block_tables: List[List[int]] = []
747
+ self.curr_seq_lens: List[int] = []
748
+ self.multimodal_placeholder_maps: Dict[
749
+ str,
750
+ MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
751
+ self.num_prefills = 0
752
+ self.num_prefill_tokens = 0
753
+ self.num_decode_tokens = 0
754
+ self.has_prefix_cache_hit = False
755
+
756
+ def _add_seq_group(
757
+ self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
758
+ chunked_prefill_enabled: bool, prefix_cache_hit: bool):
759
+ """Add a sequence group to the metadata. Specifically update/append
760
+ 1. context length.
761
+ 2. block table.
762
+ 3. slot mapping.
763
+ """
764
+ is_prompt = inter_data.is_prompt
765
+ block_tables = inter_data.block_tables
766
+
767
+ for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
768
+ curr_sliding_window_block) in zip(
769
+ inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
770
+ inter_data.orig_seq_lens, inter_data.seq_lens,
771
+ inter_data.query_lens, inter_data.context_lens,
772
+ inter_data.curr_sliding_window_blocks):
773
+ self.context_lens.append(context_len)
774
+ if is_prompt:
775
+ self.num_prefills += 1
776
+ self.num_prefill_tokens += token_len
777
+ self.prefill_seq_lens.append(seq_len)
778
+ else:
779
+ self.num_decode_tokens += query_len
780
+ self.curr_seq_lens.append(curr_seq_len)
781
+
782
+ # Compute block table.
783
+ # TODO(sang): Combine chunked prefill and prefix caching by
784
+ # only allowing multiple of block_size chunk size.
785
+ # NOTE: This only works for oooooooxxx style attention.
786
+ block_table = []
787
+ if prefix_cache_hit:
788
+ # NOTE(woosuk): For flash-attn, the block table should
789
+ # include the entries for the incoming prefill tokens.
790
+ block_table = block_tables[seq_id]
791
+ elif ((chunked_prefill_enabled or not is_prompt)
792
+ and block_tables is not None):
793
+ if curr_sliding_window_block == 0:
794
+ block_table = block_tables[seq_id]
795
+ else:
796
+ block_table = block_tables[seq_id][
797
+ -curr_sliding_window_block:]
798
+ self.block_tables.append(block_table)
799
+
800
+ # Compute slot mapping.
801
+ is_profile_run = is_block_tables_empty(block_tables)
802
+ start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
803
+ context_len,
804
+ self.sliding_window)
805
+ compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
806
+ seq_len, context_len, start_idx,
807
+ self.block_size, inter_data.block_tables)
808
+
809
+ def _get_graph_runner_block_tables(
810
+ self, num_seqs: int,
811
+ block_tables: List[List[int]]) -> torch.Tensor:
812
+ # The shape of graph_block_tables is
813
+ # [max batch size, max context len // block size].
814
+ max_batch_size, max_blocks = self.runner.graph_block_tables.shape
815
+ assert max_batch_size >= num_seqs
816
+
817
+ graph_block_tables = self.runner.graph_block_tables[:num_seqs]
818
+ for i, block_table in enumerate(block_tables):
819
+ if block_table:
820
+ num_blocks = len(block_table)
821
+ if num_blocks <= max_blocks:
822
+ graph_block_tables[i, :num_blocks] = block_table
823
+ else:
824
+ # It may be possible to have more blocks allocated due
825
+ # to lookahead slots of multi-step, however, they are
826
+ # not used anyway, so can be safely ignored.
827
+ graph_block_tables[
828
+ i, :max_blocks] = block_table[:max_blocks]
829
+
830
+ return torch.from_numpy(graph_block_tables).to(
831
+ device=self.runner.device, non_blocking=True)
832
+
833
+ def build(self, seq_lens: List[int], query_lens: List[int],
834
+ cuda_graph_pad_size: int, batch_size: int):
835
+ """Build attention metadata with on-device tensors.
836
+
837
+ Args:
838
+ seq_lens: The maybe padded sequence lengths of the input sequences.
839
+ query_lens: The query lengths of the input sequences.
840
+ cuda_graph_pad_size: The padding size for cuda graph.
841
+ -1 if cuda graph is not used.
842
+ batch_size: The maybe padded batch size.
843
+ """
844
+ prefix_cache_hit = any([
845
+ inter_data.prefix_cache_hit
846
+ for inter_data in self.input_builder.inter_data_list
847
+ ])
848
+
849
+ for inter_data in self.input_builder.inter_data_list:
850
+ self._add_seq_group(inter_data,
851
+ self.input_builder.chunked_prefill_enabled,
852
+ prefix_cache_hit)
853
+
854
+ device = self.runner.device
855
+ use_captured_graph = cuda_graph_pad_size != -1
856
+
857
+ max_query_len = max(query_lens)
858
+ decode_query_lens = query_lens[self.num_prefills:]
859
+ if len(decode_query_lens) > 0:
860
+ max_decode_query_len = max(decode_query_lens)
861
+ else:
862
+ max_decode_query_len = 1
863
+ max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
864
+ max_decode_seq_len = max(self.curr_seq_lens, default=0)
865
+ num_decode_tokens = self.num_decode_tokens
866
+ query_start_loc = list(accumulate(query_lens, initial=0))
867
+ seq_start_loc = list(accumulate(seq_lens, initial=0))
868
+
869
+ num_seqs = len(seq_lens)
870
+ if use_captured_graph:
871
+ self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
872
+ self.block_tables.extend(self.__class__.BLOCK_TABLE_EXTENDER *
873
+ cuda_graph_pad_size)
874
+ num_decode_tokens = batch_size - self.num_prefill_tokens
875
+
876
+ block_tables = self._get_graph_runner_block_tables(
877
+ num_seqs, self.block_tables)
878
+ else:
879
+ block_tables = make_tensor_with_pad(
880
+ self.block_tables,
881
+ pad=0,
882
+ dtype=torch.int,
883
+ device=device,
884
+ )
885
+ assert max_query_len > 0, ("query_lens: {}".format(query_lens))
886
+
887
+ assert device is not None
888
+ context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
889
+ device, self.runner.pin_memory)
890
+ seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
891
+ self.runner.pin_memory)
892
+ slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
893
+ device, self.runner.pin_memory)
894
+ query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
895
+ device,
896
+ self.runner.pin_memory)
897
+ seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
898
+ device, self.runner.pin_memory)
899
+
900
+ context_chunk_cu_seq_lens = None
901
+ context_chunk_starts = None
902
+ context_chunk_seq_tot = None
903
+ context_chunk_max_seq_lens = None
904
+
905
+ if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
906
+ and self.num_prefills > 0 \
907
+ and context_lens_tensor is not None \
908
+ and context_lens_tensor[:self.num_prefills].max() > 0:
909
+
910
+ # NOTE: it is recommend you read the `Chunked Prefill` section in
911
+ # the comment at the top of the file before trying to understand
912
+ # the following code
913
+
914
+ num_prefills_with_context = \
915
+ (context_lens_tensor[:self.num_prefills] > 0).sum().item()
916
+
917
+ # currently we allocate an equal amount of workspace for each
918
+ # prefill in the batch, we could probably use a more advanced
919
+ # algorithm here and allocate more workspace to prefills with
920
+ # longer context lengths
921
+ max_context_chunk = \
922
+ self.context_chunk_workspace_size // num_prefills_with_context
923
+
924
+ # align max_context_chunk to page_size by rounding down,
925
+ # currently the `gather_cache` kernel cannot handle
926
+ # `context_chunk_starts` that are not aligned to page_size
927
+ max_context_chunk = round_down(max_context_chunk, self.page_size)
928
+ assert max_context_chunk > 0
929
+ num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
930
+
931
+ # if `max_context_chunk = 256`, `num_chunks = 3`, and
932
+ # `num_prefills_with_context = 4`, create a tensor that looks like
933
+ # [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
934
+ context_chunk_starts = \
935
+ torch.arange(num_chunks, device=device, dtype=torch.int32)\
936
+ .unsqueeze(1).expand(-1, self.num_prefills)\
937
+ * max_context_chunk
938
+ chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\
939
+ .unsqueeze(0), context_chunk_starts + max_context_chunk)
940
+ chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
941
+ _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
942
+ torch.int32)
943
+ zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\
944
+ .unsqueeze(-1)
945
+ context_chunk_cu_seq_lens = \
946
+ torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
947
+ context_chunk_max_seq_lens = \
948
+ chunk_seq_lens.max(dim=1).values.tolist()
949
+ context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
950
+ assert max(context_chunk_seq_tot) <= \
951
+ self.context_chunk_workspace_size
952
+
953
+ return self.runner.attn_backend.make_metadata(
954
+ # Required by ModelRunner
955
+ use_cuda_graph=use_captured_graph, # Not Attention Related
956
+ # Required by Attention Metadata
957
+ num_prefills=self.num_prefills,
958
+ slot_mapping=slot_mapping_tensor,
959
+ num_prefill_tokens=self.num_prefill_tokens,
960
+ num_decode_tokens=num_decode_tokens,
961
+ # Required by Attention Metadata (not used)
962
+ multi_modal_placeholder_index_maps=None, # Not Attention Related
963
+ enable_kv_scales_calculation=False,
964
+ # MLACommonMetadata
965
+ seq_lens=seq_lens,
966
+ seq_lens_tensor=seq_lens_tensor,
967
+ max_query_len=max_query_len,
968
+ max_decode_query_len=max_decode_query_len,
969
+ max_prefill_seq_len=max_prefill_seq_len,
970
+ max_decode_seq_len=max_decode_seq_len,
971
+ query_start_loc=query_start_loc_tensor,
972
+ seq_start_loc=seq_start_loc_tensor,
973
+ context_lens_tensor=context_lens_tensor,
974
+ block_tables=block_tables,
975
+ head_dim=self.runner.model_config.get_head_size(),
976
+ is_profile_run=self.runner.in_profile_run,
977
+ # MLACommonMetadata Chunk prefill specific
978
+ context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
979
+ context_chunk_starts=context_chunk_starts,
980
+ context_chunk_seq_tot=context_chunk_seq_tot,
981
+ context_chunk_max_seq_lens=context_chunk_max_seq_lens,
982
+ )
983
+
984
+
985
+ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
986
+ """
987
+ NOTE: Please read the comment at the top of the file before trying to
988
+ understand this class
989
+ """
990
+
991
+ def __init__(
992
+ self,
993
+ num_heads: int,
994
+ head_size: int,
995
+ scale: float,
996
+ num_kv_heads: int,
997
+ alibi_slopes: Optional[List[float]],
998
+ sliding_window: Optional[int],
999
+ kv_cache_dtype: str,
1000
+ blocksparse_params: Optional[Dict[str, Any]],
1001
+ logits_soft_cap: Optional[float],
1002
+ attn_type: str,
1003
+ kv_sharing_target_layer_name: Optional[str],
1004
+ # MLA Specific Arguments
1005
+ q_lora_rank: Optional[int],
1006
+ kv_lora_rank: int,
1007
+ qk_nope_head_dim: int,
1008
+ qk_rope_head_dim: int,
1009
+ qk_head_dim: int,
1010
+ v_head_dim: int,
1011
+ kv_b_proj: ColumnParallelLinear,
1012
+ ) -> None:
1013
+ if kv_sharing_target_layer_name is not None:
1014
+ raise NotImplementedError("KV sharing not supported in V0.")
1015
+ self.num_heads = num_heads
1016
+ self.head_size = head_size
1017
+ self.scale = float(scale)
1018
+ self.num_kv_heads = num_kv_heads
1019
+ self.kv_cache_dtype = kv_cache_dtype
1020
+
1021
+ self.q_lora_rank = q_lora_rank
1022
+ self.kv_lora_rank = kv_lora_rank
1023
+ self.qk_nope_head_dim = qk_nope_head_dim
1024
+ self.qk_rope_head_dim = qk_rope_head_dim
1025
+ self.qk_head_dim = qk_head_dim
1026
+ self.v_head_dim = v_head_dim
1027
+ self.kv_b_proj = kv_b_proj
1028
+
1029
+ self.triton_fa_func = triton_attention
1030
+ # Handle the differences between the flash_attn_varlen from flash_attn
1031
+ # and the one from vllm_flash_attn. The former is used on RoCM and the
1032
+ # latter has an additional parameter to control FA2 vs FA3
1033
+ self.flash_attn_varlen_func = flash_attn_varlen_func
1034
+ self.vllm_flash_attn_version = get_flash_attn_version()
1035
+ if self.vllm_flash_attn_version is not None:
1036
+ self.flash_attn_varlen_func = \
1037
+ functools.partial(flash_attn_varlen_func,
1038
+ fa_version=self.vllm_flash_attn_version)
1039
+
1040
+ # For MLA the v head dim is smaller than qk head dim so we pad out
1041
+ # v with 0s to match the qk head dim for attention backends that do
1042
+ # not support different headdims
1043
+ # We don't need to pad V if we are on a hopper system with FA3
1044
+ self._pad_v = self.vllm_flash_attn_version is None or not (
1045
+ self.vllm_flash_attn_version == 3
1046
+ and current_platform.get_device_capability()[0] == 9)
1047
+
1048
+ def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
1049
+ return_softmax_lse, **kwargs):
1050
+ maybe_padded_v = v
1051
+ if self._pad_v:
1052
+ maybe_padded_v = torch.nn.functional.pad(
1053
+ v, [0, q.shape[-1] - v.shape[-1]], value=0)
1054
+
1055
+ if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN \
1056
+ and not return_softmax_lse:
1057
+ attn_out = self.triton_fa_func(
1058
+ q,
1059
+ k,
1060
+ maybe_padded_v,
1061
+ None, # output
1062
+ kwargs["cu_seqlens_q"],
1063
+ kwargs["cu_seqlens_k"],
1064
+ kwargs["max_seqlen_q"],
1065
+ kwargs["max_seqlen_k"],
1066
+ kwargs["causal"],
1067
+ softmax_scale,
1068
+ None, # bias
1069
+ )
1070
+ elif is_vllm_fa:
1071
+ attn_out = self.flash_attn_varlen_func(
1072
+ q=q,
1073
+ k=k,
1074
+ v=maybe_padded_v,
1075
+ return_softmax_lse=return_softmax_lse,
1076
+ softmax_scale=softmax_scale,
1077
+ **kwargs,
1078
+ )
1079
+ else:
1080
+ # Use return_attn_probs instead of return_softmax_lse for RoCM
1081
+ attn_out = self.flash_attn_varlen_func(
1082
+ q=q,
1083
+ k=k,
1084
+ v=maybe_padded_v,
1085
+ return_attn_probs=return_softmax_lse,
1086
+ softmax_scale=softmax_scale,
1087
+ **kwargs,
1088
+ )
1089
+
1090
+ # Unpack the output if there is multiple results,
1091
+ # triton always returns (output, softmax_lse),
1092
+ # vllm_flash_attn returns (output, softmax_lse) when
1093
+ # `return_softmax_lse = True`
1094
+ # flash_attn (RoCM) returns (output, softmax_lse, ...) when
1095
+ # `return_attn_probs = True`
1096
+ rest = None
1097
+ if isinstance(attn_out, tuple):
1098
+ attn_out, *rest = attn_out
1099
+
1100
+ # Remain consistent with old `flash_attn_varlen_func` where there
1101
+ # is only one output tensor if `return_softmax_lse` is False.
1102
+ if return_softmax_lse:
1103
+ assert rest is not None
1104
+ return attn_out, rest[0]
1105
+ return attn_out
1106
+
1107
+ def _v_up_proj(self, x):
1108
+ # Convert from (B, N, L) to (N, B, L)
1109
+ x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
1110
+ # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
1111
+ x = torch.bmm(x, self.W_UV)
1112
+ # Convert from (N, B, V) to (B, N * V)
1113
+ return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
1114
+
1115
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
1116
+
1117
+ def get_layer_weight(layer):
1118
+ WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
1119
+ for attr in WEIGHT_NAMES:
1120
+ if hasattr(layer, attr):
1121
+ return getattr(layer, attr)
1122
+ raise AttributeError(
1123
+ f"Layer '{layer}' has no recognized weight attribute:"
1124
+ f" {WEIGHT_NAMES}.")
1125
+
1126
+ def get_and_maybe_dequant_weights(layer: LinearBase):
1127
+ if not isinstance(layer.quant_method, UnquantizedLinearMethod):
1128
+ # NOTE: This should only be used offline, since it's O(N^3)
1129
+ eye = torch.eye(layer.input_size_per_partition,
1130
+ dtype=act_dtype,
1131
+ device=get_layer_weight(layer).device)
1132
+ dequant_weights = layer.quant_method.apply(layer,
1133
+ eye,
1134
+ bias=None)
1135
+ del eye
1136
+ # standardize to (output, input)
1137
+ return dequant_weights.T
1138
+ return layer.weight
1139
+
1140
+ # we currently do not have quantized bmm's which are needed for
1141
+ # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
1142
+ # the bmm's in 16-bit, the extra memory overhead of this is fairly low
1143
+ kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
1144
+ assert kv_b_proj_weight.shape == (
1145
+ self.kv_lora_rank,
1146
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
1147
+ f"{kv_b_proj_weight.shape=}, "
1148
+ f"{self.kv_lora_rank=}, "
1149
+ f"{self.num_heads=}, "
1150
+ f"{self.qk_nope_head_dim=}, "
1151
+ f"{self.v_head_dim=}")
1152
+ kv_b_proj_weight = kv_b_proj_weight.view(
1153
+ self.kv_lora_rank,
1154
+ self.num_heads,
1155
+ self.qk_nope_head_dim + self.v_head_dim,
1156
+ )
1157
+
1158
+ W_UK, W_UV = kv_b_proj_weight.split(
1159
+ [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1160
+
1161
+ # Convert from (L, N, V) to (N, L, V)
1162
+ self.W_UV = W_UV.transpose(0, 1)
1163
+ # Convert from (L, N, P) to (N, P, L)
1164
+ self.W_UK_T = W_UK.permute(1, 2, 0)
1165
+
1166
+ def _compute_prefill_context(
1167
+ self,
1168
+ q: torch.Tensor,
1169
+ kv_c_and_k_pe_cache: torch.Tensor,
1170
+ attn_metadata: MLACommonMetadata,
1171
+ ):
1172
+ prefill_metadata = attn_metadata.prefill_metadata
1173
+ assert prefill_metadata is not None
1174
+ assert prefill_metadata.context_chunk_seq_tot is not None
1175
+ assert prefill_metadata.context_chunk_cu_seq_lens is not None
1176
+ assert prefill_metadata.context_chunk_starts is not None
1177
+ assert prefill_metadata.context_chunk_max_seq_lens is not None
1178
+ assert prefill_metadata.context_lens_tensor is not None
1179
+
1180
+ output = None
1181
+ iters = len(prefill_metadata.context_chunk_seq_tot)
1182
+
1183
+ # Fetch from attn_metadata directly, since it late bound by
1184
+ # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
1185
+ # any weirdness around prefill_metadata caching
1186
+ assert attn_metadata.context_chunk_workspace is not None
1187
+ workspace = attn_metadata.context_chunk_workspace
1188
+
1189
+ for i in range(iters):
1190
+ toks = prefill_metadata.context_chunk_seq_tot[i]
1191
+
1192
+ ops.gather_cache(
1193
+ src_cache=kv_c_and_k_pe_cache,
1194
+ dst=workspace,
1195
+ block_table=prefill_metadata.block_tables,
1196
+ cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
1197
+ batch_size=prefill_metadata.num_prefills,
1198
+ seq_starts=prefill_metadata.context_chunk_starts[i],
1199
+ )
1200
+
1201
+ kv_c_normed = workspace[:toks]\
1202
+ [..., :self.kv_lora_rank]
1203
+ k_pe = workspace[:toks]\
1204
+ [..., self.kv_lora_rank:].unsqueeze(1)
1205
+
1206
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
1207
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1208
+ k_nope, v = kv_nope\
1209
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1210
+
1211
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
1212
+ dim=-1)
1213
+
1214
+ attn_output, attn_softmax_lse = \
1215
+ self._flash_attn_varlen_diff_headdims(
1216
+ q=q,
1217
+ k=k,
1218
+ v=v,
1219
+ cu_seqlens_q=prefill_metadata.query_start_loc,
1220
+ cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
1221
+ max_seqlen_q=prefill_metadata.max_query_len,
1222
+ max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
1223
+ softmax_scale=self.scale,
1224
+ causal=False, # Context is unmasked
1225
+ return_softmax_lse=True,
1226
+ )
1227
+
1228
+ if output is None:
1229
+ output = attn_output
1230
+ output_lse = attn_softmax_lse
1231
+ else:
1232
+ output_tmp = torch.empty_like(output)
1233
+ output_lse_tmp = torch.empty_like(output_lse)
1234
+ merge_attn_states(
1235
+ output=output_tmp,
1236
+ output_lse=output_lse_tmp,
1237
+ prefix_output=output,
1238
+ prefix_lse=output_lse,
1239
+ suffix_output=attn_output,
1240
+ suffix_lse=attn_softmax_lse,
1241
+ )
1242
+ output = output_tmp
1243
+ output_lse = output_lse_tmp
1244
+
1245
+ return output, output_lse
1246
+
1247
+ def _forward_prefill(
1248
+ self,
1249
+ q: torch.Tensor,
1250
+ kv_c_normed: torch.Tensor,
1251
+ k_pe: torch.Tensor,
1252
+ kv_c_and_k_pe_cache: torch.Tensor,
1253
+ attn_metadata: MLACommonMetadata,
1254
+ ) -> torch.Tensor:
1255
+
1256
+ prefill_metadata = attn_metadata.prefill_metadata
1257
+ assert prefill_metadata is not None
1258
+
1259
+ has_context = prefill_metadata.context_lens_tensor is not None \
1260
+ and prefill_metadata.context_lens_tensor.max() > 0
1261
+
1262
+ kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
1263
+ -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
1264
+ k_nope, v = kv_nope\
1265
+ .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
1266
+
1267
+ k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
1268
+
1269
+ output = self._flash_attn_varlen_diff_headdims(
1270
+ q=q,
1271
+ k=k,
1272
+ v=v,
1273
+ cu_seqlens_q=prefill_metadata.query_start_loc,
1274
+ cu_seqlens_k=prefill_metadata.query_start_loc,
1275
+ max_seqlen_q=prefill_metadata.max_prefill_seq_len,
1276
+ max_seqlen_k=prefill_metadata.max_prefill_seq_len,
1277
+ softmax_scale=self.scale,
1278
+ causal=True,
1279
+ return_softmax_lse=has_context,
1280
+ )
1281
+
1282
+ if has_context:
1283
+ # ROCm flash_attn_varlen_func will return 3 objects instead of 2
1284
+ suffix_output, suffix_lse = output
1285
+ context_output, context_lse = self._compute_prefill_context( \
1286
+ q, kv_c_and_k_pe_cache, attn_metadata)
1287
+
1288
+ output = torch.empty_like(suffix_output)
1289
+ merge_attn_states(
1290
+ output=output,
1291
+ prefix_output=context_output,
1292
+ prefix_lse=context_lse,
1293
+ suffix_output=suffix_output,
1294
+ suffix_lse=suffix_lse,
1295
+ )
1296
+
1297
+ # unpad if necessary
1298
+ if self._pad_v:
1299
+ output = output[..., :v.shape[-1]]
1300
+
1301
+ return output.flatten(start_dim=-2)
1302
+
1303
+ @abstractmethod
1304
+ def _forward_decode(
1305
+ self,
1306
+ ql_nope: torch.Tensor,
1307
+ q_pe: torch.Tensor,
1308
+ kv_c_and_k_pe_cache: torch.Tensor,
1309
+ attn_metadata: T,
1310
+ ) -> torch.Tensor:
1311
+ raise NotImplementedError
1312
+
1313
+ def forward(
1314
+ self,
1315
+ layer: AttentionLayer,
1316
+ q: torch.Tensor, # query in unified attn
1317
+ k_c_normed: torch.Tensor, # key in unified attn
1318
+ k_pe: torch.Tensor, # value in unified attn
1319
+ kv_cache: torch.Tensor,
1320
+ attn_metadata: T,
1321
+ output: Optional[torch.Tensor] = None,
1322
+ ) -> torch.Tensor:
1323
+ if output is not None:
1324
+ raise NotImplementedError(
1325
+ "output is not yet supported for MLAImplBase")
1326
+
1327
+ if attn_metadata.is_profile_run and \
1328
+ attn_metadata.context_chunk_workspace is not None:
1329
+ # During the profile run try to simulate to worse case output size
1330
+ # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
1331
+ # since this can be large
1332
+ _ = torch.empty(
1333
+ (attn_metadata.context_chunk_workspace.shape[0],
1334
+ self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
1335
+ device=k_c_normed.device,
1336
+ dtype=k_c_normed.dtype,
1337
+ )
1338
+
1339
+ has_decode = attn_metadata.decode_metadata is not None
1340
+ has_prefill = attn_metadata.prefill_metadata is not None
1341
+
1342
+ num_prefill_tokens: int = attn_metadata.num_prefill_tokens
1343
+ q = q.view(-1, self.num_heads, self.qk_head_dim)
1344
+
1345
+ decode_q = q[num_prefill_tokens:]
1346
+
1347
+ prefill_q = q[:num_prefill_tokens]
1348
+ prefill_k_pe = k_pe[:num_prefill_tokens]
1349
+ prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
1350
+
1351
+ # write the latent and rope to kv cache
1352
+ if kv_cache.numel() > 0:
1353
+ ops.concat_and_cache_mla(
1354
+ k_c_normed,
1355
+ k_pe.squeeze(1),
1356
+ kv_cache,
1357
+ attn_metadata.slot_mapping.flatten(),
1358
+ kv_cache_dtype=self.kv_cache_dtype,
1359
+ scale=layer._k_scale,
1360
+ )
1361
+
1362
+ output = torch.empty(attn_metadata.num_prefill_tokens +
1363
+ attn_metadata.num_decode_tokens,
1364
+ self.v_head_dim * self.num_heads,
1365
+ device=q.device,
1366
+ dtype=q.dtype)
1367
+ if has_prefill:
1368
+ output[:num_prefill_tokens] = self._forward_prefill(
1369
+ prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
1370
+ attn_metadata)
1371
+
1372
+ if has_decode:
1373
+ decode_q_nope, decode_q_pe = decode_q.split(
1374
+ [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
1375
+ # Convert from (B, N, P) to (N, B, P)
1376
+ decode_q_nope = decode_q_nope.transpose(0, 1)
1377
+ # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
1378
+ decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
1379
+ # Convert from (N, B, L) to (B, N, L)
1380
+ decode_ql_nope = decode_ql_nope.transpose(0, 1)
1381
+
1382
+ output[num_prefill_tokens:] = self._forward_decode(
1383
+ decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
1384
+
1385
+ return output