vllm-cpu 0.9.2.post2__cp311-cp311-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1236) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +214 -0
  3. vllm/_custom_ops.py +1915 -0
  4. vllm/_ipex_ops.py +350 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +139 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +325 -0
  20. vllm/attention/backends/blocksparse_attn.py +465 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1506 -0
  23. vllm/attention/backends/flash_attn.py +1008 -0
  24. vllm/attention/backends/flashinfer.py +1107 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +318 -0
  27. vllm/attention/backends/ipex_attn.py +403 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1391 -0
  30. vllm/attention/backends/pallas.py +356 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +1015 -0
  34. vllm/attention/backends/torch_sdpa.py +707 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +807 -0
  38. vllm/attention/layer.py +481 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +903 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/pallas_kv_cache_update.py +120 -0
  52. vllm/attention/ops/prefix_prefill.py +902 -0
  53. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  54. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  55. vllm/attention/ops/triton_decode_attention.py +674 -0
  56. vllm/attention/ops/triton_flash_attention.py +984 -0
  57. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  58. vllm/attention/ops/triton_unified_attention.py +738 -0
  59. vllm/attention/selector.py +214 -0
  60. vllm/attention/utils/fa_utils.py +72 -0
  61. vllm/beam_search.py +87 -0
  62. vllm/benchmarks/__init__.py +0 -0
  63. vllm/benchmarks/datasets.py +1441 -0
  64. vllm/benchmarks/endpoint_request_func.py +393 -0
  65. vllm/benchmarks/latency.py +168 -0
  66. vllm/benchmarks/serve.py +1063 -0
  67. vllm/benchmarks/throughput.py +609 -0
  68. vllm/benchmarks/utils.py +70 -0
  69. vllm/collect_env.py +820 -0
  70. vllm/compilation/__init__.py +0 -0
  71. vllm/compilation/activation_quant_fusion.py +89 -0
  72. vllm/compilation/backends.py +610 -0
  73. vllm/compilation/base_piecewise_backend.py +72 -0
  74. vllm/compilation/collective_fusion.py +127 -0
  75. vllm/compilation/compiler_interface.py +564 -0
  76. vllm/compilation/counter.py +41 -0
  77. vllm/compilation/cuda_piecewise_backend.py +218 -0
  78. vllm/compilation/decorators.py +250 -0
  79. vllm/compilation/fix_functionalization.py +191 -0
  80. vllm/compilation/fusion.py +645 -0
  81. vllm/compilation/fusion_attn.py +166 -0
  82. vllm/compilation/fx_utils.py +84 -0
  83. vllm/compilation/inductor_pass.py +115 -0
  84. vllm/compilation/monitor.py +39 -0
  85. vllm/compilation/multi_output_match.py +109 -0
  86. vllm/compilation/noop_elimination.py +165 -0
  87. vllm/compilation/pass_manager.py +82 -0
  88. vllm/compilation/sequence_parallelism.py +482 -0
  89. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  90. vllm/compilation/vllm_inductor_pass.py +70 -0
  91. vllm/compilation/wrapper.py +135 -0
  92. vllm/config.py +4913 -0
  93. vllm/connections.py +174 -0
  94. vllm/core/__init__.py +0 -0
  95. vllm/core/block/__init__.py +0 -0
  96. vllm/core/block/block_table.py +399 -0
  97. vllm/core/block/common.py +371 -0
  98. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  99. vllm/core/block/interfaces.py +319 -0
  100. vllm/core/block/naive_block.py +466 -0
  101. vllm/core/block/prefix_caching_block.py +1135 -0
  102. vllm/core/block/utils.py +28 -0
  103. vllm/core/block_manager.py +525 -0
  104. vllm/core/evictor.py +157 -0
  105. vllm/core/interfaces.py +139 -0
  106. vllm/core/placeholder_block_space_manager.py +103 -0
  107. vllm/core/scheduler.py +2126 -0
  108. vllm/device_allocator/__init__.py +0 -0
  109. vllm/device_allocator/cumem.py +281 -0
  110. vllm/distributed/__init__.py +6 -0
  111. vllm/distributed/communication_op.py +41 -0
  112. vllm/distributed/device_communicators/__init__.py +0 -0
  113. vllm/distributed/device_communicators/all2all.py +264 -0
  114. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  115. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  116. vllm/distributed/device_communicators/cuda_communicator.py +194 -0
  117. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  118. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  119. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  120. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  121. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  122. vllm/distributed/device_communicators/pynccl.py +218 -0
  123. vllm/distributed/device_communicators/pynccl_wrapper.py +349 -0
  124. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  125. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  126. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  127. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  128. vllm/distributed/eplb/__init__.py +8 -0
  129. vllm/distributed/eplb/eplb_state.py +432 -0
  130. vllm/distributed/eplb/rebalance_algo.py +234 -0
  131. vllm/distributed/eplb/rebalance_execute.py +307 -0
  132. vllm/distributed/kv_events.py +356 -0
  133. vllm/distributed/kv_transfer/README.md +29 -0
  134. vllm/distributed/kv_transfer/__init__.py +12 -0
  135. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  137. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  138. vllm/distributed/kv_transfer/kv_connector/factory.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  140. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  141. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  142. vllm/distributed/kv_transfer/kv_connector/utils.py +109 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1103 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +485 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +533 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +265 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +389 -0
  153. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  154. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  155. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  156. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  158. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  159. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  160. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  161. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  162. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  163. vllm/distributed/parallel_state.py +1385 -0
  164. vllm/distributed/tpu_distributed_utils.py +178 -0
  165. vllm/distributed/utils.py +536 -0
  166. vllm/engine/__init__.py +0 -0
  167. vllm/engine/arg_utils.py +1801 -0
  168. vllm/engine/async_llm_engine.py +1200 -0
  169. vllm/engine/async_timeout.py +173 -0
  170. vllm/engine/llm_engine.py +2101 -0
  171. vllm/engine/metrics.py +629 -0
  172. vllm/engine/metrics_types.py +94 -0
  173. vllm/engine/multiprocessing/__init__.py +148 -0
  174. vllm/engine/multiprocessing/client.py +681 -0
  175. vllm/engine/multiprocessing/engine.py +460 -0
  176. vllm/engine/output_processor/__init__.py +0 -0
  177. vllm/engine/output_processor/interfaces.py +75 -0
  178. vllm/engine/output_processor/multi_step.py +216 -0
  179. vllm/engine/output_processor/single_step.py +145 -0
  180. vllm/engine/output_processor/stop_checker.py +131 -0
  181. vllm/engine/output_processor/util.py +28 -0
  182. vllm/engine/protocol.py +326 -0
  183. vllm/entrypoints/__init__.py +0 -0
  184. vllm/entrypoints/api_server.py +178 -0
  185. vllm/entrypoints/chat_utils.py +1278 -0
  186. vllm/entrypoints/cli/__init__.py +12 -0
  187. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  188. vllm/entrypoints/cli/benchmark/base.py +25 -0
  189. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  190. vllm/entrypoints/cli/benchmark/main.py +58 -0
  191. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  192. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  193. vllm/entrypoints/cli/collect_env.py +36 -0
  194. vllm/entrypoints/cli/main.py +71 -0
  195. vllm/entrypoints/cli/openai.py +201 -0
  196. vllm/entrypoints/cli/run_batch.py +69 -0
  197. vllm/entrypoints/cli/serve.py +265 -0
  198. vllm/entrypoints/cli/types.py +29 -0
  199. vllm/entrypoints/launcher.py +147 -0
  200. vllm/entrypoints/llm.py +1599 -0
  201. vllm/entrypoints/logger.py +50 -0
  202. vllm/entrypoints/openai/__init__.py +0 -0
  203. vllm/entrypoints/openai/api_server.py +1495 -0
  204. vllm/entrypoints/openai/cli_args.py +331 -0
  205. vllm/entrypoints/openai/logits_processors.py +90 -0
  206. vllm/entrypoints/openai/protocol.py +2096 -0
  207. vllm/entrypoints/openai/run_batch.py +473 -0
  208. vllm/entrypoints/openai/serving_chat.py +1258 -0
  209. vllm/entrypoints/openai/serving_classification.py +160 -0
  210. vllm/entrypoints/openai/serving_completion.py +618 -0
  211. vllm/entrypoints/openai/serving_embedding.py +201 -0
  212. vllm/entrypoints/openai/serving_engine.py +988 -0
  213. vllm/entrypoints/openai/serving_models.py +315 -0
  214. vllm/entrypoints/openai/serving_pooling.py +234 -0
  215. vllm/entrypoints/openai/serving_score.py +431 -0
  216. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  217. vllm/entrypoints/openai/serving_transcription.py +132 -0
  218. vllm/entrypoints/openai/speech_to_text.py +395 -0
  219. vllm/entrypoints/openai/tool_parsers/__init__.py +25 -0
  220. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  221. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  222. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  223. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  224. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  225. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  226. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  227. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  228. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  229. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +369 -0
  230. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  231. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  232. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  233. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  234. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +466 -0
  235. vllm/entrypoints/score_utils.py +50 -0
  236. vllm/entrypoints/ssl.py +75 -0
  237. vllm/entrypoints/utils.py +262 -0
  238. vllm/env_override.py +41 -0
  239. vllm/envs.py +1029 -0
  240. vllm/executor/__init__.py +0 -0
  241. vllm/executor/executor_base.py +401 -0
  242. vllm/executor/mp_distributed_executor.py +244 -0
  243. vllm/executor/msgspec_utils.py +30 -0
  244. vllm/executor/multiproc_worker_utils.py +313 -0
  245. vllm/executor/ray_distributed_executor.py +701 -0
  246. vllm/executor/ray_utils.py +399 -0
  247. vllm/executor/uniproc_executor.py +139 -0
  248. vllm/forward_context.py +185 -0
  249. vllm/inputs/__init__.py +41 -0
  250. vllm/inputs/data.py +331 -0
  251. vllm/inputs/parse.py +151 -0
  252. vllm/inputs/preprocess.py +924 -0
  253. vllm/inputs/registry.py +245 -0
  254. vllm/jsontree.py +80 -0
  255. vllm/logger.py +212 -0
  256. vllm/logging_utils/__init__.py +8 -0
  257. vllm/logging_utils/dump_input.py +81 -0
  258. vllm/logging_utils/formatter.py +18 -0
  259. vllm/logits_process.py +119 -0
  260. vllm/lora/__init__.py +0 -0
  261. vllm/lora/fully_sharded_layers.py +355 -0
  262. vllm/lora/layers.py +1285 -0
  263. vllm/lora/lora.py +199 -0
  264. vllm/lora/models.py +818 -0
  265. vllm/lora/ops/__init__.py +0 -0
  266. vllm/lora/ops/torch_ops/__init__.py +16 -0
  267. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  268. vllm/lora/ops/triton_ops/__init__.py +12 -0
  269. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  270. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  271. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  272. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  273. vllm/lora/ops/triton_ops/utils.py +120 -0
  274. vllm/lora/ops/xla_ops/__init__.py +7 -0
  275. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  276. vllm/lora/peft_helper.py +136 -0
  277. vllm/lora/punica_wrapper/__init__.py +10 -0
  278. vllm/lora/punica_wrapper/punica_base.py +485 -0
  279. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  280. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  281. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  284. vllm/lora/punica_wrapper/utils.py +164 -0
  285. vllm/lora/request.py +99 -0
  286. vllm/lora/resolver.py +85 -0
  287. vllm/lora/utils.py +240 -0
  288. vllm/lora/worker_manager.py +256 -0
  289. vllm/model_executor/__init__.py +16 -0
  290. vllm/model_executor/custom_op.py +208 -0
  291. vllm/model_executor/guided_decoding/__init__.py +181 -0
  292. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  293. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  294. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  295. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  296. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  297. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  298. vllm/model_executor/guided_decoding/utils.py +242 -0
  299. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  300. vllm/model_executor/layers/__init__.py +0 -0
  301. vllm/model_executor/layers/activation.py +420 -0
  302. vllm/model_executor/layers/fused_moe/__init__.py +78 -0
  303. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +298 -0
  304. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +140 -0
  305. vllm/model_executor/layers/fused_moe/config.py +456 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  475. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +215 -0
  476. vllm/model_executor/layers/fused_moe/cutlass_moe.py +645 -0
  477. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +250 -0
  478. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +231 -0
  479. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +183 -0
  480. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1021 -0
  481. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +234 -0
  482. vllm/model_executor/layers/fused_moe/fused_moe.py +1734 -0
  483. vllm/model_executor/layers/fused_moe/layer.py +1528 -0
  484. vllm/model_executor/layers/fused_moe/modular_kernel.py +598 -0
  485. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +224 -0
  486. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  487. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  488. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  489. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +233 -0
  490. vllm/model_executor/layers/fused_moe/prepare_finalize.py +66 -0
  491. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +429 -0
  492. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +136 -0
  493. vllm/model_executor/layers/fused_moe/utils.py +144 -0
  494. vllm/model_executor/layers/layernorm.py +287 -0
  495. vllm/model_executor/layers/lightning_attn.py +652 -0
  496. vllm/model_executor/layers/linear.py +1547 -0
  497. vllm/model_executor/layers/logits_processor.py +197 -0
  498. vllm/model_executor/layers/mamba/__init__.py +0 -0
  499. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  500. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  501. vllm/model_executor/layers/mamba/mamba_mixer2.py +731 -0
  502. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  503. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  504. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  505. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  506. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  507. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  508. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  509. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  510. vllm/model_executor/layers/pooler.py +473 -0
  511. vllm/model_executor/layers/quantization/__init__.py +160 -0
  512. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  513. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  514. vllm/model_executor/layers/quantization/awq.py +228 -0
  515. vllm/model_executor/layers/quantization/awq_marlin.py +523 -0
  516. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  517. vllm/model_executor/layers/quantization/base_config.py +164 -0
  518. vllm/model_executor/layers/quantization/bitblas.py +462 -0
  519. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  520. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  521. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +694 -0
  522. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1613 -0
  523. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  524. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  525. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  526. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  527. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  528. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +149 -0
  529. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  530. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  531. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  532. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  533. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  534. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  535. vllm/model_executor/layers/quantization/deepgemm.py +83 -0
  536. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  537. vllm/model_executor/layers/quantization/experts_int8.py +204 -0
  538. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  539. vllm/model_executor/layers/quantization/fp8.py +950 -0
  540. vllm/model_executor/layers/quantization/gguf.py +577 -0
  541. vllm/model_executor/layers/quantization/gptq.py +278 -0
  542. vllm/model_executor/layers/quantization/gptq_bitblas.py +446 -0
  543. vllm/model_executor/layers/quantization/gptq_marlin.py +679 -0
  544. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  545. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  546. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  547. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  548. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  549. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  550. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  551. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  552. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  553. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +132 -0
  554. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  555. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  556. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  557. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  558. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  559. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  560. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  561. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  562. vllm/model_executor/layers/quantization/marlin.py +263 -0
  563. vllm/model_executor/layers/quantization/modelopt.py +747 -0
  564. vllm/model_executor/layers/quantization/moe_wna16.py +457 -0
  565. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  566. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  567. vllm/model_executor/layers/quantization/qqq.py +275 -0
  568. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  569. vllm/model_executor/layers/quantization/quark/quark.py +437 -0
  570. vllm/model_executor/layers/quantization/quark/quark_moe.py +245 -0
  571. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  572. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  573. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  574. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +157 -0
  575. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  576. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  577. vllm/model_executor/layers/quantization/rtn.py +289 -0
  578. vllm/model_executor/layers/quantization/schema.py +86 -0
  579. vllm/model_executor/layers/quantization/torchao.py +212 -0
  580. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  581. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  582. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  583. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/fp8_utils.py +653 -0
  787. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  788. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  789. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  790. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  791. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  792. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  793. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  794. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  795. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  796. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  797. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  798. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +146 -0
  799. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  800. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  801. vllm/model_executor/layers/rejection_sampler.py +406 -0
  802. vllm/model_executor/layers/resampler.py +270 -0
  803. vllm/model_executor/layers/rotary_embedding.py +2025 -0
  804. vllm/model_executor/layers/sampler.py +1204 -0
  805. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  806. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  807. vllm/model_executor/layers/utils.py +116 -0
  808. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  809. vllm/model_executor/model_loader/__init__.py +77 -0
  810. vllm/model_executor/model_loader/base_loader.py +43 -0
  811. vllm/model_executor/model_loader/bitsandbytes_loader.py +613 -0
  812. vllm/model_executor/model_loader/default_loader.py +282 -0
  813. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  814. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  815. vllm/model_executor/model_loader/neuron.py +476 -0
  816. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  817. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  818. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  819. vllm/model_executor/model_loader/tensorizer.py +602 -0
  820. vllm/model_executor/model_loader/tensorizer_loader.py +127 -0
  821. vllm/model_executor/model_loader/tpu.py +113 -0
  822. vllm/model_executor/model_loader/utils.py +315 -0
  823. vllm/model_executor/model_loader/weight_utils.py +782 -0
  824. vllm/model_executor/models/__init__.py +30 -0
  825. vllm/model_executor/models/adapters.py +375 -0
  826. vllm/model_executor/models/aimv2.py +246 -0
  827. vllm/model_executor/models/arctic.py +559 -0
  828. vllm/model_executor/models/aria.py +670 -0
  829. vllm/model_executor/models/aya_vision.py +486 -0
  830. vllm/model_executor/models/baichuan.py +474 -0
  831. vllm/model_executor/models/bamba.py +558 -0
  832. vllm/model_executor/models/bart.py +938 -0
  833. vllm/model_executor/models/bert.py +513 -0
  834. vllm/model_executor/models/bert_with_rope.py +617 -0
  835. vllm/model_executor/models/blip.py +339 -0
  836. vllm/model_executor/models/blip2.py +728 -0
  837. vllm/model_executor/models/bloom.py +373 -0
  838. vllm/model_executor/models/chameleon.py +1146 -0
  839. vllm/model_executor/models/chatglm.py +478 -0
  840. vllm/model_executor/models/clip.py +407 -0
  841. vllm/model_executor/models/commandr.py +471 -0
  842. vllm/model_executor/models/config.py +200 -0
  843. vllm/model_executor/models/constant_size_cache.py +137 -0
  844. vllm/model_executor/models/dbrx.py +472 -0
  845. vllm/model_executor/models/deepseek.py +486 -0
  846. vllm/model_executor/models/deepseek_mtp.py +281 -0
  847. vllm/model_executor/models/deepseek_v2.py +935 -0
  848. vllm/model_executor/models/deepseek_vl2.py +660 -0
  849. vllm/model_executor/models/dots1.py +536 -0
  850. vllm/model_executor/models/eagle.py +261 -0
  851. vllm/model_executor/models/ernie45.py +43 -0
  852. vllm/model_executor/models/ernie45_moe.py +583 -0
  853. vllm/model_executor/models/exaone.py +551 -0
  854. vllm/model_executor/models/fairseq2_llama.py +154 -0
  855. vllm/model_executor/models/falcon.py +510 -0
  856. vllm/model_executor/models/falcon_h1.py +708 -0
  857. vllm/model_executor/models/florence2.py +1113 -0
  858. vllm/model_executor/models/fuyu.py +406 -0
  859. vllm/model_executor/models/gemma.py +427 -0
  860. vllm/model_executor/models/gemma2.py +427 -0
  861. vllm/model_executor/models/gemma3.py +535 -0
  862. vllm/model_executor/models/gemma3_mm.py +729 -0
  863. vllm/model_executor/models/gemma3n.py +811 -0
  864. vllm/model_executor/models/glm.py +23 -0
  865. vllm/model_executor/models/glm4.py +305 -0
  866. vllm/model_executor/models/glm4_1v.py +1590 -0
  867. vllm/model_executor/models/glm4v.py +657 -0
  868. vllm/model_executor/models/gpt2.py +382 -0
  869. vllm/model_executor/models/gpt_bigcode.py +335 -0
  870. vllm/model_executor/models/gpt_j.py +339 -0
  871. vllm/model_executor/models/gpt_neox.py +332 -0
  872. vllm/model_executor/models/granite.py +493 -0
  873. vllm/model_executor/models/granite_speech.py +790 -0
  874. vllm/model_executor/models/granitemoe.py +437 -0
  875. vllm/model_executor/models/granitemoehybrid.py +653 -0
  876. vllm/model_executor/models/granitemoeshared.py +341 -0
  877. vllm/model_executor/models/gritlm.py +224 -0
  878. vllm/model_executor/models/grok1.py +546 -0
  879. vllm/model_executor/models/h2ovl.py +549 -0
  880. vllm/model_executor/models/hunyuan_v1_moe.py +897 -0
  881. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  882. vllm/model_executor/models/idefics3.py +786 -0
  883. vllm/model_executor/models/interfaces.py +681 -0
  884. vllm/model_executor/models/interfaces_base.py +164 -0
  885. vllm/model_executor/models/intern_vit.py +480 -0
  886. vllm/model_executor/models/internlm2.py +455 -0
  887. vllm/model_executor/models/internlm2_ve.py +147 -0
  888. vllm/model_executor/models/internvl.py +1432 -0
  889. vllm/model_executor/models/jais.py +373 -0
  890. vllm/model_executor/models/jamba.py +592 -0
  891. vllm/model_executor/models/keye.py +1736 -0
  892. vllm/model_executor/models/kimi_vl.py +585 -0
  893. vllm/model_executor/models/llama.py +644 -0
  894. vllm/model_executor/models/llama4.py +531 -0
  895. vllm/model_executor/models/llama_eagle.py +165 -0
  896. vllm/model_executor/models/llama_eagle3.py +263 -0
  897. vllm/model_executor/models/llava.py +887 -0
  898. vllm/model_executor/models/llava_next.py +604 -0
  899. vllm/model_executor/models/llava_next_video.py +492 -0
  900. vllm/model_executor/models/llava_onevision.py +985 -0
  901. vllm/model_executor/models/mamba.py +273 -0
  902. vllm/model_executor/models/mamba2.py +320 -0
  903. vllm/model_executor/models/mamba_cache.py +76 -0
  904. vllm/model_executor/models/medusa.py +219 -0
  905. vllm/model_executor/models/mimo.py +192 -0
  906. vllm/model_executor/models/mimo_mtp.py +285 -0
  907. vllm/model_executor/models/minicpm.py +592 -0
  908. vllm/model_executor/models/minicpm3.py +230 -0
  909. vllm/model_executor/models/minicpm_eagle.py +391 -0
  910. vllm/model_executor/models/minicpmo.py +772 -0
  911. vllm/model_executor/models/minicpmv.py +1307 -0
  912. vllm/model_executor/models/minimax_cache.py +36 -0
  913. vllm/model_executor/models/minimax_text_01.py +1301 -0
  914. vllm/model_executor/models/minimax_vl_01.py +374 -0
  915. vllm/model_executor/models/mistral3.py +624 -0
  916. vllm/model_executor/models/mixtral.py +488 -0
  917. vllm/model_executor/models/mixtral_quant.py +453 -0
  918. vllm/model_executor/models/mllama.py +1682 -0
  919. vllm/model_executor/models/mllama4.py +947 -0
  920. vllm/model_executor/models/mlp_speculator.py +206 -0
  921. vllm/model_executor/models/modernbert.py +339 -0
  922. vllm/model_executor/models/module_mapping.py +72 -0
  923. vllm/model_executor/models/molmo.py +1576 -0
  924. vllm/model_executor/models/moonvit.py +630 -0
  925. vllm/model_executor/models/mpt.py +331 -0
  926. vllm/model_executor/models/nemotron.py +508 -0
  927. vllm/model_executor/models/nemotron_h.py +588 -0
  928. vllm/model_executor/models/nemotron_nas.py +484 -0
  929. vllm/model_executor/models/nvlm_d.py +216 -0
  930. vllm/model_executor/models/olmo.py +389 -0
  931. vllm/model_executor/models/olmo2.py +414 -0
  932. vllm/model_executor/models/olmoe.py +468 -0
  933. vllm/model_executor/models/opt.py +412 -0
  934. vllm/model_executor/models/orion.py +349 -0
  935. vllm/model_executor/models/ovis.py +577 -0
  936. vllm/model_executor/models/paligemma.py +419 -0
  937. vllm/model_executor/models/persimmon.py +344 -0
  938. vllm/model_executor/models/phi.py +356 -0
  939. vllm/model_executor/models/phi3.py +19 -0
  940. vllm/model_executor/models/phi3_small.py +465 -0
  941. vllm/model_executor/models/phi3v.py +733 -0
  942. vllm/model_executor/models/phi4mm.py +1258 -0
  943. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  944. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  945. vllm/model_executor/models/phimoe.py +674 -0
  946. vllm/model_executor/models/pixtral.py +1329 -0
  947. vllm/model_executor/models/plamo2.py +738 -0
  948. vllm/model_executor/models/prithvi_geospatial_mae.py +240 -0
  949. vllm/model_executor/models/qwen.py +362 -0
  950. vllm/model_executor/models/qwen2.py +501 -0
  951. vllm/model_executor/models/qwen2_5_omni_thinker.py +923 -0
  952. vllm/model_executor/models/qwen2_5_vl.py +1175 -0
  953. vllm/model_executor/models/qwen2_audio.py +420 -0
  954. vllm/model_executor/models/qwen2_moe.py +540 -0
  955. vllm/model_executor/models/qwen2_rm.py +122 -0
  956. vllm/model_executor/models/qwen2_vl.py +1513 -0
  957. vllm/model_executor/models/qwen3.py +325 -0
  958. vllm/model_executor/models/qwen3_moe.py +541 -0
  959. vllm/model_executor/models/qwen_vl.py +796 -0
  960. vllm/model_executor/models/registry.py +634 -0
  961. vllm/model_executor/models/roberta.py +271 -0
  962. vllm/model_executor/models/siglip.py +524 -0
  963. vllm/model_executor/models/skyworkr1v.py +961 -0
  964. vllm/model_executor/models/smolvlm.py +52 -0
  965. vllm/model_executor/models/solar.py +506 -0
  966. vllm/model_executor/models/stablelm.py +343 -0
  967. vllm/model_executor/models/starcoder2.py +356 -0
  968. vllm/model_executor/models/tarsier.py +652 -0
  969. vllm/model_executor/models/telechat2.py +140 -0
  970. vllm/model_executor/models/teleflm.py +79 -0
  971. vllm/model_executor/models/transformers.py +509 -0
  972. vllm/model_executor/models/ultravox.py +670 -0
  973. vllm/model_executor/models/utils.py +744 -0
  974. vllm/model_executor/models/vision.py +147 -0
  975. vllm/model_executor/models/whisper.py +886 -0
  976. vllm/model_executor/models/zamba2.py +1036 -0
  977. vllm/model_executor/parameter.py +459 -0
  978. vllm/model_executor/pooling_metadata.py +72 -0
  979. vllm/model_executor/sampling_metadata.py +597 -0
  980. vllm/model_executor/utils.py +80 -0
  981. vllm/multimodal/__init__.py +33 -0
  982. vllm/multimodal/audio.py +116 -0
  983. vllm/multimodal/base.py +219 -0
  984. vllm/multimodal/hasher.py +91 -0
  985. vllm/multimodal/image.py +103 -0
  986. vllm/multimodal/inputs.py +878 -0
  987. vllm/multimodal/parse.py +499 -0
  988. vllm/multimodal/processing.py +1948 -0
  989. vllm/multimodal/profiling.py +283 -0
  990. vllm/multimodal/registry.py +331 -0
  991. vllm/multimodal/utils.py +492 -0
  992. vllm/multimodal/video.py +227 -0
  993. vllm/outputs.py +516 -0
  994. vllm/platforms/__init__.py +291 -0
  995. vllm/platforms/cpu.py +281 -0
  996. vllm/platforms/cuda.py +568 -0
  997. vllm/platforms/hpu.py +106 -0
  998. vllm/platforms/interface.py +551 -0
  999. vllm/platforms/neuron.py +150 -0
  1000. vllm/platforms/rocm.py +453 -0
  1001. vllm/platforms/tpu.py +206 -0
  1002. vllm/platforms/xpu.py +192 -0
  1003. vllm/plugins/__init__.py +94 -0
  1004. vllm/plugins/lora_resolvers/README.md +15 -0
  1005. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1006. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1007. vllm/pooling_params.py +64 -0
  1008. vllm/profiler/__init__.py +0 -0
  1009. vllm/profiler/layerwise_profile.py +375 -0
  1010. vllm/profiler/utils.py +148 -0
  1011. vllm/prompt_adapter/__init__.py +0 -0
  1012. vllm/prompt_adapter/layers.py +83 -0
  1013. vllm/prompt_adapter/models.py +358 -0
  1014. vllm/prompt_adapter/request.py +37 -0
  1015. vllm/prompt_adapter/utils.py +98 -0
  1016. vllm/prompt_adapter/worker_manager.py +179 -0
  1017. vllm/py.typed +2 -0
  1018. vllm/reasoning/__init__.py +15 -0
  1019. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  1020. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1021. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1022. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1023. vllm/sampling_params.py +602 -0
  1024. vllm/scalar_type.py +347 -0
  1025. vllm/scripts.py +15 -0
  1026. vllm/sequence.py +1568 -0
  1027. vllm/spec_decode/__init__.py +0 -0
  1028. vllm/spec_decode/batch_expansion.py +506 -0
  1029. vllm/spec_decode/draft_model_runner.py +349 -0
  1030. vllm/spec_decode/interfaces.py +99 -0
  1031. vllm/spec_decode/medusa_worker.py +138 -0
  1032. vllm/spec_decode/metrics.py +213 -0
  1033. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1034. vllm/spec_decode/mqa_scorer.py +160 -0
  1035. vllm/spec_decode/multi_step_worker.py +423 -0
  1036. vllm/spec_decode/ngram_worker.py +196 -0
  1037. vllm/spec_decode/proposer_worker_base.py +59 -0
  1038. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1039. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1040. vllm/spec_decode/target_model_runner.py +45 -0
  1041. vllm/spec_decode/top1_proposer.py +275 -0
  1042. vllm/spec_decode/util.py +277 -0
  1043. vllm/test_utils.py +130 -0
  1044. vllm/third_party/__init__.py +0 -0
  1045. vllm/third_party/pynvml.py +6140 -0
  1046. vllm/tracing.py +131 -0
  1047. vllm/transformers_utils/__init__.py +24 -0
  1048. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1049. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1050. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1051. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1052. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1053. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1054. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1055. vllm/transformers_utils/config.py +922 -0
  1056. vllm/transformers_utils/configs/__init__.py +57 -0
  1057. vllm/transformers_utils/configs/arctic.py +207 -0
  1058. vllm/transformers_utils/configs/chatglm.py +72 -0
  1059. vllm/transformers_utils/configs/cohere2.py +195 -0
  1060. vllm/transformers_utils/configs/dbrx.py +280 -0
  1061. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1062. vllm/transformers_utils/configs/eagle.py +85 -0
  1063. vllm/transformers_utils/configs/exaone.py +190 -0
  1064. vllm/transformers_utils/configs/falcon.py +90 -0
  1065. vllm/transformers_utils/configs/jais.py +238 -0
  1066. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1067. vllm/transformers_utils/configs/medusa.py +63 -0
  1068. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1069. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1070. vllm/transformers_utils/configs/mllama.py +31 -0
  1071. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1072. vllm/transformers_utils/configs/moonvit.py +33 -0
  1073. vllm/transformers_utils/configs/mpt.py +180 -0
  1074. vllm/transformers_utils/configs/nemotron.py +205 -0
  1075. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1076. vllm/transformers_utils/configs/nvlm_d.py +31 -0
  1077. vllm/transformers_utils/configs/ovis.py +184 -0
  1078. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1079. vllm/transformers_utils/configs/solar.py +247 -0
  1080. vllm/transformers_utils/configs/telechat2.py +64 -0
  1081. vllm/transformers_utils/configs/ultravox.py +108 -0
  1082. vllm/transformers_utils/detokenizer.py +168 -0
  1083. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1084. vllm/transformers_utils/processor.py +221 -0
  1085. vllm/transformers_utils/processors/__init__.py +8 -0
  1086. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1087. vllm/transformers_utils/processors/ovis.py +420 -0
  1088. vllm/transformers_utils/s3_utils.py +162 -0
  1089. vllm/transformers_utils/tokenizer.py +302 -0
  1090. vllm/transformers_utils/tokenizer_base.py +149 -0
  1091. vllm/transformers_utils/tokenizer_group.py +120 -0
  1092. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1093. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1094. vllm/transformers_utils/utils.py +99 -0
  1095. vllm/triton_utils/__init__.py +14 -0
  1096. vllm/triton_utils/importing.py +94 -0
  1097. vllm/usage/__init__.py +0 -0
  1098. vllm/usage/usage_lib.py +259 -0
  1099. vllm/utils/__init__.py +3008 -0
  1100. vllm/v1/__init__.py +0 -0
  1101. vllm/v1/attention/__init__.py +0 -0
  1102. vllm/v1/attention/backends/__init__.py +0 -0
  1103. vllm/v1/attention/backends/cpu_attn.py +184 -0
  1104. vllm/v1/attention/backends/flash_attn.py +757 -0
  1105. vllm/v1/attention/backends/flashinfer.py +680 -0
  1106. vllm/v1/attention/backends/flex_attention.py +491 -0
  1107. vllm/v1/attention/backends/mamba_attn.py +192 -0
  1108. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1109. vllm/v1/attention/backends/mla/common.py +978 -0
  1110. vllm/v1/attention/backends/mla/cutlass_mla.py +98 -0
  1111. vllm/v1/attention/backends/mla/flashmla.py +180 -0
  1112. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +241 -0
  1113. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1114. vllm/v1/attention/backends/pallas.py +320 -0
  1115. vllm/v1/attention/backends/rocm_aiter_fa.py +609 -0
  1116. vllm/v1/attention/backends/triton_attn.py +449 -0
  1117. vllm/v1/attention/backends/utils.py +310 -0
  1118. vllm/v1/core/__init__.py +0 -0
  1119. vllm/v1/core/block_pool.py +349 -0
  1120. vllm/v1/core/encoder_cache_manager.py +254 -0
  1121. vllm/v1/core/kv_cache_coordinator.py +369 -0
  1122. vllm/v1/core/kv_cache_manager.py +398 -0
  1123. vllm/v1/core/kv_cache_utils.py +999 -0
  1124. vllm/v1/core/sched/__init__.py +0 -0
  1125. vllm/v1/core/sched/interface.py +150 -0
  1126. vllm/v1/core/sched/output.py +157 -0
  1127. vllm/v1/core/sched/request_queue.py +224 -0
  1128. vllm/v1/core/sched/scheduler.py +1115 -0
  1129. vllm/v1/core/sched/utils.py +36 -0
  1130. vllm/v1/core/single_type_kv_cache_manager.py +444 -0
  1131. vllm/v1/engine/__init__.py +179 -0
  1132. vllm/v1/engine/async_llm.py +626 -0
  1133. vllm/v1/engine/coordinator.py +278 -0
  1134. vllm/v1/engine/core.py +1046 -0
  1135. vllm/v1/engine/core_client.py +1049 -0
  1136. vllm/v1/engine/detokenizer.py +292 -0
  1137. vllm/v1/engine/exceptions.py +17 -0
  1138. vllm/v1/engine/llm_engine.py +322 -0
  1139. vllm/v1/engine/logprobs.py +200 -0
  1140. vllm/v1/engine/mm_input_cache.py +91 -0
  1141. vllm/v1/engine/output_processor.py +477 -0
  1142. vllm/v1/engine/parallel_sampling.py +133 -0
  1143. vllm/v1/engine/processor.py +422 -0
  1144. vllm/v1/engine/utils.py +546 -0
  1145. vllm/v1/executor/__init__.py +0 -0
  1146. vllm/v1/executor/abstract.py +113 -0
  1147. vllm/v1/executor/multiproc_executor.py +532 -0
  1148. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1149. vllm/v1/kv_cache_interface.py +223 -0
  1150. vllm/v1/metrics/__init__.py +0 -0
  1151. vllm/v1/metrics/loggers.py +557 -0
  1152. vllm/v1/metrics/prometheus.py +82 -0
  1153. vllm/v1/metrics/ray_wrappers.py +131 -0
  1154. vllm/v1/metrics/reader.py +246 -0
  1155. vllm/v1/metrics/stats.py +240 -0
  1156. vllm/v1/outputs.py +124 -0
  1157. vllm/v1/pool/__init__.py +0 -0
  1158. vllm/v1/pool/metadata.py +17 -0
  1159. vllm/v1/request.py +229 -0
  1160. vllm/v1/sample/__init__.py +0 -0
  1161. vllm/v1/sample/logits_processor.py +517 -0
  1162. vllm/v1/sample/metadata.py +43 -0
  1163. vllm/v1/sample/ops/__init__.py +0 -0
  1164. vllm/v1/sample/ops/bad_words.py +39 -0
  1165. vllm/v1/sample/ops/penalties.py +43 -0
  1166. vllm/v1/sample/ops/topk_topp_sampler.py +296 -0
  1167. vllm/v1/sample/rejection_sampler.py +631 -0
  1168. vllm/v1/sample/sampler.py +226 -0
  1169. vllm/v1/sample/tpu/__init__.py +0 -0
  1170. vllm/v1/sample/tpu/metadata.py +124 -0
  1171. vllm/v1/sample/tpu/sampler.py +145 -0
  1172. vllm/v1/serial_utils.py +315 -0
  1173. vllm/v1/spec_decode/__init__.py +0 -0
  1174. vllm/v1/spec_decode/eagle.py +441 -0
  1175. vllm/v1/spec_decode/medusa.py +64 -0
  1176. vllm/v1/spec_decode/metadata.py +62 -0
  1177. vllm/v1/spec_decode/metrics.py +178 -0
  1178. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1179. vllm/v1/spec_decode/utils.py +41 -0
  1180. vllm/v1/structured_output/__init__.py +227 -0
  1181. vllm/v1/structured_output/backend_guidance.py +245 -0
  1182. vllm/v1/structured_output/backend_types.py +134 -0
  1183. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1184. vllm/v1/structured_output/request.py +86 -0
  1185. vllm/v1/structured_output/utils.py +175 -0
  1186. vllm/v1/utils.py +377 -0
  1187. vllm/v1/worker/__init__.py +0 -0
  1188. vllm/v1/worker/block_table.py +142 -0
  1189. vllm/v1/worker/cpu_model_runner.py +91 -0
  1190. vllm/v1/worker/cpu_worker.py +153 -0
  1191. vllm/v1/worker/gpu_input_batch.py +757 -0
  1192. vllm/v1/worker/gpu_model_runner.py +2739 -0
  1193. vllm/v1/worker/gpu_worker.py +408 -0
  1194. vllm/v1/worker/lora_model_runner_mixin.py +177 -0
  1195. vllm/v1/worker/tpu_input_batch.py +585 -0
  1196. vllm/v1/worker/tpu_model_runner.py +1849 -0
  1197. vllm/v1/worker/tpu_worker.py +315 -0
  1198. vllm/v1/worker/utils.py +112 -0
  1199. vllm/v1/worker/worker_base.py +65 -0
  1200. vllm/v1/worker/xpu_model_runner.py +33 -0
  1201. vllm/v1/worker/xpu_worker.py +165 -0
  1202. vllm/version.py +41 -0
  1203. vllm/vllm_flash_attn/.gitkeep +0 -0
  1204. vllm/worker/__init__.py +0 -0
  1205. vllm/worker/cache_engine.py +145 -0
  1206. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1207. vllm/worker/cpu_model_runner.py +671 -0
  1208. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1209. vllm/worker/cpu_worker.py +452 -0
  1210. vllm/worker/enc_dec_model_runner.py +555 -0
  1211. vllm/worker/hpu_model_runner.py +2320 -0
  1212. vllm/worker/hpu_worker.py +484 -0
  1213. vllm/worker/model_runner.py +2178 -0
  1214. vllm/worker/model_runner_base.py +282 -0
  1215. vllm/worker/multi_step_hpu_worker.py +123 -0
  1216. vllm/worker/multi_step_model_runner.py +911 -0
  1217. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1218. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1219. vllm/worker/multi_step_tpu_worker.py +108 -0
  1220. vllm/worker/multi_step_worker.py +197 -0
  1221. vllm/worker/neuron_model_runner.py +460 -0
  1222. vllm/worker/neuron_worker.py +193 -0
  1223. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1224. vllm/worker/pooling_model_runner.py +211 -0
  1225. vllm/worker/tpu_model_runner.py +909 -0
  1226. vllm/worker/tpu_worker.py +337 -0
  1227. vllm/worker/utils.py +53 -0
  1228. vllm/worker/worker.py +577 -0
  1229. vllm/worker/worker_base.py +646 -0
  1230. vllm/worker/xpu_model_runner.py +606 -0
  1231. vllm/worker/xpu_worker.py +186 -0
  1232. vllm_cpu-0.9.2.post2.dist-info/METADATA +339 -0
  1233. vllm_cpu-0.9.2.post2.dist-info/RECORD +1236 -0
  1234. vllm_cpu-0.9.2.post2.dist-info/WHEEL +5 -0
  1235. vllm_cpu-0.9.2.post2.dist-info/entry_points.txt +5 -0
  1236. vllm_cpu-0.9.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2739 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import copy
5
+ import gc
6
+ import time
7
+ import weakref
8
+ from contextlib import contextmanager
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ import numpy as np
12
+ import torch
13
+ import torch.distributed
14
+ import torch.nn as nn
15
+ from tqdm import tqdm
16
+
17
+ import vllm.envs as envs
18
+ from vllm.attention import AttentionType, get_attn_backend
19
+ from vllm.attention.backends.abstract import AttentionBackend
20
+ from vllm.attention.layer import Attention
21
+ from vllm.compilation.counter import compilation_counter
22
+ from vllm.config import (CompilationLevel, VllmConfig,
23
+ get_layers_from_vllm_config)
24
+ from vllm.distributed.eplb.eplb_state import EplbState
25
+ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
26
+ has_kv_transfer_group)
27
+ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
28
+ from vllm.distributed.parallel_state import (
29
+ get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
30
+ prepare_communication_buffer_for_model)
31
+ from vllm.forward_context import (DPMetadata, get_forward_context,
32
+ set_forward_context)
33
+ from vllm.logger import init_logger
34
+ from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
35
+ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
36
+ from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
37
+ from vllm.model_executor.models.interfaces import (has_step_pooler,
38
+ is_mixture_of_experts)
39
+ from vllm.multimodal import MULTIMODAL_REGISTRY
40
+ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
41
+ from vllm.multimodal.utils import group_mm_inputs_by_modality
42
+ from vllm.pooling_params import PoolingParams
43
+ from vllm.sampling_params import SamplingType
44
+ from vllm.sequence import IntermediateTensors
45
+ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
46
+ GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
47
+ check_use_alibi, get_dtype_size,
48
+ is_pin_memory_available, round_up)
49
+ from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
50
+ from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
51
+ CommonAttentionMetadata)
52
+ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
53
+ from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
54
+ KVCacheConfig, KVCacheSpec, MambaSpec,
55
+ SlidingWindowSpec)
56
+ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
57
+ ModelRunnerOutput)
58
+ from vllm.v1.pool.metadata import PoolingMetadata
59
+ from vllm.v1.sample.metadata import SamplingMetadata
60
+ from vllm.v1.sample.rejection_sampler import RejectionSampler
61
+ from vllm.v1.sample.sampler import Sampler
62
+ from vllm.v1.spec_decode.eagle import EagleProposer
63
+ from vllm.v1.spec_decode.medusa import MedusaProposer
64
+ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
65
+ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
66
+ from vllm.v1.utils import bind_kv_cache
67
+ from vllm.v1.worker.block_table import BlockTable
68
+ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
69
+ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
70
+
71
+ from ..sample.logits_processor import LogitsProcessorManager
72
+ from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
73
+ sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
74
+
75
+ if TYPE_CHECKING:
76
+ import xgrammar as xgr
77
+ import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501
78
+
79
+ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
80
+ from vllm.v1.core.sched.output import SchedulerOutput
81
+ else:
82
+ xgr = LazyLoader("xgr", globals(), "xgrammar")
83
+ xgr_torch_compile = LazyLoader(
84
+ "xgr_torch_compile", globals(),
85
+ "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
86
+
87
+ logger = init_logger(__name__)
88
+
89
+
90
+ class GPUModelRunner(LoRAModelRunnerMixin):
91
+
92
+ def __init__(
93
+ self,
94
+ vllm_config: VllmConfig,
95
+ device: torch.device,
96
+ ):
97
+ self.vllm_config = vllm_config
98
+ self.model_config = vllm_config.model_config
99
+ self.cache_config = vllm_config.cache_config
100
+ self.compilation_config = vllm_config.compilation_config
101
+ self.lora_config = vllm_config.lora_config
102
+ self.load_config = vllm_config.load_config
103
+ self.parallel_config = vllm_config.parallel_config
104
+ self.scheduler_config = vllm_config.scheduler_config
105
+ self.speculative_config = vllm_config.speculative_config
106
+ self.prompt_adapter_config = vllm_config.prompt_adapter_config
107
+ self.observability_config = vllm_config.observability_config
108
+
109
+ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
110
+ set_cpu_offload_max_bytes(
111
+ int(self.cache_config.cpu_offload_gb * 1024**3))
112
+
113
+ model_config = self.model_config
114
+ cache_config = self.cache_config
115
+ scheduler_config = self.scheduler_config
116
+ parallel_config = self.parallel_config
117
+ self.device = device
118
+ self.pin_memory = is_pin_memory_available()
119
+ self.dtype = self.model_config.dtype
120
+ if cache_config.cache_dtype == "auto":
121
+ self.kv_cache_dtype = self.dtype
122
+ else:
123
+ self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
124
+ cache_config.cache_dtype]
125
+
126
+ self.is_multimodal_model = model_config.is_multimodal_model
127
+ self.is_pooling_model = model_config.pooler_config is not None
128
+ self.max_model_len = model_config.max_model_len
129
+ self.max_num_tokens = scheduler_config.max_num_batched_tokens
130
+ self.max_num_reqs = scheduler_config.max_num_seqs
131
+
132
+ # Model-related.
133
+ self.num_query_heads = model_config.get_num_attention_heads(
134
+ parallel_config)
135
+ self.hidden_size = model_config.get_hidden_size()
136
+ self.attention_chunk_size = model_config.attention_chunk_size
137
+
138
+ self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
139
+
140
+ # Multi-modal data support
141
+ self.mm_registry = MULTIMODAL_REGISTRY
142
+ self.uses_mrope = model_config.uses_mrope
143
+
144
+ encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
145
+ model_config=model_config,
146
+ scheduler_config=scheduler_config,
147
+ mm_registry=self.mm_registry,
148
+ )
149
+ self.max_num_encoder_input_tokens = encoder_compute_budget
150
+ self.encoder_cache_size = encoder_cache_size
151
+
152
+ # Sampler
153
+ self.sampler = Sampler()
154
+
155
+ self.eplb_state: Optional[EplbState] = None
156
+ """
157
+ State of the expert parallelism load balancer.
158
+
159
+ Will be lazily initialized when the model is loaded.
160
+ """
161
+
162
+ # Lazy initializations
163
+ # self.model: nn.Module # Set after load_model
164
+ # Initialize in initialize_kv_cache
165
+ self.kv_caches: list[torch.Tensor] = []
166
+ self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
167
+ self.attn_backends: list[type[AttentionBackend]] = []
168
+ # self.kv_cache_config: KVCacheConfig
169
+
170
+ # req_id -> (input_id -> encoder_output)
171
+ self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
172
+
173
+ self.use_aux_hidden_state_outputs = False
174
+ # Set up speculative decoding.
175
+ # NOTE(Jiayi): currently we put the entire draft model on
176
+ # the last PP rank. This is not ideal if there are many
177
+ # layers in the draft model.
178
+ if self.speculative_config and get_pp_group().is_last_rank:
179
+ if self.speculative_config.method == "ngram":
180
+ self.drafter = NgramProposer(self.vllm_config)
181
+ elif self.speculative_config.use_eagle():
182
+ self.drafter = EagleProposer(self.vllm_config, self.device,
183
+ self) # type: ignore
184
+ if self.speculative_config.method == "eagle3":
185
+ self.use_aux_hidden_state_outputs = True
186
+ elif self.speculative_config.method == "medusa":
187
+ self.drafter = MedusaProposer(
188
+ vllm_config=self.vllm_config,
189
+ device=self.device) # type: ignore
190
+ else:
191
+ raise ValueError("Unknown speculative decoding method: "
192
+ f"{self.speculative_config.method}")
193
+ self.rejection_sampler = RejectionSampler()
194
+
195
+ # Request states.
196
+ self.requests: dict[str, CachedRequestState] = {}
197
+
198
+ # Input Batch
199
+ # NOTE(Chen): Ideally, we should initialize the input batch inside
200
+ # `initialize_kv_cache` based on the kv cache config. However, as in
201
+ # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
202
+ # reasons, we have to initialize the input batch before `load_model`,
203
+ # quantization + weight offloading will fail otherwise. As a temporary
204
+ # solution, we initialize the input batch here, and re-initialize it
205
+ # in `initialize_kv_cache` if the block_sizes here is different from
206
+ # the block_sizes in the kv cache config.
207
+ self.input_batch = InputBatch(
208
+ max_num_reqs=self.max_num_reqs,
209
+ max_model_len=self.max_model_len,
210
+ max_num_batched_tokens=self.max_num_tokens,
211
+ device=self.device,
212
+ pin_memory=self.pin_memory,
213
+ vocab_size=self.model_config.get_vocab_size(),
214
+ block_sizes=[self.cache_config.block_size],
215
+ is_spec_decode=bool(self.vllm_config.speculative_config),
216
+ )
217
+
218
+ self.use_cuda_graph = (
219
+ self.vllm_config.compilation_config.level
220
+ == CompilationLevel.PIECEWISE
221
+ and self.vllm_config.compilation_config.use_cudagraph
222
+ and not self.model_config.enforce_eager)
223
+ # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
224
+ # The convention is different.
225
+ # self.cudagraph_batch_sizes sorts in ascending order.
226
+ # The batch sizes in the config are in descending order.
227
+ self.cudagraph_batch_sizes = list(
228
+ reversed(self.compilation_config.cudagraph_capture_sizes))
229
+
230
+ self.full_cuda_graph = self.compilation_config.full_cuda_graph
231
+
232
+ # Cache the device properties.
233
+ self._init_device_properties()
234
+
235
+ # Persistent buffers for CUDA graphs.
236
+ self.input_ids = torch.zeros(self.max_num_tokens,
237
+ dtype=torch.int32,
238
+ device=self.device)
239
+ self.positions = torch.zeros(self.max_num_tokens,
240
+ dtype=torch.int64,
241
+ device=self.device)
242
+ self.query_start_loc = torch.zeros(self.max_num_reqs + 1,
243
+ dtype=torch.int32,
244
+ device=self.device)
245
+ self.seq_lens = torch.zeros(self.max_num_reqs,
246
+ dtype=torch.int32,
247
+ device=self.device)
248
+ self.slot_mapping = torch.zeros(self.max_num_tokens,
249
+ dtype=torch.int64,
250
+ device=self.device)
251
+
252
+ # None in the first PP rank. The rest are set after load_model.
253
+ self.intermediate_tensors: Optional[IntermediateTensors] = None
254
+
255
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
256
+ if self.uses_mrope:
257
+ # NOTE: `mrope_positions` is implemented with one additional dummy
258
+ # position on purpose to make it non-contiguous so that it can work
259
+ # with torch compile.
260
+ # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
261
+
262
+ # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
263
+ # the modality of inputs. For text-only inputs, each dimension has
264
+ # identical position IDs, making M-RoPE functionally equivalent to
265
+ # 1D-RoPE.
266
+ # See page 5 of https://arxiv.org/abs/2409.12191
267
+ self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1),
268
+ dtype=torch.int64,
269
+ device=self.device)
270
+ self.mrope_positions_cpu = torch.zeros(
271
+ (3, self.max_num_tokens + 1),
272
+ dtype=torch.int64,
273
+ device="cpu",
274
+ pin_memory=self.pin_memory)
275
+ self.mrope_positions_np = self.mrope_positions_cpu.numpy()
276
+
277
+ # Only relevant for models using ALiBi (e.g, MPT)
278
+ self.use_alibi = check_use_alibi(model_config)
279
+
280
+ self.inputs_embeds = torch.zeros(
281
+ (self.max_num_tokens, self.hidden_size),
282
+ dtype=self.dtype,
283
+ device=self.device)
284
+
285
+ # OPTIMIZATION: Cache the tensors rather than creating them every step.
286
+ # Keep in int64 to avoid overflow with long context
287
+ self.arange_np = np.arange(max(self.max_num_reqs + 1,
288
+ self.max_model_len,
289
+ self.max_num_tokens),
290
+ dtype=np.int64)
291
+ # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
292
+ # a faster version of creating a new tensor every time. Thus, we should
293
+ # not make any assumptions about the values in these tensors.
294
+ self.input_ids_cpu = torch.zeros(self.max_num_tokens,
295
+ dtype=torch.int32,
296
+ device="cpu",
297
+ pin_memory=self.pin_memory)
298
+ self.positions_cpu = torch.zeros(self.max_num_tokens,
299
+ dtype=torch.int64,
300
+ device="cpu",
301
+ pin_memory=self.pin_memory)
302
+ self.positions_np = self.positions_cpu.numpy()
303
+ self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
304
+ dtype=torch.int32,
305
+ device="cpu",
306
+ pin_memory=self.pin_memory)
307
+ self.query_start_loc_np = self.query_start_loc_cpu.numpy()
308
+ self.seq_lens_cpu = torch.zeros(self.max_num_reqs,
309
+ dtype=torch.int32,
310
+ device="cpu",
311
+ pin_memory=self.pin_memory)
312
+ self.seq_lens_np = self.seq_lens_cpu.numpy()
313
+
314
+ # Layer pairings for cross-layer KV sharing.
315
+ # If an Attention layer `layer_name` is in the keys of this dict, it
316
+ # means this layer will perform attention using the keys and values
317
+ # from the KV cache of `shared_kv_cache_layers[layer_name]`.
318
+ self.shared_kv_cache_layers: dict[str, str] = {}
319
+
320
+ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
321
+ """
322
+ Update the order of requests in the batch based on the attention
323
+ backend's needs. For example, some attention backends (namely MLA) may
324
+ want to separate requests based on if the attention computation will be
325
+ compute-bound or memory-bound.
326
+
327
+ Args:
328
+ scheduler_output: The scheduler output.
329
+ """
330
+ self.attn_metadata_builders[0].reorder_batch(self.input_batch,
331
+ scheduler_output)
332
+
333
+ # For models with multiple KV cache groups, the groups should agree on
334
+ # the same order of requests. We ensure this by only allowing the first
335
+ # group to reorder the batch and asserting that all other groups do not
336
+ # reorder the batch.
337
+ # TODO(tdoublep): make this more flexible so that any group can
338
+ # re-order the batch (not only the first).
339
+ # TODO(tdoublep): verify this during engine init instead of at runtime
340
+ for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
341
+ batch_reordered = self.attn_metadata_builders[i].reorder_batch(
342
+ self.input_batch, scheduler_output)
343
+ assert not batch_reordered
344
+
345
+ # Note: used for model runner override.
346
+ def _init_device_properties(self) -> None:
347
+ """Initialize attributes from torch.cuda.get_device_properties
348
+ """
349
+ self.device_properties = torch.cuda.get_device_properties(self.device)
350
+ self.num_sms = self.device_properties.multi_processor_count
351
+
352
+ # Note: used for model runner override.
353
+ def _sync_device(self) -> None:
354
+ torch.cuda.synchronize()
355
+
356
+ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
357
+ """Update the cached states and the persistent batch with the scheduler
358
+ output.
359
+
360
+ The updated states are used by the `_prepare_inputs` function to create
361
+ the input GPU tensors for the model.
362
+
363
+ The SamplingMetadata is updated and copied to the GPU if there is a
364
+ new/resumed/paused/finished request in the batch.
365
+ """
366
+ # Remove finished requests from the cached states.
367
+ for req_id in scheduler_output.finished_req_ids:
368
+ self.requests.pop(req_id, None)
369
+ self.encoder_cache.pop(req_id, None)
370
+ # Remove the finished requests from the persistent batch.
371
+ # NOTE(woosuk): There could be an edge case where finished_req_ids and
372
+ # scheduled_req_ids overlap. This happens when a request is aborted and
373
+ # then resubmitted with the same ID. In this case, we treat them as two
374
+ # distinct requests - clearing the cached states for the first request
375
+ # and handling the second as a new request.
376
+ for req_id in scheduler_output.finished_req_ids:
377
+ self.input_batch.remove_request(req_id)
378
+
379
+ # Free the cached encoder outputs.
380
+ for req_id, input_id in scheduler_output.free_encoder_input_ids:
381
+ encoder_outputs = self.encoder_cache.get(req_id)
382
+ if encoder_outputs is not None:
383
+ encoder_outputs.pop(input_id, None)
384
+ if not encoder_outputs:
385
+ self.encoder_cache.pop(req_id, None)
386
+
387
+ # Remove the unscheduled requests from the persistent batch.
388
+ # NOTE(woosuk): The unscheduled requests are either preempted requests
389
+ # or running requests that are not scheduled in this step. We remove
390
+ # them from the persistent batch but keep their cached states since
391
+ # they will be scheduled again sometime in the future.
392
+ scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
393
+ cached_req_ids = self.input_batch.req_id_to_index.keys()
394
+ unscheduled_req_ids = cached_req_ids - scheduled_req_ids
395
+ # NOTE(woosuk): The persistent batch optimization assumes that
396
+ # consecutive batches contain mostly the same requests. If batches
397
+ # have low request overlap (e.g., alternating between two distinct
398
+ # sets of requests), this optimization becomes very inefficient.
399
+ for req_id in unscheduled_req_ids:
400
+ self.input_batch.remove_request(req_id)
401
+
402
+ req_ids_to_add: list[str] = []
403
+ # Add new requests to the cached states.
404
+ for new_req_data in scheduler_output.scheduled_new_reqs:
405
+ req_id = new_req_data.req_id
406
+ sampling_params = new_req_data.sampling_params
407
+ pooling_params = new_req_data.pooling_params
408
+ if sampling_params and \
409
+ sampling_params.sampling_type == SamplingType.RANDOM_SEED:
410
+ generator = torch.Generator(device=self.device)
411
+ generator.manual_seed(sampling_params.seed)
412
+ else:
413
+ generator = None
414
+
415
+ self.requests[req_id] = CachedRequestState(
416
+ req_id=req_id,
417
+ prompt_token_ids=new_req_data.prompt_token_ids,
418
+ mm_inputs=new_req_data.mm_inputs,
419
+ mm_positions=new_req_data.mm_positions,
420
+ sampling_params=sampling_params,
421
+ pooling_params=pooling_params,
422
+ generator=generator,
423
+ block_ids=new_req_data.block_ids,
424
+ num_computed_tokens=new_req_data.num_computed_tokens,
425
+ output_token_ids=[],
426
+ lora_request=new_req_data.lora_request,
427
+ )
428
+
429
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
430
+ if self.uses_mrope:
431
+ image_grid_thw = []
432
+ video_grid_thw = []
433
+ second_per_grid_ts = []
434
+ audio_feature_lengths = []
435
+ use_audio_in_video = False
436
+ for mm_input in self.requests[req_id].mm_inputs:
437
+ if mm_input.get("image_grid_thw") is not None:
438
+ image_grid_thw.extend(
439
+ mm_input["image_grid_thw"].tolist())
440
+ if mm_input.get("video_grid_thw") is not None:
441
+ video_grid_thw.extend(
442
+ mm_input["video_grid_thw"].tolist())
443
+ if mm_input.get("second_per_grid_ts") is not None:
444
+ second_per_grid_ts.extend(
445
+ mm_input["second_per_grid_ts"])
446
+ if mm_input.get("audio_feature_lengths") is not None:
447
+ audio_feature_lengths.extend(
448
+ mm_input["audio_feature_lengths"])
449
+ if mm_input.get("use_audio_in_video") is True:
450
+ use_audio_in_video = True
451
+
452
+ hf_config = self.model_config.hf_config
453
+
454
+ self.requests[req_id].mrope_positions, \
455
+ self.requests[req_id].mrope_position_delta = \
456
+ MRotaryEmbedding.get_input_positions_tensor(
457
+ self.requests[req_id].prompt_token_ids,
458
+ hf_config=hf_config,
459
+ image_grid_thw=image_grid_thw,
460
+ video_grid_thw=video_grid_thw,
461
+ second_per_grid_ts=second_per_grid_ts,
462
+ audio_feature_lengths=audio_feature_lengths,
463
+ use_audio_in_video=use_audio_in_video,
464
+ )
465
+
466
+ req_ids_to_add.append(req_id)
467
+
468
+ # Update the states of the running/resumed requests.
469
+ is_last_rank = get_pp_group().is_last_rank
470
+ req_data = scheduler_output.scheduled_cached_reqs
471
+ for i, req_id in enumerate(req_data.req_ids):
472
+ req_state = self.requests[req_id]
473
+ num_computed_tokens = req_data.num_computed_tokens[i]
474
+ new_block_ids = req_data.new_block_ids[i]
475
+ resumed_from_preemption = req_data.resumed_from_preemption[i]
476
+
477
+ # Update the cached states.
478
+ req_state.num_computed_tokens = num_computed_tokens
479
+
480
+ if not is_last_rank:
481
+ # When using PP, the scheduler sends the sampled tokens back,
482
+ # because there's no direct communication between the first-
483
+ # stage worker and the last-stage worker.
484
+ new_token_ids = req_data.new_token_ids[i]
485
+ # Add the sampled token(s) from the previous step (if any).
486
+ # This doesn't include "unverified" tokens like spec tokens.
487
+ num_new_tokens = (num_computed_tokens + len(new_token_ids) -
488
+ req_state.num_tokens)
489
+ if num_new_tokens == 1:
490
+ # Avoid slicing list in most common case.
491
+ req_state.output_token_ids.append(new_token_ids[-1])
492
+ elif num_new_tokens > 0:
493
+ req_state.output_token_ids.extend(
494
+ new_token_ids[-num_new_tokens:])
495
+
496
+ # Update the block IDs.
497
+ if not resumed_from_preemption:
498
+ # Append the new blocks to the existing block IDs.
499
+ for block_ids, new_ids in zip(req_state.block_ids,
500
+ new_block_ids):
501
+ block_ids.extend(new_ids)
502
+ else:
503
+ # The request is resumed from preemption.
504
+ # Replace the existing block IDs with the new ones.
505
+ req_state.block_ids = new_block_ids
506
+
507
+ req_index = self.input_batch.req_id_to_index.get(req_id)
508
+ if req_index is None:
509
+ # The request is not in the persistent batch.
510
+ # The request was either preempted and resumed later, or was not
511
+ # scheduled in the previous step and needs to be added again.
512
+ req_ids_to_add.append(req_id)
513
+ continue
514
+
515
+ # Update the persistent batch.
516
+ self.input_batch.num_computed_tokens_cpu[req_index] = (
517
+ num_computed_tokens)
518
+ self.input_batch.block_table.append_row(new_block_ids, req_index)
519
+
520
+ # For the last rank, we don't need to update the token_ids_cpu
521
+ # because the sampled tokens are already cached.
522
+ if not is_last_rank:
523
+ # Add new_token_ids to token_ids_cpu.
524
+ start_token_index = num_computed_tokens
525
+ end_token_index = num_computed_tokens + len(new_token_ids)
526
+ self.input_batch.token_ids_cpu[
527
+ req_index,
528
+ start_token_index:end_token_index] = new_token_ids
529
+ self.input_batch.num_tokens_no_spec[
530
+ req_index] = end_token_index
531
+ self.input_batch.num_tokens[req_index] = end_token_index
532
+
533
+ # Add spec_token_ids to token_ids_cpu.
534
+ spec_token_ids = (
535
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
536
+ if spec_token_ids:
537
+ num_spec_tokens = len(spec_token_ids)
538
+ start_index = self.input_batch.num_tokens_no_spec[req_index]
539
+ end_token_index = start_index + num_spec_tokens
540
+ self.input_batch.token_ids_cpu[
541
+ req_index, start_index:end_token_index] = spec_token_ids
542
+ # NOTE(woosuk): `num_tokens` here may include spec tokens.
543
+ self.input_batch.num_tokens[req_index] += num_spec_tokens
544
+
545
+ # Add the new or resumed requests to the persistent batch.
546
+ # The smaller empty indices are filled first.
547
+ for req_id in req_ids_to_add:
548
+ req_state = self.requests[req_id]
549
+ self.input_batch.add_request(req_state)
550
+
551
+ # Condense the batched states if there are gaps left by removed requests
552
+ self.input_batch.condense()
553
+ # Allow attention backend to reorder the batch, potentially
554
+ self._may_reorder_batch(scheduler_output)
555
+ # Refresh batch metadata with any pending updates.
556
+ self.input_batch.refresh_metadata()
557
+
558
+ def _get_cumsum_and_arange(
559
+ self,
560
+ num_tokens: np.ndarray,
561
+ cumsum_dtype: Optional[np.dtype] = None,
562
+ ) -> tuple[np.ndarray, np.ndarray]:
563
+ """Get the cumulative sum and batched arange of the given array.
564
+ # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
565
+ # Equivalent to but faster than:
566
+ # np.concatenate([np.arange(n) for n in num_tokens])
567
+ """
568
+ # Step 1. [2, 5, 3] -> [2, 7, 10]
569
+ cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
570
+ total_num_tokens = cu_num_tokens[-1]
571
+ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
572
+ cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
573
+ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
574
+ arange = self.arange_np[:total_num_tokens] - cumsums_offsets
575
+
576
+ return cu_num_tokens, arange
577
+
578
+ def _prepare_inputs(
579
+ self,
580
+ scheduler_output: "SchedulerOutput",
581
+ ) -> tuple[dict[str, Any], bool, torch.Tensor,
582
+ Optional[SpecDecodeMetadata], np.ndarray]:
583
+ """
584
+ :return: tuple[
585
+ attn_metadata: layer-to-attention_metadata mapping,
586
+ attention_cuda_graphs: whether attention can run in cudagraph
587
+ logits_indices, spec_decode_metadata
588
+ ]
589
+ """
590
+ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
591
+ assert total_num_scheduled_tokens > 0
592
+ num_reqs = self.input_batch.num_reqs
593
+ assert num_reqs > 0
594
+
595
+ # OPTIMIZATION: Start copying the block table first.
596
+ # This way, we can overlap the copy with the following CPU operations.
597
+ self.input_batch.block_table.commit(num_reqs)
598
+
599
+ # Get the number of scheduled tokens for each request.
600
+ req_ids = self.input_batch.req_ids
601
+ tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
602
+ num_scheduled_tokens = np.array(tokens, dtype=np.int32)
603
+ max_num_scheduled_tokens = max(tokens)
604
+
605
+ # Get request indices.
606
+ # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
607
+ req_indices = np.repeat(self.arange_np[:num_reqs],
608
+ num_scheduled_tokens)
609
+
610
+ # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
611
+ # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
612
+ cu_num_tokens, arange = self._get_cumsum_and_arange(
613
+ num_scheduled_tokens)
614
+
615
+ # Get positions.
616
+ positions_np = self.positions_np[:total_num_scheduled_tokens]
617
+ np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
618
+ arange,
619
+ out=positions_np)
620
+
621
+ # Calculate M-RoPE positions.
622
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
623
+ if self.uses_mrope:
624
+ self._calc_mrope_positions(scheduler_output)
625
+
626
+ # Get token indices.
627
+ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
628
+ # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
629
+ # where M is the max_model_len.
630
+ token_indices = (positions_np +
631
+ req_indices * self.input_batch.token_ids_cpu.shape[1])
632
+
633
+ # NOTE(woosuk): We use torch.index_select instead of np.take here
634
+ # because torch.index_select is much faster than np.take for large
635
+ # tensors.
636
+ torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
637
+ 0,
638
+ torch.from_numpy(token_indices),
639
+ out=self.input_ids_cpu[:total_num_scheduled_tokens])
640
+
641
+ # Calculate the slot mapping for each KV cache group.
642
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
643
+ self.kv_cache_config.kv_cache_groups):
644
+ block_size = kv_cache_group_spec.kv_cache_spec.block_size
645
+ block_table: BlockTable = self.input_batch.block_table[
646
+ kv_cache_group_id]
647
+ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
648
+ # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
649
+ # where K is the max_num_blocks_per_req and the block size is 2.
650
+ # NOTE(woosuk): We can't simply use `token_indices // block_size`
651
+ # here because M (max_model_len) is not necessarily divisible by
652
+ # block_size.
653
+ block_table_indices = (
654
+ req_indices * block_table.max_num_blocks_per_req +
655
+ positions_np // block_size)
656
+ block_table_cpu = block_table.get_cpu_tensor()
657
+ block_numbers = block_table_cpu.flatten(
658
+ )[block_table_indices].numpy()
659
+ block_offsets = positions_np % block_size
660
+ np.add(
661
+ block_numbers * block_size,
662
+ block_offsets,
663
+ out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
664
+
665
+ # Prepare the attention metadata.
666
+ self.query_start_loc_np[0] = 0
667
+ self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
668
+
669
+ self.seq_lens_np[:num_reqs] = (
670
+ self.input_batch.num_computed_tokens_cpu[:num_reqs] +
671
+ num_scheduled_tokens)
672
+
673
+ # Copy the tensors to the GPU.
674
+ self.input_ids[:total_num_scheduled_tokens].copy_(
675
+ self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
676
+ if self.uses_mrope:
677
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
678
+ self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
679
+ self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
680
+ non_blocking=True)
681
+ else:
682
+ # Common case (1D positions)
683
+ self.positions[:total_num_scheduled_tokens].copy_(
684
+ self.positions_cpu[:total_num_scheduled_tokens],
685
+ non_blocking=True)
686
+
687
+ self.query_start_loc[:num_reqs + 1].copy_(
688
+ self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
689
+ self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
690
+ non_blocking=True)
691
+
692
+ # Fill unused with -1. Needed for reshape_and_cache
693
+ self.seq_lens[num_reqs:].fill_(0)
694
+ # Note: pad query_start_loc to be non-decreasing, as kernels
695
+ # like FlashAttention requires that
696
+ self.query_start_loc[num_reqs + 1:].fill_(
697
+ self.query_start_loc_cpu[num_reqs].item())
698
+
699
+ query_start_loc = self.query_start_loc[:num_reqs + 1]
700
+ seq_lens = self.seq_lens[:num_reqs]
701
+
702
+ common_attn_metadata = CommonAttentionMetadata(
703
+ query_start_loc=query_start_loc,
704
+ seq_lens=seq_lens,
705
+ num_reqs=num_reqs,
706
+ num_actual_tokens=total_num_scheduled_tokens,
707
+ max_query_len=max_num_scheduled_tokens,
708
+ )
709
+
710
+ attn_metadata: dict[str, Any] = {}
711
+ # Prepare the attention metadata for each KV cache group and make layers
712
+ # in the same group share the same metadata.
713
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
714
+ self.kv_cache_config.kv_cache_groups):
715
+
716
+ # Prepare for cascade attention if enabled & beneficial.
717
+ common_prefix_len = 0
718
+ builder = self.attn_metadata_builders[kv_cache_group_id]
719
+ if self.cascade_attn_enabled:
720
+ common_prefix_len = self._compute_cascade_attn_prefix_len(
721
+ num_scheduled_tokens,
722
+ scheduler_output.
723
+ num_common_prefix_blocks[kv_cache_group_id],
724
+ kv_cache_group_spec.kv_cache_spec,
725
+ builder,
726
+ )
727
+
728
+ attn_metadata_i = (builder.build(
729
+ common_prefix_len=common_prefix_len,
730
+ common_attn_metadata=common_attn_metadata,
731
+ ))
732
+
733
+ for layer_name in kv_cache_group_spec.layer_names:
734
+ attn_metadata[layer_name] = attn_metadata_i
735
+
736
+ attention_cuda_graphs = all(
737
+ b.can_run_in_cudagraph(common_attn_metadata)
738
+ for b in self.attn_metadata_builders)
739
+
740
+ use_spec_decode = len(
741
+ scheduler_output.scheduled_spec_decode_tokens) > 0
742
+ if not use_spec_decode:
743
+ # NOTE(woosuk): Due to chunked prefills, the batch may contain
744
+ # partial requests. While we should not sample any token
745
+ # from these partial requests, we do so for simplicity.
746
+ # We will ignore the sampled tokens from the partial requests.
747
+ # TODO: Support prompt logprobs.
748
+ logits_indices = query_start_loc[1:] - 1
749
+ spec_decode_metadata = None
750
+ else:
751
+ # Get the number of draft tokens for each request.
752
+ # Iterate over the dictionary rather than all requests since not all
753
+ # requests have draft tokens.
754
+ num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
755
+ for req_id, draft_token_ids in (
756
+ scheduler_output.scheduled_spec_decode_tokens.items()):
757
+ req_idx = self.input_batch.req_id_to_index[req_id]
758
+ num_draft_tokens[req_idx] = len(draft_token_ids)
759
+
760
+ spec_decode_metadata = self._calc_spec_decode_metadata(
761
+ num_draft_tokens, cu_num_tokens)
762
+ logits_indices = spec_decode_metadata.logits_indices
763
+
764
+ # Hot-Swap lora model
765
+ if self.lora_config:
766
+ self.set_active_loras(self.input_batch, num_scheduled_tokens)
767
+
768
+ return (attn_metadata, attention_cuda_graphs, logits_indices,
769
+ spec_decode_metadata, num_scheduled_tokens)
770
+
771
+ def _compute_cascade_attn_prefix_len(
772
+ self,
773
+ num_scheduled_tokens: np.ndarray,
774
+ num_common_prefix_blocks: int,
775
+ kv_cache_spec: KVCacheSpec,
776
+ attn_metadata_builder: AttentionMetadataBuilder,
777
+ ) -> int:
778
+ """Compute the length of the common prefix for cascade attention.
779
+
780
+ NOTE(woosuk): The common prefix length returned by this function
781
+ represents the length used specifically for cascade attention, not the
782
+ actual number of tokens shared between requests. When cascade attention
783
+ is disabled (use_cascade=False), this function returns 0 even if
784
+ requests share common tokens. Additionally, the common prefix length is
785
+ truncated to a multiple of the block size and may be further truncated
786
+ due to implementation details explained below.
787
+
788
+ Args:
789
+ num_scheduled_tokens: Number of tokens scheduled per request.
790
+ num_common_prefix_blocks: Number of shared KV cache blocks.
791
+
792
+ Returns:
793
+ int: Length of common prefix in tokens.
794
+ """
795
+ common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
796
+ if common_prefix_len == 0:
797
+ # Common case.
798
+ return 0
799
+
800
+ # NOTE(woosuk): Cascade attention uses two attention kernels: one
801
+ # for the common prefix and the other for the rest. For the first
802
+ # kernel, we concatenate all the query tokens (possibly from
803
+ # different requests) and treat them as if they are from the same
804
+ # request. Then, we use bi-directional attention to process the
805
+ # common prefix in the KV cache. Importantly, this means that the
806
+ # first kernel does not do any masking.
807
+
808
+ # Consider the following example:
809
+ # Request 1's input query: [D, E, X]
810
+ # Request 1's kv cache: [A, B, C, D, E, X]
811
+ # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
812
+ # Request 2's input query: [E, Y]
813
+ # Request 2's kv cache: [A, B, C, D, E, Y]
814
+ # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
815
+
816
+ # If we use [A, B, C, D, E] as the common prefix, then the
817
+ # first kernel will compute the bi-directional attention between
818
+ # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
819
+ # However, this is wrong because D in Request 1 should not attend to
820
+ # E in the common prefix (i.e., we need masking).
821
+ # To avoid this, [A, B, C, D] should be the common prefix.
822
+ # That is, the common prefix should be capped by the minimum
823
+ # num_computed_tokens among the requests, and plus one to include
824
+ # the first token of the query.
825
+
826
+ # In practice, we use [A, B, C] as the common prefix, instead of
827
+ # [A, B, C, D] (i.e., the common prefix is capped by the minimum
828
+ # num_computed_tokens, without plus one).
829
+ # This is because of an implementation detail: We want to always
830
+ # use two kernels for cascade attention. Let's imagine:
831
+ # Request 3's input query: [D]
832
+ # Request 3's kv cache: [A, B, C, D]
833
+ # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
834
+ # If we use [A, B, C, D] as the common prefix for Request 1-3,
835
+ # then Request 3 will be processed only by the first kernel,
836
+ # and the second kernel will get an empty input. While this is not
837
+ # a fundamental problem, our current implementation does not support
838
+ # this case.
839
+ num_reqs = len(num_scheduled_tokens)
840
+ common_prefix_len = min(
841
+ common_prefix_len,
842
+ self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
843
+ # common_prefix_len should be a multiple of the block size.
844
+ common_prefix_len = (common_prefix_len // kv_cache_spec.block_size *
845
+ kv_cache_spec.block_size)
846
+ use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or
847
+ (isinstance(kv_cache_spec, FullAttentionSpec)
848
+ and kv_cache_spec.sliding_window is not None))
849
+ assert isinstance(kv_cache_spec, AttentionSpec)
850
+ use_cascade = attn_metadata_builder.use_cascade_attention(
851
+ common_prefix_len=common_prefix_len,
852
+ query_lens=num_scheduled_tokens,
853
+ num_query_heads=self.num_query_heads,
854
+ num_kv_heads=kv_cache_spec.num_kv_heads,
855
+ use_alibi=self.use_alibi,
856
+ use_sliding_window=use_sliding_window,
857
+ num_sms=self.num_sms,
858
+ )
859
+ return common_prefix_len if use_cascade else 0
860
+
861
+ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
862
+ mrope_pos_ptr = 0
863
+ for index, req_id in enumerate(self.input_batch.req_ids):
864
+ req = self.requests[req_id]
865
+ assert req.mrope_positions is not None
866
+
867
+ num_computed_tokens = \
868
+ self.input_batch.num_computed_tokens_cpu[index]
869
+ num_scheduled_tokens = \
870
+ scheduler_output.num_scheduled_tokens[req_id]
871
+ num_prompt_tokens = len(req.prompt_token_ids)
872
+
873
+ if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
874
+ prompt_part_len = max(0,
875
+ num_prompt_tokens - num_computed_tokens)
876
+ completion_part_len = max(
877
+ 0, num_scheduled_tokens - prompt_part_len)
878
+ else:
879
+ prompt_part_len = num_scheduled_tokens
880
+ completion_part_len = 0
881
+
882
+ assert num_scheduled_tokens == prompt_part_len + completion_part_len
883
+
884
+ if prompt_part_len > 0:
885
+ # prompt's mrope_positions are pre-computed
886
+ dst_start = mrope_pos_ptr
887
+ dst_end = mrope_pos_ptr + prompt_part_len
888
+ src_start = num_computed_tokens
889
+ src_end = num_computed_tokens + prompt_part_len
890
+
891
+ self.mrope_positions_cpu[:, dst_start:dst_end] = \
892
+ req.mrope_positions[:,src_start:src_end]
893
+
894
+ mrope_pos_ptr += prompt_part_len
895
+
896
+ if completion_part_len > 0:
897
+ # compute completion's mrope_positions on-the-fly
898
+ dst_start = mrope_pos_ptr
899
+ dst_end = mrope_pos_ptr + completion_part_len
900
+
901
+ MRotaryEmbedding.get_next_input_positions_tensor(
902
+ out=self.mrope_positions_np,
903
+ out_offset=dst_start,
904
+ mrope_position_delta=req.mrope_position_delta,
905
+ context_len=num_computed_tokens + prompt_part_len,
906
+ num_new_tokens=completion_part_len,
907
+ )
908
+
909
+ mrope_pos_ptr += completion_part_len
910
+
911
+ def _calc_spec_decode_metadata(
912
+ self,
913
+ num_draft_tokens: np.ndarray,
914
+ cu_num_scheduled_tokens: np.ndarray,
915
+ ) -> SpecDecodeMetadata:
916
+ # Inputs:
917
+ # cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209]
918
+ # num_draft_tokens: [ 3, 0, 2, 0, 1]
919
+ # Outputs:
920
+ # cu_num_draft_tokens: [ 3, 3, 5, 5, 6]
921
+ # logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106,
922
+ # 206, 207, 208]
923
+ # target_logits_indices: [ 0, 1, 2, 5, 6, 9]
924
+ # bonus_logits_indices: [ 3, 4, 7, 8, 10]
925
+
926
+ # Compute the logits indices.
927
+ # [4, 1, 3, 1, 2]
928
+ num_sampled_tokens = num_draft_tokens + 1
929
+
930
+ # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
931
+ # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
932
+ cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
933
+ num_sampled_tokens, cumsum_dtype=np.int32)
934
+ # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
935
+ logits_indices = np.repeat(
936
+ cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
937
+ # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
938
+ logits_indices += arange
939
+
940
+ # Compute the bonus logits indices.
941
+ bonus_logits_indices = cu_num_sampled_tokens - 1
942
+
943
+ # Compute the draft logits indices.
944
+ # cu_num_draft_tokens: [3, 3, 5, 5, 6]
945
+ # arange: [0, 1, 2, 0, 1, 0]
946
+ cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
947
+ num_draft_tokens, cumsum_dtype=np.int32)
948
+ # [0, 0, 0, 5, 5, 9]
949
+ target_logits_indices = np.repeat(
950
+ cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
951
+ # [0, 1, 2, 5, 6, 9]
952
+ target_logits_indices += arange
953
+
954
+ # TODO: Optimize the CPU -> GPU copy.
955
+ cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
956
+ self.device, non_blocking=True)
957
+ logits_indices = torch.from_numpy(logits_indices).to(self.device,
958
+ non_blocking=True)
959
+ target_logits_indices = torch.from_numpy(target_logits_indices).to(
960
+ self.device, non_blocking=True)
961
+ bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
962
+ self.device, non_blocking=True)
963
+
964
+ # Compute the draft token ids.
965
+ # draft_token_indices: [ 1, 2, 3, 105, 106, 208]
966
+ draft_token_ids = self.input_ids[logits_indices]
967
+ draft_token_ids = draft_token_ids[target_logits_indices + 1]
968
+
969
+ metadata = SpecDecodeMetadata(
970
+ draft_token_ids=draft_token_ids,
971
+ num_draft_tokens=num_draft_tokens.tolist(),
972
+ cu_num_draft_tokens=cu_num_draft_tokens,
973
+ target_logits_indices=target_logits_indices,
974
+ bonus_logits_indices=bonus_logits_indices,
975
+ logits_indices=logits_indices,
976
+ )
977
+ return metadata
978
+
979
+ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
980
+ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
981
+ if not scheduled_encoder_inputs:
982
+ return
983
+
984
+ # Batch the multi-modal inputs.
985
+ mm_inputs = list[MultiModalKwargs]()
986
+ req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
987
+ for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
988
+ req_state = self.requests[req_id]
989
+
990
+ for mm_input_id in encoder_input_ids:
991
+ mm_inputs.append(req_state.mm_inputs[mm_input_id])
992
+ req_ids_pos.append(
993
+ (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
994
+
995
+ # Batch mm inputs as much as we can: if a request in the batch has
996
+ # multiple modalities or a different modality than the previous one,
997
+ # we process it separately to preserve item order.
998
+ # FIXME(ywang96): This is a hacky way to deal with multiple modalities
999
+ # in the same batch while still being able to benefit from batching
1000
+ # multimodal inputs. The proper solution should be reordering the
1001
+ # encoder outputs.
1002
+ grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
1003
+
1004
+ encoder_outputs = []
1005
+ for grouped_mm_inputs in grouped_mm_inputs_list:
1006
+ batched_mm_inputs = MultiModalKwargs.batch(
1007
+ grouped_mm_inputs, pin_memory=self.pin_memory)
1008
+ batched_mm_inputs = MultiModalKwargs.as_kwargs(
1009
+ batched_mm_inputs,
1010
+ device=self.device,
1011
+ )
1012
+
1013
+ # Run the encoder.
1014
+ # `curr_group_outputs` is either of the following:
1015
+ # 1. A tensor of shape (num_items, feature_size, hidden_size)
1016
+ # in case feature_size is fixed across all multimodal items.
1017
+ # 2. A list or tuple (length: num_items) of tensors, each of shape
1018
+ # (feature_size, hidden_size) in case the feature size is dynamic
1019
+ # depending on the input multimodal items.
1020
+ curr_group_outputs = self.model.get_multimodal_embeddings(
1021
+ **batched_mm_inputs)
1022
+
1023
+ sanity_check_mm_encoder_outputs(
1024
+ curr_group_outputs,
1025
+ expected_num_items=len(grouped_mm_inputs),
1026
+ )
1027
+
1028
+ for output in curr_group_outputs:
1029
+ encoder_outputs.append(output)
1030
+
1031
+ # Cache the encoder outputs.
1032
+ for (req_id, input_id, pos_info), output in zip(
1033
+ req_ids_pos,
1034
+ encoder_outputs,
1035
+ ):
1036
+ if req_id not in self.encoder_cache:
1037
+ self.encoder_cache[req_id] = {}
1038
+
1039
+ self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
1040
+ output,
1041
+ is_embed=pos_info.is_embed,
1042
+ )
1043
+
1044
+ def _gather_mm_embeddings(
1045
+ self,
1046
+ scheduler_output: "SchedulerOutput",
1047
+ ) -> list[torch.Tensor]:
1048
+ mm_embeds: list[torch.Tensor] = []
1049
+ for req_id in self.input_batch.req_ids:
1050
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
1051
+ req_id]
1052
+ req_state = self.requests[req_id]
1053
+ num_computed_tokens = req_state.num_computed_tokens
1054
+ mm_positions = req_state.mm_positions
1055
+ for i, pos_info in enumerate(mm_positions):
1056
+ start_pos = pos_info.offset
1057
+ num_encoder_tokens = pos_info.length
1058
+
1059
+ # The encoder output is needed if the two ranges overlap:
1060
+ # [num_computed_tokens,
1061
+ # num_computed_tokens + num_scheduled_tokens) and
1062
+ # [start_pos, start_pos + num_encoder_tokens)
1063
+ if start_pos >= num_computed_tokens + num_scheduled_tokens:
1064
+ # The encoder output is not needed in this step.
1065
+ break
1066
+ if start_pos + num_encoder_tokens <= num_computed_tokens:
1067
+ # The encoder output is already processed and stored
1068
+ # in the decoder's KV cache.
1069
+ continue
1070
+
1071
+ start_idx = max(num_computed_tokens - start_pos, 0)
1072
+ end_idx = min(
1073
+ num_computed_tokens - start_pos + num_scheduled_tokens,
1074
+ num_encoder_tokens)
1075
+ assert start_idx < end_idx
1076
+ assert req_id in self.encoder_cache
1077
+ assert i in self.encoder_cache[req_id]
1078
+ encoder_output = self.encoder_cache[req_id][i]
1079
+
1080
+ if (is_embed := pos_info.is_embed) is not None:
1081
+ is_embed = is_embed[start_idx:end_idx]
1082
+
1083
+ mm_embeds_item = gather_mm_placeholders(
1084
+ encoder_output[start_idx:end_idx],
1085
+ is_embed=is_embed,
1086
+ )
1087
+ mm_embeds.append(mm_embeds_item)
1088
+ return mm_embeds
1089
+
1090
+ def get_model(self) -> nn.Module:
1091
+ return self.model
1092
+
1093
+ def apply_grammar_bitmask(
1094
+ self,
1095
+ scheduler_output: "SchedulerOutput",
1096
+ logits: torch.Tensor,
1097
+ ):
1098
+ grammar_bitmask = scheduler_output.grammar_bitmask
1099
+ if grammar_bitmask is None:
1100
+ return
1101
+
1102
+ # We receive the structured output bitmask from the scheduler,
1103
+ # compacted to contain bitmasks only for structured output requests.
1104
+ # The order of the requests in the bitmask is not guaranteed to be the
1105
+ # same as the order of the requests in the gpu runner's batch. We need
1106
+ # to sort the bitmask to match the order of the requests used here.
1107
+
1108
+ # Get the batch indices of the structured output requests.
1109
+ # Keep track of the number of speculative tokens scheduled for every
1110
+ # request in the batch, as the logit indices are offset by this amount.
1111
+ struct_out_req_batch_indices: dict[str, int] = {}
1112
+ cumulative_offset = 0
1113
+ seq = sorted(self.input_batch.req_id_to_index.items(),
1114
+ key=lambda x: x[1])
1115
+ for req_id, batch_index in seq:
1116
+ logit_index = batch_index + cumulative_offset
1117
+ cumulative_offset += len(
1118
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1119
+ if req_id in scheduler_output.structured_output_request_ids:
1120
+ struct_out_req_batch_indices[req_id] = logit_index
1121
+
1122
+ out_indices = []
1123
+
1124
+ # Reorder the bitmask to match the order of the requests in the batch.
1125
+ sorted_bitmask = np.zeros_like(grammar_bitmask,
1126
+ shape=(logits.shape[0],
1127
+ grammar_bitmask.shape[1]))
1128
+ cumulative_index = 0
1129
+ seq = sorted(scheduler_output.structured_output_request_ids.items(),
1130
+ key=lambda x: x[1])
1131
+ for req_id, _ in seq:
1132
+ logit_index = struct_out_req_batch_indices[req_id]
1133
+ num_spec_tokens = len(
1134
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1135
+ for i in range(1 + num_spec_tokens):
1136
+ sorted_bitmask[logit_index + i] = \
1137
+ grammar_bitmask[cumulative_index + i]
1138
+ out_indices.append(logit_index + i)
1139
+ cumulative_index += 1 + num_spec_tokens
1140
+ grammar_bitmask = sorted_bitmask
1141
+
1142
+ # Serialization of np.ndarray is much more efficient than a tensor,
1143
+ # so we receive it in that format.
1144
+ grammar_bitmask = torch.from_numpy(grammar_bitmask)
1145
+
1146
+ # Force use of the torch.compile implementation from xgrammar to work
1147
+ # around issues with the Triton kernel in concurrent structured output
1148
+ # scenarios. See PR #19565 and issues #19493, #18376 for details.
1149
+ xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
1150
+ logits,
1151
+ grammar_bitmask.to(self.device, non_blocking=True),
1152
+ indices=out_indices,
1153
+ )
1154
+
1155
+ def sync_and_slice_intermediate_tensors(
1156
+ self, num_tokens: int, intermediate_tensors: IntermediateTensors,
1157
+ sync_self: bool) -> IntermediateTensors:
1158
+
1159
+ assert self.intermediate_tensors is not None
1160
+
1161
+ tp = self.vllm_config.parallel_config.tensor_parallel_size
1162
+ enabled_sp = self.compilation_config.pass_config. \
1163
+ enable_sequence_parallelism
1164
+ if enabled_sp:
1165
+ # When sequence parallelism is enabled, we always pad num_tokens
1166
+ # to be a multiple of tensor_parallel_size (tp) earlier
1167
+ assert num_tokens % tp == 0
1168
+ is_residual_scattered = tp > 1 and enabled_sp \
1169
+ and num_tokens % tp == 0
1170
+
1171
+ # When sequence parallelism is enabled, the "residual" tensor is sharded
1172
+ # across tensor parallel ranks, so each rank only needs its own slice.
1173
+ if sync_self:
1174
+ assert intermediate_tensors is not None
1175
+ for k, v in intermediate_tensors.items():
1176
+ is_scattered = "residual" and is_residual_scattered
1177
+ copy_len = num_tokens // tp if is_scattered else \
1178
+ num_tokens
1179
+ self.intermediate_tensors[k][:copy_len].copy_(
1180
+ v[:copy_len], non_blocking=True)
1181
+
1182
+ return IntermediateTensors({
1183
+ k:
1184
+ v[:num_tokens // tp]
1185
+ if k == "residual" and is_residual_scattered else v[:num_tokens]
1186
+ for k, v in self.intermediate_tensors.items()
1187
+ })
1188
+
1189
+ def eplb_step(self,
1190
+ is_dummy: bool = False,
1191
+ is_profile: bool = False) -> None:
1192
+ """
1193
+ Step for the EPLB (Expert Parallelism Load Balancing) state.
1194
+ """
1195
+ if not self.parallel_config.enable_eplb:
1196
+ return
1197
+
1198
+ assert self.eplb_state is not None
1199
+ assert is_mixture_of_experts(self.model)
1200
+ self.eplb_state.step(
1201
+ self.model,
1202
+ is_dummy,
1203
+ is_profile,
1204
+ log_stats=self.parallel_config.eplb_log_balancedness,
1205
+ )
1206
+
1207
+ def get_dp_padding(self,
1208
+ num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
1209
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
1210
+ dp_rank = self.vllm_config.parallel_config.data_parallel_rank
1211
+
1212
+ # For DP: Don't pad when setting enforce_eager.
1213
+ # This lets us set enforce_eager on the prefiller in a P/D setup and
1214
+ # still use CUDA graphs (enabled by this padding) on the decoder.
1215
+ #
1216
+ # TODO(tms) : There are many cases where padding is enabled for
1217
+ # prefills, causing unnecessary and excessive padding of activations.
1218
+
1219
+ if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
1220
+ # Early exit.
1221
+ return 0, None
1222
+
1223
+ num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
1224
+ num_tokens, dp_size, dp_rank)
1225
+ max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
1226
+ num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
1227
+ dp_size,
1228
+ device="cpu",
1229
+ dtype=torch.int32)
1230
+ return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
1231
+
1232
+ def _pool(
1233
+ self,
1234
+ hidden_states: torch.Tensor,
1235
+ num_scheduled_tokens: int,
1236
+ num_scheduled_tokens_np: np.ndarray,
1237
+ finished_sending: Optional[set[str]],
1238
+ finished_recving: Optional[set[str]],
1239
+ ) -> ModelRunnerOutput:
1240
+ assert self.input_batch.num_reqs ==\
1241
+ len(self.input_batch.pooling_params), \
1242
+ "Either all or none of the requests in" \
1243
+ " a batch must be pooling request"
1244
+
1245
+ extracted_hidden_states = list(
1246
+ torch.split(hidden_states[:num_scheduled_tokens],
1247
+ num_scheduled_tokens_np.tolist()))
1248
+
1249
+ pooling_metadata = self.input_batch.pooling_metadata
1250
+
1251
+ raw_pooler_output = self.model.pooler(
1252
+ hidden_states=extracted_hidden_states,
1253
+ pooling_metadata=pooling_metadata)
1254
+
1255
+ pooler_output: list[Optional[torch.Tensor]] = []
1256
+ seq_lens = self.seq_lens[:self.input_batch.num_reqs]
1257
+ for raw_output, seq_len, prompt_len in zip(
1258
+ raw_pooler_output, seq_lens, pooling_metadata.prompt_lens):
1259
+
1260
+ if seq_len == prompt_len:
1261
+ pooler_output.append(raw_output.data.cpu())
1262
+ else:
1263
+ pooler_output.append(None)
1264
+
1265
+ return ModelRunnerOutput(
1266
+ req_ids=self.input_batch.req_ids,
1267
+ req_id_to_index=self.input_batch.req_id_to_index,
1268
+ sampled_token_ids=[],
1269
+ spec_token_ids=None,
1270
+ logprobs=None,
1271
+ prompt_logprobs_dict={},
1272
+ pooler_output=pooler_output,
1273
+ finished_sending=finished_sending,
1274
+ finished_recving=finished_recving,
1275
+ )
1276
+
1277
+ @torch.inference_mode()
1278
+ def execute_model(
1279
+ self,
1280
+ scheduler_output: "SchedulerOutput",
1281
+ intermediate_tensors: Optional[IntermediateTensors] = None,
1282
+ ) -> Union[ModelRunnerOutput, IntermediateTensors]:
1283
+ self._update_states(scheduler_output)
1284
+ if not scheduler_output.total_num_scheduled_tokens:
1285
+ if not has_kv_transfer_group():
1286
+ # Return empty ModelRunnerOutput if there's no work to do.
1287
+ return EMPTY_MODEL_RUNNER_OUTPUT
1288
+
1289
+ return self.kv_connector_no_forward(scheduler_output)
1290
+
1291
+ # Prepare the decoder inputs.
1292
+ (attn_metadata, attention_cuda_graphs, logits_indices,
1293
+ spec_decode_metadata,
1294
+ num_scheduled_tokens_np) = (self._prepare_inputs(scheduler_output))
1295
+ num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
1296
+ if (self.use_cuda_graph
1297
+ and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
1298
+ # Use piecewise CUDA graphs.
1299
+ # Add padding to the batch size.
1300
+ num_input_tokens = self.vllm_config.pad_for_cudagraph(
1301
+ num_scheduled_tokens)
1302
+ else:
1303
+ # Eager mode.
1304
+ # Pad tokens to multiple of tensor_parallel_size when
1305
+ # enabled collective fusion for SP
1306
+ tp_size = self.vllm_config.parallel_config.tensor_parallel_size
1307
+ if self.compilation_config.pass_config. \
1308
+ enable_sequence_parallelism and tp_size > 1:
1309
+ num_input_tokens = round_up(num_scheduled_tokens, tp_size)
1310
+ else:
1311
+ num_input_tokens = num_scheduled_tokens
1312
+
1313
+ # Padding for DP
1314
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
1315
+ num_input_tokens += num_pad
1316
+
1317
+ # _prepare_inputs may reorder the batch, so we must gather multi
1318
+ # modal outputs after that to ensure the correct order
1319
+ if self.is_multimodal_model:
1320
+ # Run the multimodal encoder if any.
1321
+ self._execute_mm_encoder(scheduler_output)
1322
+ mm_embeds = self._gather_mm_embeddings(scheduler_output)
1323
+ else:
1324
+ mm_embeds = []
1325
+
1326
+ if self.is_multimodal_model and get_pp_group().is_first_rank:
1327
+ # NOTE(woosuk): To unify token ids and soft tokens (vision
1328
+ # embeddings), we always use embeddings (rather than token ids)
1329
+ # as input to the multimodal model, even when the input is text.
1330
+ input_ids = self.input_ids[:num_scheduled_tokens]
1331
+ if mm_embeds:
1332
+ inputs_embeds = self.model.get_input_embeddings(
1333
+ input_ids, mm_embeds)
1334
+ else:
1335
+ inputs_embeds = self.model.get_input_embeddings(input_ids)
1336
+ # TODO(woosuk): Avoid the copy. Optimize.
1337
+ self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
1338
+ inputs_embeds = self.inputs_embeds[:num_input_tokens]
1339
+ input_ids = None
1340
+ else:
1341
+ # For text-only models, we use token ids as input.
1342
+ # While it is possible to use embeddings as input just like the
1343
+ # multimodal models, it is not desirable for performance since
1344
+ # then the embedding layer is not included in the CUDA graph.
1345
+ input_ids = self.input_ids[:num_input_tokens]
1346
+ inputs_embeds = None
1347
+ if self.uses_mrope:
1348
+ positions = self.mrope_positions[:, :num_input_tokens]
1349
+ else:
1350
+ positions = self.positions[:num_input_tokens]
1351
+
1352
+ if get_pp_group().is_first_rank:
1353
+ intermediate_tensors = None
1354
+ else:
1355
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
1356
+ num_input_tokens, intermediate_tensors, True)
1357
+
1358
+ # Some attention backends only support CUDA Graphs in pure decode.
1359
+ # If attention doesn't support CUDA Graphs for this batch, but we
1360
+ # compiled with full CUDA graphs, we have to skip them entirely.
1361
+ skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
1362
+
1363
+ # Run the model.
1364
+ # Use persistent buffers for CUDA graphs.
1365
+ with set_forward_context(
1366
+ attn_metadata,
1367
+ self.vllm_config,
1368
+ num_tokens=num_input_tokens,
1369
+ num_tokens_across_dp=num_tokens_across_dp,
1370
+ skip_cuda_graphs=skip_cuda_graphs,
1371
+ ):
1372
+ self.maybe_setup_kv_connector(scheduler_output)
1373
+
1374
+ model_output = self.model(
1375
+ input_ids=input_ids,
1376
+ positions=positions,
1377
+ intermediate_tensors=intermediate_tensors,
1378
+ inputs_embeds=inputs_embeds,
1379
+ )
1380
+
1381
+ self.maybe_wait_for_kv_save()
1382
+ finished_sending, finished_recving = (
1383
+ self.get_finished_kv_transfers(scheduler_output))
1384
+
1385
+ if self.use_aux_hidden_state_outputs:
1386
+ hidden_states, aux_hidden_states = model_output
1387
+ else:
1388
+ hidden_states = model_output
1389
+ aux_hidden_states = None
1390
+
1391
+ # Broadcast PP output for external_launcher (torchrun)
1392
+ # to make sure we are synced across pp ranks
1393
+ # TODO: Support overlapping mirco-batches
1394
+ # https://github.com/vllm-project/vllm/issues/18019
1395
+ broadcast_pp_output = \
1396
+ self.parallel_config.distributed_executor_backend \
1397
+ == "external_launcher" and len(get_pp_group().ranks) > 0
1398
+ if not get_pp_group().is_last_rank:
1399
+ # For mid-pipeline stages, return the hidden states.
1400
+ if not broadcast_pp_output:
1401
+ return hidden_states
1402
+ assert isinstance(hidden_states, IntermediateTensors)
1403
+ get_pp_group().send_tensor_dict(hidden_states.tensors,
1404
+ all_gather_group=get_tp_group())
1405
+ logits = None
1406
+ else:
1407
+ if self.input_batch.pooling_params:
1408
+ return self._pool(hidden_states, num_scheduled_tokens,
1409
+ num_scheduled_tokens_np, finished_sending,
1410
+ finished_recving)
1411
+
1412
+ sample_hidden_states = hidden_states[logits_indices]
1413
+ logits = self.model.compute_logits(sample_hidden_states, None)
1414
+ if broadcast_pp_output:
1415
+ model_output_broadcast_data = {
1416
+ "logits": logits.contiguous(),
1417
+ } if logits is not None else {}
1418
+ model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
1419
+ model_output_broadcast_data, src=len(get_pp_group().ranks) - 1)
1420
+ assert model_output_broadcast_data is not None
1421
+ logits = model_output_broadcast_data["logits"]
1422
+
1423
+ # Apply structured output bitmasks if present
1424
+ if scheduler_output.grammar_bitmask is not None:
1425
+ self.apply_grammar_bitmask(scheduler_output, logits)
1426
+
1427
+ # Sample the next token and get logprobs if needed.
1428
+ sampling_metadata = self.input_batch.sampling_metadata
1429
+ if spec_decode_metadata is None:
1430
+ sampler_output = self.sampler(
1431
+ logits=logits,
1432
+ sampling_metadata=sampling_metadata,
1433
+ )
1434
+ else:
1435
+ # When indexing with a tensor (bonus_logits_indices), PyTorch
1436
+ # creates a new tensor with separate storage from the original
1437
+ # logits tensor. This means any in-place operations on bonus_logits
1438
+ # won't affect the original logits tensor.
1439
+ assert logits is not None
1440
+ bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
1441
+ sampler_output = self.sampler(
1442
+ logits=bonus_logits,
1443
+ sampling_metadata=sampling_metadata,
1444
+ )
1445
+ bonus_token_ids = sampler_output.sampled_token_ids
1446
+
1447
+ # Just like `bonus_logits`, `target_logits` is a new tensor with
1448
+ # separate storage from the original `logits` tensor. Therefore,
1449
+ # it is safe to update `target_logits` in place.
1450
+ target_logits = logits[spec_decode_metadata.target_logits_indices]
1451
+ output_token_ids = self.rejection_sampler(
1452
+ spec_decode_metadata,
1453
+ None, # draft_probs
1454
+ target_logits,
1455
+ bonus_token_ids,
1456
+ sampling_metadata,
1457
+ )
1458
+ sampler_output.sampled_token_ids = output_token_ids
1459
+
1460
+ num_nans_in_logits = {}
1461
+ if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
1462
+ num_nans_in_logits = self._get_nans_in_logits(logits)
1463
+
1464
+ # TODO(woosuk): The following loop can be slow since it iterates over
1465
+ # the requests one by one. Optimize.
1466
+ discard_sampled_tokens_req_indices = []
1467
+ for i, req_id in enumerate(self.input_batch.req_ids):
1468
+ req_state = self.requests[req_id]
1469
+ seq_len = (req_state.num_computed_tokens +
1470
+ scheduler_output.num_scheduled_tokens[req_id])
1471
+ if seq_len < req_state.num_tokens:
1472
+ # Ignore the sampled token for partial prefills.
1473
+ # Rewind the generator state as if the token was not sampled.
1474
+ # This relies on cuda-specific torch-internal impl details
1475
+ generator = self.input_batch.generators.get(i)
1476
+ if generator is not None:
1477
+ generator.set_offset(generator.get_offset() - 4)
1478
+ # Record the index of the request that should not be sampled,
1479
+ # so that we could clear the sampled tokens before returning.
1480
+ discard_sampled_tokens_req_indices.append(i)
1481
+
1482
+ # NOTE: GPU -> CPU Sync happens here.
1483
+ # Move as many CPU operations as possible before this sync point.
1484
+ logprobs_tensors = sampler_output.logprobs_tensors
1485
+ logprobs_lists = logprobs_tensors.tolists() \
1486
+ if logprobs_tensors is not None else None
1487
+
1488
+ # Compute prompt logprobs if needed.
1489
+ prompt_logprobs_dict = self._get_prompt_logprobs_dict(
1490
+ hidden_states[:num_scheduled_tokens],
1491
+ scheduler_output,
1492
+ )
1493
+
1494
+ # Get the valid generated tokens.
1495
+ sampled_token_ids = sampler_output.sampled_token_ids
1496
+ max_gen_len = sampled_token_ids.shape[-1]
1497
+ if max_gen_len == 1:
1498
+ # No spec decode tokens.
1499
+ valid_sampled_token_ids = sampled_token_ids.tolist()
1500
+ else:
1501
+ # Includes spec decode tokens.
1502
+ valid_sampled_token_ids = self.rejection_sampler.parse_output(
1503
+ sampled_token_ids,
1504
+ self.input_batch.vocab_size,
1505
+ )
1506
+ # Mask out the sampled tokens that should not be sampled.
1507
+ for i in discard_sampled_tokens_req_indices:
1508
+ valid_sampled_token_ids[i].clear()
1509
+
1510
+ # Cache the sampled tokens in the model runner, so that the scheduler
1511
+ # doesn't need to send them back.
1512
+ # NOTE(woosuk): As an exception, when using PP, the scheduler sends
1513
+ # the sampled tokens back, because there's no direct communication
1514
+ # between the first-stage worker and the last-stage worker.
1515
+ for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
1516
+ if not sampled_ids:
1517
+ continue
1518
+
1519
+ start_idx = self.input_batch.num_tokens_no_spec[req_idx]
1520
+ end_idx = start_idx + len(sampled_ids)
1521
+ assert end_idx <= self.max_model_len, (
1522
+ "Sampled token IDs exceed the max model length. "
1523
+ f"Total number of tokens: {end_idx} > max_model_len: "
1524
+ f"{self.max_model_len}")
1525
+
1526
+ self.input_batch.token_ids_cpu[req_idx,
1527
+ start_idx:end_idx] = sampled_ids
1528
+ self.input_batch.num_tokens_no_spec[req_idx] = end_idx
1529
+ self.input_batch.num_tokens[req_idx] = end_idx
1530
+ req_id = self.input_batch.req_ids[req_idx]
1531
+ req_state = self.requests[req_id]
1532
+ req_state.output_token_ids.extend(sampled_ids)
1533
+
1534
+ if not self.speculative_config:
1535
+ # Speculative decoding is not enabled.
1536
+ spec_token_ids = None
1537
+ else:
1538
+ spec_token_ids = self.propose_draft_token_ids(
1539
+ scheduler_output,
1540
+ valid_sampled_token_ids,
1541
+ sampling_metadata,
1542
+ hidden_states,
1543
+ sample_hidden_states,
1544
+ aux_hidden_states,
1545
+ spec_decode_metadata,
1546
+ attn_metadata,
1547
+ )
1548
+
1549
+ # Clear KVConnector state after all KVs are generated.
1550
+ if has_kv_transfer_group():
1551
+ get_kv_transfer_group().clear_connector_metadata()
1552
+
1553
+ self.eplb_step()
1554
+
1555
+ return ModelRunnerOutput(
1556
+ req_ids=self.input_batch.req_ids,
1557
+ req_id_to_index=self.input_batch.req_id_to_index,
1558
+ sampled_token_ids=valid_sampled_token_ids,
1559
+ spec_token_ids=spec_token_ids,
1560
+ logprobs=logprobs_lists,
1561
+ prompt_logprobs_dict=prompt_logprobs_dict,
1562
+ pooler_output=[],
1563
+ finished_sending=finished_sending,
1564
+ finished_recving=finished_recving,
1565
+ num_nans_in_logits=num_nans_in_logits,
1566
+ )
1567
+
1568
+ def propose_draft_token_ids(
1569
+ self,
1570
+ scheduler_output: "SchedulerOutput",
1571
+ sampled_token_ids: list[list[int]],
1572
+ sampling_metadata: SamplingMetadata,
1573
+ hidden_states: torch.Tensor,
1574
+ sample_hidden_states: torch.Tensor,
1575
+ aux_hidden_states: Optional[torch.Tensor],
1576
+ spec_decode_metadata: Optional[SpecDecodeMetadata],
1577
+ attn_metadata: dict[str, Any],
1578
+ ) -> list[list[int]]:
1579
+ num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
1580
+ if self.speculative_config.method == "ngram":
1581
+ assert isinstance(self.drafter, NgramProposer)
1582
+ spec_token_ids = self.propose_ngram_draft_token_ids(
1583
+ sampled_token_ids)
1584
+ elif self.speculative_config.method == "medusa":
1585
+ assert isinstance(self.drafter, MedusaProposer)
1586
+ if sample_hidden_states.shape[0] == len(sampled_token_ids):
1587
+ # The input to the target model does not include draft tokens.
1588
+ hidden_states = sample_hidden_states
1589
+ else:
1590
+ indices = []
1591
+ offset = 0
1592
+ for num_draft, tokens in zip(
1593
+ spec_decode_metadata.num_draft_tokens,
1594
+ sampled_token_ids):
1595
+ indices.append(offset + len(tokens) - 1)
1596
+ offset += num_draft + 1
1597
+ indices = torch.tensor(indices, device=self.device)
1598
+ hidden_states = sample_hidden_states[indices]
1599
+
1600
+ spec_token_ids = self.drafter.propose(
1601
+ target_hidden_states=hidden_states,
1602
+ sampling_metadata=sampling_metadata,
1603
+ )
1604
+ elif self.speculative_config.use_eagle():
1605
+ assert isinstance(self.drafter, EagleProposer)
1606
+ # TODO(woosuk): Refactor the loop.
1607
+ next_token_ids: list[int] = []
1608
+ for i, token_ids in enumerate(sampled_token_ids):
1609
+ if token_ids:
1610
+ # Common case.
1611
+ next_token_id = token_ids[-1]
1612
+ else:
1613
+ # Partial prefill (rare case).
1614
+ # Get the next token id from the request state.
1615
+ req_id = self.input_batch.req_ids[i]
1616
+ req_state = self.requests[req_id]
1617
+ seq_len = (req_state.num_computed_tokens +
1618
+ scheduler_output.num_scheduled_tokens[req_id])
1619
+ next_token_id = req_state.get_token_id(seq_len)
1620
+ next_token_ids.append(next_token_id)
1621
+ next_token_ids = torch.tensor(next_token_ids,
1622
+ dtype=torch.int32,
1623
+ device=self.device)
1624
+ # At this moment, we assume all eagle layers belong to the same KV
1625
+ # cache group, thus using the same attention metadata.
1626
+ eagle_attn_metadata = attn_metadata[
1627
+ self.drafter.attn_layer_names[0]]
1628
+
1629
+ # NOTE: deepseek_mtp uses MLA which does not have `block_table`
1630
+ if hasattr(eagle_attn_metadata, "block_table"):
1631
+ block_table = eagle_attn_metadata.block_table
1632
+ else:
1633
+ block_table = None
1634
+
1635
+ if spec_decode_metadata is None:
1636
+ # input_ids can be None for multimodal models.
1637
+ target_token_ids = self.input_ids[:num_scheduled_tokens]
1638
+ # TODO(woosuk): Support M-RoPE.
1639
+ target_positions = self.positions[:num_scheduled_tokens]
1640
+ if self.use_aux_hidden_state_outputs:
1641
+ target_hidden_states = torch.cat(
1642
+ [h[:num_scheduled_tokens] for h in aux_hidden_states],
1643
+ dim=-1)
1644
+ else:
1645
+ target_hidden_states = hidden_states[:num_scheduled_tokens]
1646
+ target_slot_mapping = eagle_attn_metadata.slot_mapping
1647
+ cu_num_tokens = eagle_attn_metadata.query_start_loc
1648
+ else:
1649
+ # TODO(woosuk): Refactor this.
1650
+ num_draft_tokens = spec_decode_metadata.num_draft_tokens
1651
+ num_rejected_tokens = [
1652
+ n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
1653
+ for i, n in enumerate(num_draft_tokens)
1654
+ ]
1655
+ num_rejected_tokens_tensor = async_tensor_h2d(
1656
+ num_rejected_tokens,
1657
+ dtype=torch.int32,
1658
+ target_device=self.device,
1659
+ pin_memory=True)
1660
+ num_tokens = num_scheduled_tokens - sum(num_rejected_tokens)
1661
+ cu_num_tokens, token_indices = self.drafter.prepare_inputs(
1662
+ eagle_attn_metadata.query_start_loc,
1663
+ num_rejected_tokens_tensor,
1664
+ num_tokens,
1665
+ )
1666
+ target_token_ids = self.input_ids[token_indices]
1667
+ # TODO(woosuk): Support M-RoPE.
1668
+ target_positions = self.positions[token_indices]
1669
+ if self.use_aux_hidden_state_outputs:
1670
+ target_hidden_states = torch.cat(
1671
+ [h[token_indices] for h in aux_hidden_states], dim=-1)
1672
+ else:
1673
+ target_hidden_states = hidden_states[token_indices]
1674
+ target_slot_mapping = eagle_attn_metadata.slot_mapping[
1675
+ token_indices]
1676
+ draft_token_ids = self.drafter.propose(
1677
+ target_token_ids=target_token_ids,
1678
+ target_positions=target_positions,
1679
+ target_hidden_states=target_hidden_states,
1680
+ target_slot_mapping=target_slot_mapping,
1681
+ next_token_ids=next_token_ids,
1682
+ cu_num_tokens=cu_num_tokens,
1683
+ block_table=block_table,
1684
+ sampling_metadata=sampling_metadata,
1685
+ )
1686
+ spec_token_ids = draft_token_ids.tolist()
1687
+ return spec_token_ids
1688
+
1689
+ def kv_connector_no_forward(
1690
+ self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
1691
+ # KV send/recv even if no work to do.
1692
+ with set_forward_context(None, self.vllm_config):
1693
+ self.maybe_setup_kv_connector(scheduler_output)
1694
+ finished_sending, finished_recving = (
1695
+ self.get_finished_kv_transfers(scheduler_output))
1696
+
1697
+ if not finished_sending and not finished_recving:
1698
+ return EMPTY_MODEL_RUNNER_OUTPUT
1699
+
1700
+ output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
1701
+ output.finished_sending = finished_sending
1702
+ output.finished_recving = finished_recving
1703
+ return output
1704
+
1705
+ @staticmethod
1706
+ def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
1707
+ # Update KVConnector with the KVConnector metadata forward().
1708
+ if has_kv_transfer_group():
1709
+ kv_connector = get_kv_transfer_group()
1710
+ assert isinstance(kv_connector, KVConnectorBase_V1)
1711
+ assert scheduler_output.kv_connector_metadata is not None
1712
+ kv_connector.bind_connector_metadata(
1713
+ scheduler_output.kv_connector_metadata)
1714
+
1715
+ # Background KV cache transfers happen here.
1716
+ # These transfers are designed to be async and the requests
1717
+ # involved may be disjoint from the running requests.
1718
+ # Do this here to save a collective_rpc.
1719
+ kv_connector.start_load_kv(get_forward_context())
1720
+
1721
+ @staticmethod
1722
+ def maybe_wait_for_kv_save() -> None:
1723
+ if has_kv_transfer_group():
1724
+ get_kv_transfer_group().wait_for_save()
1725
+
1726
+ @staticmethod
1727
+ def get_finished_kv_transfers(
1728
+ scheduler_output: "SchedulerOutput",
1729
+ ) -> tuple[Optional[set[str]], Optional[set[str]]]:
1730
+ if has_kv_transfer_group():
1731
+ return get_kv_transfer_group().get_finished(
1732
+ scheduler_output.finished_req_ids)
1733
+ return None, None
1734
+
1735
+ def propose_ngram_draft_token_ids(
1736
+ self,
1737
+ sampled_token_ids: list[list[int]],
1738
+ ) -> list[list[int]]:
1739
+ # TODO(woosuk): Optimize.
1740
+ draft_token_ids: list[list[int]] = []
1741
+ for i, sampled_ids in enumerate(sampled_token_ids):
1742
+ num_sampled_ids = len(sampled_ids)
1743
+ if not num_sampled_ids:
1744
+ # Skip speculative decoding.
1745
+ draft_token_ids.append([])
1746
+ continue
1747
+
1748
+ # Skip requests that require sampling parameters that are not
1749
+ # supported with speculative decoding.
1750
+ req_id = self.input_batch.req_ids[i]
1751
+ if req_id in self.input_batch.spec_decode_unsupported_reqs:
1752
+ draft_token_ids.append([])
1753
+ continue
1754
+
1755
+ num_tokens = self.input_batch.num_tokens_no_spec[i]
1756
+ if num_tokens >= self.max_model_len:
1757
+ # Skip requests that have already reached the max model length.
1758
+ draft_token_ids.append([])
1759
+ continue
1760
+
1761
+ drafter_output = self.drafter.propose(
1762
+ self.input_batch.token_ids_cpu[i, :num_tokens])
1763
+ if drafter_output is None or len(drafter_output) == 0:
1764
+ draft_token_ids.append([])
1765
+ else:
1766
+ draft_token_ids.append(drafter_output.tolist())
1767
+ return draft_token_ids
1768
+
1769
+ def load_model(self) -> None:
1770
+ logger.info("Starting to load model %s...", self.model_config.model)
1771
+ with DeviceMemoryProfiler() as m: # noqa: SIM117
1772
+ time_before_load = time.perf_counter()
1773
+ model_loader = get_model_loader(self.load_config)
1774
+ if not hasattr(self, "model"):
1775
+ logger.info("Loading model from scratch...")
1776
+ self.model = model_loader.load_model(
1777
+ vllm_config=self.vllm_config,
1778
+ model_config=self.model_config)
1779
+ else:
1780
+ logger.info(
1781
+ "Model was already initialized. Loading weights inplace..."
1782
+ )
1783
+ model_loader.load_weights(self.model,
1784
+ model_config=self.model_config)
1785
+ if has_step_pooler(self.model):
1786
+ self.input_batch.logits_processing_needs_token_ids = True
1787
+ if self.lora_config:
1788
+ self.model = self.load_lora_model(self.model,
1789
+ self.model_config,
1790
+ self.scheduler_config,
1791
+ self.lora_config,
1792
+ self.device)
1793
+ if hasattr(self, "drafter"):
1794
+ logger.info("Loading drafter model...")
1795
+ self.drafter.load_model(self.model)
1796
+ if self.use_aux_hidden_state_outputs:
1797
+ self.model.set_aux_hidden_state_layers(
1798
+ self.model.get_eagle3_aux_hidden_state_layers())
1799
+ time_after_load = time.perf_counter()
1800
+ self.model_memory_usage = m.consumed_memory
1801
+ logger.info("Model loading took %.4f GiB and %.6f seconds",
1802
+ self.model_memory_usage / GiB_bytes,
1803
+ time_after_load - time_before_load)
1804
+ prepare_communication_buffer_for_model(self.model)
1805
+
1806
+ if is_mixture_of_experts(
1807
+ self.model) and self.parallel_config.enable_eplb:
1808
+ logger.info("EPLB is enabled for model %s.",
1809
+ self.model_config.model)
1810
+ self.eplb_state = EplbState.build(
1811
+ self.model,
1812
+ self.device,
1813
+ self.parallel_config,
1814
+ )
1815
+
1816
+ def save_tensorized_model(
1817
+ self,
1818
+ tensorizer_config: "TensorizerConfig",
1819
+ ) -> None:
1820
+ TensorizerLoader.save_model(
1821
+ self.model,
1822
+ tensorizer_config=tensorizer_config,
1823
+ )
1824
+
1825
+ def _get_prompt_logprobs_dict(
1826
+ self,
1827
+ hidden_states: torch.Tensor,
1828
+ scheduler_output: "SchedulerOutput",
1829
+ ) -> dict[str, Optional[LogprobsTensors]]:
1830
+ num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
1831
+ if not num_prompt_logprobs_dict:
1832
+ return {}
1833
+
1834
+ in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
1835
+ prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
1836
+
1837
+ # Since prompt logprobs are a rare feature, prioritize simple,
1838
+ # maintainable loop over optimal performance.
1839
+ completed_prefill_reqs = []
1840
+ for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
1841
+
1842
+ num_tokens = scheduler_output.num_scheduled_tokens[req_id]
1843
+
1844
+ # Get metadata for this request.
1845
+ request = self.requests[req_id]
1846
+ num_prompt_tokens = len(request.prompt_token_ids)
1847
+ prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
1848
+ self.device, non_blocking=True)
1849
+
1850
+ # Set up target LogprobsTensors object.
1851
+ logprobs_tensors = in_progress_dict.get(req_id)
1852
+ if not logprobs_tensors:
1853
+ # Create empty logprobs CPU tensors for the entire prompt.
1854
+ # If chunked, we'll copy in slice by slice.
1855
+ logprobs_tensors = LogprobsTensors.empty_cpu(
1856
+ num_prompt_tokens - 1, num_prompt_logprobs + 1)
1857
+ in_progress_dict[req_id] = logprobs_tensors
1858
+
1859
+ # Determine number of logits to retrieve.
1860
+ start_idx = request.num_computed_tokens
1861
+ start_tok = start_idx + 1
1862
+ num_remaining_tokens = num_prompt_tokens - start_tok
1863
+ if num_tokens <= num_remaining_tokens:
1864
+ # This is a chunk, more tokens remain.
1865
+ # In the == case, there are no more prompt logprobs to produce
1866
+ # but we want to defer returning them to the next step where we
1867
+ # have new generated tokens to return.
1868
+ num_logits = num_tokens
1869
+ else:
1870
+ # This is the last chunk of prompt tokens to return.
1871
+ num_logits = num_remaining_tokens
1872
+ completed_prefill_reqs.append(req_id)
1873
+ prompt_logprobs_dict[req_id] = logprobs_tensors
1874
+
1875
+ if num_logits <= 0:
1876
+ # This can happen for the final chunk if we prefilled exactly
1877
+ # (num_prompt_tokens - 1) tokens for this request in the prior
1878
+ # step. There are no more prompt logprobs to produce.
1879
+ continue
1880
+
1881
+ # Get the logits corresponding to this req's prompt tokens.
1882
+ # If this is a partial request (i.e. chunked prefill),
1883
+ # then there is prompt logprob generated for each index.
1884
+ req_idx = self.input_batch.req_id_to_index[req_id]
1885
+ offset = self.query_start_loc_np[req_idx].item()
1886
+ prompt_hidden_states = hidden_states[offset:offset + num_logits]
1887
+ logits = self.model.compute_logits(prompt_hidden_states, None)
1888
+
1889
+ # Get the "target" tokens for each index. For prompt at index i,
1890
+ # the token at prompt index i+1 is the "sampled" token we want
1891
+ # to gather the logprob for.
1892
+ tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
1893
+
1894
+ # Compute prompt logprobs.
1895
+ logprobs = self.sampler.compute_logprobs(logits)
1896
+ token_ids, logprobs, ranks = self.sampler.gather_logprobs(
1897
+ logprobs, num_prompt_logprobs, tgt_token_ids)
1898
+
1899
+ # Transfer GPU->CPU async.
1900
+ chunk_slice = slice(start_idx, start_idx + num_logits)
1901
+ logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
1902
+ token_ids, non_blocking=True)
1903
+ logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
1904
+ non_blocking=True)
1905
+ logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
1906
+ ranks, non_blocking=True)
1907
+
1908
+ # Remove requests that have completed prefill from the batch
1909
+ # num_prompt_logprobs_dict.
1910
+ for req_id in completed_prefill_reqs:
1911
+ del num_prompt_logprobs_dict[req_id]
1912
+ del in_progress_dict[req_id]
1913
+
1914
+ # Must synchronize the non-blocking GPU->CPU transfers.
1915
+ if prompt_logprobs_dict:
1916
+ self._sync_device()
1917
+
1918
+ return prompt_logprobs_dict
1919
+
1920
+ def _get_nans_in_logits(
1921
+ self,
1922
+ logits: Optional[torch.Tensor],
1923
+ ) -> dict[str, int]:
1924
+ try:
1925
+ if logits is None:
1926
+ return {req_id: 0 for req_id in self.input_batch.req_ids}
1927
+
1928
+ num_nans_in_logits = {}
1929
+ num_nans_for_index = logits.isnan().sum(dim=-1).cpu().numpy()
1930
+ for req_id in self.input_batch.req_ids:
1931
+ req_index = self.input_batch.req_id_to_index[req_id]
1932
+ num_nans_in_logits[req_id] = (
1933
+ int(num_nans_for_index[req_index])
1934
+ if num_nans_for_index is not None
1935
+ and req_index < logits.shape[0] else 0)
1936
+ return num_nans_in_logits
1937
+ except IndexError:
1938
+ return {}
1939
+
1940
+ @contextmanager
1941
+ def maybe_randomize_inputs(self, input_ids: torch.Tensor):
1942
+ """
1943
+ Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
1944
+ This is to help balance expert-selection
1945
+ - during profile_run
1946
+ - during DP rank dummy run
1947
+ """
1948
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
1949
+ randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
1950
+ if not randomize_inputs:
1951
+ yield
1952
+ else:
1953
+ import functools
1954
+
1955
+ @functools.cache
1956
+ def rand_input_ids() -> torch.Tensor:
1957
+ return torch.randint_like(
1958
+ self.input_ids,
1959
+ low=0,
1960
+ high=self.model_config.get_vocab_size(),
1961
+ dtype=input_ids.dtype)
1962
+
1963
+ logger.debug("Randomizing dummy data for DP Rank")
1964
+ input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
1965
+ non_blocking=True)
1966
+ yield
1967
+ input_ids.fill_(0)
1968
+
1969
+ @torch.inference_mode()
1970
+ def _dummy_run(
1971
+ self,
1972
+ num_tokens: int,
1973
+ capture_attn_cudagraph: bool = False,
1974
+ skip_eplb: bool = False,
1975
+ is_profile: bool = False,
1976
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1977
+
1978
+ # Padding for DP
1979
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
1980
+ num_tokens += num_pad
1981
+
1982
+ # Set num_scheduled_tokens based on num_tokens and max_num_seqs
1983
+ # for dummy run with LoRA so that the num_reqs collectively
1984
+ # has num_tokens in total.
1985
+ assert num_tokens <= self.scheduler_config.max_num_batched_tokens
1986
+ max_num_reqs = self.scheduler_config.max_num_seqs
1987
+ num_reqs = min(num_tokens, max_num_reqs)
1988
+ min_tokens_per_req = num_tokens // num_reqs
1989
+ num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
1990
+ num_scheduled_tokens_list[-1] += num_tokens % num_reqs
1991
+ assert sum(num_scheduled_tokens_list) == num_tokens
1992
+ assert len(num_scheduled_tokens_list) == num_reqs
1993
+ num_scheduled_tokens = np.array(num_scheduled_tokens_list,
1994
+ dtype=np.int32)
1995
+
1996
+ attn_metadata: Optional[dict[str, Any]] = None
1997
+ if capture_attn_cudagraph:
1998
+ attn_metadata = {}
1999
+
2000
+ query_start_loc = self.query_start_loc[:num_reqs + 1]
2001
+ # Make sure max_model_len is used at the graph capture time.
2002
+ self.seq_lens_np[:num_reqs] = self.max_model_len
2003
+ self.seq_lens_np[num_reqs:] = 0
2004
+ self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
2005
+ non_blocking=True)
2006
+ seq_lens = self.seq_lens[:num_reqs]
2007
+
2008
+ common_attn_metadata = CommonAttentionMetadata(
2009
+ query_start_loc=query_start_loc,
2010
+ seq_lens=seq_lens,
2011
+ num_reqs=num_reqs,
2012
+ num_actual_tokens=num_tokens,
2013
+ max_query_len=num_tokens,
2014
+ )
2015
+
2016
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
2017
+ self.kv_cache_config.kv_cache_groups):
2018
+
2019
+ attn_metadata_i = self.attn_metadata_builders[
2020
+ kv_cache_group_id].build_for_cudagraph_capture(
2021
+ common_attn_metadata)
2022
+ for layer_name in kv_cache_group_spec.layer_names:
2023
+ attn_metadata[layer_name] = attn_metadata_i
2024
+
2025
+ with self.maybe_dummy_run_with_lora(self.lora_config,
2026
+ num_scheduled_tokens):
2027
+ model = self.model
2028
+ if self.is_multimodal_model:
2029
+ input_ids = None
2030
+ inputs_embeds = self.inputs_embeds[:num_tokens]
2031
+ else:
2032
+ input_ids = self.input_ids[:num_tokens]
2033
+ inputs_embeds = None
2034
+ if self.uses_mrope:
2035
+ positions = self.mrope_positions[:, :num_tokens]
2036
+ else:
2037
+ positions = self.positions[:num_tokens]
2038
+
2039
+ if get_pp_group().is_first_rank:
2040
+ intermediate_tensors = None
2041
+ else:
2042
+ if self.intermediate_tensors is None:
2043
+ self.intermediate_tensors = (
2044
+ self.model.make_empty_intermediate_tensors(
2045
+ batch_size=self.max_num_tokens,
2046
+ dtype=self.model_config.dtype,
2047
+ device=self.device))
2048
+
2049
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
2050
+ num_tokens, None, False)
2051
+
2052
+ with self.maybe_randomize_inputs(input_ids), set_forward_context(
2053
+ attn_metadata,
2054
+ self.vllm_config,
2055
+ num_tokens=num_tokens,
2056
+ num_tokens_across_dp=num_tokens_across_dp):
2057
+ outputs = model(
2058
+ input_ids=input_ids,
2059
+ positions=positions,
2060
+ intermediate_tensors=intermediate_tensors,
2061
+ inputs_embeds=inputs_embeds,
2062
+ )
2063
+ if self.use_aux_hidden_state_outputs:
2064
+ hidden_states, _ = outputs
2065
+ else:
2066
+ hidden_states = outputs
2067
+
2068
+ if self.speculative_config and self.speculative_config.use_eagle():
2069
+ assert isinstance(self.drafter, EagleProposer)
2070
+ self.drafter.dummy_run(num_tokens)
2071
+
2072
+ # This is necessary to avoid blocking DP.
2073
+ # For dummy runs, we typically skip EPLB since we don't have any real
2074
+ # requests to process.
2075
+ # However, in DP settings, there may be cases when some DP ranks do
2076
+ # not have any requests to process, so they're executing dummy batches.
2077
+ # In such cases, we still have to trigger EPLB to make sure
2078
+ # ranks execute the rearrangement in synchronization.
2079
+ if not skip_eplb:
2080
+ self.eplb_step(is_dummy=True, is_profile=is_profile)
2081
+
2082
+ logit_indices = np.cumsum(num_scheduled_tokens) - 1
2083
+ return hidden_states, hidden_states[logit_indices]
2084
+
2085
+ @torch.inference_mode()
2086
+ def _dummy_sampler_run(
2087
+ self,
2088
+ hidden_states: torch.Tensor,
2089
+ ) -> torch.Tensor:
2090
+ # The dummy hidden states may contain special values,
2091
+ # like `inf` or `nan`.
2092
+ # To avoid breaking the sampler, we use a random tensor here instead.
2093
+ hidden_states = torch.rand_like(hidden_states)
2094
+
2095
+ logits = self.model.compute_logits(hidden_states, None)
2096
+ num_reqs = logits.size(0)
2097
+
2098
+ dummy_tensors = lambda v: torch.full(
2099
+ (num_reqs, ), v, device=self.device)
2100
+
2101
+ dummy_metadata = SamplingMetadata(
2102
+ temperature=dummy_tensors(0.5),
2103
+ all_greedy=False,
2104
+ all_random=False,
2105
+ top_p=dummy_tensors(0.9),
2106
+ top_k=dummy_tensors(logits.size(1) - 1),
2107
+ generators={},
2108
+ max_num_logprobs=None,
2109
+ no_penalties=True,
2110
+ prompt_token_ids=None,
2111
+ frequency_penalties=dummy_tensors(0.1),
2112
+ presence_penalties=dummy_tensors(0.1),
2113
+ repetition_penalties=dummy_tensors(0.1),
2114
+ output_token_ids=[[] for _ in range(num_reqs)],
2115
+ allowed_token_ids_mask=None,
2116
+ bad_words_token_ids={},
2117
+ logitsprocs=LogitsProcessorManager(),
2118
+ )
2119
+ try:
2120
+ sampler_output = self.sampler(logits=logits,
2121
+ sampling_metadata=dummy_metadata)
2122
+ except RuntimeError as e:
2123
+ if 'out of memory' in str(e):
2124
+ raise RuntimeError(
2125
+ "CUDA out of memory occurred when warming up sampler with "
2126
+ f"{num_reqs} dummy requests. Please try lowering "
2127
+ "`max_num_seqs` or `gpu_memory_utilization` when "
2128
+ "initializing the engine.") from e
2129
+ else:
2130
+ raise e
2131
+ if self.speculative_config:
2132
+ draft_token_ids = [[0] for _ in range(num_reqs)]
2133
+ dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
2134
+ draft_token_ids, self.device)
2135
+
2136
+ num_tokens = sum(len(ids) for ids in draft_token_ids)
2137
+ # draft_probs = torch.randn(
2138
+ # num_tokens, logits.shape[-1], device=self.device,
2139
+ # dtype=logits.dtype)
2140
+ draft_probs = None
2141
+ target_logits = torch.randn(num_tokens,
2142
+ logits.shape[-1],
2143
+ device=self.device,
2144
+ dtype=logits.dtype)
2145
+ # NOTE(woosuk): Here, we should use int32 because the sampler uses
2146
+ # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
2147
+ # will occur at runtime.
2148
+ bonus_token_ids = torch.zeros(num_reqs,
2149
+ device=self.device,
2150
+ dtype=torch.int32)
2151
+ self.rejection_sampler(
2152
+ dummy_spec_decode_metadata,
2153
+ draft_probs,
2154
+ target_logits,
2155
+ bonus_token_ids,
2156
+ dummy_metadata,
2157
+ )
2158
+ return sampler_output
2159
+
2160
+ @torch.inference_mode()
2161
+ def _dummy_pooler_run(
2162
+ self,
2163
+ hidden_states: torch.Tensor,
2164
+ ) -> torch.Tensor:
2165
+
2166
+ num_tokens = hidden_states.shape[0]
2167
+ max_num_reqs = self.scheduler_config.max_num_seqs
2168
+ num_reqs = min(num_tokens, max_num_reqs)
2169
+ min_tokens_per_req = num_tokens // num_reqs
2170
+ num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
2171
+ num_scheduled_tokens_list[-1] += num_tokens % num_reqs
2172
+ assert sum(num_scheduled_tokens_list) == num_tokens
2173
+ assert len(num_scheduled_tokens_list) == num_reqs
2174
+
2175
+ hidden_states_list = list(
2176
+ torch.split(hidden_states, num_scheduled_tokens_list))
2177
+
2178
+ req_num_tokens = num_tokens // num_reqs
2179
+
2180
+ dummy_metadata = PoolingMetadata(
2181
+ prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
2182
+ device=self.device),
2183
+ prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
2184
+ dtype=torch.int32,
2185
+ device=self.device),
2186
+ pooling_params=[PoolingParams()] * num_reqs)
2187
+
2188
+ try:
2189
+ pooler_output = self.model.pooler(hidden_states=hidden_states_list,
2190
+ pooling_metadata=dummy_metadata)
2191
+ except RuntimeError as e:
2192
+ if 'out of memory' in str(e):
2193
+ raise RuntimeError(
2194
+ "CUDA out of memory occurred when warming up pooler with "
2195
+ f"{num_reqs} dummy requests. Please try lowering "
2196
+ "`max_num_seqs` or `gpu_memory_utilization` when "
2197
+ "initializing the engine.") from e
2198
+ else:
2199
+ raise e
2200
+ return pooler_output
2201
+
2202
+ def profile_run(self) -> None:
2203
+ # Profile with multimodal encoder & encoder cache.
2204
+ # TODO: handle encoder-decoder models once we support them.
2205
+ if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
2206
+ and self.encoder_cache_size > 0):
2207
+
2208
+ # NOTE: Currently model is profiled with a single non-text
2209
+ # modality with the max possible input tokens even when
2210
+ # it supports multiple.
2211
+ max_tokens_by_modality_dict = self.mm_registry \
2212
+ .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
2213
+ dummy_data_modality, max_tokens_per_mm_item = max(
2214
+ max_tokens_by_modality_dict.items(), key=lambda item: item[1])
2215
+
2216
+ # Check how many items of this modality can be supported by
2217
+ # the encoder budget.
2218
+ encoder_budget = min(self.max_num_encoder_input_tokens,
2219
+ self.encoder_cache_size)
2220
+
2221
+ max_num_mm_items_encoder_budget = cdiv(encoder_budget,
2222
+ max_tokens_per_mm_item)
2223
+
2224
+ # Check how many items of this modality can be supported by
2225
+ # the decoder budget.
2226
+ max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
2227
+ self.model_config)[dummy_data_modality]
2228
+
2229
+ # NOTE: We do not consider max_num_batched_tokens on purpose
2230
+ # because the multimodal embeddings can be generated in advance
2231
+ # and chunked prefilled.
2232
+ max_num_mm_items_decoder_budget = self.max_num_reqs * \
2233
+ max_mm_items_per_req
2234
+
2235
+ max_num_mm_items = min(max_num_mm_items_encoder_budget,
2236
+ max_num_mm_items_decoder_budget)
2237
+
2238
+ logger.info(
2239
+ "Encoder cache will be initialized with a budget of %s tokens,"
2240
+ " and profiled with %s %s items of the maximum feature size.",
2241
+ encoder_budget, max_num_mm_items, dummy_data_modality)
2242
+
2243
+ # Create dummy batch of multimodal inputs.
2244
+ dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
2245
+ model_config=self.model_config,
2246
+ seq_len=self.max_num_tokens,
2247
+ mm_counts={
2248
+ dummy_data_modality: 1
2249
+ },
2250
+ ).multi_modal_data
2251
+
2252
+ batched_dummy_mm_inputs = MultiModalKwargs.batch(
2253
+ [dummy_mm_kwargs] * max_num_mm_items,
2254
+ pin_memory=self.pin_memory)
2255
+ batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
2256
+ batched_dummy_mm_inputs,
2257
+ device=self.device,
2258
+ )
2259
+
2260
+ # Run multimodal encoder.
2261
+ dummy_encoder_outputs = self.model.get_multimodal_embeddings(
2262
+ **batched_dummy_mm_inputs)
2263
+
2264
+ sanity_check_mm_encoder_outputs(
2265
+ dummy_encoder_outputs,
2266
+ expected_num_items=max_num_mm_items,
2267
+ )
2268
+
2269
+ # Cache the dummy encoder outputs.
2270
+ self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
2271
+
2272
+ # Add `is_profile` here to pre-allocate communication buffers
2273
+ hidden_states, last_hidden_states \
2274
+ = self._dummy_run(self.max_num_tokens, is_profile=True)
2275
+ if get_pp_group().is_last_rank:
2276
+ if self.is_pooling_model:
2277
+ output = self._dummy_pooler_run(hidden_states)
2278
+ else:
2279
+ output = self._dummy_sampler_run(last_hidden_states)
2280
+ else:
2281
+ output = None
2282
+ self._sync_device()
2283
+ del hidden_states, output
2284
+ self.encoder_cache.clear()
2285
+ gc.collect()
2286
+
2287
+ def capture_model(self) -> None:
2288
+ if not self.use_cuda_graph:
2289
+ logger.warning(
2290
+ "Skipping CUDA graph capture. To turn on CUDA graph capture, "
2291
+ "set -O %s and ensure `use_cudagraph` was not manually set to "
2292
+ "False", CompilationLevel.PIECEWISE)
2293
+ return
2294
+
2295
+ compilation_counter.num_gpu_runner_capture_triggers += 1
2296
+
2297
+ start_time = time.perf_counter()
2298
+ start_free_gpu_memory = torch.cuda.mem_get_info()[0]
2299
+
2300
+ # Trigger CUDA graph capture for specific shapes.
2301
+ # Capture the large shapes first so that the smaller shapes
2302
+ # can reuse the memory pool allocated for the large shapes.
2303
+ with graph_capture(device=self.device):
2304
+ full_cg = self.full_cuda_graph
2305
+ # Only rank 0 should print progress bar during capture
2306
+ compilation_cases = reversed(self.cudagraph_batch_sizes)
2307
+ if is_global_first_rank():
2308
+ compilation_cases = tqdm(list(compilation_cases),
2309
+ desc="Capturing CUDA graph shapes")
2310
+ for num_tokens in compilation_cases:
2311
+ # We skip EPLB here since we don't want to record dummy metrics
2312
+ for _ in range(
2313
+ self.compilation_config.cudagraph_num_of_warmups):
2314
+ self._dummy_run(num_tokens,
2315
+ capture_attn_cudagraph=full_cg,
2316
+ skip_eplb=True)
2317
+ self._dummy_run(num_tokens,
2318
+ capture_attn_cudagraph=full_cg,
2319
+ skip_eplb=True)
2320
+
2321
+ end_time = time.perf_counter()
2322
+ end_free_gpu_memory = torch.cuda.mem_get_info()[0]
2323
+ elapsed_time = end_time - start_time
2324
+ cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
2325
+ # This usually takes 5~20 seconds.
2326
+ logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
2327
+ elapsed_time, cuda_graph_size / (1 << 30))
2328
+
2329
+ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
2330
+ """
2331
+ Initialize the attention backends and attention metadata builders.
2332
+ """
2333
+ assert len(self.attn_backends) == 0 and len(
2334
+ self.attn_metadata_builders
2335
+ ) == 0, "Attention backends are already initialized"
2336
+ for i, kv_cache_group_spec in enumerate(
2337
+ kv_cache_config.kv_cache_groups):
2338
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
2339
+ if isinstance(kv_cache_spec, AttentionSpec):
2340
+ attn_backend_i = get_attn_backend(
2341
+ kv_cache_spec.head_size,
2342
+ self.dtype,
2343
+ kv_cache_spec.dtype,
2344
+ kv_cache_spec.block_size,
2345
+ self.model_config.is_attention_free,
2346
+ use_mla=kv_cache_spec.use_mla,
2347
+ )
2348
+ if attn_backend_i is None:
2349
+ error_msg = (f"Error with get_attn_backend: "
2350
+ f"{kv_cache_spec.head_size=}, "
2351
+ f"{self.dtype=}, {kv_cache_spec.dtype=}, "
2352
+ f"{kv_cache_spec.block_size=}, "
2353
+ f"{self.model_config.is_attention_free=}, "
2354
+ f"{kv_cache_spec.use_mla=}")
2355
+ logger.error(error_msg)
2356
+ raise NotImplementedError(
2357
+ "Non-Attention backend is not supported by V1 "
2358
+ "GPUModelRunner.")
2359
+ elif isinstance(kv_cache_spec, MambaSpec):
2360
+ attn_backend_i = Mamba2AttentionBackend
2361
+ else:
2362
+ raise ValueError(
2363
+ f"Unknown KV cache spec type: {type(kv_cache_spec)}")
2364
+
2365
+ block_table_i = self.input_batch.block_table[i]
2366
+ attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
2367
+ weakref.proxy(self),
2368
+ kv_cache_spec,
2369
+ block_table_i,
2370
+ )
2371
+
2372
+ if (self.full_cuda_graph
2373
+ and not attn_metadata_builder_i.full_cudagraph_supported):
2374
+ raise ValueError(
2375
+ f"Full CUDAGraph not supported for "
2376
+ f"{attn_backend_i.__name__}. Turn off CompilationConfig."
2377
+ f"full_cuda_graph or use a different attention backend.")
2378
+
2379
+ self.attn_backends.append(attn_backend_i)
2380
+ self.attn_metadata_builders.append(attn_metadata_builder_i)
2381
+
2382
+ def may_reinitialize_input_batch(self,
2383
+ kv_cache_config: KVCacheConfig) -> None:
2384
+ """
2385
+ Re-initialize the input batch if the block sizes are different from
2386
+ `[self.cache_config.block_size]`. This usually happens when there
2387
+ are multiple KV cache groups.
2388
+
2389
+ Args:
2390
+ kv_cache_config: The KV cache configuration.
2391
+ """
2392
+ block_sizes = [
2393
+ kv_cache_group.kv_cache_spec.block_size
2394
+ for kv_cache_group in kv_cache_config.kv_cache_groups
2395
+ ]
2396
+ if block_sizes != [self.cache_config.block_size]:
2397
+ assert self.cache_config.cpu_offload_gb == 0, (
2398
+ "Cannot re-initialize the input batch when CPU weight "
2399
+ "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
2400
+ "for more details.")
2401
+ self.input_batch = InputBatch(
2402
+ max_num_reqs=self.max_num_reqs,
2403
+ max_model_len=self.max_model_len,
2404
+ max_num_batched_tokens=self.max_num_tokens,
2405
+ device=self.device,
2406
+ pin_memory=self.pin_memory,
2407
+ vocab_size=self.model_config.get_vocab_size(),
2408
+ block_sizes=block_sizes,
2409
+ is_spec_decode=bool(self.vllm_config.speculative_config),
2410
+ )
2411
+
2412
+ def _allocate_kv_cache_tensors(
2413
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
2414
+ """
2415
+ Initializes the KV cache buffer with the correct size. The buffer needs
2416
+ to be reshaped to the desired shape before being used by the models.
2417
+
2418
+ Args:
2419
+ kv_cache_config: The KV cache config
2420
+ Returns:
2421
+ dict[str, torch.Tensor]: A map between layer names to their
2422
+ corresponding memory buffer for KV cache.
2423
+ """
2424
+ kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
2425
+ for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
2426
+ tensor = torch.zeros(kv_cache_tensor.size,
2427
+ dtype=torch.int8,
2428
+ device=self.device)
2429
+ for layer_name in kv_cache_tensor.shared_by:
2430
+ kv_cache_raw_tensors[layer_name] = tensor
2431
+
2432
+ layer_names = set()
2433
+ for group in kv_cache_config.kv_cache_groups:
2434
+ layer_names.update(group.layer_names)
2435
+ assert layer_names == set(kv_cache_raw_tensors.keys(
2436
+ )), "Some layers are not correctly initialized"
2437
+ return kv_cache_raw_tensors
2438
+
2439
+ def _reshape_kv_cache_tensors(
2440
+ self,
2441
+ kv_cache_config: KVCacheConfig,
2442
+ kv_cache_raw_tensors: dict[str, torch.Tensor],
2443
+ ) -> dict[str, torch.Tensor]:
2444
+ """
2445
+ Reshape the KV cache tensors to the desired shape and dtype.
2446
+
2447
+ Args:
2448
+ kv_cache_config: The KV cache config
2449
+ kv_cache_raw_tensors: The KV cache buffer of each layer, with
2450
+ correct size but uninitialized shape.
2451
+ Returns:
2452
+ Dict[str, torch.Tensor]: A map between layer names to their
2453
+ corresponding memory buffer for KV cache.
2454
+ """
2455
+ kv_caches: dict[str, torch.Tensor] = {}
2456
+ has_attn, has_mamba = False, False
2457
+ for i, kv_cache_group_spec in enumerate(
2458
+ kv_cache_config.kv_cache_groups):
2459
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
2460
+ for layer_name in kv_cache_group_spec.layer_names:
2461
+ raw_tensor = kv_cache_raw_tensors[layer_name]
2462
+ assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
2463
+ num_blocks = (raw_tensor.numel() //
2464
+ kv_cache_spec.page_size_bytes)
2465
+ if isinstance(kv_cache_spec, AttentionSpec):
2466
+ has_attn = True
2467
+ kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
2468
+ num_blocks, kv_cache_spec.block_size,
2469
+ kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
2470
+ dtype = kv_cache_spec.dtype
2471
+ try:
2472
+ kv_cache_stride_order = self.attn_backends[
2473
+ i].get_kv_cache_stride_order()
2474
+ assert len(kv_cache_stride_order) == len(
2475
+ kv_cache_shape)
2476
+ except (AttributeError, NotImplementedError):
2477
+ kv_cache_stride_order = tuple(
2478
+ range(len(kv_cache_shape)))
2479
+ # The allocation respects the backend-defined stride order
2480
+ # to ensure the semantic remains consistent for each
2481
+ # backend. We first obtain the generic kv cache shape and
2482
+ # then permute it according to the stride order which could
2483
+ # result in a non-contiguous tensor.
2484
+ kv_cache_shape = tuple(kv_cache_shape[i]
2485
+ for i in kv_cache_stride_order)
2486
+ # Maintain original KV shape view.
2487
+ inv_order = [
2488
+ kv_cache_stride_order.index(i)
2489
+ for i in range(len(kv_cache_stride_order))
2490
+ ]
2491
+ kv_caches[layer_name] = kv_cache_raw_tensors[
2492
+ layer_name].view(dtype).view(kv_cache_shape).permute(
2493
+ *inv_order)
2494
+ elif isinstance(kv_cache_spec, MambaSpec):
2495
+ has_mamba = True
2496
+ raw_tensor = kv_cache_raw_tensors[layer_name]
2497
+ dtype = kv_cache_spec.dtype
2498
+ num_element_per_page = (kv_cache_spec.page_size_bytes //
2499
+ get_dtype_size(dtype))
2500
+ state_tensors = []
2501
+ storage_offset = 0
2502
+ for shape in kv_cache_spec.shapes:
2503
+ target_shape = (num_blocks, *shape)
2504
+ stride = torch.empty(target_shape).stride()
2505
+ target_stride = (num_element_per_page, *stride[1:])
2506
+ tensor = torch.as_strided(
2507
+ raw_tensor.view(dtype),
2508
+ size=target_shape,
2509
+ stride=target_stride,
2510
+ storage_offset=storage_offset,
2511
+ )
2512
+ state_tensors.append(tensor)
2513
+ storage_offset += stride[0]
2514
+
2515
+ kv_caches[layer_name] = state_tensors
2516
+ else:
2517
+ raise NotImplementedError
2518
+
2519
+ if has_attn and has_mamba:
2520
+ self._verify_hybrid_attention_mamba_layout(kv_cache_config,
2521
+ kv_cache_raw_tensors)
2522
+
2523
+ return kv_caches
2524
+
2525
+ def _verify_hybrid_attention_mamba_layout(
2526
+ self, kv_cache_config: KVCacheConfig,
2527
+ kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
2528
+ """
2529
+ Verify that the KV cache memory layout is compatible for
2530
+ models with both attention and mamba KV cache groups.
2531
+
2532
+ Args:
2533
+ kv_cache_config: The KV cache config
2534
+ kv_cache_raw_tensors: The KV cache buffer of each layer.
2535
+ """
2536
+
2537
+ for i, kv_cache_group_spec in enumerate(
2538
+ kv_cache_config.kv_cache_groups):
2539
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
2540
+ for layer_name in kv_cache_group_spec.layer_names:
2541
+ raw_tensor = kv_cache_raw_tensors[layer_name]
2542
+ num_blocks = (raw_tensor.numel() //
2543
+ kv_cache_spec.page_size_bytes)
2544
+ if isinstance(kv_cache_spec, AttentionSpec):
2545
+ kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
2546
+ num_blocks, kv_cache_spec.block_size,
2547
+ kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
2548
+ if kv_cache_shape[0] != num_blocks or kv_cache_shape[
2549
+ 1] != 2:
2550
+ raise ValueError(
2551
+ "Hybrid models in V1 require an attention "
2552
+ "backend with kv_cache_shape="
2553
+ "(num_blocks, 2, ...). Please try setting "
2554
+ "VLLM_ATTENTION_BACKEND=FLASHINFER")
2555
+
2556
+ def initialize_kv_cache_tensors(
2557
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
2558
+ """
2559
+ Initialize the memory buffer for KV cache.
2560
+
2561
+ Args:
2562
+ kv_cache_config: The KV cache config
2563
+ Returns:
2564
+ Dict[str, torch.Tensor]: A map between layer names to their
2565
+ corresponding memory buffer for KV cache.
2566
+ """
2567
+ # Initialize the memory buffer for KV cache
2568
+ kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
2569
+ # Change the memory buffer to the desired shape
2570
+ kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
2571
+ kv_cache_raw_tensors)
2572
+
2573
+ # Setup `kv_cache_config` and `kv_caches` for models
2574
+ # with cross-layer KV sharing
2575
+ if self.shared_kv_cache_layers:
2576
+ initialize_kv_cache_for_kv_sharing(
2577
+ self.shared_kv_cache_layers,
2578
+ kv_cache_config.kv_cache_groups,
2579
+ kv_caches,
2580
+ )
2581
+
2582
+ bind_kv_cache(kv_caches,
2583
+ self.compilation_config.static_forward_context,
2584
+ self.kv_caches)
2585
+ return kv_caches
2586
+
2587
+ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
2588
+ """
2589
+ Initialize KV cache based on `kv_cache_config`.
2590
+ Args:
2591
+ kv_cache_config: Configuration for the KV cache, including the KV
2592
+ cache size of each layer
2593
+ """
2594
+ self.kv_cache_config = kv_cache_config
2595
+ self.may_reinitialize_input_batch(kv_cache_config)
2596
+ self.initialize_attn_backend(kv_cache_config)
2597
+ kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
2598
+
2599
+ if self.speculative_config and self.speculative_config.use_eagle():
2600
+ assert isinstance(self.drafter, EagleProposer)
2601
+ # validate all draft model layers belong to the same kv cache
2602
+ # group
2603
+ self.drafter.validate_same_kv_cache_group(kv_cache_config)
2604
+
2605
+ if has_kv_transfer_group():
2606
+ get_kv_transfer_group().register_kv_caches(kv_caches)
2607
+
2608
+ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
2609
+ """
2610
+ Generates the KVCacheSpec by parsing the kv cache format from each
2611
+ Attention module in the static forward context.
2612
+ Returns:
2613
+ KVCacheSpec: A dictionary mapping layer names to their KV cache
2614
+ format. Layers that do not need KV cache are not included.
2615
+ """
2616
+
2617
+ block_size = self.vllm_config.cache_config.block_size
2618
+ use_mla = self.vllm_config.model_config.use_mla
2619
+ kv_cache_spec: dict[str, KVCacheSpec] = {}
2620
+ attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
2621
+ for layer_name, attn_module in attn_layers.items():
2622
+ if (kv_tgt_layer :=
2623
+ attn_module.kv_sharing_target_layer_name) is not None:
2624
+ # The layer doesn't need its own KV cache and will use that of
2625
+ # the target layer. We skip creating a KVCacheSpec for it, so
2626
+ # that KV cache management logic will act as this layer does
2627
+ # not exist, and doesn't allocate KV cache for the layer. This
2628
+ # enables the memory saving of cross-layer kv sharing, allowing
2629
+ # a given amount of memory to accommodate longer context lengths
2630
+ # or enable more requests to be processed simultaneously.
2631
+ self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
2632
+ continue
2633
+
2634
+ # TODO: Support other attention modules, e.g., cross-attention
2635
+ if attn_module.attn_type == AttentionType.DECODER:
2636
+ if attn_module.sliding_window is not None:
2637
+ kv_cache_spec[layer_name] = SlidingWindowSpec(
2638
+ block_size=block_size,
2639
+ num_kv_heads=attn_module.num_kv_heads,
2640
+ head_size=attn_module.head_size,
2641
+ dtype=self.kv_cache_dtype,
2642
+ sliding_window=attn_module.sliding_window,
2643
+ use_mla=use_mla)
2644
+ else:
2645
+ kv_cache_spec[layer_name] = FullAttentionSpec(
2646
+ block_size=block_size,
2647
+ num_kv_heads=attn_module.num_kv_heads,
2648
+ head_size=attn_module.head_size,
2649
+ dtype=self.kv_cache_dtype,
2650
+ use_mla=use_mla)
2651
+ elif attn_module.attn_type in (AttentionType.ENCODER,
2652
+ AttentionType.ENCODER_ONLY):
2653
+ # encoder-only attention does not need KV cache.
2654
+ continue
2655
+ elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
2656
+ raise NotImplementedError
2657
+ else:
2658
+ raise ValueError(
2659
+ f"Unknown attention type: {attn_module.attn_type}")
2660
+
2661
+ mamba_layers = get_layers_from_vllm_config(self.vllm_config,
2662
+ MambaMixer2)
2663
+ if len(mamba_layers) > 0:
2664
+ if self.vllm_config.speculative_config is not None:
2665
+ raise NotImplementedError(
2666
+ "Mamba with speculative decoding is not supported yet.")
2667
+ if not self.vllm_config.model_config.enforce_eager:
2668
+ raise NotImplementedError(
2669
+ "Mamba with cuda graph is not supported yet.")
2670
+ if self.vllm_config.cache_config.enable_prefix_caching:
2671
+ raise NotImplementedError(
2672
+ "Prefix caching is not supported for Mamba yet.")
2673
+ max_model_len = self.vllm_config.model_config.max_model_len
2674
+
2675
+ page_size_padded = self._maybe_pad_mamba_page_size(
2676
+ attn_layers, mamba_layers, kv_cache_spec, max_model_len,
2677
+ block_size)
2678
+
2679
+ # Set block_size to max_model_len, so that mamba model will always
2680
+ # have only one block in the KV cache.
2681
+ for layer_name, mamba_module in mamba_layers.items():
2682
+ kv_cache_spec[layer_name] = MambaSpec(
2683
+ shapes=mamba_module.get_state_shape(),
2684
+ dtype=self.kv_cache_dtype,
2685
+ block_size=max_model_len,
2686
+ page_size_padded=page_size_padded)
2687
+
2688
+ return kv_cache_spec
2689
+
2690
+ def _maybe_pad_mamba_page_size(
2691
+ self,
2692
+ attn_layers: dict[str, Attention],
2693
+ mamba_layers: dict[str, MambaMixer2],
2694
+ kv_cache_spec: dict[str, KVCacheSpec],
2695
+ max_model_len: int,
2696
+ block_size: int,
2697
+ ) -> Optional[int]:
2698
+ """
2699
+ Ensure that page size of attention KV cache groups is greater than or
2700
+ equal to the mamba KV cache groups. If not, we suggest to the user
2701
+ how to set the attention block size to ensure that it is.
2702
+
2703
+ If the attention page size is strictly greater than the mamba page size,
2704
+ we pad the mamba page size to make them equal.
2705
+
2706
+ Args:
2707
+ attn_layers: Attention layers
2708
+ mamba_layers: Mamba layers
2709
+ kv_cache_spec: KV cache spec (populated with attention layers)
2710
+
2711
+ Returns:
2712
+ Optional[int]: Mamba page size with padding (None if no padding).
2713
+ """
2714
+
2715
+ if len(attn_layers) == 0:
2716
+ return None
2717
+
2718
+ attn_layer_name = next(iter(attn_layers))
2719
+ attn_page_size = kv_cache_spec[attn_layer_name].page_size_bytes
2720
+ mamba_layer_name = next(iter(mamba_layers))
2721
+ mamba_page_size = MambaSpec(
2722
+ shapes=mamba_layers[mamba_layer_name].get_state_shape(),
2723
+ dtype=self.kv_cache_dtype,
2724
+ block_size=max_model_len).page_size_bytes
2725
+ if attn_page_size < mamba_page_size:
2726
+ # attention page size (for 16 tokens)
2727
+ attn_page_size_16 = 16 * attn_page_size // block_size
2728
+ # some attention backends (e.g. FA) only support setting
2729
+ # block size to multiple of 16, so let's suggest a value
2730
+ # that would work (note: FA is currently not compatible
2731
+ # with mamba layers, use FlashInfer instead).
2732
+ suggest_attn_block_size = 16 * cdiv(mamba_page_size,
2733
+ attn_page_size_16)
2734
+ raise ValueError(
2735
+ "Attention block size should be increased to at least "
2736
+ f"{suggest_attn_block_size} in order to match "
2737
+ "the mamba page size")
2738
+
2739
+ return attn_page_size