vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2022 -0
  5. vllm/_ipex_ops.py +404 -0
  6. vllm/_version.py +34 -0
  7. vllm/adapter_commons/__init__.py +0 -0
  8. vllm/adapter_commons/layers.py +16 -0
  9. vllm/adapter_commons/models.py +106 -0
  10. vllm/adapter_commons/request.py +26 -0
  11. vllm/adapter_commons/utils.py +93 -0
  12. vllm/adapter_commons/worker_manager.py +39 -0
  13. vllm/assets/__init__.py +0 -0
  14. vllm/assets/audio.py +45 -0
  15. vllm/assets/base.py +41 -0
  16. vllm/assets/image.py +50 -0
  17. vllm/assets/video.py +138 -0
  18. vllm/attention/__init__.py +19 -0
  19. vllm/attention/backends/__init__.py +0 -0
  20. vllm/attention/backends/abstract.py +348 -0
  21. vllm/attention/backends/differential_flash_attn.py +935 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1499 -0
  23. vllm/attention/backends/flash_attn.py +933 -0
  24. vllm/attention/backends/flashmla.py +238 -0
  25. vllm/attention/backends/mla/__init__.py +0 -0
  26. vllm/attention/backends/mla/common.py +1310 -0
  27. vllm/attention/backends/placeholder_attn.py +340 -0
  28. vllm/attention/backends/rocm_aiter_mla.py +410 -0
  29. vllm/attention/backends/rocm_flash_attn.py +953 -0
  30. vllm/attention/backends/triton_mla.py +111 -0
  31. vllm/attention/backends/utils.py +610 -0
  32. vllm/attention/backends/xformers.py +805 -0
  33. vllm/attention/layer.py +552 -0
  34. vllm/attention/layers/__init__.py +0 -0
  35. vllm/attention/layers/chunked_local_attention.py +91 -0
  36. vllm/attention/layers/cross_attention.py +159 -0
  37. vllm/attention/layers/encoder_only_attention.py +86 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  40. vllm/attention/ops/common.py +139 -0
  41. vllm/attention/ops/flashmla.py +123 -0
  42. vllm/attention/ops/merge_attn_states.py +43 -0
  43. vllm/attention/ops/paged_attn.py +261 -0
  44. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  45. vllm/attention/ops/prefix_prefill.py +928 -0
  46. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  47. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  48. vllm/attention/ops/triton_decode_attention.py +676 -0
  49. vllm/attention/ops/triton_flash_attention.py +984 -0
  50. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  51. vllm/attention/ops/triton_unified_attention.py +854 -0
  52. vllm/attention/selector.py +243 -0
  53. vllm/attention/utils/__init__.py +0 -0
  54. vllm/attention/utils/fa_utils.py +85 -0
  55. vllm/attention/utils/kv_sharing_utils.py +33 -0
  56. vllm/beam_search.py +87 -0
  57. vllm/benchmarks/__init__.py +0 -0
  58. vllm/benchmarks/datasets.py +2651 -0
  59. vllm/benchmarks/latency.py +170 -0
  60. vllm/benchmarks/lib/__init__.py +3 -0
  61. vllm/benchmarks/lib/endpoint_request_func.py +510 -0
  62. vllm/benchmarks/lib/ready_checker.py +72 -0
  63. vllm/benchmarks/lib/utils.py +80 -0
  64. vllm/benchmarks/serve.py +1247 -0
  65. vllm/benchmarks/throughput.py +696 -0
  66. vllm/collect_env.py +823 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/activation_quant_fusion.py +193 -0
  69. vllm/compilation/backends.py +641 -0
  70. vllm/compilation/base_static_graph.py +51 -0
  71. vllm/compilation/collective_fusion.py +1190 -0
  72. vllm/compilation/compiler_interface.py +572 -0
  73. vllm/compilation/counter.py +47 -0
  74. vllm/compilation/cuda_graph.py +193 -0
  75. vllm/compilation/cuda_piecewise_backend.py +117 -0
  76. vllm/compilation/decorators.py +316 -0
  77. vllm/compilation/fix_functionalization.py +208 -0
  78. vllm/compilation/fusion.py +600 -0
  79. vllm/compilation/fusion_attn.py +303 -0
  80. vllm/compilation/fx_utils.py +84 -0
  81. vllm/compilation/inductor_pass.py +136 -0
  82. vllm/compilation/monitor.py +57 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +165 -0
  85. vllm/compilation/pass_manager.py +88 -0
  86. vllm/compilation/sequence_parallelism.py +484 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +50 -0
  89. vllm/compilation/wrapper.py +138 -0
  90. vllm/config/__init__.py +3921 -0
  91. vllm/config/cache.py +214 -0
  92. vllm/config/compilation.py +580 -0
  93. vllm/config/kv_events.py +50 -0
  94. vllm/config/kv_transfer.py +111 -0
  95. vllm/config/load.py +113 -0
  96. vllm/config/lora.py +132 -0
  97. vllm/config/parallel.py +446 -0
  98. vllm/config/scheduler.py +304 -0
  99. vllm/config/utils.py +29 -0
  100. vllm/connections.py +174 -0
  101. vllm/core/__init__.py +0 -0
  102. vllm/core/block/__init__.py +0 -0
  103. vllm/core/block/block_table.py +399 -0
  104. vllm/core/block/common.py +371 -0
  105. vllm/core/block/cpu_gpu_block_allocator.py +439 -0
  106. vllm/core/block/interfaces.py +319 -0
  107. vllm/core/block/naive_block.py +466 -0
  108. vllm/core/block/prefix_caching_block.py +1135 -0
  109. vllm/core/block/utils.py +28 -0
  110. vllm/core/block_manager.py +523 -0
  111. vllm/core/evictor.py +157 -0
  112. vllm/core/interfaces.py +139 -0
  113. vllm/core/placeholder_block_space_manager.py +103 -0
  114. vllm/core/scheduler.py +2028 -0
  115. vllm/device_allocator/__init__.py +0 -0
  116. vllm/device_allocator/cumem.py +286 -0
  117. vllm/distributed/__init__.py +6 -0
  118. vllm/distributed/communication_op.py +41 -0
  119. vllm/distributed/device_communicators/__init__.py +0 -0
  120. vllm/distributed/device_communicators/all2all.py +259 -0
  121. vllm/distributed/device_communicators/all_reduce_utils.py +292 -0
  122. vllm/distributed/device_communicators/base_device_communicator.py +277 -0
  123. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  124. vllm/distributed/device_communicators/cuda_communicator.py +294 -0
  125. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  126. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  127. vllm/distributed/device_communicators/pynccl.py +290 -0
  128. vllm/distributed/device_communicators/pynccl_wrapper.py +382 -0
  129. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  130. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  131. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  132. vllm/distributed/device_communicators/symm_mem.py +136 -0
  133. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  134. vllm/distributed/device_communicators/xpu_communicator.py +69 -0
  135. vllm/distributed/eplb/__init__.py +8 -0
  136. vllm/distributed/eplb/eplb_state.py +619 -0
  137. vllm/distributed/eplb/rebalance_algo.py +234 -0
  138. vllm/distributed/eplb/rebalance_execute.py +424 -0
  139. vllm/distributed/kv_events.py +362 -0
  140. vllm/distributed/kv_transfer/README.md +29 -0
  141. vllm/distributed/kv_transfer/__init__.py +13 -0
  142. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  143. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  145. vllm/distributed/kv_transfer/kv_connector/factory.py +108 -0
  146. vllm/distributed/kv_transfer/kv_connector/utils.py +246 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/base.py +356 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +266 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1319 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +484 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +542 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +266 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +414 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  158. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  159. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  160. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  161. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  162. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  163. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  164. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  165. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  166. vllm/distributed/parallel_state.py +1489 -0
  167. vllm/distributed/tpu_distributed_utils.py +178 -0
  168. vllm/distributed/utils.py +536 -0
  169. vllm/engine/__init__.py +0 -0
  170. vllm/engine/arg_utils.py +1857 -0
  171. vllm/engine/async_llm_engine.py +1044 -0
  172. vllm/engine/async_timeout.py +173 -0
  173. vllm/engine/llm_engine.py +1849 -0
  174. vllm/engine/metrics.py +577 -0
  175. vllm/engine/metrics_types.py +84 -0
  176. vllm/engine/multiprocessing/__init__.py +145 -0
  177. vllm/engine/multiprocessing/client.py +643 -0
  178. vllm/engine/multiprocessing/engine.py +470 -0
  179. vllm/engine/output_processor/__init__.py +0 -0
  180. vllm/engine/output_processor/interfaces.py +61 -0
  181. vllm/engine/output_processor/single_step.py +145 -0
  182. vllm/engine/output_processor/stop_checker.py +131 -0
  183. vllm/engine/output_processor/util.py +28 -0
  184. vllm/engine/protocol.py +343 -0
  185. vllm/entrypoints/__init__.py +0 -0
  186. vllm/entrypoints/api_server.py +178 -0
  187. vllm/entrypoints/chat_utils.py +1535 -0
  188. vllm/entrypoints/cli/__init__.py +12 -0
  189. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  190. vllm/entrypoints/cli/benchmark/base.py +25 -0
  191. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  192. vllm/entrypoints/cli/benchmark/main.py +58 -0
  193. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  194. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  195. vllm/entrypoints/cli/collect_env.py +36 -0
  196. vllm/entrypoints/cli/main.py +60 -0
  197. vllm/entrypoints/cli/openai.py +214 -0
  198. vllm/entrypoints/cli/run_batch.py +69 -0
  199. vllm/entrypoints/cli/serve.py +232 -0
  200. vllm/entrypoints/cli/types.py +29 -0
  201. vllm/entrypoints/constants.py +10 -0
  202. vllm/entrypoints/context.py +444 -0
  203. vllm/entrypoints/harmony_utils.py +431 -0
  204. vllm/entrypoints/launcher.py +168 -0
  205. vllm/entrypoints/llm.py +1579 -0
  206. vllm/entrypoints/logger.py +79 -0
  207. vllm/entrypoints/openai/__init__.py +0 -0
  208. vllm/entrypoints/openai/api_server.py +2011 -0
  209. vllm/entrypoints/openai/cli_args.py +281 -0
  210. vllm/entrypoints/openai/logits_processors.py +90 -0
  211. vllm/entrypoints/openai/protocol.py +2590 -0
  212. vllm/entrypoints/openai/run_batch.py +497 -0
  213. vllm/entrypoints/openai/serving_chat.py +1591 -0
  214. vllm/entrypoints/openai/serving_classification.py +176 -0
  215. vllm/entrypoints/openai/serving_completion.py +688 -0
  216. vllm/entrypoints/openai/serving_embedding.py +632 -0
  217. vllm/entrypoints/openai/serving_engine.py +996 -0
  218. vllm/entrypoints/openai/serving_models.py +288 -0
  219. vllm/entrypoints/openai/serving_pooling.py +277 -0
  220. vllm/entrypoints/openai/serving_responses.py +1690 -0
  221. vllm/entrypoints/openai/serving_score.py +479 -0
  222. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  223. vllm/entrypoints/openai/serving_transcription.py +136 -0
  224. vllm/entrypoints/openai/speech_to_text.py +388 -0
  225. vllm/entrypoints/openai/tool_parsers/__init__.py +51 -0
  226. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  227. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  228. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  229. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  230. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  231. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  232. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +418 -0
  233. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  234. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  235. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  236. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  237. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  238. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  239. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  240. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  241. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +73 -0
  242. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  243. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  244. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  245. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  246. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  247. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  248. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  249. vllm/entrypoints/renderer.py +395 -0
  250. vllm/entrypoints/score_utils.py +232 -0
  251. vllm/entrypoints/ssl.py +75 -0
  252. vllm/entrypoints/tool.py +139 -0
  253. vllm/entrypoints/tool_server.py +195 -0
  254. vllm/entrypoints/utils.py +328 -0
  255. vllm/env_override.py +23 -0
  256. vllm/envs.py +1354 -0
  257. vllm/executor/__init__.py +0 -0
  258. vllm/executor/executor_base.py +378 -0
  259. vllm/executor/mp_distributed_executor.py +244 -0
  260. vllm/executor/msgspec_utils.py +35 -0
  261. vllm/executor/multiproc_worker_utils.py +279 -0
  262. vllm/executor/ray_distributed_executor.py +699 -0
  263. vllm/executor/ray_utils.py +410 -0
  264. vllm/executor/uniproc_executor.py +152 -0
  265. vllm/forward_context.py +273 -0
  266. vllm/inputs/__init__.py +44 -0
  267. vllm/inputs/data.py +356 -0
  268. vllm/inputs/parse.py +151 -0
  269. vllm/inputs/preprocess.py +973 -0
  270. vllm/inputs/registry.py +251 -0
  271. vllm/logger.py +229 -0
  272. vllm/logging_utils/__init__.py +8 -0
  273. vllm/logging_utils/dump_input.py +81 -0
  274. vllm/logging_utils/formatter.py +79 -0
  275. vllm/logits_process.py +119 -0
  276. vllm/logprobs.py +28 -0
  277. vllm/lora/__init__.py +0 -0
  278. vllm/lora/layers/__init__.py +34 -0
  279. vllm/lora/layers/base.py +69 -0
  280. vllm/lora/layers/base_linear.py +184 -0
  281. vllm/lora/layers/column_parallel_linear.py +622 -0
  282. vllm/lora/layers/logits_processor.py +247 -0
  283. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  284. vllm/lora/layers/replicated_linear.py +61 -0
  285. vllm/lora/layers/row_parallel_linear.py +201 -0
  286. vllm/lora/layers/utils.py +60 -0
  287. vllm/lora/layers/vocal_parallel_embedding.py +172 -0
  288. vllm/lora/lora.py +199 -0
  289. vllm/lora/models.py +792 -0
  290. vllm/lora/ops/__init__.py +0 -0
  291. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  292. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  293. vllm/lora/ops/torch_ops/__init__.py +16 -0
  294. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  295. vllm/lora/ops/triton_ops/__init__.py +12 -0
  296. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  297. vllm/lora/ops/triton_ops/lora_expand_op.py +291 -0
  298. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  299. vllm/lora/ops/triton_ops/lora_shrink_op.py +245 -0
  300. vllm/lora/ops/triton_ops/utils.py +126 -0
  301. vllm/lora/ops/xla_ops/__init__.py +7 -0
  302. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  303. vllm/lora/peft_helper.py +127 -0
  304. vllm/lora/punica_wrapper/__init__.py +10 -0
  305. vllm/lora/punica_wrapper/punica_base.py +458 -0
  306. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  307. vllm/lora/punica_wrapper/punica_gpu.py +279 -0
  308. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  309. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  310. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  311. vllm/lora/punica_wrapper/utils.py +136 -0
  312. vllm/lora/request.py +99 -0
  313. vllm/lora/resolver.py +85 -0
  314. vllm/lora/utils.py +246 -0
  315. vllm/lora/worker_manager.py +256 -0
  316. vllm/model_executor/__init__.py +16 -0
  317. vllm/model_executor/custom_op.py +194 -0
  318. vllm/model_executor/layers/__init__.py +0 -0
  319. vllm/model_executor/layers/activation.py +575 -0
  320. vllm/model_executor/layers/attention_layer_base.py +23 -0
  321. vllm/model_executor/layers/fla/__init__.py +8 -0
  322. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  323. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  324. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  325. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  326. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  327. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  328. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  329. vllm/model_executor/layers/fla/ops/index.py +39 -0
  330. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  331. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  332. vllm/model_executor/layers/fla/ops/op.py +39 -0
  333. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  334. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  335. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  336. vllm/model_executor/layers/fused_moe/__init__.py +80 -0
  337. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +304 -0
  338. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +164 -0
  339. vllm/model_executor/layers/fused_moe/config.py +497 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  560. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +297 -0
  561. vllm/model_executor/layers/fused_moe/cutlass_moe.py +996 -0
  562. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +370 -0
  563. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  564. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +280 -0
  565. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +229 -0
  566. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +243 -0
  567. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +97 -0
  568. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1042 -0
  569. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +240 -0
  570. vllm/model_executor/layers/fused_moe/fused_moe.py +2081 -0
  571. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +247 -0
  572. vllm/model_executor/layers/fused_moe/layer.py +1951 -0
  573. vllm/model_executor/layers/fused_moe/modular_kernel.py +892 -0
  574. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  575. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  576. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  577. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  578. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +321 -0
  579. vllm/model_executor/layers/fused_moe/prepare_finalize.py +72 -0
  580. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +431 -0
  581. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  582. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  583. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +171 -0
  584. vllm/model_executor/layers/fused_moe/trtllm_moe.py +197 -0
  585. vllm/model_executor/layers/fused_moe/utils.py +270 -0
  586. vllm/model_executor/layers/layernorm.py +381 -0
  587. vllm/model_executor/layers/lightning_attn.py +661 -0
  588. vllm/model_executor/layers/linear.py +1567 -0
  589. vllm/model_executor/layers/logits_processor.py +199 -0
  590. vllm/model_executor/layers/mamba/__init__.py +0 -0
  591. vllm/model_executor/layers/mamba/abstract.py +45 -0
  592. vllm/model_executor/layers/mamba/linear_attn.py +432 -0
  593. vllm/model_executor/layers/mamba/mamba2_metadata.py +186 -0
  594. vllm/model_executor/layers/mamba/mamba_mixer.py +517 -0
  595. vllm/model_executor/layers/mamba/mamba_mixer2.py +803 -0
  596. vllm/model_executor/layers/mamba/mamba_utils.py +202 -0
  597. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  598. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +982 -0
  599. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  600. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  601. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  602. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +574 -0
  603. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  604. vllm/model_executor/layers/mamba/ops/ssd_combined.py +248 -0
  605. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +248 -0
  606. vllm/model_executor/layers/mamba/short_conv.py +270 -0
  607. vllm/model_executor/layers/mla.py +158 -0
  608. vllm/model_executor/layers/pooler.py +732 -0
  609. vllm/model_executor/layers/quantization/__init__.py +157 -0
  610. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  611. vllm/model_executor/layers/quantization/awq.py +228 -0
  612. vllm/model_executor/layers/quantization/awq_marlin.py +548 -0
  613. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  614. vllm/model_executor/layers/quantization/base_config.py +164 -0
  615. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  616. vllm/model_executor/layers/quantization/bitsandbytes.py +621 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +795 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1651 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  625. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +161 -0
  626. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  627. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  628. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  629. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +156 -0
  630. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  631. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  632. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +227 -0
  633. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +135 -0
  634. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +21 -0
  635. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  636. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  637. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  638. vllm/model_executor/layers/quantization/deepgemm.py +81 -0
  639. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  640. vllm/model_executor/layers/quantization/experts_int8.py +215 -0
  641. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  642. vllm/model_executor/layers/quantization/fp8.py +1179 -0
  643. vllm/model_executor/layers/quantization/gguf.py +597 -0
  644. vllm/model_executor/layers/quantization/gptq.py +300 -0
  645. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  646. vllm/model_executor/layers/quantization/gptq_marlin.py +700 -0
  647. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  648. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  649. vllm/model_executor/layers/quantization/inc.py +61 -0
  650. vllm/model_executor/layers/quantization/input_quant_fp8.py +103 -0
  651. vllm/model_executor/layers/quantization/ipex_quant.py +410 -0
  652. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  653. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  654. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  655. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  656. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  657. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  658. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  659. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  660. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  661. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  662. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  663. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  664. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  665. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +163 -0
  666. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  667. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  668. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  669. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  670. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  671. vllm/model_executor/layers/quantization/modelopt.py +1548 -0
  672. vllm/model_executor/layers/quantization/moe_wna16.py +473 -0
  673. vllm/model_executor/layers/quantization/mxfp4.py +951 -0
  674. vllm/model_executor/layers/quantization/petit.py +306 -0
  675. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  676. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  677. vllm/model_executor/layers/quantization/quark/quark.py +431 -0
  678. vllm/model_executor/layers/quantization/quark/quark_moe.py +434 -0
  679. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  680. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  681. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +112 -0
  682. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  683. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  684. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  685. vllm/model_executor/layers/quantization/rtn.py +456 -0
  686. vllm/model_executor/layers/quantization/schema.py +86 -0
  687. vllm/model_executor/layers/quantization/torchao.py +214 -0
  688. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  689. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  690. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  691. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  903. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +85 -0
  904. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +258 -0
  905. vllm/model_executor/layers/quantization/utils/fp8_utils.py +795 -0
  906. vllm/model_executor/layers/quantization/utils/gptq_utils.py +96 -0
  907. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  908. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  909. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  910. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  911. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  912. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  913. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  914. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  915. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +132 -0
  916. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  917. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  918. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  919. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  920. vllm/model_executor/layers/quantization/utils/quant_utils.py +627 -0
  921. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  922. vllm/model_executor/layers/resampler.py +270 -0
  923. vllm/model_executor/layers/rotary_embedding/__init__.py +190 -0
  924. vllm/model_executor/layers/rotary_embedding/base.py +156 -0
  925. vllm/model_executor/layers/rotary_embedding/common.py +105 -0
  926. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +140 -0
  927. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  928. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  929. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  930. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  931. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  932. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  933. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  934. vllm/model_executor/layers/rotary_embedding/mrope.py +1140 -0
  935. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  936. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  937. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  938. vllm/model_executor/layers/sampler.py +1198 -0
  939. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  940. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  941. vllm/model_executor/layers/utils.py +196 -0
  942. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  943. vllm/model_executor/model_loader/__init__.py +138 -0
  944. vllm/model_executor/model_loader/base_loader.py +52 -0
  945. vllm/model_executor/model_loader/bitsandbytes_loader.py +787 -0
  946. vllm/model_executor/model_loader/default_loader.py +278 -0
  947. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  948. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  949. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  950. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  951. vllm/model_executor/model_loader/tensorizer.py +743 -0
  952. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  953. vllm/model_executor/model_loader/tpu.py +114 -0
  954. vllm/model_executor/model_loader/utils.py +271 -0
  955. vllm/model_executor/model_loader/weight_utils.py +946 -0
  956. vllm/model_executor/models/__init__.py +30 -0
  957. vllm/model_executor/models/adapters.py +542 -0
  958. vllm/model_executor/models/aimv2.py +246 -0
  959. vllm/model_executor/models/apertus.py +582 -0
  960. vllm/model_executor/models/arcee.py +423 -0
  961. vllm/model_executor/models/arctic.py +560 -0
  962. vllm/model_executor/models/aria.py +662 -0
  963. vllm/model_executor/models/aya_vision.py +470 -0
  964. vllm/model_executor/models/baichuan.py +475 -0
  965. vllm/model_executor/models/bailing_moe.py +529 -0
  966. vllm/model_executor/models/bamba.py +582 -0
  967. vllm/model_executor/models/bart.py +1343 -0
  968. vllm/model_executor/models/bert.py +613 -0
  969. vllm/model_executor/models/bert_with_rope.py +687 -0
  970. vllm/model_executor/models/blip.py +339 -0
  971. vllm/model_executor/models/blip2.py +716 -0
  972. vllm/model_executor/models/bloom.py +374 -0
  973. vllm/model_executor/models/chameleon.py +1141 -0
  974. vllm/model_executor/models/chatglm.py +479 -0
  975. vllm/model_executor/models/clip.py +407 -0
  976. vllm/model_executor/models/cohere2_vision.py +484 -0
  977. vllm/model_executor/models/commandr.py +467 -0
  978. vllm/model_executor/models/config.py +434 -0
  979. vllm/model_executor/models/constant_size_cache.py +137 -0
  980. vllm/model_executor/models/dbrx.py +473 -0
  981. vllm/model_executor/models/deepseek.py +491 -0
  982. vllm/model_executor/models/deepseek_eagle.py +241 -0
  983. vllm/model_executor/models/deepseek_mtp.py +282 -0
  984. vllm/model_executor/models/deepseek_v2.py +1058 -0
  985. vllm/model_executor/models/deepseek_vl2.py +661 -0
  986. vllm/model_executor/models/donut.py +387 -0
  987. vllm/model_executor/models/dots1.py +547 -0
  988. vllm/model_executor/models/ernie45.py +43 -0
  989. vllm/model_executor/models/ernie45_moe.py +608 -0
  990. vllm/model_executor/models/ernie45_vl.py +1510 -0
  991. vllm/model_executor/models/ernie45_vl_moe.py +728 -0
  992. vllm/model_executor/models/ernie_mtp.py +287 -0
  993. vllm/model_executor/models/exaone.py +552 -0
  994. vllm/model_executor/models/exaone4.py +535 -0
  995. vllm/model_executor/models/fairseq2_llama.py +154 -0
  996. vllm/model_executor/models/falcon.py +511 -0
  997. vllm/model_executor/models/falcon_h1.py +739 -0
  998. vllm/model_executor/models/florence2.py +1107 -0
  999. vllm/model_executor/models/fuyu.py +401 -0
  1000. vllm/model_executor/models/gemma.py +428 -0
  1001. vllm/model_executor/models/gemma2.py +425 -0
  1002. vllm/model_executor/models/gemma3.py +542 -0
  1003. vllm/model_executor/models/gemma3_mm.py +723 -0
  1004. vllm/model_executor/models/gemma3n.py +830 -0
  1005. vllm/model_executor/models/gemma3n_mm.py +767 -0
  1006. vllm/model_executor/models/glm.py +23 -0
  1007. vllm/model_executor/models/glm4.py +305 -0
  1008. vllm/model_executor/models/glm4_1v.py +1669 -0
  1009. vllm/model_executor/models/glm4_moe.py +703 -0
  1010. vllm/model_executor/models/glm4_moe_mtp.py +306 -0
  1011. vllm/model_executor/models/glm4v.py +654 -0
  1012. vllm/model_executor/models/gpt2.py +383 -0
  1013. vllm/model_executor/models/gpt_bigcode.py +346 -0
  1014. vllm/model_executor/models/gpt_j.py +340 -0
  1015. vllm/model_executor/models/gpt_neox.py +333 -0
  1016. vllm/model_executor/models/gpt_oss.py +687 -0
  1017. vllm/model_executor/models/granite.py +498 -0
  1018. vllm/model_executor/models/granite_speech.py +799 -0
  1019. vllm/model_executor/models/granitemoe.py +541 -0
  1020. vllm/model_executor/models/granitemoehybrid.py +684 -0
  1021. vllm/model_executor/models/granitemoeshared.py +342 -0
  1022. vllm/model_executor/models/gritlm.py +262 -0
  1023. vllm/model_executor/models/grok1.py +550 -0
  1024. vllm/model_executor/models/h2ovl.py +536 -0
  1025. vllm/model_executor/models/hunyuan_v1.py +937 -0
  1026. vllm/model_executor/models/hyperclovax_vision.py +1206 -0
  1027. vllm/model_executor/models/idefics2_vision_model.py +416 -0
  1028. vllm/model_executor/models/idefics3.py +758 -0
  1029. vllm/model_executor/models/interfaces.py +854 -0
  1030. vllm/model_executor/models/interfaces_base.py +195 -0
  1031. vllm/model_executor/models/intern_vit.py +481 -0
  1032. vllm/model_executor/models/internlm2.py +453 -0
  1033. vllm/model_executor/models/internlm2_ve.py +148 -0
  1034. vllm/model_executor/models/interns1.py +832 -0
  1035. vllm/model_executor/models/interns1_vit.py +418 -0
  1036. vllm/model_executor/models/internvl.py +1423 -0
  1037. vllm/model_executor/models/jais.py +374 -0
  1038. vllm/model_executor/models/jamba.py +630 -0
  1039. vllm/model_executor/models/jina_vl.py +144 -0
  1040. vllm/model_executor/models/keye.py +1684 -0
  1041. vllm/model_executor/models/keye_vl1_5.py +601 -0
  1042. vllm/model_executor/models/kimi_vl.py +620 -0
  1043. vllm/model_executor/models/lfm2.py +558 -0
  1044. vllm/model_executor/models/llama.py +671 -0
  1045. vllm/model_executor/models/llama4.py +732 -0
  1046. vllm/model_executor/models/llama4_eagle.py +241 -0
  1047. vllm/model_executor/models/llama_eagle.py +171 -0
  1048. vllm/model_executor/models/llama_eagle3.py +292 -0
  1049. vllm/model_executor/models/llava.py +872 -0
  1050. vllm/model_executor/models/llava_next.py +572 -0
  1051. vllm/model_executor/models/llava_next_video.py +479 -0
  1052. vllm/model_executor/models/llava_onevision.py +945 -0
  1053. vllm/model_executor/models/mamba.py +310 -0
  1054. vllm/model_executor/models/mamba2.py +346 -0
  1055. vllm/model_executor/models/mamba_cache.py +83 -0
  1056. vllm/model_executor/models/medusa.py +219 -0
  1057. vllm/model_executor/models/midashenglm.py +788 -0
  1058. vllm/model_executor/models/mimo.py +191 -0
  1059. vllm/model_executor/models/mimo_mtp.py +273 -0
  1060. vllm/model_executor/models/minicpm.py +593 -0
  1061. vllm/model_executor/models/minicpm3.py +230 -0
  1062. vllm/model_executor/models/minicpm_eagle.py +391 -0
  1063. vllm/model_executor/models/minicpmo.py +804 -0
  1064. vllm/model_executor/models/minicpmv.py +1786 -0
  1065. vllm/model_executor/models/minimax_cache.py +36 -0
  1066. vllm/model_executor/models/minimax_text_01.py +1027 -0
  1067. vllm/model_executor/models/minimax_vl_01.py +431 -0
  1068. vllm/model_executor/models/mistral3.py +628 -0
  1069. vllm/model_executor/models/mixtral.py +494 -0
  1070. vllm/model_executor/models/mllama.py +1697 -0
  1071. vllm/model_executor/models/mllama4.py +1079 -0
  1072. vllm/model_executor/models/mlp_speculator.py +206 -0
  1073. vllm/model_executor/models/modernbert.py +374 -0
  1074. vllm/model_executor/models/module_mapping.py +72 -0
  1075. vllm/model_executor/models/molmo.py +1569 -0
  1076. vllm/model_executor/models/moonvit.py +663 -0
  1077. vllm/model_executor/models/motif.py +345 -0
  1078. vllm/model_executor/models/mpt.py +332 -0
  1079. vllm/model_executor/models/nano_nemotron_vl.py +1395 -0
  1080. vllm/model_executor/models/nemotron.py +509 -0
  1081. vllm/model_executor/models/nemotron_h.py +633 -0
  1082. vllm/model_executor/models/nemotron_nas.py +484 -0
  1083. vllm/model_executor/models/nemotron_vl.py +655 -0
  1084. vllm/model_executor/models/nvlm_d.py +203 -0
  1085. vllm/model_executor/models/olmo.py +406 -0
  1086. vllm/model_executor/models/olmo2.py +428 -0
  1087. vllm/model_executor/models/olmoe.py +485 -0
  1088. vllm/model_executor/models/opt.py +413 -0
  1089. vllm/model_executor/models/orion.py +350 -0
  1090. vllm/model_executor/models/ovis.py +572 -0
  1091. vllm/model_executor/models/ovis2_5.py +644 -0
  1092. vllm/model_executor/models/paligemma.py +414 -0
  1093. vllm/model_executor/models/persimmon.py +345 -0
  1094. vllm/model_executor/models/phi.py +357 -0
  1095. vllm/model_executor/models/phi3.py +19 -0
  1096. vllm/model_executor/models/phi3v.py +701 -0
  1097. vllm/model_executor/models/phi4_multimodal.py +1478 -0
  1098. vllm/model_executor/models/phi4flash.py +737 -0
  1099. vllm/model_executor/models/phi4mm.py +1281 -0
  1100. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1101. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1102. vllm/model_executor/models/phimoe.py +681 -0
  1103. vllm/model_executor/models/pixtral.py +1348 -0
  1104. vllm/model_executor/models/plamo2.py +1126 -0
  1105. vllm/model_executor/models/qwen.py +363 -0
  1106. vllm/model_executor/models/qwen2.py +526 -0
  1107. vllm/model_executor/models/qwen2_5_omni_thinker.py +985 -0
  1108. vllm/model_executor/models/qwen2_5_vl.py +1256 -0
  1109. vllm/model_executor/models/qwen2_audio.py +492 -0
  1110. vllm/model_executor/models/qwen2_moe.py +558 -0
  1111. vllm/model_executor/models/qwen2_rm.py +122 -0
  1112. vllm/model_executor/models/qwen2_vl.py +1512 -0
  1113. vllm/model_executor/models/qwen3.py +344 -0
  1114. vllm/model_executor/models/qwen3_moe.py +704 -0
  1115. vllm/model_executor/models/qwen3_next.py +1298 -0
  1116. vllm/model_executor/models/qwen3_next_mtp.py +285 -0
  1117. vllm/model_executor/models/qwen_vl.py +795 -0
  1118. vllm/model_executor/models/registry.py +891 -0
  1119. vllm/model_executor/models/roberta.py +252 -0
  1120. vllm/model_executor/models/rvl.py +103 -0
  1121. vllm/model_executor/models/seed_oss.py +488 -0
  1122. vllm/model_executor/models/siglip.py +524 -0
  1123. vllm/model_executor/models/siglip2navit.py +688 -0
  1124. vllm/model_executor/models/skyworkr1v.py +914 -0
  1125. vllm/model_executor/models/smolvlm.py +44 -0
  1126. vllm/model_executor/models/solar.py +506 -0
  1127. vllm/model_executor/models/stablelm.py +344 -0
  1128. vllm/model_executor/models/starcoder2.py +357 -0
  1129. vllm/model_executor/models/step3_text.py +521 -0
  1130. vllm/model_executor/models/step3_vl.py +1091 -0
  1131. vllm/model_executor/models/swin.py +475 -0
  1132. vllm/model_executor/models/tarsier.py +649 -0
  1133. vllm/model_executor/models/telechat2.py +151 -0
  1134. vllm/model_executor/models/teleflm.py +79 -0
  1135. vllm/model_executor/models/terratorch.py +294 -0
  1136. vllm/model_executor/models/transformers.py +883 -0
  1137. vllm/model_executor/models/ultravox.py +667 -0
  1138. vllm/model_executor/models/utils.py +770 -0
  1139. vllm/model_executor/models/vision.py +125 -0
  1140. vllm/model_executor/models/voxtral.py +789 -0
  1141. vllm/model_executor/models/whisper.py +966 -0
  1142. vllm/model_executor/models/zamba2.py +1056 -0
  1143. vllm/model_executor/parameter.py +599 -0
  1144. vllm/model_executor/sampling_metadata.py +597 -0
  1145. vllm/model_executor/utils.py +97 -0
  1146. vllm/model_executor/warmup/__init__.py +0 -0
  1147. vllm/model_executor/warmup/deep_gemm_warmup.py +223 -0
  1148. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1149. vllm/multimodal/__init__.py +35 -0
  1150. vllm/multimodal/audio.py +116 -0
  1151. vllm/multimodal/base.py +219 -0
  1152. vllm/multimodal/cache.py +507 -0
  1153. vllm/multimodal/hasher.py +110 -0
  1154. vllm/multimodal/image.py +130 -0
  1155. vllm/multimodal/inputs.py +979 -0
  1156. vllm/multimodal/parse.py +496 -0
  1157. vllm/multimodal/processing.py +1921 -0
  1158. vllm/multimodal/profiling.py +313 -0
  1159. vllm/multimodal/registry.py +375 -0
  1160. vllm/multimodal/utils.py +754 -0
  1161. vllm/multimodal/video.py +312 -0
  1162. vllm/outputs.py +517 -0
  1163. vllm/platforms/__init__.py +263 -0
  1164. vllm/platforms/cpu.py +353 -0
  1165. vllm/platforms/cuda.py +731 -0
  1166. vllm/platforms/interface.py +599 -0
  1167. vllm/platforms/rocm.py +504 -0
  1168. vllm/platforms/tpu.py +236 -0
  1169. vllm/platforms/xpu.py +243 -0
  1170. vllm/plugins/__init__.py +72 -0
  1171. vllm/plugins/io_processors/__init__.py +68 -0
  1172. vllm/plugins/io_processors/interface.py +67 -0
  1173. vllm/plugins/lora_resolvers/README.md +16 -0
  1174. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1175. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1176. vllm/pooling_params.py +183 -0
  1177. vllm/profiler/__init__.py +0 -0
  1178. vllm/profiler/layerwise_profile.py +375 -0
  1179. vllm/profiler/utils.py +148 -0
  1180. vllm/py.typed +2 -0
  1181. vllm/ray/__init__.py +0 -0
  1182. vllm/ray/lazy_utils.py +22 -0
  1183. vllm/ray/ray_env.py +72 -0
  1184. vllm/reasoning/__init__.py +25 -0
  1185. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1186. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1187. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1188. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1189. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1190. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1191. vllm/reasoning/mistral_reasoning_parser.py +47 -0
  1192. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1193. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1194. vllm/sampling_params.py +577 -0
  1195. vllm/scalar_type.py +349 -0
  1196. vllm/scripts.py +15 -0
  1197. vllm/sequence.py +1465 -0
  1198. vllm/tasks.py +11 -0
  1199. vllm/test_utils.py +130 -0
  1200. vllm/third_party/__init__.py +0 -0
  1201. vllm/third_party/pynvml.py +6140 -0
  1202. vllm/tracing.py +136 -0
  1203. vllm/transformers_utils/__init__.py +24 -0
  1204. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1205. vllm/transformers_utils/chat_templates/registry.py +71 -0
  1206. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1207. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1208. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1209. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1210. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1211. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1212. vllm/transformers_utils/config.py +1043 -0
  1213. vllm/transformers_utils/config_parser_base.py +20 -0
  1214. vllm/transformers_utils/configs/__init__.py +55 -0
  1215. vllm/transformers_utils/configs/arctic.py +207 -0
  1216. vllm/transformers_utils/configs/chatglm.py +72 -0
  1217. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1218. vllm/transformers_utils/configs/eagle.py +84 -0
  1219. vllm/transformers_utils/configs/falcon.py +90 -0
  1220. vllm/transformers_utils/configs/jais.py +238 -0
  1221. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1222. vllm/transformers_utils/configs/medusa.py +63 -0
  1223. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1224. vllm/transformers_utils/configs/mistral.py +165 -0
  1225. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1226. vllm/transformers_utils/configs/moonvit.py +33 -0
  1227. vllm/transformers_utils/configs/nemotron.py +205 -0
  1228. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1229. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1230. vllm/transformers_utils/configs/ovis.py +176 -0
  1231. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1232. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1233. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1234. vllm/transformers_utils/configs/speculators/base.py +91 -0
  1235. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1236. vllm/transformers_utils/configs/ultravox.py +120 -0
  1237. vllm/transformers_utils/detokenizer.py +169 -0
  1238. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1239. vllm/transformers_utils/dynamic_module.py +60 -0
  1240. vllm/transformers_utils/processor.py +245 -0
  1241. vllm/transformers_utils/processors/__init__.py +16 -0
  1242. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1243. vllm/transformers_utils/processors/ovis.py +420 -0
  1244. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1245. vllm/transformers_utils/runai_utils.py +99 -0
  1246. vllm/transformers_utils/s3_utils.py +90 -0
  1247. vllm/transformers_utils/tokenizer.py +293 -0
  1248. vllm/transformers_utils/tokenizer_base.py +149 -0
  1249. vllm/transformers_utils/tokenizer_group.py +132 -0
  1250. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1251. vllm/transformers_utils/tokenizers/mistral.py +520 -0
  1252. vllm/transformers_utils/utils.py +99 -0
  1253. vllm/triton_utils/__init__.py +16 -0
  1254. vllm/triton_utils/importing.py +95 -0
  1255. vllm/usage/__init__.py +0 -0
  1256. vllm/usage/usage_lib.py +259 -0
  1257. vllm/utils/__init__.py +3438 -0
  1258. vllm/utils/deep_gemm.py +212 -0
  1259. vllm/utils/flashinfer.py +372 -0
  1260. vllm/utils/jsontree.py +90 -0
  1261. vllm/utils/tensor_schema.py +236 -0
  1262. vllm/v1/__init__.py +0 -0
  1263. vllm/v1/attention/__init__.py +0 -0
  1264. vllm/v1/attention/backends/__init__.py +0 -0
  1265. vllm/v1/attention/backends/cpu_attn.py +922 -0
  1266. vllm/v1/attention/backends/flash_attn.py +800 -0
  1267. vllm/v1/attention/backends/flashinfer.py +1128 -0
  1268. vllm/v1/attention/backends/flex_attention.py +796 -0
  1269. vllm/v1/attention/backends/gdn_attn.py +320 -0
  1270. vllm/v1/attention/backends/linear_attn.py +68 -0
  1271. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1272. vllm/v1/attention/backends/mamba2_attn.py +224 -0
  1273. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1274. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1275. vllm/v1/attention/backends/mla/common.py +1608 -0
  1276. vllm/v1/attention/backends/mla/cutlass_mla.py +301 -0
  1277. vllm/v1/attention/backends/mla/flashattn_mla.py +273 -0
  1278. vllm/v1/attention/backends/mla/flashinfer_mla.py +110 -0
  1279. vllm/v1/attention/backends/mla/flashmla.py +213 -0
  1280. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1281. vllm/v1/attention/backends/mla/triton_mla.py +175 -0
  1282. vllm/v1/attention/backends/pallas.py +413 -0
  1283. vllm/v1/attention/backends/rocm_aiter_fa.py +548 -0
  1284. vllm/v1/attention/backends/short_conv_attn.py +82 -0
  1285. vllm/v1/attention/backends/tree_attn.py +450 -0
  1286. vllm/v1/attention/backends/triton_attn.py +430 -0
  1287. vllm/v1/attention/backends/utils.py +834 -0
  1288. vllm/v1/attention/backends/xformers.py +437 -0
  1289. vllm/v1/core/__init__.py +0 -0
  1290. vllm/v1/core/block_pool.py +330 -0
  1291. vllm/v1/core/encoder_cache_manager.py +333 -0
  1292. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1293. vllm/v1/core/kv_cache_manager.py +398 -0
  1294. vllm/v1/core/kv_cache_utils.py +1169 -0
  1295. vllm/v1/core/sched/__init__.py +0 -0
  1296. vllm/v1/core/sched/async_scheduler.py +47 -0
  1297. vllm/v1/core/sched/interface.py +158 -0
  1298. vllm/v1/core/sched/output.py +162 -0
  1299. vllm/v1/core/sched/request_queue.py +224 -0
  1300. vllm/v1/core/sched/scheduler.py +1287 -0
  1301. vllm/v1/core/sched/utils.py +69 -0
  1302. vllm/v1/core/single_type_kv_cache_manager.py +670 -0
  1303. vllm/v1/cudagraph_dispatcher.py +121 -0
  1304. vllm/v1/engine/__init__.py +202 -0
  1305. vllm/v1/engine/async_llm.py +757 -0
  1306. vllm/v1/engine/coordinator.py +357 -0
  1307. vllm/v1/engine/core.py +1245 -0
  1308. vllm/v1/engine/core_client.py +1333 -0
  1309. vllm/v1/engine/detokenizer.py +300 -0
  1310. vllm/v1/engine/exceptions.py +17 -0
  1311. vllm/v1/engine/llm_engine.py +332 -0
  1312. vllm/v1/engine/logprobs.py +201 -0
  1313. vllm/v1/engine/output_processor.py +558 -0
  1314. vllm/v1/engine/parallel_sampling.py +133 -0
  1315. vllm/v1/engine/processor.py +524 -0
  1316. vllm/v1/engine/utils.py +857 -0
  1317. vllm/v1/executor/__init__.py +0 -0
  1318. vllm/v1/executor/abstract.py +126 -0
  1319. vllm/v1/executor/multiproc_executor.py +683 -0
  1320. vllm/v1/executor/ray_distributed_executor.py +109 -0
  1321. vllm/v1/kv_cache_interface.py +275 -0
  1322. vllm/v1/metrics/__init__.py +0 -0
  1323. vllm/v1/metrics/loggers.py +717 -0
  1324. vllm/v1/metrics/prometheus.py +82 -0
  1325. vllm/v1/metrics/ray_wrappers.py +133 -0
  1326. vllm/v1/metrics/reader.py +246 -0
  1327. vllm/v1/metrics/stats.py +248 -0
  1328. vllm/v1/outputs.py +147 -0
  1329. vllm/v1/pool/__init__.py +0 -0
  1330. vllm/v1/pool/metadata.py +77 -0
  1331. vllm/v1/request.py +237 -0
  1332. vllm/v1/sample/__init__.py +0 -0
  1333. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1334. vllm/v1/sample/logits_processor/builtin.py +273 -0
  1335. vllm/v1/sample/logits_processor/interface.py +97 -0
  1336. vllm/v1/sample/logits_processor/state.py +161 -0
  1337. vllm/v1/sample/metadata.py +43 -0
  1338. vllm/v1/sample/ops/__init__.py +0 -0
  1339. vllm/v1/sample/ops/bad_words.py +39 -0
  1340. vllm/v1/sample/ops/logprobs.py +26 -0
  1341. vllm/v1/sample/ops/penalties.py +43 -0
  1342. vllm/v1/sample/ops/topk_topp_sampler.py +254 -0
  1343. vllm/v1/sample/rejection_sampler.py +623 -0
  1344. vllm/v1/sample/sampler.py +281 -0
  1345. vllm/v1/sample/tpu/__init__.py +0 -0
  1346. vllm/v1/sample/tpu/metadata.py +124 -0
  1347. vllm/v1/sample/tpu/sampler.py +213 -0
  1348. vllm/v1/serial_utils.py +395 -0
  1349. vllm/v1/spec_decode/__init__.py +0 -0
  1350. vllm/v1/spec_decode/eagle.py +740 -0
  1351. vllm/v1/spec_decode/medusa.py +66 -0
  1352. vllm/v1/spec_decode/metadata.py +62 -0
  1353. vllm/v1/spec_decode/metrics.py +191 -0
  1354. vllm/v1/spec_decode/ngram_proposer.py +157 -0
  1355. vllm/v1/spec_decode/utils.py +14 -0
  1356. vllm/v1/structured_output/__init__.py +297 -0
  1357. vllm/v1/structured_output/backend_guidance.py +245 -0
  1358. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1359. vllm/v1/structured_output/backend_outlines.py +320 -0
  1360. vllm/v1/structured_output/backend_types.py +134 -0
  1361. vllm/v1/structured_output/backend_xgrammar.py +323 -0
  1362. vllm/v1/structured_output/request.py +86 -0
  1363. vllm/v1/structured_output/utils.py +373 -0
  1364. vllm/v1/utils.py +382 -0
  1365. vllm/v1/worker/__init__.py +0 -0
  1366. vllm/v1/worker/block_table.py +221 -0
  1367. vllm/v1/worker/cpu_model_runner.py +163 -0
  1368. vllm/v1/worker/cpu_worker.py +183 -0
  1369. vllm/v1/worker/gpu_input_batch.py +821 -0
  1370. vllm/v1/worker/gpu_model_runner.py +3743 -0
  1371. vllm/v1/worker/gpu_worker.py +697 -0
  1372. vllm/v1/worker/kv_connector_model_runner_mixin.py +122 -0
  1373. vllm/v1/worker/lora_model_runner_mixin.py +192 -0
  1374. vllm/v1/worker/tpu_input_batch.py +585 -0
  1375. vllm/v1/worker/tpu_model_runner.py +1947 -0
  1376. vllm/v1/worker/tpu_worker.py +340 -0
  1377. vllm/v1/worker/utils.py +290 -0
  1378. vllm/v1/worker/worker_base.py +65 -0
  1379. vllm/v1/worker/xpu_model_runner.py +53 -0
  1380. vllm/v1/worker/xpu_worker.py +179 -0
  1381. vllm/version.py +41 -0
  1382. vllm/vllm_flash_attn/.gitkeep +0 -0
  1383. vllm/worker/__init__.py +0 -0
  1384. vllm/worker/cache_engine.py +145 -0
  1385. vllm/worker/enc_dec_model_runner.py +553 -0
  1386. vllm/worker/model_runner.py +2016 -0
  1387. vllm/worker/model_runner_base.py +307 -0
  1388. vllm/worker/utils.py +49 -0
  1389. vllm/worker/worker.py +670 -0
  1390. vllm/worker/worker_base.py +651 -0
  1391. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/METADATA +326 -0
  1392. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/RECORD +1395 -0
  1393. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/WHEEL +5 -0
  1394. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/entry_points.txt +5 -0
  1395. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3743 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import gc
5
+ import itertools
6
+ import time
7
+ from collections import defaultdict
8
+ from collections.abc import Iterator
9
+ from contextlib import contextmanager
10
+ from copy import deepcopy
11
+ from typing import TYPE_CHECKING, Any, Optional, Union, cast
12
+
13
+ import numpy as np
14
+ import torch
15
+ import torch.distributed
16
+ import torch.nn as nn
17
+ from tqdm import tqdm
18
+
19
+ import vllm.envs as envs
20
+ from vllm.attention import Attention, AttentionType
21
+ from vllm.attention.backends.abstract import AttentionBackend
22
+ from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
23
+ from vllm.compilation.counter import compilation_counter
24
+ from vllm.compilation.cuda_graph import CUDAGraphWrapper
25
+ from vllm.compilation.monitor import set_cudagraph_capturing_enabled
26
+ from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
27
+ get_layers_from_vllm_config, update_config)
28
+ from vllm.distributed.eplb.eplb_state import EplbState
29
+ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
30
+ has_kv_transfer_group)
31
+ from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
32
+ from vllm.distributed.parallel_state import (
33
+ get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
34
+ prepare_communication_buffer_for_model)
35
+ from vllm.forward_context import (BatchDescriptor, DPMetadata,
36
+ set_forward_context)
37
+ from vllm.logger import init_logger
38
+ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
39
+ from vllm.model_executor.layers.mamba.abstract import MambaBase
40
+ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
41
+ from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
42
+ from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
43
+ supports_eagle3,
44
+ supports_transcription)
45
+ from vllm.model_executor.models.interfaces_base import (
46
+ VllmModelForPooling, is_pooling_model, is_text_generation_model)
47
+ from vllm.multimodal import MULTIMODAL_REGISTRY
48
+ from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
49
+ PlaceholderRange)
50
+ from vllm.multimodal.utils import group_mm_kwargs_by_modality
51
+ from vllm.pooling_params import PoolingParams
52
+ from vllm.sampling_params import SamplingType
53
+ from vllm.sequence import IntermediateTensors, PoolerOutput
54
+ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
55
+ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
56
+ GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
57
+ is_pin_memory_available, round_up, supports_dynamo)
58
+ from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
59
+ from vllm.v1.attention.backends.utils import (
60
+ AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
61
+ create_fast_prefill_custom_backend,
62
+ reorder_batch_to_split_decodes_and_prefills)
63
+ from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
64
+ # yapf conflicts with isort for this block
65
+ # yapf: disable
66
+ from vllm.v1.kv_cache_interface import (AttentionSpec,
67
+ ChunkedLocalAttentionSpec,
68
+ CrossAttentionSpec,
69
+ EncoderOnlyAttentionSpec,
70
+ FullAttentionSpec, KVCacheConfig,
71
+ KVCacheGroupSpec, KVCacheSpec,
72
+ MambaSpec, SlidingWindowSpec)
73
+ # yapf: enable
74
+ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
75
+ DraftTokenIds, LogprobsLists, LogprobsTensors,
76
+ ModelRunnerOutput, SamplerOutput)
77
+ from vllm.v1.pool.metadata import PoolingMetadata
78
+ from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
79
+ from vllm.v1.sample.metadata import SamplingMetadata
80
+ from vllm.v1.sample.rejection_sampler import RejectionSampler
81
+ from vllm.v1.sample.sampler import Sampler
82
+ from vllm.v1.spec_decode.eagle import EagleProposer
83
+ from vllm.v1.spec_decode.medusa import MedusaProposer
84
+ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
85
+ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
86
+ from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
87
+ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
88
+ from vllm.v1.worker.kv_connector_model_runner_mixin import (
89
+ KVConnectorModelRunnerMixin, KVConnectorOutput)
90
+ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
91
+
92
+ from .utils import (AttentionGroup, MultiModalBudget,
93
+ add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache,
94
+ gather_mm_placeholders, sanity_check_mm_encoder_outputs,
95
+ scatter_mm_placeholders)
96
+
97
+ if TYPE_CHECKING:
98
+ import xgrammar as xgr
99
+
100
+ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
101
+ from vllm.v1.core.sched.output import SchedulerOutput
102
+ else:
103
+ xgr = LazyLoader("xgr", globals(), "xgrammar")
104
+
105
+ logger = init_logger(__name__)
106
+
107
+
108
+ # Wrapper for ModelRunnerOutput to support overlapped execution.
109
+ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
110
+
111
+ def __init__(
112
+ self,
113
+ model_runner_output: ModelRunnerOutput,
114
+ sampled_token_ids: torch.Tensor,
115
+ invalid_req_indices: list[int],
116
+ async_output_copy_stream: torch.cuda.Stream,
117
+ ):
118
+ self._model_runner_output = model_runner_output
119
+ self._invalid_req_indices = invalid_req_indices
120
+
121
+ # Event on the copy stream so we can synchronize the non-blocking copy.
122
+ self._async_copy_ready_event = torch.cuda.Event()
123
+
124
+ # Keep a reference to the device tensor to avoid it being
125
+ # deallocated until we finish copying it to the host.
126
+ self._sampled_token_ids = sampled_token_ids
127
+
128
+ # Initiate the copy on a separate stream, but do not synchronize it.
129
+ default_stream = torch.cuda.current_stream()
130
+ with torch.cuda.stream(async_output_copy_stream):
131
+ async_output_copy_stream.wait_stream(default_stream)
132
+ self._sampled_token_ids_cpu = self._sampled_token_ids.to(
133
+ 'cpu', non_blocking=True)
134
+ self._async_copy_ready_event.record()
135
+
136
+ def get_output(self) -> ModelRunnerOutput:
137
+ """Copy the device tensors to the host and return a ModelRunnerOutput.
138
+
139
+ This function blocks until the copy is finished.
140
+ """
141
+ self._async_copy_ready_event.synchronize()
142
+
143
+ # Release the device tensor once the copy has completed
144
+ del self._sampled_token_ids
145
+
146
+ valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
147
+ for i in self._invalid_req_indices:
148
+ valid_sampled_token_ids[i].clear()
149
+
150
+ output = self._model_runner_output
151
+ output.sampled_token_ids = valid_sampled_token_ids
152
+ return output
153
+
154
+
155
+ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
156
+
157
+ def __init__(
158
+ self,
159
+ vllm_config: VllmConfig,
160
+ device: torch.device,
161
+ ):
162
+ self.vllm_config = vllm_config
163
+ self.model_config = vllm_config.model_config
164
+ self.cache_config = vllm_config.cache_config
165
+ self.compilation_config = vllm_config.compilation_config
166
+ self.lora_config = vllm_config.lora_config
167
+ self.load_config = vllm_config.load_config
168
+ self.parallel_config = vllm_config.parallel_config
169
+ self.scheduler_config = vllm_config.scheduler_config
170
+ self.speculative_config = vllm_config.speculative_config
171
+ self.observability_config = vllm_config.observability_config
172
+
173
+ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
174
+ set_cpu_offload_max_bytes(
175
+ int(self.cache_config.cpu_offload_gb * 1024**3))
176
+
177
+ model_config = self.model_config
178
+ cache_config = self.cache_config
179
+ scheduler_config = self.scheduler_config
180
+ parallel_config = self.parallel_config
181
+ self.device = device
182
+ self.pin_memory = is_pin_memory_available()
183
+ self.dtype = self.model_config.dtype
184
+ if cache_config.cache_dtype == "auto":
185
+ self.kv_cache_dtype = self.dtype
186
+ else:
187
+ self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
188
+ cache_config.cache_dtype]
189
+
190
+ self.is_pooling_model = (model_config.runner_type == 'pooling')
191
+ self.is_multimodal_raw_input_only_model = (
192
+ model_config.is_multimodal_raw_input_only_model)
193
+
194
+ self.max_model_len = model_config.max_model_len
195
+ self.dcp_world_size = self.parallel_config.decode_context_parallel_size
196
+ self.max_num_tokens = scheduler_config.max_num_batched_tokens
197
+ self.max_num_reqs = scheduler_config.max_num_seqs
198
+
199
+ # Model-related.
200
+ self.num_query_heads = model_config.get_num_attention_heads(
201
+ parallel_config)
202
+ self.hidden_size = model_config.get_hidden_size()
203
+ self.attention_chunk_size = model_config.attention_chunk_size
204
+ # Only relevant for models using ALiBi (e.g, MPT)
205
+ self.use_alibi = check_use_alibi(model_config)
206
+
207
+ self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
208
+
209
+ # Multi-modal data support
210
+ self.mm_registry = MULTIMODAL_REGISTRY
211
+ self.uses_mrope = model_config.uses_mrope
212
+ self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
213
+ model_config)
214
+
215
+ if self.model_config.is_encoder_decoder:
216
+ # Maximum length of the encoder input, only for encoder-decoder
217
+ # models.
218
+ self.max_encoder_len = self.mm_registry.\
219
+ get_encdec_max_encoder_len(model_config)
220
+ else:
221
+ self.max_encoder_len = 0
222
+
223
+ # Sampler
224
+ self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
225
+
226
+ self.eplb_state: Optional[EplbState] = None
227
+ """
228
+ State of the expert parallelism load balancer.
229
+
230
+ Will be lazily initialized when the model is loaded.
231
+ """
232
+
233
+ # Lazy initializations
234
+ # self.model: nn.Module # Set after load_model
235
+ # Initialize in initialize_kv_cache
236
+ self.kv_caches: list[torch.Tensor] = []
237
+ # indexes: [kv_cache_group_id][attn_group]
238
+ self.attn_groups: list[list[AttentionGroup]] = []
239
+ # self.kv_cache_config: KVCacheConfig
240
+
241
+ # mm_hash -> encoder_output
242
+ self.encoder_cache: dict[str, torch.Tensor] = {}
243
+
244
+ self.use_aux_hidden_state_outputs = False
245
+ # Set up speculative decoding.
246
+ # NOTE(Jiayi): currently we put the entire draft model on
247
+ # the last PP rank. This is not ideal if there are many
248
+ # layers in the draft model.
249
+ if self.speculative_config and get_pp_group().is_last_rank:
250
+ if self.speculative_config.method == "ngram":
251
+ self.drafter = NgramProposer(self.vllm_config)
252
+ elif self.speculative_config.use_eagle():
253
+ self.drafter = EagleProposer(self.vllm_config, self.device,
254
+ self) # type: ignore
255
+ if self.speculative_config.method == "eagle3":
256
+ self.use_aux_hidden_state_outputs = True
257
+ elif self.speculative_config.method == "medusa":
258
+ self.drafter = MedusaProposer(
259
+ vllm_config=self.vllm_config,
260
+ device=self.device) # type: ignore
261
+ else:
262
+ raise ValueError("Unknown speculative decoding method: "
263
+ f"{self.speculative_config.method}")
264
+ self.rejection_sampler = RejectionSampler()
265
+
266
+ # Request states.
267
+ self.requests: dict[str, CachedRequestState] = {}
268
+
269
+ # Input Batch
270
+ # NOTE(Chen): Ideally, we should initialize the input batch inside
271
+ # `initialize_kv_cache` based on the kv cache config. However, as in
272
+ # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
273
+ # reasons, we have to initialize the input batch before `load_model`,
274
+ # quantization + weight offloading will fail otherwise. As a temporary
275
+ # solution, we initialize the input batch here, and re-initialize it
276
+ # in `initialize_kv_cache` if the block_sizes here is different from
277
+ # the block_sizes in the kv cache config.
278
+ self.input_batch = InputBatch(
279
+ max_num_reqs=self.max_num_reqs,
280
+ # We need to use the encoder length for encoder-decoer
281
+ # because of KV cache for cross-attention.
282
+ max_model_len=max(self.max_model_len, self.max_encoder_len),
283
+ max_num_batched_tokens=self.max_num_tokens,
284
+ device=self.device,
285
+ pin_memory=self.pin_memory,
286
+ vocab_size=self.model_config.get_vocab_size(),
287
+ block_sizes=[self.cache_config.block_size],
288
+ is_spec_decode=bool(self.vllm_config.speculative_config),
289
+ logitsprocs=build_logitsprocs(
290
+ self.vllm_config, self.device, self.pin_memory,
291
+ self.is_pooling_model,
292
+ self.vllm_config.model_config.logits_processors),
293
+ is_pooling_model=self.is_pooling_model,
294
+ )
295
+
296
+ self.use_async_scheduling = self.scheduler_config.async_scheduling
297
+ self.async_output_copy_stream = torch.cuda.Stream() if \
298
+ self.use_async_scheduling else None
299
+
300
+ # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
301
+ # The convention is different.
302
+ # self.cudagraph_batch_sizes sorts in ascending order.
303
+ # The batch sizes in the config are in descending order.
304
+ if self.compilation_config.cudagraph_capture_sizes and \
305
+ self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
306
+ self.cudagraph_batch_sizes = list(
307
+ reversed(self.compilation_config.cudagraph_capture_sizes))
308
+
309
+ # Cache the device properties.
310
+ self._init_device_properties()
311
+
312
+ # Persistent buffers for CUDA graphs.
313
+ self.input_ids = self._make_buffer(self.max_num_tokens,
314
+ dtype=torch.int32)
315
+ self.positions = self._make_buffer(self.max_num_tokens,
316
+ dtype=torch.int64)
317
+ self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
318
+ dtype=torch.int32)
319
+ self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
320
+ # Because inputs_embeds may be bfloat16 and we don't need a numpy
321
+ # version of this tensor, avoid a RuntimeError by not creating a
322
+ # numpy buffer.
323
+ self.inputs_embeds = self._make_buffer(self.max_num_tokens,
324
+ self.hidden_size,
325
+ dtype=self.dtype,
326
+ numpy=False)
327
+ self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
328
+ dtype=torch.int32)
329
+ self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
330
+ dtype=torch.int64)
331
+
332
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
333
+ if self.uses_mrope:
334
+ # NOTE: `mrope_positions` is implemented with one additional dummy
335
+ # position on purpose to make it non-contiguous so that it can work
336
+ # with torch compile.
337
+ # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
338
+
339
+ # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
340
+ # the modality of inputs. For text-only inputs, each dimension has
341
+ # identical position IDs, making M-RoPE functionally equivalent to
342
+ # 1D-RoPE.
343
+ # See page 5 of https://arxiv.org/abs/2409.12191
344
+ self.mrope_positions = self._make_buffer(
345
+ (3, self.max_num_tokens + 1), dtype=torch.int64)
346
+
347
+ # CUDA event to synchronize use of reused CPU tensors between steps
348
+ # when async scheduling is enabled.
349
+ self.prepare_inputs_event: Optional[torch.cuda.Event] = None
350
+ if self.use_async_scheduling:
351
+ self.prepare_inputs_event = torch.cuda.Event()
352
+ # Start in a completed state.
353
+ self.prepare_inputs_event.record(torch.cuda.default_stream())
354
+
355
+ # None in the first PP rank. The rest are set after load_model.
356
+ self.intermediate_tensors: Optional[IntermediateTensors] = None
357
+
358
+ # OPTIMIZATION: Cache the tensors rather than creating them every step.
359
+ # Keep in int64 to avoid overflow with long context
360
+ self.arange_np = np.arange(max(self.max_num_reqs + 1,
361
+ self.max_model_len,
362
+ self.max_num_tokens),
363
+ dtype=np.int64)
364
+
365
+ # Layer pairings for cross-layer KV sharing.
366
+ # If an Attention layer `layer_name` is in the keys of this dict, it
367
+ # means this layer will perform attention using the keys and values
368
+ # from the KV cache of `shared_kv_cache_layers[layer_name]`.
369
+ self.shared_kv_cache_layers: dict[str, str] = {}
370
+ self.kv_sharing_fast_prefill_eligible_layers: set[str] = set()
371
+
372
+ self.kv_sharing_fast_prefill_logits_indices = None
373
+ if self.cache_config.kv_sharing_fast_prefill:
374
+ self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
375
+ self.max_num_tokens, dtype=torch.int32, device=self.device)
376
+
377
+ self.uniform_decode_query_len = 1 if not self.speculative_config else \
378
+ 1 + self.speculative_config.num_speculative_tokens
379
+
380
+ # Cudagraph dispatcher for runtime cudagraph dispatching.
381
+ self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
382
+
383
+ self.mm_budget = MultiModalBudget(
384
+ self.model_config,
385
+ self.scheduler_config,
386
+ self.mm_registry,
387
+ ) if self.supports_mm_inputs else None
388
+
389
+ self.reorder_batch_threshold: Optional[int] = None
390
+
391
+ # Attention layers that are only in the KVCacheConfig of the runner
392
+ # (e.g., KV sharing, encoder-only attention), but not in the
393
+ # KVCacheConfig of the scheduler.
394
+ self.runner_only_attn_layers: set[str] = set()
395
+
396
+ # Cached outputs.
397
+ self._draft_token_ids: Optional[Union[list[list[int]],
398
+ torch.Tensor]] = None
399
+ self.transfer_event = torch.cuda.Event()
400
+ self.sampled_token_ids_pinned_cpu = torch.empty(
401
+ (self.max_model_len, 1),
402
+ dtype=torch.int64,
403
+ device="cpu",
404
+ pin_memory=self.pin_memory)
405
+
406
+ def _make_buffer(self,
407
+ *size: Union[int, torch.SymInt],
408
+ dtype: torch.dtype,
409
+ numpy: bool = True) -> CpuGpuBuffer:
410
+ # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
411
+ # if a bfloat16 buffer is needed without a corresponding numpy array,
412
+ # don't bother instantiating the numpy array.
413
+ return CpuGpuBuffer(*size,
414
+ dtype=dtype,
415
+ device=self.device,
416
+ pin_memory=self.pin_memory,
417
+ with_numpy=numpy)
418
+
419
+ def _init_model_kwargs(self, num_tokens: int):
420
+ model_kwargs = dict[str, Any]()
421
+
422
+ if not self.is_pooling_model:
423
+ return model_kwargs
424
+
425
+ num_reqs = self.input_batch.num_reqs
426
+ pooling_params = self.input_batch.get_pooling_params()
427
+
428
+ token_type_id_requests = dict[int, Any]()
429
+ for i, param in enumerate(pooling_params):
430
+ if param.extra_kwargs is not None and \
431
+ (token_types := param.extra_kwargs.get(
432
+ "compressed_token_type_ids")) is not None:
433
+ token_type_id_requests[i] = token_types
434
+
435
+ if len(token_type_id_requests) == 0:
436
+ return model_kwargs
437
+
438
+ seq_lens = self.seq_lens.gpu[:num_reqs]
439
+ token_type_ids = []
440
+
441
+ for i in range(num_reqs):
442
+ pos = token_type_id_requests.get(i, seq_lens[i])
443
+ ids = (torch.arange(seq_lens[i]) >= pos).int()
444
+ token_type_ids.append(ids)
445
+
446
+ model_kwargs["token_type_ids"] = torch.concat(token_type_ids).to(
447
+ device=self.device)
448
+ return model_kwargs
449
+
450
+ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
451
+ """
452
+ Update the order of requests in the batch based on the attention
453
+ backend's needs. For example, some attention backends (namely MLA) may
454
+ want to separate requests based on if the attention computation will be
455
+ compute-bound or memory-bound.
456
+
457
+ Args:
458
+ scheduler_output: The scheduler output.
459
+ """
460
+ # Attention free models have zero kv_cache_goups, however models
461
+ # like Mamba are also attention free but use the kv_cache for
462
+ # keeping its internal state. This is why we check the number
463
+ # of kv_cache groups instead of solely checking
464
+ # for self.model_config.is_attention_free.
465
+ if len(self.kv_cache_config.kv_cache_groups) == 0:
466
+ return
467
+
468
+ if self.reorder_batch_threshold is not None:
469
+ # NOTE(lucas): currently no backend supports the custom masking
470
+ # required for DCP with q_len > 1, so we assert here. Remove this
471
+ # assert once the custom mask is support is added to FA3.
472
+ if self.dcp_world_size > 1:
473
+ assert self.reorder_batch_threshold == 1, \
474
+ "DCP not support reorder_batch_threshold > 1 now."
475
+ reorder_batch_to_split_decodes_and_prefills(
476
+ self.input_batch,
477
+ scheduler_output,
478
+ decode_threshold=self.reorder_batch_threshold)
479
+
480
+ # Note: used for model runner override.
481
+ def _init_device_properties(self) -> None:
482
+ """Initialize attributes from torch.cuda.get_device_properties
483
+ """
484
+ self.device_properties = torch.cuda.get_device_properties(self.device)
485
+ self.num_sms = self.device_properties.multi_processor_count
486
+
487
+ # Note: used for model runner override.
488
+ def _sync_device(self) -> None:
489
+ torch.cuda.synchronize()
490
+
491
+ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
492
+ """Update the cached states and the persistent batch with the scheduler
493
+ output.
494
+
495
+ The updated states are used by the `_prepare_inputs` function to create
496
+ the input GPU tensors for the model.
497
+
498
+ The SamplingMetadata is updated and copied to the GPU if there is a
499
+ new/resumed/paused/finished request in the batch.
500
+ """
501
+ # Remove finished requests from the cached states.
502
+ for req_id in scheduler_output.finished_req_ids:
503
+ self.requests.pop(req_id, None)
504
+ # Remove the finished requests from the persistent batch.
505
+ # NOTE(woosuk): There could be an edge case where finished_req_ids and
506
+ # scheduled_req_ids overlap. This happens when a request is aborted and
507
+ # then resubmitted with the same ID. In this case, we treat them as two
508
+ # distinct requests - clearing the cached states for the first request
509
+ # and handling the second as a new request.
510
+ for req_id in scheduler_output.finished_req_ids:
511
+ self.input_batch.remove_request(req_id)
512
+
513
+ # Free the cached encoder outputs.
514
+ for mm_hash in scheduler_output.free_encoder_mm_hashes:
515
+ self.encoder_cache.pop(mm_hash, None)
516
+
517
+ # Remove the unscheduled requests from the persistent batch.
518
+ # NOTE(woosuk): The unscheduled requests are either preempted requests
519
+ # or running requests that are not scheduled in this step. We remove
520
+ # them from the persistent batch but keep their cached states since
521
+ # they will be scheduled again sometime in the future.
522
+ scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
523
+ cached_req_ids = self.input_batch.req_id_to_index.keys()
524
+ unscheduled_req_ids = cached_req_ids - scheduled_req_ids
525
+ # NOTE(woosuk): The persistent batch optimization assumes that
526
+ # consecutive batches contain mostly the same requests. If batches
527
+ # have low request overlap (e.g., alternating between two distinct
528
+ # sets of requests), this optimization becomes very inefficient.
529
+ for req_id in unscheduled_req_ids:
530
+ self.input_batch.remove_request(req_id)
531
+
532
+ reqs_to_add: list[CachedRequestState] = []
533
+ # Add new requests to the cached states.
534
+ for new_req_data in scheduler_output.scheduled_new_reqs:
535
+ req_id = new_req_data.req_id
536
+ sampling_params = new_req_data.sampling_params
537
+ pooling_params = new_req_data.pooling_params
538
+
539
+ if sampling_params and \
540
+ sampling_params.sampling_type == SamplingType.RANDOM_SEED:
541
+ generator = torch.Generator(device=self.device)
542
+ generator.manual_seed(sampling_params.seed)
543
+ else:
544
+ generator = None
545
+
546
+ if self.is_pooling_model:
547
+ assert pooling_params is not None
548
+ task = pooling_params.task
549
+ assert task is not None, "You did not set `task` in the API"
550
+
551
+ model = cast(VllmModelForPooling, self.get_model())
552
+ to_update = model.pooler.get_pooling_updates(task)
553
+ to_update.apply(pooling_params)
554
+
555
+ req_state = CachedRequestState(
556
+ req_id=req_id,
557
+ prompt_token_ids=new_req_data.prompt_token_ids,
558
+ mm_kwargs=new_req_data.mm_kwargs,
559
+ mm_positions=new_req_data.mm_positions,
560
+ mm_hashes=new_req_data.mm_hashes,
561
+ sampling_params=sampling_params,
562
+ pooling_params=pooling_params,
563
+ generator=generator,
564
+ block_ids=new_req_data.block_ids,
565
+ num_computed_tokens=new_req_data.num_computed_tokens,
566
+ output_token_ids=[],
567
+ lora_request=new_req_data.lora_request,
568
+ )
569
+ self.requests[req_id] = req_state
570
+
571
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
572
+ if self.uses_mrope:
573
+ self._init_mrope_positions(req_state)
574
+
575
+ reqs_to_add.append(req_state)
576
+
577
+ # Update the states of the running/resumed requests.
578
+ is_last_rank = get_pp_group().is_last_rank
579
+ req_data = scheduler_output.scheduled_cached_reqs
580
+ for i, req_id in enumerate(req_data.req_ids):
581
+ req_state = self.requests[req_id]
582
+ num_computed_tokens = req_data.num_computed_tokens[i]
583
+ new_block_ids = req_data.new_block_ids[i]
584
+ resumed_from_preemption = req_data.resumed_from_preemption[i]
585
+
586
+ # Update the cached states.
587
+ req_state.num_computed_tokens = num_computed_tokens
588
+
589
+ if not is_last_rank:
590
+ # When using PP, the scheduler sends the sampled tokens back,
591
+ # because there's no direct communication between the first-
592
+ # stage worker and the last-stage worker.
593
+ new_token_ids = req_data.new_token_ids[i]
594
+ # Add the sampled token(s) from the previous step (if any).
595
+ # This doesn't include "unverified" tokens like spec tokens.
596
+ num_new_tokens = (num_computed_tokens + len(new_token_ids) -
597
+ req_state.num_tokens)
598
+ if num_new_tokens == 1:
599
+ # Avoid slicing list in most common case.
600
+ req_state.output_token_ids.append(new_token_ids[-1])
601
+ elif num_new_tokens > 0:
602
+ req_state.output_token_ids.extend(
603
+ new_token_ids[-num_new_tokens:])
604
+
605
+ # Update the block IDs.
606
+ if not resumed_from_preemption:
607
+ if new_block_ids is not None:
608
+ # Append the new blocks to the existing block IDs.
609
+ for block_ids, new_ids in zip(req_state.block_ids,
610
+ new_block_ids):
611
+ block_ids.extend(new_ids)
612
+ else:
613
+ assert new_block_ids is not None
614
+ # The request is resumed from preemption.
615
+ # Replace the existing block IDs with the new ones.
616
+ req_state.block_ids = new_block_ids
617
+
618
+ req_index = self.input_batch.req_id_to_index.get(req_id)
619
+ if req_index is None:
620
+ # The request is not in the persistent batch.
621
+ # The request was either preempted and resumed later, or was not
622
+ # scheduled in the previous step and needs to be added again.
623
+ reqs_to_add.append(req_state)
624
+ continue
625
+
626
+ # Update the persistent batch.
627
+ self.input_batch.num_computed_tokens_cpu[req_index] = (
628
+ num_computed_tokens)
629
+ if new_block_ids is not None:
630
+ self.input_batch.block_table.append_row(
631
+ new_block_ids, req_index)
632
+
633
+ # For the last rank, we don't need to update the token_ids_cpu
634
+ # because the sampled tokens are already cached.
635
+ if not is_last_rank:
636
+ # Add new_token_ids to token_ids_cpu.
637
+ start_token_index = num_computed_tokens
638
+ end_token_index = num_computed_tokens + len(new_token_ids)
639
+ self.input_batch.token_ids_cpu[
640
+ req_index,
641
+ start_token_index:end_token_index] = new_token_ids
642
+ self.input_batch.num_tokens_no_spec[
643
+ req_index] = end_token_index
644
+ self.input_batch.num_tokens[req_index] = end_token_index
645
+
646
+ # Add spec_token_ids to token_ids_cpu.
647
+ spec_token_ids = (
648
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
649
+ if spec_token_ids:
650
+ num_spec_tokens = len(spec_token_ids)
651
+ start_index = self.input_batch.num_tokens_no_spec[req_index]
652
+ end_token_index = start_index + num_spec_tokens
653
+ self.input_batch.token_ids_cpu[
654
+ req_index, start_index:end_token_index] = spec_token_ids
655
+ # NOTE(woosuk): `num_tokens` here may include spec tokens.
656
+ self.input_batch.num_tokens[req_index] += num_spec_tokens
657
+
658
+ # Add the new or resumed requests to the persistent batch.
659
+ # The smaller empty indices are filled first.
660
+ for request in reqs_to_add:
661
+ self.input_batch.add_request(request)
662
+
663
+ # Condense the batched states if there are gaps left by removed requests
664
+ self.input_batch.condense()
665
+ # Allow attention backend to reorder the batch, potentially
666
+ self._may_reorder_batch(scheduler_output)
667
+ # Refresh batch metadata with any pending updates.
668
+ self.input_batch.refresh_metadata()
669
+
670
+ def _update_states_after_model_execute(
671
+ self, output_token_ids: torch.Tensor) -> None:
672
+ """Update the cached states after model execution.
673
+
674
+ This is used for MTP/EAGLE for hybrid models, as in linear attention,
675
+ only the last token's state is kept. In MTP/EAGLE, for draft tokens
676
+ the state are kept util we decide how many tokens are accepted for
677
+ each sequence, and a shifting is done during the next iteration
678
+ based on the number of accepted tokens.
679
+ """
680
+ if not self.model_config.is_hybrid or not self.speculative_config:
681
+ return
682
+
683
+ # Find the number of accepted tokens for each sequence.
684
+ num_accepted_tokens = (torch.cat(
685
+ [
686
+ output_token_ids,
687
+ torch.full((output_token_ids.size(0), 1),
688
+ -1,
689
+ device=output_token_ids.device),
690
+ ],
691
+ dim=1) == -1).int().argmax(-1).cpu().numpy()
692
+ for i, num_tokens in enumerate(num_accepted_tokens):
693
+ self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
694
+
695
+ def _init_mrope_positions(self, req_state: CachedRequestState):
696
+ image_grid_thw = []
697
+ video_grid_thw = []
698
+ second_per_grid_ts = []
699
+ audio_feature_lengths = []
700
+ use_audio_in_video = False
701
+ for mm_item in req_state.mm_kwargs:
702
+ mm_input = mm_item.get_data()
703
+ if (t := mm_input.get("image_grid_thw")) is not None:
704
+ image_grid_thw.append(t.tolist())
705
+ if (t := mm_input.get("video_grid_thw")) is not None:
706
+ video_grid_thw.append(t.tolist())
707
+ if (t := mm_input.get("second_per_grid_ts")) is not None:
708
+ second_per_grid_ts.append(t)
709
+ if (t := mm_input.get("audio_feature_lengths")) is not None:
710
+ audio_feature_lengths.append(t)
711
+ if mm_input.get("use_audio_in_video") is True:
712
+ use_audio_in_video = True
713
+
714
+ req_state.mrope_positions, req_state.mrope_position_delta = \
715
+ MRotaryEmbedding.get_input_positions_tensor(
716
+ req_state.prompt_token_ids,
717
+ hf_config=self.model_config.hf_config,
718
+ image_grid_thw=image_grid_thw,
719
+ video_grid_thw=video_grid_thw,
720
+ second_per_grid_ts=second_per_grid_ts,
721
+ audio_feature_lengths=audio_feature_lengths,
722
+ use_audio_in_video=use_audio_in_video,
723
+ )
724
+
725
+ def _extract_mm_kwargs(
726
+ self,
727
+ scheduler_output: "SchedulerOutput",
728
+ ) -> BatchedTensorInputs:
729
+ if not scheduler_output or not self.is_multimodal_raw_input_only_model:
730
+ return {}
731
+
732
+ mm_kwargs = list[MultiModalKwargsItem]()
733
+ for req in scheduler_output.scheduled_new_reqs:
734
+ mm_kwargs.extend(req.mm_kwargs)
735
+
736
+ # Input all modalities at once
737
+ mm_kwargs_combined: BatchedTensorInputs = {}
738
+ for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
739
+ mm_kwargs,
740
+ device=self.device,
741
+ pin_memory=self.pin_memory,
742
+ ):
743
+ mm_kwargs_combined.update(mm_kwargs_group)
744
+
745
+ return mm_kwargs_combined
746
+
747
+ def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
748
+ if not self.is_multimodal_raw_input_only_model:
749
+ return {}
750
+
751
+ mm_budget = self.mm_budget
752
+ assert mm_budget is not None
753
+
754
+ dummy_modality = mm_budget.get_modality_with_max_tokens()
755
+ return self._get_mm_dummy_batch(dummy_modality, num_seqs)
756
+
757
+ def _get_cumsum_and_arange(
758
+ self,
759
+ num_tokens: np.ndarray,
760
+ cumsum_dtype: Optional[np.dtype] = None,
761
+ ) -> tuple[np.ndarray, np.ndarray]:
762
+ """Get the cumulative sum and batched arange of the given array.
763
+ # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
764
+ # Equivalent to but faster than:
765
+ # np.concatenate([np.arange(n) for n in num_tokens])
766
+ """
767
+ # Step 1. [2, 5, 3] -> [2, 7, 10]
768
+ cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
769
+ total_num_tokens = cu_num_tokens[-1]
770
+ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
771
+ cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
772
+ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
773
+ arange = self.arange_np[:total_num_tokens] - cumsums_offsets
774
+
775
+ return cu_num_tokens, arange
776
+
777
+ def _prepare_input_ids(self, total_num_scheduled_tokens: int,
778
+ cu_num_tokens: np.ndarray) -> None:
779
+ """Prepare the input IDs for the current batch.
780
+
781
+ Carefully handles the `prev_sampled_token_ids` which can be cached
782
+ from the previous engine iteration, in which case those tokens on the
783
+ GPU need to be copied into the corresponding slots into input_ids."""
784
+
785
+ if self.input_batch.prev_sampled_token_ids is None:
786
+ # Normal scheduling case
787
+ self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
788
+ return
789
+
790
+ # Async scheduling case, where some decode requests from the previous
791
+ # iteration won't have entries in input_ids_cpu and need to be copied
792
+ # on the GPU from prev_sampled_token_ids.
793
+ prev_req_id_to_index = self.input_batch.prev_req_id_to_index
794
+ assert prev_req_id_to_index is not None
795
+ flattened_indices = []
796
+ prev_common_req_indices = []
797
+ indices_match = True
798
+ max_flattened_index = -1
799
+ for req_id, cur_index in self.input_batch.req_id_to_index.items():
800
+ if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
801
+ prev_common_req_indices.append(prev_index)
802
+ # We need to compute the flattened input_ids index of the
803
+ # last token in each common request.
804
+ flattened_index = cu_num_tokens[cur_index].item() - 1
805
+ flattened_indices.append(flattened_index)
806
+ indices_match &= (prev_index == flattened_index)
807
+ max_flattened_index = max(max_flattened_index, flattened_index)
808
+ num_commmon_tokens = len(flattened_indices)
809
+ if num_commmon_tokens < total_num_scheduled_tokens:
810
+ # If not all requests are decodes from the last iteration,
811
+ # We need to copy the input_ids_cpu to the GPU first.
812
+ self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
813
+ if num_commmon_tokens == 0:
814
+ # No requests in common with the previous iteration
815
+ # So input_ids_cpu will have all the input ids.
816
+ return
817
+ if indices_match and max_flattened_index == (num_commmon_tokens - 1):
818
+ # Common-case optimization: the batch is unchanged
819
+ # and no reordering happened.
820
+ # The indices are both the same permutation of 0..N-1 so
821
+ # we can copy directly using a single slice.
822
+ self.input_ids.gpu[:num_commmon_tokens].copy_(
823
+ self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
824
+ 0],
825
+ non_blocking=True)
826
+ return
827
+ # Upload the index tensors asynchronously
828
+ # so the scatter can be non-blocking.
829
+ input_ids_index_tensor = torch.tensor(flattened_indices,
830
+ dtype=torch.int64,
831
+ pin_memory=self.pin_memory).to(
832
+ self.device,
833
+ non_blocking=True)
834
+ prev_common_req_indices_tensor = torch.tensor(
835
+ prev_common_req_indices,
836
+ dtype=torch.int64,
837
+ pin_memory=self.pin_memory).to(self.device, non_blocking=True)
838
+ self.input_ids.gpu.scatter_(
839
+ dim=0,
840
+ index=input_ids_index_tensor,
841
+ src=self.input_batch.prev_sampled_token_ids[
842
+ prev_common_req_indices_tensor, 0])
843
+
844
+ def _get_encoder_seq_lens(
845
+ self,
846
+ scheduler_output: "SchedulerOutput",
847
+ kv_cache_spec: KVCacheSpec,
848
+ num_reqs: int,
849
+ ) -> Optional[np.ndarray]:
850
+ if not isinstance(kv_cache_spec, CrossAttentionSpec):
851
+ return None
852
+
853
+ # Build encoder_seq_lens array mapping request indices to
854
+ # encoder lengths for inputs scheduled in this batch
855
+ encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
856
+ for req_id in scheduler_output.scheduled_encoder_inputs:
857
+ req_index = self.input_batch.req_id_to_index[req_id]
858
+ encoder_seq_lens[req_index] = self.max_encoder_len
859
+
860
+ return encoder_seq_lens
861
+
862
+ def _prepare_inputs(
863
+ self,
864
+ scheduler_output: "SchedulerOutput",
865
+ ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata],
866
+ np.ndarray, Optional[CommonAttentionMetadata], int]:
867
+ """
868
+ :return: tuple[
869
+ attn_metadata: layer-to-attention_metadata mapping,
870
+ logits_indices, spec_decode_metadata
871
+ ]
872
+ """
873
+ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
874
+ assert total_num_scheduled_tokens > 0
875
+ num_reqs = self.input_batch.num_reqs
876
+ assert num_reqs > 0
877
+
878
+ # OPTIMIZATION: Start copying the block table first.
879
+ # This way, we can overlap the copy with the following CPU operations.
880
+ self.input_batch.block_table.commit_block_table(num_reqs)
881
+
882
+ # Get the number of scheduled tokens for each request.
883
+ req_ids = self.input_batch.req_ids
884
+ tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
885
+ num_scheduled_tokens = np.array(tokens, dtype=np.int32)
886
+ max_num_scheduled_tokens = max(tokens)
887
+
888
+ # Get request indices.
889
+ # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
890
+ req_indices = np.repeat(self.arange_np[:num_reqs],
891
+ num_scheduled_tokens)
892
+
893
+ # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
894
+ # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
895
+ cu_num_tokens, arange = self._get_cumsum_and_arange(
896
+ num_scheduled_tokens)
897
+
898
+ # Get positions.
899
+ positions_np = self.positions.np[:total_num_scheduled_tokens]
900
+ np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
901
+ arange,
902
+ out=positions_np)
903
+
904
+ # Calculate M-RoPE positions.
905
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
906
+ if self.uses_mrope:
907
+ self._calc_mrope_positions(scheduler_output)
908
+
909
+ # Get token indices.
910
+ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
911
+ # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
912
+ # where M is the max_model_len.
913
+ token_indices = (positions_np +
914
+ req_indices * self.input_batch.token_ids_cpu.shape[1])
915
+
916
+ # NOTE(woosuk): We use torch.index_select instead of np.take here
917
+ # because torch.index_select is much faster than np.take for large
918
+ # tensors.
919
+ torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
920
+ 0,
921
+ torch.from_numpy(token_indices),
922
+ out=self.input_ids.cpu[:total_num_scheduled_tokens])
923
+
924
+ self.input_batch.block_table.compute_slot_mapping(
925
+ req_indices, positions_np)
926
+ self.input_batch.block_table.commit_slot_mapping(
927
+ total_num_scheduled_tokens)
928
+
929
+ # Prepare the attention metadata.
930
+ self.query_start_loc.np[0] = 0
931
+ self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
932
+ # Note: pad query_start_loc to be non-decreasing, as kernels
933
+ # like FlashAttention requires that
934
+ self.query_start_loc.np[num_reqs + 1:].fill(cu_num_tokens[-1])
935
+ self.query_start_loc.copy_to_gpu()
936
+ query_start_loc = self.query_start_loc.gpu[:num_reqs + 1]
937
+
938
+ self.seq_lens.np[:num_reqs] = (
939
+ self.input_batch.num_computed_tokens_cpu[:num_reqs] +
940
+ num_scheduled_tokens)
941
+ # Fill unused with 0 for full cuda graph mode.
942
+ self.seq_lens.np[num_reqs:].fill(0)
943
+ self.seq_lens.copy_to_gpu()
944
+ seq_lens = self.seq_lens.gpu[:num_reqs]
945
+ max_seq_len = self.seq_lens.np[:num_reqs].max().item()
946
+
947
+ # Copy the tensors to the GPU.
948
+ self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
949
+
950
+ if self.uses_mrope:
951
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
952
+ self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
953
+ self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
954
+ non_blocking=True)
955
+ else:
956
+ # Common case (1D positions)
957
+ self.positions.copy_to_gpu(total_num_scheduled_tokens)
958
+
959
+ use_spec_decode = len(
960
+ scheduler_output.scheduled_spec_decode_tokens) > 0
961
+ if not use_spec_decode:
962
+ # NOTE(woosuk): Due to chunked prefills, the batch may contain
963
+ # partial requests. While we should not sample any token
964
+ # from these partial requests, we do so for simplicity.
965
+ # We will ignore the sampled tokens from the partial requests.
966
+ # TODO: Support prompt logprobs.
967
+ logits_indices = query_start_loc[1:] - 1
968
+ num_draft_tokens = None
969
+ spec_decode_metadata = None
970
+ else:
971
+ # Get the number of draft tokens for each request.
972
+ # Iterate over the dictionary rather than all requests since not all
973
+ # requests have draft tokens.
974
+ num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
975
+ for req_id, draft_token_ids in (
976
+ scheduler_output.scheduled_spec_decode_tokens.items()):
977
+ req_idx = self.input_batch.req_id_to_index[req_id]
978
+ num_draft_tokens[req_idx] = len(draft_token_ids)
979
+
980
+ spec_decode_metadata = self._calc_spec_decode_metadata(
981
+ num_draft_tokens, cu_num_tokens)
982
+ logits_indices = spec_decode_metadata.logits_indices
983
+ self.num_draft_tokens.np[:num_reqs] = num_draft_tokens
984
+ self.num_draft_tokens.np[num_reqs:].fill(0)
985
+ self.num_draft_tokens.copy_to_gpu()
986
+
987
+ logits_indices_padded = None
988
+ if self.cache_config.kv_sharing_fast_prefill:
989
+ logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
990
+ logits_indices)
991
+
992
+ attn_metadata: dict[str, Any] = {}
993
+
994
+ # Used in the below loop.
995
+ query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1]
996
+ seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
997
+ num_computed_tokens_cpu = (
998
+ self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
999
+ spec_decode_common_attn_metadata = None
1000
+ if use_spec_decode:
1001
+ self.num_accepted_tokens.np[:num_reqs] = (
1002
+ self.input_batch.num_accepted_tokens_cpu[:num_reqs])
1003
+ self.num_accepted_tokens.np[num_reqs:].fill(1)
1004
+ self.num_accepted_tokens.copy_to_gpu()
1005
+
1006
+ # Prepare the attention metadata for each KV cache group and make layers
1007
+ # in the same group share the same metadata.
1008
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
1009
+ self.kv_cache_config.kv_cache_groups):
1010
+ encoder_seq_lens = self._get_encoder_seq_lens(
1011
+ scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs)
1012
+
1013
+ if isinstance(kv_cache_group_spec.kv_cache_spec,
1014
+ EncoderOnlyAttentionSpec):
1015
+ # Encoder-only layers do not have KV cache, so we need to
1016
+ # create a dummy block table and slot mapping for them.
1017
+ blk_table_tensor = torch.zeros(
1018
+ (num_reqs, 1),
1019
+ dtype=torch.int32,
1020
+ device=self.device,
1021
+ )
1022
+ slot_mapping = torch.zeros(
1023
+ (total_num_scheduled_tokens, ),
1024
+ dtype=torch.int64,
1025
+ device=self.device,
1026
+ )
1027
+ num_common_prefix_blocks = 0
1028
+ else:
1029
+ blk_table = self.input_batch.block_table[kv_cache_group_id]
1030
+ blk_table_tensor = blk_table.get_device_tensor()[:num_reqs]
1031
+ slot_mapping = blk_table.slot_mapping[:
1032
+ total_num_scheduled_tokens]
1033
+
1034
+ # Fill unused with -1. Needed for reshape_and_cache in full cuda
1035
+ # graph mode.
1036
+ blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
1037
+ num_common_prefix_blocks = (
1038
+ scheduler_output.
1039
+ num_common_prefix_blocks[kv_cache_group_id])
1040
+
1041
+ common_attn_metadata = CommonAttentionMetadata(
1042
+ query_start_loc=query_start_loc,
1043
+ query_start_loc_cpu=query_start_loc_cpu,
1044
+ seq_lens=seq_lens,
1045
+ seq_lens_cpu=seq_lens_cpu,
1046
+ num_computed_tokens_cpu=num_computed_tokens_cpu,
1047
+ num_reqs=num_reqs,
1048
+ num_actual_tokens=total_num_scheduled_tokens,
1049
+ max_query_len=max_num_scheduled_tokens,
1050
+ max_seq_len=max_seq_len,
1051
+ block_table_tensor=blk_table_tensor,
1052
+ slot_mapping=slot_mapping,
1053
+ logits_indices_padded=logits_indices_padded,
1054
+ num_logits_indices=logits_indices.size(0),
1055
+ causal=True,
1056
+ encoder_seq_lens=encoder_seq_lens,
1057
+ )
1058
+
1059
+ if self.speculative_config and \
1060
+ spec_decode_common_attn_metadata is None:
1061
+ spec_decode_common_attn_metadata = common_attn_metadata
1062
+
1063
+ for attn_group in self.attn_groups[kv_cache_group_id]:
1064
+ # Prepare for cascade attention if enabled & beneficial.
1065
+ common_prefix_len = 0
1066
+ builder = attn_group.metadata_builder
1067
+ if self.cascade_attn_enabled:
1068
+ common_prefix_len = self._compute_cascade_attn_prefix_len(
1069
+ num_scheduled_tokens,
1070
+ num_common_prefix_blocks,
1071
+ kv_cache_group_spec.kv_cache_spec,
1072
+ builder,
1073
+ )
1074
+
1075
+ extra_attn_metadata_args = {}
1076
+ if use_spec_decode and isinstance(builder,
1077
+ GDNAttentionMetadataBuilder):
1078
+ extra_attn_metadata_args = dict(
1079
+ num_accepted_tokens=self.num_accepted_tokens.
1080
+ gpu[:num_reqs],
1081
+ num_draft_tokens=self.num_draft_tokens.gpu[:num_reqs],
1082
+ )
1083
+
1084
+ attn_metadata_i = builder.build(
1085
+ common_prefix_len=common_prefix_len,
1086
+ common_attn_metadata=common_attn_metadata,
1087
+ **extra_attn_metadata_args)
1088
+
1089
+ for layer_name in attn_group.layer_names:
1090
+ attn_metadata[layer_name] = attn_metadata_i
1091
+
1092
+ # Hot-Swap lora model
1093
+ if self.lora_config:
1094
+ self.set_active_loras(self.input_batch, num_scheduled_tokens)
1095
+
1096
+ return (attn_metadata, logits_indices, spec_decode_metadata,
1097
+ num_scheduled_tokens, spec_decode_common_attn_metadata,
1098
+ max_num_scheduled_tokens)
1099
+
1100
+ def _compute_cascade_attn_prefix_len(
1101
+ self,
1102
+ num_scheduled_tokens: np.ndarray,
1103
+ num_common_prefix_blocks: int,
1104
+ kv_cache_spec: KVCacheSpec,
1105
+ attn_metadata_builder: AttentionMetadataBuilder,
1106
+ ) -> int:
1107
+ """Compute the length of the common prefix for cascade attention.
1108
+
1109
+ NOTE(woosuk): The common prefix length returned by this function
1110
+ represents the length used specifically for cascade attention, not the
1111
+ actual number of tokens shared between requests. When cascade attention
1112
+ is disabled (use_cascade=False), this function returns 0 even if
1113
+ requests share common tokens. Additionally, the common prefix length is
1114
+ truncated to a multiple of the block size and may be further truncated
1115
+ due to implementation details explained below.
1116
+
1117
+ Args:
1118
+ num_scheduled_tokens: Number of tokens scheduled per request.
1119
+ num_common_prefix_blocks: Number of shared KV cache blocks.
1120
+
1121
+ Returns:
1122
+ int: Length of common prefix in tokens.
1123
+ """
1124
+ common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
1125
+ if common_prefix_len == 0:
1126
+ # Common case.
1127
+ return 0
1128
+
1129
+ # NOTE(woosuk): Cascade attention uses two attention kernels: one
1130
+ # for the common prefix and the other for the rest. For the first
1131
+ # kernel, we concatenate all the query tokens (possibly from
1132
+ # different requests) and treat them as if they are from the same
1133
+ # request. Then, we use bi-directional attention to process the
1134
+ # common prefix in the KV cache. Importantly, this means that the
1135
+ # first kernel does not do any masking.
1136
+
1137
+ # Consider the following example:
1138
+ # Request 1's input query: [D, E, X]
1139
+ # Request 1's kv cache: [A, B, C, D, E, X]
1140
+ # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
1141
+ # Request 2's input query: [E, Y]
1142
+ # Request 2's kv cache: [A, B, C, D, E, Y]
1143
+ # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
1144
+
1145
+ # If we use [A, B, C, D, E] as the common prefix, then the
1146
+ # first kernel will compute the bi-directional attention between
1147
+ # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
1148
+ # However, this is wrong because D in Request 1 should not attend to
1149
+ # E in the common prefix (i.e., we need masking).
1150
+ # To avoid this, [A, B, C, D] should be the common prefix.
1151
+ # That is, the common prefix should be capped by the minimum
1152
+ # num_computed_tokens among the requests, and plus one to include
1153
+ # the first token of the query.
1154
+
1155
+ # In practice, we use [A, B, C] as the common prefix, instead of
1156
+ # [A, B, C, D] (i.e., the common prefix is capped by the minimum
1157
+ # num_computed_tokens, without plus one).
1158
+ # This is because of an implementation detail: We want to always
1159
+ # use two kernels for cascade attention. Let's imagine:
1160
+ # Request 3's input query: [D]
1161
+ # Request 3's kv cache: [A, B, C, D]
1162
+ # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
1163
+ # If we use [A, B, C, D] as the common prefix for Request 1-3,
1164
+ # then Request 3 will be processed only by the first kernel,
1165
+ # and the second kernel will get an empty input. While this is not
1166
+ # a fundamental problem, our current implementation does not support
1167
+ # this case.
1168
+ num_reqs = len(num_scheduled_tokens)
1169
+ common_prefix_len = min(
1170
+ common_prefix_len,
1171
+ self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
1172
+ # common_prefix_len should be a multiple of the block size.
1173
+ common_prefix_len = (common_prefix_len // kv_cache_spec.block_size *
1174
+ kv_cache_spec.block_size)
1175
+ use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or
1176
+ (isinstance(kv_cache_spec, FullAttentionSpec)
1177
+ and kv_cache_spec.sliding_window is not None))
1178
+ use_local_attention = (
1179
+ isinstance(kv_cache_spec, ChunkedLocalAttentionSpec)
1180
+ or (isinstance(kv_cache_spec, FullAttentionSpec)
1181
+ and kv_cache_spec.attention_chunk_size is not None))
1182
+ assert isinstance(kv_cache_spec, AttentionSpec)
1183
+ use_cascade = attn_metadata_builder.use_cascade_attention(
1184
+ common_prefix_len=common_prefix_len,
1185
+ query_lens=num_scheduled_tokens,
1186
+ num_query_heads=self.num_query_heads,
1187
+ num_kv_heads=kv_cache_spec.num_kv_heads,
1188
+ use_alibi=self.use_alibi,
1189
+ use_sliding_window=use_sliding_window,
1190
+ use_local_attention=use_local_attention,
1191
+ num_sms=self.num_sms,
1192
+ )
1193
+ return common_prefix_len if use_cascade else 0
1194
+
1195
+ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
1196
+ mrope_pos_ptr = 0
1197
+ for index, req_id in enumerate(self.input_batch.req_ids):
1198
+ req = self.requests[req_id]
1199
+ assert req.mrope_positions is not None
1200
+
1201
+ num_computed_tokens = \
1202
+ self.input_batch.num_computed_tokens_cpu[index]
1203
+ num_scheduled_tokens = \
1204
+ scheduler_output.num_scheduled_tokens[req_id]
1205
+ num_prompt_tokens = len(req.prompt_token_ids)
1206
+
1207
+ if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
1208
+ prompt_part_len = max(0,
1209
+ num_prompt_tokens - num_computed_tokens)
1210
+ completion_part_len = max(
1211
+ 0, num_scheduled_tokens - prompt_part_len)
1212
+ else:
1213
+ prompt_part_len = num_scheduled_tokens
1214
+ completion_part_len = 0
1215
+
1216
+ assert num_scheduled_tokens == prompt_part_len + completion_part_len
1217
+
1218
+ if prompt_part_len > 0:
1219
+ # prompt's mrope_positions are pre-computed
1220
+ dst_start = mrope_pos_ptr
1221
+ dst_end = mrope_pos_ptr + prompt_part_len
1222
+ src_start = num_computed_tokens
1223
+ src_end = num_computed_tokens + prompt_part_len
1224
+
1225
+ self.mrope_positions.cpu[:, dst_start:dst_end] = (
1226
+ req.mrope_positions[:, src_start:src_end])
1227
+ mrope_pos_ptr += prompt_part_len
1228
+
1229
+ if completion_part_len > 0:
1230
+ # compute completion's mrope_positions on-the-fly
1231
+ dst_start = mrope_pos_ptr
1232
+ dst_end = mrope_pos_ptr + completion_part_len
1233
+
1234
+ MRotaryEmbedding.get_next_input_positions_tensor(
1235
+ out=self.mrope_positions.np,
1236
+ out_offset=dst_start,
1237
+ mrope_position_delta=req.mrope_position_delta,
1238
+ context_len=num_computed_tokens + prompt_part_len,
1239
+ num_new_tokens=completion_part_len,
1240
+ )
1241
+
1242
+ mrope_pos_ptr += completion_part_len
1243
+
1244
+ def _calc_spec_decode_metadata(
1245
+ self,
1246
+ num_draft_tokens: np.ndarray,
1247
+ cu_num_scheduled_tokens: np.ndarray,
1248
+ ) -> SpecDecodeMetadata:
1249
+ # Inputs:
1250
+ # cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209]
1251
+ # num_draft_tokens: [ 3, 0, 2, 0, 1]
1252
+ # Outputs:
1253
+ # cu_num_draft_tokens: [ 3, 3, 5, 5, 6]
1254
+ # logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106,
1255
+ # 206, 207, 208]
1256
+ # target_logits_indices: [ 0, 1, 2, 5, 6, 9]
1257
+ # bonus_logits_indices: [ 3, 4, 7, 8, 10]
1258
+
1259
+ # Compute the logits indices.
1260
+ # [4, 1, 3, 1, 2]
1261
+ num_sampled_tokens = num_draft_tokens + 1
1262
+
1263
+ # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
1264
+ # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
1265
+ cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
1266
+ num_sampled_tokens, cumsum_dtype=np.int32)
1267
+ # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
1268
+ logits_indices = np.repeat(
1269
+ cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
1270
+ # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
1271
+ logits_indices += arange
1272
+
1273
+ # Compute the bonus logits indices.
1274
+ bonus_logits_indices = cu_num_sampled_tokens - 1
1275
+
1276
+ # Compute the draft logits indices.
1277
+ # cu_num_draft_tokens: [3, 3, 5, 5, 6]
1278
+ # arange: [0, 1, 2, 0, 1, 0]
1279
+ cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
1280
+ num_draft_tokens, cumsum_dtype=np.int32)
1281
+ # [0, 0, 0, 5, 5, 9]
1282
+ target_logits_indices = np.repeat(
1283
+ cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
1284
+ # [0, 1, 2, 5, 6, 9]
1285
+ target_logits_indices += arange
1286
+
1287
+ # TODO: Optimize the CPU -> GPU copy.
1288
+ cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
1289
+ self.device, non_blocking=True)
1290
+ logits_indices = torch.from_numpy(logits_indices).to(self.device,
1291
+ non_blocking=True)
1292
+ target_logits_indices = torch.from_numpy(target_logits_indices).to(
1293
+ self.device, non_blocking=True)
1294
+ bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
1295
+ self.device, non_blocking=True)
1296
+
1297
+ # Compute the draft token ids.
1298
+ # draft_token_indices: [ 1, 2, 3, 105, 106, 208]
1299
+ draft_token_ids = self.input_ids.gpu[logits_indices]
1300
+ draft_token_ids = draft_token_ids[target_logits_indices + 1]
1301
+
1302
+ metadata = SpecDecodeMetadata(
1303
+ draft_token_ids=draft_token_ids,
1304
+ num_draft_tokens=num_draft_tokens.tolist(),
1305
+ cu_num_draft_tokens=cu_num_draft_tokens,
1306
+ target_logits_indices=target_logits_indices,
1307
+ bonus_logits_indices=bonus_logits_indices,
1308
+ logits_indices=logits_indices,
1309
+ )
1310
+ return metadata
1311
+
1312
+ def _prepare_kv_sharing_fast_prefill(
1313
+ self,
1314
+ logits_indices: torch.Tensor,
1315
+ ) -> torch.Tensor:
1316
+ assert self.kv_sharing_fast_prefill_logits_indices is not None
1317
+ num_logits = logits_indices.shape[0]
1318
+ assert num_logits > 0
1319
+ self.kv_sharing_fast_prefill_logits_indices[:num_logits].copy_(
1320
+ logits_indices)
1321
+ # There might have leftover indices in logits_indices[num_logits:]
1322
+ # from previous iterations, whose values may be greater than the
1323
+ # batch size in the current iteration. To ensure indices are always
1324
+ # valid, we fill the padded indices with the last index.
1325
+ self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
1326
+ logits_indices[-1].item())
1327
+ if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
1328
+ and num_logits <= self.cudagraph_batch_sizes[-1]):
1329
+ # Use piecewise CUDA graphs.
1330
+ # Add padding to the batch size.
1331
+ num_logits_padded = self.vllm_config.pad_for_cudagraph(num_logits)
1332
+ else:
1333
+ num_logits_padded = num_logits
1334
+ logits_indices_padded = (
1335
+ self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded])
1336
+ return logits_indices_padded
1337
+
1338
+ def _batch_mm_kwargs_from_scheduler(
1339
+ self,
1340
+ scheduler_output: "SchedulerOutput",
1341
+ ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
1342
+ """Batch multimodal kwargs from scheduled encoder inputs.
1343
+
1344
+ Args:
1345
+ scheduler_output: The scheduler output containing scheduled encoder
1346
+ inputs.
1347
+
1348
+ Returns:
1349
+ A tuple of (mm_kwargs, req_ids_pos) where:
1350
+ - mm_kwargs: List of multimodal kwargs items to be batched
1351
+ - mm_hashes_pos: List of (mm_hash, position_info) tuples
1352
+ """
1353
+ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
1354
+ if not scheduled_encoder_inputs:
1355
+ return [], []
1356
+ # Batch the multi-modal inputs.
1357
+ mm_kwargs = list[MultiModalKwargsItem]()
1358
+ # list of tuple (mm_hash, position_info)
1359
+ mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
1360
+ for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
1361
+ req_state = self.requests[req_id]
1362
+
1363
+ for mm_input_id in encoder_input_ids:
1364
+ mm_hash = req_state.mm_hashes[mm_input_id]
1365
+ mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
1366
+ mm_hashes_pos.append(
1367
+ (mm_hash, req_state.mm_positions[mm_input_id]))
1368
+
1369
+ return mm_kwargs, mm_hashes_pos
1370
+
1371
+ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
1372
+ # Batch the multi-modal inputs using the helper method.
1373
+ mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
1374
+ scheduler_output)
1375
+
1376
+ if not mm_kwargs:
1377
+ return
1378
+
1379
+ # Batch mm inputs as much as we can: if a request in the batch has
1380
+ # multiple modalities or a different modality than the previous one,
1381
+ # we process it separately to preserve item order.
1382
+ # FIXME(ywang96): This is a hacky way to deal with multiple modalities
1383
+ # in the same batch while still being able to benefit from batching
1384
+ # multimodal inputs. The proper solution should be reordering the
1385
+ # encoder outputs.
1386
+ encoder_outputs = []
1387
+ for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
1388
+ mm_kwargs,
1389
+ device=self.device,
1390
+ pin_memory=self.pin_memory,
1391
+ ):
1392
+ # Run the encoder.
1393
+ # `curr_group_outputs` is either of the following:
1394
+ # 1. A tensor of shape (num_items, feature_size, hidden_size)
1395
+ # in case feature_size is fixed across all multimodal items.
1396
+ # 2. A list or tuple (length: num_items) of tensors, each of shape
1397
+ # (feature_size, hidden_size) in case the feature size is dynamic
1398
+ # depending on the input multimodal items.
1399
+ curr_group_outputs = self.model.get_multimodal_embeddings(
1400
+ **mm_kwargs_group)
1401
+
1402
+ sanity_check_mm_encoder_outputs(
1403
+ curr_group_outputs,
1404
+ expected_num_items=num_items,
1405
+ )
1406
+
1407
+ for output in curr_group_outputs:
1408
+ encoder_outputs.append(output)
1409
+
1410
+ # Cache the encoder outputs by mm_hash
1411
+ for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
1412
+ self.encoder_cache[mm_hash] = scatter_mm_placeholders(
1413
+ output,
1414
+ is_embed=pos_info.is_embed,
1415
+ )
1416
+
1417
+ def _gather_mm_embeddings(
1418
+ self,
1419
+ scheduler_output: "SchedulerOutput",
1420
+ shift_computed_tokens: int = 0,
1421
+ ) -> list[torch.Tensor]:
1422
+ mm_embeds: list[torch.Tensor] = []
1423
+ for req_id in self.input_batch.req_ids:
1424
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
1425
+ req_id]
1426
+ req_state = self.requests[req_id]
1427
+ num_computed_tokens = \
1428
+ req_state.num_computed_tokens + shift_computed_tokens
1429
+ mm_positions = req_state.mm_positions
1430
+ mm_hashes = req_state.mm_hashes
1431
+ for i, pos_info in enumerate(mm_positions):
1432
+ start_pos = pos_info.offset
1433
+ num_encoder_tokens = pos_info.length
1434
+
1435
+ # The encoder output is needed if the two ranges overlap:
1436
+ # [num_computed_tokens,
1437
+ # num_computed_tokens + num_scheduled_tokens) and
1438
+ # [start_pos, start_pos + num_encoder_tokens)
1439
+ if start_pos >= num_computed_tokens + num_scheduled_tokens:
1440
+ # The encoder output is not needed in this step.
1441
+ break
1442
+ if start_pos + num_encoder_tokens <= num_computed_tokens:
1443
+ # The encoder output is already processed and stored
1444
+ # in the decoder's KV cache.
1445
+ continue
1446
+
1447
+ start_idx = max(num_computed_tokens - start_pos, 0)
1448
+ end_idx = min(
1449
+ num_computed_tokens - start_pos + num_scheduled_tokens,
1450
+ num_encoder_tokens,
1451
+ )
1452
+ assert start_idx < end_idx
1453
+
1454
+ mm_hash = mm_hashes[i]
1455
+ encoder_output = self.encoder_cache.get(mm_hash, None)
1456
+ assert encoder_output is not None,\
1457
+ f"Encoder cache miss for {mm_hash}."
1458
+
1459
+ if (is_embed := pos_info.is_embed) is not None:
1460
+ is_embed = is_embed[start_idx:end_idx]
1461
+
1462
+ mm_embeds_item = gather_mm_placeholders(
1463
+ encoder_output[start_idx:end_idx],
1464
+ is_embed=is_embed,
1465
+ )
1466
+ mm_embeds.append(mm_embeds_item)
1467
+ return mm_embeds
1468
+
1469
+ def _extract_encoder_inputs(
1470
+ self,
1471
+ scheduler_output: "SchedulerOutput",
1472
+ ) -> dict[str, torch.Tensor]:
1473
+ """Extract encoder inputs for encoder-decoder models.
1474
+
1475
+ This method extracts multimodal input features from scheduled encoder
1476
+ inputs and formats them for the encoder-decoder model forward pass.
1477
+ """
1478
+ # Batch the multi-modal inputs using the helper method.
1479
+ mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
1480
+
1481
+ if not mm_kwargs:
1482
+ return {}
1483
+
1484
+ # Group MM kwargs by modality and extract features
1485
+ encoder_features = {}
1486
+ for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
1487
+ mm_kwargs,
1488
+ device=self.device,
1489
+ pin_memory=self.pin_memory,
1490
+ ):
1491
+ # Add the grouped features to encoder_features dict
1492
+ # This allows the model to receive them as kwargs (e.g.,
1493
+ # input_features=...)
1494
+ encoder_features.update(mm_kwargs_group)
1495
+
1496
+ return encoder_features
1497
+
1498
+ def get_model(self) -> nn.Module:
1499
+ # get raw model out of the cudagraph wrapper.
1500
+ if isinstance(self.model, CUDAGraphWrapper):
1501
+ return self.model.unwrap()
1502
+ return self.model
1503
+
1504
+ def get_supported_generation_tasks(self) -> list[GenerationTask]:
1505
+ model = self.get_model()
1506
+ supported_tasks = list[GenerationTask]()
1507
+
1508
+ if is_text_generation_model(model):
1509
+ supported_tasks.append("generate")
1510
+
1511
+ if supports_transcription(model):
1512
+ if model.supports_transcription_only:
1513
+ return ["transcription"]
1514
+
1515
+ supported_tasks.append("transcription")
1516
+
1517
+ return supported_tasks
1518
+
1519
+ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
1520
+ model = self.get_model()
1521
+ if not is_pooling_model(model):
1522
+ return []
1523
+
1524
+ supported_tasks = list(model.pooler.get_supported_tasks())
1525
+
1526
+ if (self.scheduler_config.chunked_prefill_enabled
1527
+ and "encode" in supported_tasks):
1528
+ supported_tasks.remove("encode")
1529
+
1530
+ logger.debug_once("Chunked prefill is not supported with "
1531
+ "encode task which using ALL pooling. "
1532
+ "Please turn off chunked prefill by "
1533
+ "`--no-enable-chunked-prefill` before using it.")
1534
+
1535
+ if "score" in supported_tasks:
1536
+ num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
1537
+ if num_labels != 1:
1538
+ supported_tasks.remove("score")
1539
+ logger.debug_once(
1540
+ "Score API is only enabled for num_labels == 1.")
1541
+
1542
+ return supported_tasks
1543
+
1544
+ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
1545
+ tasks = list[SupportedTask]()
1546
+
1547
+ if self.model_config.runner_type == "generate":
1548
+ tasks.extend(self.get_supported_generation_tasks())
1549
+ if self.model_config.runner_type == "pooling":
1550
+ tasks.extend(self.get_supported_pooling_tasks())
1551
+
1552
+ return tuple(tasks)
1553
+
1554
+ def apply_grammar_bitmask(
1555
+ self,
1556
+ scheduler_output: "SchedulerOutput",
1557
+ logits: torch.Tensor,
1558
+ ):
1559
+ grammar_bitmask = scheduler_output.grammar_bitmask
1560
+ if grammar_bitmask is None:
1561
+ return
1562
+
1563
+ # We receive the structured output bitmask from the scheduler,
1564
+ # compacted to contain bitmasks only for structured output requests.
1565
+ # The order of the requests in the bitmask is not guaranteed to be the
1566
+ # same as the order of the requests in the gpu runner's batch. We need
1567
+ # to sort the bitmask to match the order of the requests used here.
1568
+
1569
+ # Get the batch indices of the structured output requests.
1570
+ # Keep track of the number of speculative tokens scheduled for every
1571
+ # request in the batch, as the logit indices are offset by this amount.
1572
+ struct_out_req_batch_indices: dict[str, int] = {}
1573
+ cumulative_offset = 0
1574
+ seq = sorted(self.input_batch.req_id_to_index.items(),
1575
+ key=lambda x: x[1])
1576
+ for req_id, batch_index in seq:
1577
+ logit_index = batch_index + cumulative_offset
1578
+ cumulative_offset += len(
1579
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1580
+ if req_id in scheduler_output.structured_output_request_ids:
1581
+ struct_out_req_batch_indices[req_id] = logit_index
1582
+
1583
+ out_indices = []
1584
+
1585
+ # Reorder the bitmask to match the order of the requests in the batch.
1586
+ sorted_bitmask = np.full(shape=(logits.shape[0],
1587
+ grammar_bitmask.shape[1]),
1588
+ fill_value=-1,
1589
+ dtype=grammar_bitmask.dtype)
1590
+ cumulative_index = 0
1591
+ seq = sorted(scheduler_output.structured_output_request_ids.items(),
1592
+ key=lambda x: x[1])
1593
+ for req_id, _ in seq:
1594
+ logit_index = struct_out_req_batch_indices[req_id]
1595
+ num_spec_tokens = len(
1596
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1597
+ for i in range(1 + num_spec_tokens):
1598
+ sorted_bitmask[logit_index + i] = \
1599
+ grammar_bitmask[cumulative_index + i]
1600
+ out_indices.append(logit_index + i)
1601
+ cumulative_index += 1 + num_spec_tokens
1602
+ grammar_bitmask = sorted_bitmask
1603
+
1604
+ # If the length of out indices and the logits have the same shape
1605
+ # we don't need to pass indices to the kernel,
1606
+ # since the bitmask is already aligned with the logits.
1607
+ skip_out_indices = len(out_indices) == logits.shape[0]
1608
+
1609
+ # Serialization of np.ndarray is much more efficient than a tensor,
1610
+ # so we receive it in that format.
1611
+ grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
1612
+
1613
+ xgr.apply_token_bitmask_inplace(
1614
+ logits,
1615
+ grammar_bitmask.to(self.device, non_blocking=True),
1616
+ indices=out_indices if not skip_out_indices else None,
1617
+ )
1618
+
1619
+ def sync_and_slice_intermediate_tensors(
1620
+ self, num_tokens: int, intermediate_tensors: IntermediateTensors,
1621
+ sync_self: bool) -> IntermediateTensors:
1622
+
1623
+ assert self.intermediate_tensors is not None
1624
+
1625
+ tp = self.vllm_config.parallel_config.tensor_parallel_size
1626
+ enabled_sp = self.compilation_config.pass_config. \
1627
+ enable_sequence_parallelism
1628
+ if enabled_sp:
1629
+ # When sequence parallelism is enabled, we always pad num_tokens
1630
+ # to be a multiple of tensor_parallel_size (tp) earlier
1631
+ assert num_tokens % tp == 0
1632
+ is_residual_scattered = tp > 1 and enabled_sp \
1633
+ and num_tokens % tp == 0
1634
+
1635
+ # When sequence parallelism is enabled, the "residual" tensor is sharded
1636
+ # across tensor parallel ranks, so each rank only needs its own slice.
1637
+ if sync_self:
1638
+ assert intermediate_tensors is not None
1639
+ for k, v in intermediate_tensors.items():
1640
+ is_scattered = k == "residual" and is_residual_scattered
1641
+ copy_len = num_tokens // tp if is_scattered else \
1642
+ num_tokens
1643
+ self.intermediate_tensors[k][:copy_len].copy_(
1644
+ v[:copy_len], non_blocking=True)
1645
+
1646
+ return IntermediateTensors({
1647
+ k:
1648
+ v[:num_tokens // tp]
1649
+ if k == "residual" and is_residual_scattered else v[:num_tokens]
1650
+ for k, v in self.intermediate_tensors.items()
1651
+ })
1652
+
1653
+ def eplb_step(self,
1654
+ is_dummy: bool = False,
1655
+ is_profile: bool = False) -> None:
1656
+ """
1657
+ Step for the EPLB (Expert Parallelism Load Balancing) state.
1658
+ """
1659
+ if not self.parallel_config.enable_eplb:
1660
+ return
1661
+
1662
+ assert self.eplb_state is not None
1663
+ model = self.get_model()
1664
+ assert is_mixture_of_experts(model)
1665
+ self.eplb_state.step(
1666
+ model,
1667
+ is_dummy,
1668
+ is_profile,
1669
+ log_stats=self.parallel_config.eplb_config.log_balancedness,
1670
+ )
1671
+
1672
+ def get_dp_padding(self,
1673
+ num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
1674
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
1675
+ dp_rank = self.vllm_config.parallel_config.data_parallel_rank
1676
+
1677
+ # For DP: Don't pad when setting enforce_eager.
1678
+ # This lets us set enforce_eager on the prefiller in a P/D setup and
1679
+ # still use CUDA graphs (enabled by this padding) on the decoder.
1680
+ #
1681
+ # TODO(tms) : There are many cases where padding is enabled for
1682
+ # prefills, causing unnecessary and excessive padding of activations.
1683
+
1684
+ if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
1685
+ # Early exit.
1686
+ return 0, None
1687
+
1688
+ num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
1689
+ num_tokens, dp_size, dp_rank)
1690
+ max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
1691
+ num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
1692
+ dp_size,
1693
+ device="cpu",
1694
+ dtype=torch.int32)
1695
+ return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
1696
+
1697
+ def _pool(
1698
+ self,
1699
+ hidden_states: torch.Tensor,
1700
+ num_scheduled_tokens: int,
1701
+ num_scheduled_tokens_np: np.ndarray,
1702
+ kv_connector_output: Optional[KVConnectorOutput],
1703
+ ) -> ModelRunnerOutput:
1704
+ assert self.input_batch.num_reqs ==\
1705
+ len(self.input_batch.pooling_params), \
1706
+ "Either all or none of the requests in" \
1707
+ " a batch must be pooling request"
1708
+
1709
+ hidden_states = hidden_states[:num_scheduled_tokens]
1710
+ pooling_metadata = self.input_batch.get_pooling_metadata()
1711
+ pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
1712
+ device=hidden_states.device)
1713
+ seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs]
1714
+
1715
+ # Pooling models D2H & synchronize occurs in pooler.py:build_output
1716
+ raw_pooler_output = self.model.pooler(
1717
+ hidden_states=hidden_states, pooling_metadata=pooling_metadata)
1718
+
1719
+ pooler_output: list[Optional[torch.Tensor]] = []
1720
+ for raw_output, seq_len, prompt_len in zip(
1721
+ raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens):
1722
+
1723
+ output = raw_output.data if seq_len == prompt_len else None
1724
+ pooler_output.append(output)
1725
+
1726
+ return ModelRunnerOutput(
1727
+ req_ids=self.input_batch.req_ids,
1728
+ req_id_to_index=self.input_batch.req_id_to_index,
1729
+ sampled_token_ids=[],
1730
+ logprobs=None,
1731
+ prompt_logprobs_dict={},
1732
+ pooler_output=pooler_output,
1733
+ kv_connector_output=kv_connector_output,
1734
+ )
1735
+
1736
+ def _preprocess(
1737
+ self,
1738
+ scheduler_output: "SchedulerOutput",
1739
+ intermediate_tensors: Optional[IntermediateTensors] = None,
1740
+ ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor],
1741
+ Optional[torch.Tensor], torch.Tensor,
1742
+ Optional[IntermediateTensors], dict[str, Any]]:
1743
+
1744
+ num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
1745
+ if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
1746
+ and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
1747
+ and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
1748
+ # Use CUDA graphs.
1749
+ # Add padding to the batch size.
1750
+ num_input_tokens = self.vllm_config.pad_for_cudagraph(
1751
+ num_scheduled_tokens)
1752
+ else:
1753
+ # Eager mode.
1754
+ # Pad tokens to multiple of tensor_parallel_size when
1755
+ # enabled collective fusion for SP
1756
+ tp_size = self.vllm_config.parallel_config.tensor_parallel_size
1757
+ if self.compilation_config.pass_config. \
1758
+ enable_sequence_parallelism and tp_size > 1:
1759
+ num_input_tokens = round_up(num_scheduled_tokens, tp_size)
1760
+ else:
1761
+ num_input_tokens = num_scheduled_tokens
1762
+
1763
+ # Padding for DP
1764
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
1765
+ num_input_tokens += num_pad
1766
+
1767
+ # _prepare_inputs may reorder the batch, so we must gather multi
1768
+ # modal outputs after that to ensure the correct order
1769
+ if (self.supports_mm_inputs and get_pp_group().is_first_rank
1770
+ and not self.model_config.is_encoder_decoder):
1771
+ # Run the multimodal encoder if any.
1772
+ self._execute_mm_encoder(scheduler_output)
1773
+ mm_embeds = self._gather_mm_embeddings(scheduler_output)
1774
+
1775
+ # NOTE(woosuk): To unify token ids and soft tokens (vision
1776
+ # embeddings), we always use embeddings (rather than token ids)
1777
+ # as input to the multimodal model, even when the input is text.
1778
+ inputs_embeds_scheduled = self.model.get_input_embeddings(
1779
+ input_ids=self.input_ids.gpu[:num_scheduled_tokens],
1780
+ multimodal_embeddings=mm_embeds or None,
1781
+ )
1782
+
1783
+ # TODO(woosuk): Avoid the copy. Optimize.
1784
+ self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(
1785
+ inputs_embeds_scheduled)
1786
+
1787
+ input_ids = None
1788
+ inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
1789
+ model_kwargs = {
1790
+ **self._init_model_kwargs(num_scheduled_tokens),
1791
+ **self._extract_mm_kwargs(scheduler_output),
1792
+ }
1793
+ else:
1794
+ # For text-only models, we use token ids as input.
1795
+ # While it is possible to use embeddings as input just like the
1796
+ # multimodal models, it is not desirable for performance since
1797
+ # then the embedding layer is not included in the CUDA graph.
1798
+ input_ids = self.input_ids.gpu[:num_input_tokens]
1799
+ inputs_embeds = None
1800
+ model_kwargs = self._init_model_kwargs(num_input_tokens)
1801
+ if self.uses_mrope:
1802
+ positions = self.mrope_positions.gpu[:, :num_input_tokens]
1803
+ else:
1804
+ positions = self.positions.gpu[:num_input_tokens]
1805
+
1806
+ if get_pp_group().is_first_rank:
1807
+ intermediate_tensors = None
1808
+ else:
1809
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
1810
+ num_input_tokens, intermediate_tensors, True)
1811
+
1812
+ if (self.model_config.is_encoder_decoder
1813
+ and scheduler_output.scheduled_encoder_inputs):
1814
+ encoder_inputs = self._extract_encoder_inputs(scheduler_output)
1815
+ model_kwargs.update(encoder_inputs)
1816
+
1817
+ return (
1818
+ num_scheduled_tokens,
1819
+ num_input_tokens,
1820
+ num_tokens_across_dp,
1821
+ input_ids,
1822
+ inputs_embeds,
1823
+ positions,
1824
+ intermediate_tensors,
1825
+ model_kwargs,
1826
+ )
1827
+
1828
+ def _sample(
1829
+ self, logits: Optional[torch.Tensor],
1830
+ spec_decode_metadata: Optional[SpecDecodeMetadata]
1831
+ ) -> SamplerOutput:
1832
+ # Sample the next token and get logprobs if needed.
1833
+ sampling_metadata = self.input_batch.sampling_metadata
1834
+ if spec_decode_metadata is None:
1835
+ sampler_output = self.sampler(
1836
+ logits=logits,
1837
+ sampling_metadata=sampling_metadata,
1838
+ )
1839
+ else:
1840
+ # When indexing with a tensor (bonus_logits_indices), PyTorch
1841
+ # creates a new tensor with separate storage from the original
1842
+ # logits tensor. This means any in-place operations on bonus_logits
1843
+ # won't affect the original logits tensor.
1844
+ assert logits is not None
1845
+ bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
1846
+ sampler_output = self.sampler(
1847
+ logits=bonus_logits,
1848
+ sampling_metadata=sampling_metadata,
1849
+ )
1850
+ bonus_token_ids = sampler_output.sampled_token_ids
1851
+
1852
+ # Just like `bonus_logits`, `target_logits` is a new tensor with
1853
+ # separate storage from the original `logits` tensor. Therefore,
1854
+ # it is safe to update `target_logits` in place.
1855
+ target_logits = logits[spec_decode_metadata.target_logits_indices]
1856
+ output_token_ids = self.rejection_sampler(
1857
+ spec_decode_metadata,
1858
+ None, # draft_probs
1859
+ target_logits,
1860
+ bonus_token_ids,
1861
+ sampling_metadata,
1862
+ )
1863
+ sampler_output.sampled_token_ids = output_token_ids
1864
+ self._update_states_after_model_execute(output_token_ids)
1865
+
1866
+ return sampler_output
1867
+
1868
+ def _bookkeeping_sync(
1869
+ self, scheduler_output: "SchedulerOutput",
1870
+ sampler_output: SamplerOutput, logits: Optional[torch.Tensor],
1871
+ hidden_states: torch.Tensor, num_scheduled_tokens: int
1872
+ ) -> tuple[
1873
+ dict[str, int],
1874
+ Optional[LogprobsLists],
1875
+ list[list[int]],
1876
+ dict[str, Optional[LogprobsTensors]],
1877
+ list[str],
1878
+ dict[str, int],
1879
+ list[int],
1880
+ ]:
1881
+ num_nans_in_logits = {}
1882
+ if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
1883
+ num_nans_in_logits = self._get_nans_in_logits(logits)
1884
+
1885
+ # TODO(woosuk): The following loop can be slow since it iterates over
1886
+ # the requests one by one. Optimize.
1887
+ discard_sampled_tokens_req_indices = []
1888
+ for i, req_id in enumerate(self.input_batch.req_ids):
1889
+ req_state = self.requests[req_id]
1890
+ seq_len = (req_state.num_computed_tokens +
1891
+ scheduler_output.num_scheduled_tokens[req_id])
1892
+ if seq_len < req_state.num_tokens:
1893
+ # Ignore the sampled token for partial prefills.
1894
+ # Rewind the generator state as if the token was not sampled.
1895
+ # This relies on cuda-specific torch-internal impl details
1896
+ generator = self.input_batch.generators.get(i)
1897
+ if generator is not None:
1898
+ generator.set_offset(generator.get_offset() - 4)
1899
+ # Record the index of the request that should not be sampled,
1900
+ # so that we could clear the sampled tokens before returning.
1901
+ discard_sampled_tokens_req_indices.append(i)
1902
+
1903
+ # Copy some objects so they don't get modified after returning.
1904
+ # This is important when using async scheduling.
1905
+ req_ids_output_copy = self.input_batch.req_ids.copy()
1906
+ req_id_to_index_output_copy = \
1907
+ self.input_batch.req_id_to_index.copy()
1908
+
1909
+ # NOTE: GPU -> CPU Sync happens here.
1910
+ # Move as many CPU operations as possible before this sync point.
1911
+ logprobs_tensors = sampler_output.logprobs_tensors
1912
+ logprobs_lists = logprobs_tensors.tolists() \
1913
+ if logprobs_tensors is not None else None
1914
+
1915
+ # Compute prompt logprobs if needed.
1916
+ prompt_logprobs_dict = self._get_prompt_logprobs_dict(
1917
+ hidden_states[:num_scheduled_tokens],
1918
+ scheduler_output.num_scheduled_tokens,
1919
+ )
1920
+
1921
+ num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
1922
+ sampled_token_ids = sampler_output.sampled_token_ids
1923
+ invalid_req_indices = []
1924
+ if not self.use_async_scheduling:
1925
+ # Get the valid generated tokens.
1926
+ max_gen_len = sampled_token_ids.shape[-1]
1927
+ if max_gen_len == 1:
1928
+ # No spec decode tokens.
1929
+ valid_sampled_token_ids = self._to_list(sampled_token_ids)
1930
+ else:
1931
+ # Includes spec decode tokens.
1932
+ valid_sampled_token_ids = self.rejection_sampler.parse_output(
1933
+ sampled_token_ids,
1934
+ self.input_batch.vocab_size,
1935
+ )
1936
+ # Mask out the sampled tokens that should not be sampled.
1937
+ for i in discard_sampled_tokens_req_indices:
1938
+ valid_sampled_token_ids[i].clear()
1939
+ else:
1940
+ valid_sampled_token_ids = []
1941
+ invalid_req_indices = list(discard_sampled_tokens_req_indices)
1942
+ invalid_req_indices_set = set(invalid_req_indices)
1943
+ assert sampled_token_ids.shape[-1] == 1
1944
+
1945
+ # Cache the sampled tokens on the GPU and avoid CPU sync.
1946
+ # These will be copied into input_ids in the next step
1947
+ # when preparing inputs.
1948
+ self.input_batch.prev_sampled_token_ids = \
1949
+ sampled_token_ids
1950
+ self.input_batch.prev_sampled_token_ids_invalid_indices = \
1951
+ invalid_req_indices_set
1952
+ self.input_batch.prev_req_id_to_index = {
1953
+ req_id: i
1954
+ for i, req_id in enumerate(self.input_batch.req_ids)
1955
+ if i not in invalid_req_indices_set
1956
+ }
1957
+
1958
+ # Cache the sampled tokens in the model runner, so that the scheduler
1959
+ # doesn't need to send them back.
1960
+ # NOTE(woosuk): As an exception, when using PP, the scheduler sends
1961
+ # the sampled tokens back, because there's no direct communication
1962
+ # between the first-stage worker and the last-stage worker.
1963
+ req_ids = self.input_batch.req_ids
1964
+ for req_idx in range(num_sampled_tokens):
1965
+ if self.use_async_scheduling:
1966
+ sampled_ids = [-1] if \
1967
+ req_idx not in invalid_req_indices_set else None
1968
+ else:
1969
+ sampled_ids = valid_sampled_token_ids[req_idx]
1970
+ if not sampled_ids:
1971
+ continue
1972
+
1973
+ start_idx = self.input_batch.num_tokens_no_spec[req_idx]
1974
+ end_idx = start_idx + len(sampled_ids)
1975
+ assert end_idx <= self.max_model_len, (
1976
+ "Sampled token IDs exceed the max model length. "
1977
+ f"Total number of tokens: {end_idx} > max_model_len: "
1978
+ f"{self.max_model_len}")
1979
+
1980
+ self.input_batch.token_ids_cpu[req_idx,
1981
+ start_idx:end_idx] = sampled_ids
1982
+ self.input_batch.num_tokens_no_spec[req_idx] = end_idx
1983
+ self.input_batch.num_tokens[req_idx] = end_idx
1984
+
1985
+ req_id = req_ids[req_idx]
1986
+ req_state = self.requests[req_id]
1987
+ req_state.output_token_ids.extend(sampled_ids)
1988
+
1989
+ return (
1990
+ num_nans_in_logits,
1991
+ logprobs_lists,
1992
+ valid_sampled_token_ids,
1993
+ prompt_logprobs_dict,
1994
+ req_ids_output_copy,
1995
+ req_id_to_index_output_copy,
1996
+ invalid_req_indices,
1997
+ )
1998
+
1999
+ @torch.inference_mode()
2000
+ def execute_model(
2001
+ self,
2002
+ scheduler_output: "SchedulerOutput",
2003
+ intermediate_tensors: Optional[IntermediateTensors] = None,
2004
+ ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
2005
+ with record_function_or_nullcontext("Preprocess"):
2006
+ self._update_states(scheduler_output)
2007
+ if not scheduler_output.total_num_scheduled_tokens:
2008
+ if not has_kv_transfer_group():
2009
+ # Return empty ModelRunnerOutput if there's no work to do.
2010
+ return EMPTY_MODEL_RUNNER_OUTPUT
2011
+ return self.kv_connector_no_forward(scheduler_output,
2012
+ self.vllm_config)
2013
+ if self.cache_config.kv_sharing_fast_prefill:
2014
+ assert not self.input_batch.num_prompt_logprobs, (
2015
+ "--kv-sharing-fast-prefill produces incorrect logprobs for "
2016
+ "prompt tokens, tokens, please disable it when the requests"
2017
+ " need prompt logprobs")
2018
+
2019
+ if self.prepare_inputs_event is not None:
2020
+ # Ensure prior step has finished with reused CPU tensors.
2021
+ self.prepare_inputs_event.synchronize()
2022
+ try:
2023
+ # Prepare the decoder inputs.
2024
+ (attn_metadata, logits_indices, spec_decode_metadata,
2025
+ num_scheduled_tokens_np, spec_decode_common_attn_metadata,
2026
+ max_query_len) = self._prepare_inputs(scheduler_output)
2027
+
2028
+ finally:
2029
+ if self.prepare_inputs_event is not None:
2030
+ self.prepare_inputs_event.record()
2031
+
2032
+ (
2033
+ num_scheduled_tokens,
2034
+ num_input_tokens,
2035
+ num_tokens_across_dp,
2036
+ input_ids,
2037
+ inputs_embeds,
2038
+ positions,
2039
+ intermediate_tensors,
2040
+ model_kwargs,
2041
+ ) = self._preprocess(scheduler_output, intermediate_tensors)
2042
+
2043
+ uniform_decode = (max_query_len
2044
+ == self.uniform_decode_query_len) and (
2045
+ num_scheduled_tokens
2046
+ == self.input_batch.num_reqs * max_query_len)
2047
+ batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
2048
+ uniform_decode=uniform_decode)
2049
+ cudagraph_runtime_mode, batch_descriptor = \
2050
+ self.cudagraph_dispatcher.dispatch(batch_descriptor)
2051
+
2052
+ # Run the model.
2053
+ # Use persistent buffers for CUDA graphs.
2054
+ with (set_forward_context(
2055
+ attn_metadata,
2056
+ self.vllm_config,
2057
+ num_tokens=num_input_tokens,
2058
+ num_tokens_across_dp=num_tokens_across_dp,
2059
+ cudagraph_runtime_mode=cudagraph_runtime_mode,
2060
+ batch_descriptor=batch_descriptor,
2061
+ ), record_function_or_nullcontext("Forward"),
2062
+ self.maybe_get_kv_connector_output(scheduler_output) as
2063
+ kv_connector_output):
2064
+ model_output = self.model(
2065
+ input_ids=input_ids,
2066
+ positions=positions,
2067
+ intermediate_tensors=intermediate_tensors,
2068
+ inputs_embeds=inputs_embeds,
2069
+ **model_kwargs,
2070
+ )
2071
+
2072
+ with record_function_or_nullcontext("Postprocess"):
2073
+ if self.use_aux_hidden_state_outputs:
2074
+ hidden_states, aux_hidden_states = model_output
2075
+ else:
2076
+ hidden_states = model_output
2077
+ aux_hidden_states = None
2078
+
2079
+ # Broadcast PP output for external_launcher (torchrun)
2080
+ # to make sure we are synced across pp ranks
2081
+ # TODO: Support overlapping mirco-batches
2082
+ # https://github.com/vllm-project/vllm/issues/18019
2083
+ broadcast_pp_output = \
2084
+ self.parallel_config.distributed_executor_backend \
2085
+ == "external_launcher" and len(get_pp_group().ranks) > 0
2086
+ if not get_pp_group().is_last_rank:
2087
+ # For mid-pipeline stages, return the hidden states.
2088
+ assert isinstance(hidden_states, IntermediateTensors)
2089
+ if not broadcast_pp_output:
2090
+ hidden_states.kv_connector_output = kv_connector_output
2091
+ return hidden_states
2092
+ get_pp_group().send_tensor_dict(
2093
+ hidden_states.tensors, all_gather_group=get_tp_group())
2094
+ logits = None
2095
+ else:
2096
+ if self.is_pooling_model:
2097
+ return self._pool(hidden_states, num_scheduled_tokens,
2098
+ num_scheduled_tokens_np,
2099
+ kv_connector_output)
2100
+
2101
+ sample_hidden_states = hidden_states[logits_indices]
2102
+ logits = self.model.compute_logits(sample_hidden_states, None)
2103
+ if broadcast_pp_output:
2104
+ model_output_broadcast_data = {
2105
+ "logits": logits.contiguous(),
2106
+ } if logits is not None else {}
2107
+ model_output_broadcast_data = get_pp_group(
2108
+ ).broadcast_tensor_dict(model_output_broadcast_data,
2109
+ src=len(get_pp_group().ranks) - 1)
2110
+ assert model_output_broadcast_data is not None
2111
+ logits = model_output_broadcast_data["logits"]
2112
+
2113
+ # Apply structured output bitmasks if present
2114
+ if scheduler_output.grammar_bitmask is not None:
2115
+ self.apply_grammar_bitmask(scheduler_output, logits)
2116
+
2117
+ with record_function_or_nullcontext("Sample"):
2118
+ sampler_output = self._sample(logits, spec_decode_metadata)
2119
+
2120
+ with record_function_or_nullcontext("Bookkeep"):
2121
+ (
2122
+ num_nans_in_logits,
2123
+ logprobs_lists,
2124
+ valid_sampled_token_ids,
2125
+ prompt_logprobs_dict,
2126
+ req_ids_output_copy,
2127
+ req_id_to_index_output_copy,
2128
+ invalid_req_indices,
2129
+ ) = self._bookkeeping_sync(scheduler_output, sampler_output,
2130
+ logits, hidden_states,
2131
+ num_scheduled_tokens)
2132
+
2133
+ if self.speculative_config:
2134
+ assert spec_decode_common_attn_metadata is not None
2135
+ with record_function_or_nullcontext("Draft"):
2136
+ self._draft_token_ids = self.propose_draft_token_ids(
2137
+ scheduler_output,
2138
+ valid_sampled_token_ids,
2139
+ self.input_batch.sampling_metadata,
2140
+ hidden_states,
2141
+ sample_hidden_states,
2142
+ aux_hidden_states,
2143
+ spec_decode_metadata,
2144
+ spec_decode_common_attn_metadata,
2145
+ )
2146
+
2147
+ with record_function_or_nullcontext("EPLB"):
2148
+ self.eplb_step()
2149
+
2150
+ output = ModelRunnerOutput(
2151
+ req_ids=req_ids_output_copy,
2152
+ req_id_to_index=req_id_to_index_output_copy,
2153
+ sampled_token_ids=valid_sampled_token_ids,
2154
+ logprobs=logprobs_lists,
2155
+ prompt_logprobs_dict=prompt_logprobs_dict,
2156
+ pooler_output=[],
2157
+ kv_connector_output=kv_connector_output,
2158
+ num_nans_in_logits=num_nans_in_logits,
2159
+ )
2160
+
2161
+ if not self.use_async_scheduling:
2162
+ return output
2163
+
2164
+ return AsyncGPUModelRunnerOutput(
2165
+ model_runner_output=output,
2166
+ sampled_token_ids=sampler_output.sampled_token_ids,
2167
+ invalid_req_indices=invalid_req_indices,
2168
+ async_output_copy_stream=self.async_output_copy_stream,
2169
+ )
2170
+
2171
+ def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
2172
+ if self._draft_token_ids is None:
2173
+ return None
2174
+ req_ids = self.input_batch.req_ids
2175
+ if isinstance(self._draft_token_ids, torch.Tensor):
2176
+ draft_token_ids = self._draft_token_ids.tolist()
2177
+ else:
2178
+ draft_token_ids = self._draft_token_ids
2179
+ self._draft_token_ids = None
2180
+ return DraftTokenIds(req_ids, draft_token_ids)
2181
+
2182
+ def propose_draft_token_ids(
2183
+ self,
2184
+ scheduler_output: "SchedulerOutput",
2185
+ sampled_token_ids: list[list[int]],
2186
+ sampling_metadata: SamplingMetadata,
2187
+ hidden_states: torch.Tensor,
2188
+ sample_hidden_states: torch.Tensor,
2189
+ aux_hidden_states: Optional[torch.Tensor],
2190
+ spec_decode_metadata: Optional[SpecDecodeMetadata],
2191
+ common_attn_metadata: CommonAttentionMetadata,
2192
+ ) -> Union[list[list[int]], torch.Tensor]:
2193
+ num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
2194
+ if self.speculative_config.method == "ngram":
2195
+ assert isinstance(self.drafter, NgramProposer)
2196
+ draft_token_ids = self.propose_ngram_draft_token_ids(
2197
+ sampled_token_ids)
2198
+ elif self.speculative_config.method == "medusa":
2199
+ assert isinstance(self.drafter, MedusaProposer)
2200
+ if sample_hidden_states.shape[0] == len(sampled_token_ids):
2201
+ # The input to the target model does not include draft tokens.
2202
+ hidden_states = sample_hidden_states
2203
+ else:
2204
+ indices = []
2205
+ offset = 0
2206
+ for num_draft, tokens in zip(
2207
+ spec_decode_metadata.num_draft_tokens,
2208
+ sampled_token_ids):
2209
+ indices.append(offset + len(tokens) - 1)
2210
+ offset += num_draft + 1
2211
+ indices = torch.tensor(indices, device=self.device)
2212
+ hidden_states = sample_hidden_states[indices]
2213
+
2214
+ draft_token_ids = self.drafter.propose(
2215
+ target_hidden_states=hidden_states,
2216
+ sampling_metadata=sampling_metadata,
2217
+ )
2218
+ elif self.speculative_config.use_eagle():
2219
+ assert isinstance(self.drafter, EagleProposer)
2220
+ # TODO(woosuk): Refactor the loop.
2221
+ req_ids = self.input_batch.req_ids
2222
+ next_token_ids: list[int] = []
2223
+ for i, token_ids in enumerate(sampled_token_ids):
2224
+ if token_ids:
2225
+ # Common case.
2226
+ next_token_id = token_ids[-1]
2227
+ else:
2228
+ # Partial prefill (rare case).
2229
+ # Get the next token id from the request state.
2230
+ req_id = req_ids[i]
2231
+ req_state = self.requests[req_id]
2232
+ seq_len = (req_state.num_computed_tokens +
2233
+ scheduler_output.num_scheduled_tokens[req_id])
2234
+ next_token_id = req_state.get_token_id(seq_len)
2235
+ next_token_ids.append(next_token_id)
2236
+ next_token_ids = torch.tensor(next_token_ids,
2237
+ dtype=torch.int32,
2238
+ device=self.device)
2239
+
2240
+ if spec_decode_metadata is None:
2241
+ # input_ids can be None for multimodal models.
2242
+ target_token_ids = self.input_ids.gpu[:num_scheduled_tokens]
2243
+ # TODO(woosuk): Support M-RoPE.
2244
+ target_positions = self.positions.gpu[:num_scheduled_tokens]
2245
+ if self.use_aux_hidden_state_outputs:
2246
+ target_hidden_states = torch.cat(
2247
+ [h[:num_scheduled_tokens] for h in aux_hidden_states],
2248
+ dim=-1)
2249
+ else:
2250
+ target_hidden_states = hidden_states[:num_scheduled_tokens]
2251
+ else:
2252
+ # TODO(woosuk): Refactor this.
2253
+ num_draft_tokens = spec_decode_metadata.num_draft_tokens
2254
+ num_rejected_tokens = [
2255
+ n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
2256
+ for i, n in enumerate(num_draft_tokens)
2257
+ ]
2258
+ num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens,
2259
+ dtype=torch.int32)
2260
+ common_attn_metadata, token_indices =\
2261
+ self.drafter.prepare_inputs(
2262
+ common_attn_metadata, num_rejected_tokens_cpu)
2263
+
2264
+ target_token_ids = self.input_ids.gpu[token_indices]
2265
+ # TODO(woosuk): Support M-RoPE.
2266
+ target_positions = self.positions.gpu[token_indices]
2267
+ if self.use_aux_hidden_state_outputs:
2268
+ target_hidden_states = torch.cat(
2269
+ [h[token_indices] for h in aux_hidden_states], dim=-1)
2270
+ else:
2271
+ target_hidden_states = hidden_states[token_indices]
2272
+ mm_embeds = None
2273
+ if self.supports_mm_inputs:
2274
+ mm_embeds = self._gather_mm_embeddings(scheduler_output,
2275
+ shift_computed_tokens=1)
2276
+
2277
+ draft_token_ids = self.drafter.propose(
2278
+ target_token_ids=target_token_ids,
2279
+ target_positions=target_positions,
2280
+ target_hidden_states=target_hidden_states,
2281
+ next_token_ids=next_token_ids,
2282
+ sampling_metadata=sampling_metadata,
2283
+ common_attn_metadata=common_attn_metadata,
2284
+ mm_embeds=mm_embeds,
2285
+ )
2286
+ return draft_token_ids
2287
+
2288
+ def propose_ngram_draft_token_ids(
2289
+ self,
2290
+ sampled_token_ids: list[list[int]],
2291
+ ) -> list[list[int]]:
2292
+ # TODO(woosuk): Optimize.
2293
+ req_ids = self.input_batch.req_ids
2294
+ draft_token_ids: list[list[int]] = []
2295
+ for i, sampled_ids in enumerate(sampled_token_ids):
2296
+ num_sampled_ids = len(sampled_ids)
2297
+ if not num_sampled_ids:
2298
+ # Skip speculative decoding.
2299
+ draft_token_ids.append([])
2300
+ continue
2301
+
2302
+ # Skip requests that require sampling parameters that are not
2303
+ # supported with speculative decoding.
2304
+ req_id = req_ids[i]
2305
+ if req_id in self.input_batch.spec_decode_unsupported_reqs:
2306
+ draft_token_ids.append([])
2307
+ continue
2308
+
2309
+ num_tokens = self.input_batch.num_tokens_no_spec[i]
2310
+ if num_tokens >= self.max_model_len:
2311
+ # Skip requests that have already reached the max model length.
2312
+ draft_token_ids.append([])
2313
+ continue
2314
+
2315
+ drafter_output = self.drafter.propose(
2316
+ self.input_batch.token_ids_cpu[i, :num_tokens])
2317
+ if drafter_output is None or len(drafter_output) == 0:
2318
+ draft_token_ids.append([])
2319
+ else:
2320
+ draft_token_ids.append(drafter_output.tolist())
2321
+ return draft_token_ids
2322
+
2323
+ def update_config(self, overrides: dict[str, Any]) -> None:
2324
+ allowed_config_names = {"load_config", "model_config"}
2325
+ for config_name, config_overrides in overrides.items():
2326
+ assert config_name in allowed_config_names, \
2327
+ f"Config `{config_name}` not supported. " \
2328
+ f"Allowed configs: {allowed_config_names}"
2329
+ config = getattr(self, config_name)
2330
+ new_config = update_config(config, config_overrides)
2331
+ setattr(self, config_name, new_config)
2332
+
2333
+ def load_model(self, eep_scale_up: bool = False) -> None:
2334
+ """
2335
+ Args:
2336
+ eep_scale_up: the model loading is for elastic EP scale up.
2337
+ """
2338
+ logger.info("Starting to load model %s...", self.model_config.model)
2339
+ if eep_scale_up:
2340
+ from vllm.distributed.parallel_state import get_ep_group
2341
+ num_local_physical_experts = torch.empty(1,
2342
+ dtype=torch.int32,
2343
+ device="cpu")
2344
+ torch.distributed.broadcast(num_local_physical_experts,
2345
+ group=get_ep_group().cpu_group,
2346
+ group_src=0)
2347
+ num_local_physical_experts = int(num_local_physical_experts.item())
2348
+ new_ep_size = get_ep_group().world_size
2349
+ global_expert_load, old_global_expert_indices = (
2350
+ EplbState.recv_state())
2351
+ num_logical_experts = global_expert_load.shape[1]
2352
+ self.parallel_config.eplb_config.num_redundant_experts = (
2353
+ num_local_physical_experts * new_ep_size - num_logical_experts)
2354
+ assert old_global_expert_indices.shape[
2355
+ 1] % num_local_physical_experts == 0
2356
+ old_ep_size = old_global_expert_indices.shape[
2357
+ 1] // num_local_physical_experts
2358
+ rank_mapping = {
2359
+ old_ep_rank: old_ep_rank
2360
+ for old_ep_rank in range(old_ep_size)
2361
+ }
2362
+ else:
2363
+ global_expert_load = None
2364
+ old_global_expert_indices = None
2365
+ rank_mapping = None
2366
+
2367
+ with DeviceMemoryProfiler() as m:
2368
+ time_before_load = time.perf_counter()
2369
+ model_loader = get_model_loader(self.load_config)
2370
+ logger.info("Loading model from scratch...")
2371
+ self.model = model_loader.load_model(
2372
+ vllm_config=self.vllm_config, model_config=self.model_config)
2373
+ if self.lora_config:
2374
+ self.model = self.load_lora_model(self.model,
2375
+ self.model_config,
2376
+ self.scheduler_config,
2377
+ self.lora_config,
2378
+ self.device)
2379
+ if hasattr(self, "drafter"):
2380
+ logger.info("Loading drafter model...")
2381
+ self.drafter.load_model(self.model)
2382
+ if self.use_aux_hidden_state_outputs:
2383
+ if supports_eagle3(self.model):
2384
+ self.model.set_aux_hidden_state_layers(
2385
+ self.model.get_eagle3_aux_hidden_state_layers())
2386
+ else:
2387
+ raise RuntimeError(
2388
+ "Model does not support EAGLE3 interface but "
2389
+ "aux_hidden_state_outputs was requested")
2390
+ time_after_load = time.perf_counter()
2391
+ self.model_memory_usage = m.consumed_memory
2392
+ logger.info("Model loading took %.4f GiB and %.6f seconds",
2393
+ self.model_memory_usage / GiB_bytes,
2394
+ time_after_load - time_before_load)
2395
+ prepare_communication_buffer_for_model(self.model)
2396
+
2397
+ if is_mixture_of_experts(
2398
+ self.model) and self.parallel_config.enable_eplb:
2399
+ logger.info("EPLB is enabled for model %s.",
2400
+ self.model_config.model)
2401
+ self.eplb_state = EplbState.build(
2402
+ self.model,
2403
+ self.device,
2404
+ self.parallel_config,
2405
+ global_expert_load,
2406
+ old_global_expert_indices,
2407
+ rank_mapping,
2408
+ )
2409
+
2410
+ if (
2411
+ self.vllm_config.compilation_config.level == \
2412
+ CompilationLevel.DYNAMO_AS_IS and supports_dynamo()
2413
+ ):
2414
+ backend = self.vllm_config.compilation_config.init_backend(
2415
+ self.vllm_config)
2416
+ compilation_counter.dynamo_as_is_count += 1
2417
+ self.model.compile(
2418
+ fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
2419
+ backend=backend)
2420
+ return
2421
+ # for other compilation levels, cudagraph behavior is controlled by
2422
+ # CudagraphWraper and CudagraphDispatcher of vllm.
2423
+
2424
+ # wrap the model with full cudagraph wrapper if needed.
2425
+ if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
2426
+ self.model = CUDAGraphWrapper(self.model,
2427
+ self.vllm_config,
2428
+ runtime_mode=CUDAGraphMode.FULL)
2429
+
2430
+ def reload_weights(self) -> None:
2431
+ assert getattr(self, "model", None) is not None, \
2432
+ "Cannot reload weights before model is loaded."
2433
+ model_loader = get_model_loader(self.load_config)
2434
+ logger.info("Reloading weights inplace...")
2435
+ model = self.get_model()
2436
+ model_loader.load_weights(model, model_config=self.model_config)
2437
+
2438
+ def save_tensorized_model(
2439
+ self,
2440
+ tensorizer_config: "TensorizerConfig",
2441
+ ) -> None:
2442
+ model = self.get_model()
2443
+ TensorizerLoader.save_model(
2444
+ model,
2445
+ tensorizer_config=tensorizer_config,
2446
+ model_config=self.model_config,
2447
+ )
2448
+
2449
+ def _get_prompt_logprobs_dict(
2450
+ self,
2451
+ hidden_states: torch.Tensor,
2452
+ num_scheduled_tokens: dict[str, int],
2453
+ ) -> dict[str, Optional[LogprobsTensors]]:
2454
+ num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
2455
+ if not num_prompt_logprobs_dict:
2456
+ return {}
2457
+
2458
+ in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
2459
+ prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
2460
+
2461
+ # Since prompt logprobs are a rare feature, prioritize simple,
2462
+ # maintainable loop over optimal performance.
2463
+ completed_prefill_reqs = []
2464
+ for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
2465
+ num_tokens = num_scheduled_tokens[req_id]
2466
+
2467
+ # Get metadata for this request.
2468
+ request = self.requests[req_id]
2469
+ num_prompt_tokens = len(request.prompt_token_ids)
2470
+ prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
2471
+ self.device, non_blocking=True)
2472
+
2473
+ # Set up target LogprobsTensors object.
2474
+ logprobs_tensors = in_progress_dict.get(req_id)
2475
+ if not logprobs_tensors:
2476
+ # Create empty logprobs CPU tensors for the entire prompt.
2477
+ # If chunked, we'll copy in slice by slice.
2478
+ logprobs_tensors = LogprobsTensors.empty_cpu(
2479
+ num_prompt_tokens - 1, num_prompt_logprobs + 1)
2480
+ in_progress_dict[req_id] = logprobs_tensors
2481
+
2482
+ # Determine number of logits to retrieve.
2483
+ start_idx = request.num_computed_tokens
2484
+ start_tok = start_idx + 1
2485
+ num_remaining_tokens = num_prompt_tokens - start_tok
2486
+ if num_tokens <= num_remaining_tokens:
2487
+ # This is a chunk, more tokens remain.
2488
+ # In the == case, there are no more prompt logprobs to produce
2489
+ # but we want to defer returning them to the next step where we
2490
+ # have new generated tokens to return.
2491
+ num_logits = num_tokens
2492
+ else:
2493
+ # This is the last chunk of prompt tokens to return.
2494
+ num_logits = num_remaining_tokens
2495
+ completed_prefill_reqs.append(req_id)
2496
+ prompt_logprobs_dict[req_id] = logprobs_tensors
2497
+
2498
+ if num_logits <= 0:
2499
+ # This can happen for the final chunk if we prefilled exactly
2500
+ # (num_prompt_tokens - 1) tokens for this request in the prior
2501
+ # step. There are no more prompt logprobs to produce.
2502
+ continue
2503
+
2504
+ # Get the logits corresponding to this req's prompt tokens.
2505
+ # If this is a partial request (i.e. chunked prefill),
2506
+ # then there is prompt logprob generated for each index.
2507
+ req_idx = self.input_batch.req_id_to_index[req_id]
2508
+ offset = self.query_start_loc.np[req_idx].item()
2509
+ prompt_hidden_states = hidden_states[offset:offset + num_logits]
2510
+ logits = self.model.compute_logits(prompt_hidden_states, None)
2511
+
2512
+ # Get the "target" tokens for each index. For prompt at index i,
2513
+ # the token at prompt index i+1 is the "sampled" token we want
2514
+ # to gather the logprob for.
2515
+ tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
2516
+
2517
+ # Compute prompt logprobs.
2518
+ logprobs = self.sampler.compute_logprobs(logits)
2519
+ token_ids, logprobs, ranks = self.sampler.gather_logprobs(
2520
+ logprobs, num_prompt_logprobs, tgt_token_ids)
2521
+
2522
+ # Transfer GPU->CPU async.
2523
+ chunk_slice = slice(start_idx, start_idx + num_logits)
2524
+ logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
2525
+ token_ids, non_blocking=True)
2526
+ logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
2527
+ non_blocking=True)
2528
+ logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
2529
+ ranks, non_blocking=True)
2530
+
2531
+ # Remove requests that have completed prefill from the batch
2532
+ # num_prompt_logprobs_dict.
2533
+ for req_id in completed_prefill_reqs:
2534
+ del num_prompt_logprobs_dict[req_id]
2535
+ del in_progress_dict[req_id]
2536
+
2537
+ # Must synchronize the non-blocking GPU->CPU transfers.
2538
+ if prompt_logprobs_dict:
2539
+ self._sync_device()
2540
+
2541
+ return prompt_logprobs_dict
2542
+
2543
+ def _get_nans_in_logits(
2544
+ self,
2545
+ logits: Optional[torch.Tensor],
2546
+ ) -> dict[str, int]:
2547
+ try:
2548
+ if logits is None:
2549
+ return {req_id: 0 for req_id in self.input_batch.req_ids}
2550
+
2551
+ num_nans_in_logits = {}
2552
+ num_nans_for_index = logits.isnan().sum(dim=-1).cpu().numpy()
2553
+ for req_id in self.input_batch.req_ids:
2554
+ req_index = self.input_batch.req_id_to_index[req_id]
2555
+ num_nans_in_logits[req_id] = (
2556
+ int(num_nans_for_index[req_index])
2557
+ if num_nans_for_index is not None
2558
+ and req_index < logits.shape[0] else 0)
2559
+ return num_nans_in_logits
2560
+ except IndexError:
2561
+ return {}
2562
+
2563
+ @contextmanager
2564
+ def maybe_randomize_inputs(self, input_ids: torch.Tensor):
2565
+ """
2566
+ Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
2567
+ This is to help balance expert-selection
2568
+ - during profile_run
2569
+ - during DP rank dummy run
2570
+ """
2571
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
2572
+ randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
2573
+ if not randomize_inputs:
2574
+ yield
2575
+ else:
2576
+ import functools
2577
+
2578
+ @functools.cache
2579
+ def rand_input_ids() -> torch.Tensor:
2580
+ return torch.randint_like(
2581
+ self.input_ids.gpu,
2582
+ low=0,
2583
+ high=self.model_config.get_vocab_size(),
2584
+ dtype=input_ids.dtype)
2585
+
2586
+ logger.debug_once("Randomizing dummy data for DP Rank")
2587
+ input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
2588
+ non_blocking=True)
2589
+ yield
2590
+ input_ids.fill_(0)
2591
+
2592
+ def _get_mm_dummy_batch(
2593
+ self,
2594
+ modality: str,
2595
+ max_items_per_batch: int,
2596
+ ) -> BatchedTensorInputs:
2597
+ """Dummy data for profiling and precompiling multimodal models."""
2598
+ assert self.mm_budget is not None
2599
+
2600
+ dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
2601
+ model_config=self.model_config,
2602
+ seq_len=self.max_num_tokens,
2603
+ mm_counts={modality: 1},
2604
+ cache=self.mm_budget.cache,
2605
+ )
2606
+ dummy_mm_data = dummy_decoder_data.multi_modal_data
2607
+
2608
+ # Result in the maximum GPU consumption of the model
2609
+ dummy_mm_item = dummy_mm_data[modality][0]
2610
+ dummy_mm_items = [dummy_mm_item] * max_items_per_batch
2611
+
2612
+ return next(mm_kwargs_group
2613
+ for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
2614
+ dummy_mm_items,
2615
+ device=self.device,
2616
+ pin_memory=self.pin_memory,
2617
+ ))
2618
+
2619
+ @torch.inference_mode()
2620
+ def _dummy_run(
2621
+ self,
2622
+ num_tokens: int,
2623
+ cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
2624
+ force_attention: bool = False,
2625
+ uniform_decode: bool = False,
2626
+ skip_eplb: bool = False,
2627
+ is_profile: bool = False,
2628
+ create_mixed_batch: bool = False,
2629
+ remove_lora: bool = True,
2630
+ ) -> tuple[torch.Tensor, torch.Tensor]:
2631
+ """
2632
+ Run a dummy forward pass to warm up/profile run or capture the
2633
+ CUDA graph for the model.
2634
+
2635
+ Args:
2636
+ num_tokens: Number of tokens to run the dummy forward pass.
2637
+ cudagraph_runtime_mode: used to control the behavior.
2638
+ - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
2639
+ - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
2640
+ - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
2641
+ needed.
2642
+ force_attention: If True, always create attention metadata. Used to
2643
+ warm up attention backend when mode is NONE.
2644
+ uniform_decode: If True, the batch is a uniform decode batch.
2645
+ skip_eplb: If True, skip EPLB state update.
2646
+ is_profile: If True, this is a profile run.
2647
+ create_mixed_batch: If True, create a mixed batch with both decode
2648
+ (1 token) and prefill (multiple tokens) requests.
2649
+ remove_lora: If False, dummy LoRAs are not destroyed after the run
2650
+ """
2651
+ assert cudagraph_runtime_mode in {
2652
+ CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
2653
+ }
2654
+
2655
+ # Padding for DP
2656
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
2657
+ num_tokens += num_pad
2658
+
2659
+ # If cudagraph_mode.decode_mode() == FULL and
2660
+ # cudagraph_mode.seperate_routine(). This means that we are using
2661
+ # different graphs and/or modes for mixed prefill-decode batches vs.
2662
+ # uniform decode batches. A uniform decode batch means that all
2663
+ # requests have identical query length, except a potential virtual
2664
+ # request (shorter) in the batch account for padding.
2665
+ # Uniform decode batch could either be common pure decode, where
2666
+ # max_query_len == 1, or speculative decode, where
2667
+ # max_query_len == 1 + num_spec_decode_tokens.
2668
+
2669
+ # When setting max_query_len = 1, we switch to and capture the optimized
2670
+ # routine of FA2 for pure decode, i.e., Flashdecode + an optimization
2671
+ # for GQA/MQA.
2672
+ max_query_len = self.uniform_decode_query_len if uniform_decode else \
2673
+ num_tokens
2674
+
2675
+ # Set num_scheduled_tokens based on num_tokens and max_num_seqs
2676
+ # for dummy run with LoRA so that the num_reqs collectively
2677
+ # has num_tokens in total.
2678
+ assert num_tokens <= self.scheduler_config.max_num_batched_tokens
2679
+ max_num_reqs = self.scheduler_config.max_num_seqs
2680
+ if create_mixed_batch:
2681
+ assert not uniform_decode
2682
+ # Create mixed batch:
2683
+ # first half decode tokens, second half one prefill
2684
+ num_decode_tokens = num_tokens // 2
2685
+ num_prefill_tokens = num_tokens - num_decode_tokens
2686
+ num_reqs = num_decode_tokens + 1
2687
+
2688
+ # Create decode requests (1 token each) followed by prefill request
2689
+ num_scheduled_tokens_list = [1] * num_decode_tokens + [
2690
+ num_prefill_tokens
2691
+ ]
2692
+ # Note: Overriding max_query_len to be the prefill tokens
2693
+ max_query_len = num_prefill_tokens
2694
+ elif uniform_decode:
2695
+ num_reqs = num_tokens // max_query_len
2696
+ assert num_reqs <= max_num_reqs, \
2697
+ "Do not capture num_reqs > max_num_reqs for uniform batch"
2698
+ num_scheduled_tokens_list = [max_query_len] * num_reqs
2699
+ if num_tokens % max_query_len != 0:
2700
+ num_scheduled_tokens_list[-1] += num_tokens % max_query_len
2701
+ else:
2702
+ num_reqs = min(num_tokens, max_num_reqs)
2703
+ min_tokens_per_req = num_tokens // num_reqs
2704
+ num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
2705
+ num_scheduled_tokens_list[-1] += num_tokens % num_reqs
2706
+
2707
+ assert sum(num_scheduled_tokens_list) == num_tokens
2708
+ assert len(num_scheduled_tokens_list) == num_reqs
2709
+ num_scheduled_tokens = np.array(num_scheduled_tokens_list,
2710
+ dtype=np.int32)
2711
+
2712
+ attn_metadata: Optional[dict[str, Any]] = None
2713
+
2714
+ # If force_attention is True, we always capture attention. Otherwise,
2715
+ # it only happens for cudagraph_runtime_mode=FULL.
2716
+ if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
2717
+ attn_metadata = {}
2718
+
2719
+ if create_mixed_batch:
2720
+ # In the mixed batch mode (used for FI warmup), we use
2721
+ # shorter sequence lengths to run faster.
2722
+ # TODO(luka) better system for describing dummy batches
2723
+ seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
2724
+ else:
2725
+ # Make sure max_model_len is used at the graph capture time.
2726
+ seq_lens = self.max_model_len
2727
+ self.seq_lens.np[:num_reqs] = seq_lens
2728
+ self.seq_lens.np[num_reqs:] = 0
2729
+ self.seq_lens.copy_to_gpu()
2730
+
2731
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
2732
+ self.kv_cache_config.kv_cache_groups):
2733
+ common_attn_metadata = CommonAttentionMetadata(
2734
+ query_start_loc=self.query_start_loc.gpu[:num_reqs + 1],
2735
+ query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
2736
+ 1],
2737
+ seq_lens=self.seq_lens.gpu[:num_reqs],
2738
+ seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
2739
+ num_computed_tokens_cpu=self.input_batch.
2740
+ num_computed_tokens_cpu_tensor[:num_reqs],
2741
+ num_reqs=num_reqs,
2742
+ num_actual_tokens=num_tokens,
2743
+ max_query_len=max_query_len,
2744
+ max_seq_len=self.max_model_len,
2745
+ block_table_tensor=self.input_batch.block_table[
2746
+ kv_cache_group_id].get_device_tensor()[:num_reqs],
2747
+ slot_mapping=self.input_batch.
2748
+ block_table[kv_cache_group_id].slot_mapping[:num_tokens],
2749
+ causal=True)
2750
+
2751
+ for attn_group in self.attn_groups[kv_cache_group_id]:
2752
+ attn_metadata_i = attn_group.metadata_builder\
2753
+ .build_for_cudagraph_capture(common_attn_metadata)
2754
+ for layer_name in kv_cache_group_spec.layer_names:
2755
+ attn_metadata[layer_name] = attn_metadata_i
2756
+
2757
+ with self.maybe_dummy_run_with_lora(self.lora_config,
2758
+ num_scheduled_tokens, remove_lora):
2759
+ model_kwargs = self._init_model_kwargs(num_tokens)
2760
+ if (self.supports_mm_inputs
2761
+ and not self.model_config.is_encoder_decoder):
2762
+ input_ids = None
2763
+ inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
2764
+ model_kwargs = {
2765
+ **model_kwargs,
2766
+ **self._dummy_mm_kwargs(num_reqs),
2767
+ }
2768
+ else:
2769
+ input_ids = self.input_ids.gpu[:num_tokens]
2770
+ inputs_embeds = None
2771
+
2772
+ if self.uses_mrope:
2773
+ positions = self.mrope_positions.gpu[:, :num_tokens]
2774
+ else:
2775
+ positions = self.positions.gpu[:num_tokens]
2776
+
2777
+ if get_pp_group().is_first_rank:
2778
+ intermediate_tensors = None
2779
+ else:
2780
+ if self.intermediate_tensors is None:
2781
+ self.intermediate_tensors = (
2782
+ self.model.make_empty_intermediate_tensors(
2783
+ batch_size=self.max_num_tokens,
2784
+ dtype=self.model_config.dtype,
2785
+ device=self.device))
2786
+
2787
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
2788
+ num_tokens, None, False)
2789
+ if cudagraph_runtime_mode == CUDAGraphMode.NONE:
2790
+ batch_descriptor = None
2791
+ else:
2792
+ # filter out the valid batch descriptor
2793
+ _cg_mode, batch_descriptor = \
2794
+ self.cudagraph_dispatcher.dispatch(
2795
+ BatchDescriptor(num_tokens=num_tokens,
2796
+ uniform_decode=uniform_decode))
2797
+ # sanity check
2798
+ assert cudagraph_runtime_mode == _cg_mode, (
2799
+ f"Cudagraph runtime mode mismatch at dummy_run. "
2800
+ f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.")
2801
+
2802
+ with self.maybe_randomize_inputs(input_ids), set_forward_context(
2803
+ attn_metadata,
2804
+ self.vllm_config,
2805
+ num_tokens=num_tokens,
2806
+ num_tokens_across_dp=num_tokens_across_dp,
2807
+ cudagraph_runtime_mode=cudagraph_runtime_mode,
2808
+ batch_descriptor=batch_descriptor):
2809
+ outputs = self.model(
2810
+ input_ids=input_ids,
2811
+ positions=positions,
2812
+ intermediate_tensors=intermediate_tensors,
2813
+ inputs_embeds=inputs_embeds,
2814
+ **model_kwargs,
2815
+ )
2816
+
2817
+ if self.use_aux_hidden_state_outputs:
2818
+ hidden_states, _ = outputs
2819
+ else:
2820
+ hidden_states = outputs
2821
+
2822
+ if self.speculative_config and self.speculative_config.use_eagle():
2823
+ assert isinstance(self.drafter, EagleProposer)
2824
+ self.drafter.dummy_run(num_tokens)
2825
+
2826
+ # This is necessary to avoid blocking DP.
2827
+ # For dummy runs, we typically skip EPLB since we don't have any real
2828
+ # requests to process.
2829
+ # However, in DP settings, there may be cases when some DP ranks do
2830
+ # not have any requests to process, so they're executing dummy batches.
2831
+ # In such cases, we still have to trigger EPLB to make sure
2832
+ # ranks execute the rearrangement in synchronization.
2833
+ if not skip_eplb:
2834
+ self.eplb_step(is_dummy=True, is_profile=is_profile)
2835
+
2836
+ logit_indices = np.cumsum(num_scheduled_tokens) - 1
2837
+ return hidden_states, hidden_states[logit_indices]
2838
+
2839
+ @torch.inference_mode()
2840
+ def _dummy_sampler_run(
2841
+ self,
2842
+ hidden_states: torch.Tensor,
2843
+ ) -> torch.Tensor:
2844
+ # The dummy hidden states may contain special values,
2845
+ # like `inf` or `nan`.
2846
+ # To avoid breaking the sampler, we use a random tensor here instead.
2847
+ hidden_states = torch.rand_like(hidden_states)
2848
+
2849
+ logits = self.model.compute_logits(hidden_states, None)
2850
+ num_reqs = logits.size(0)
2851
+
2852
+ dummy_tensors = lambda v: torch.full(
2853
+ (num_reqs, ), v, device=self.device)
2854
+
2855
+ dummy_metadata = SamplingMetadata(
2856
+ temperature=dummy_tensors(0.5),
2857
+ all_greedy=False,
2858
+ all_random=False,
2859
+ top_p=dummy_tensors(0.9),
2860
+ top_k=dummy_tensors(logits.size(1) - 1),
2861
+ generators={},
2862
+ max_num_logprobs=None,
2863
+ no_penalties=True,
2864
+ prompt_token_ids=None,
2865
+ frequency_penalties=dummy_tensors(0.1),
2866
+ presence_penalties=dummy_tensors(0.1),
2867
+ repetition_penalties=dummy_tensors(0.1),
2868
+ output_token_ids=[[] for _ in range(num_reqs)],
2869
+ allowed_token_ids_mask=None,
2870
+ bad_words_token_ids={},
2871
+ logitsprocs=LogitsProcessors(),
2872
+ )
2873
+ try:
2874
+ sampler_output = self.sampler(logits=logits,
2875
+ sampling_metadata=dummy_metadata)
2876
+ except RuntimeError as e:
2877
+ if 'out of memory' in str(e):
2878
+ raise RuntimeError(
2879
+ "CUDA out of memory occurred when warming up sampler with "
2880
+ f"{num_reqs} dummy requests. Please try lowering "
2881
+ "`max_num_seqs` or `gpu_memory_utilization` when "
2882
+ "initializing the engine.") from e
2883
+ else:
2884
+ raise e
2885
+ if self.speculative_config:
2886
+ draft_token_ids = [[0] for _ in range(num_reqs)]
2887
+ dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
2888
+ draft_token_ids, self.device)
2889
+
2890
+ num_tokens = sum(len(ids) for ids in draft_token_ids)
2891
+ # draft_probs = torch.randn(
2892
+ # num_tokens, logits.shape[-1], device=self.device,
2893
+ # dtype=logits.dtype)
2894
+ draft_probs = None
2895
+ target_logits = torch.randn(num_tokens,
2896
+ logits.shape[-1],
2897
+ device=self.device,
2898
+ dtype=logits.dtype)
2899
+ # NOTE(woosuk): Here, we should use int32 because the sampler uses
2900
+ # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
2901
+ # will occur at runtime.
2902
+ bonus_token_ids = torch.zeros(num_reqs,
2903
+ device=self.device,
2904
+ dtype=torch.int32)
2905
+ self.rejection_sampler(
2906
+ dummy_spec_decode_metadata,
2907
+ draft_probs,
2908
+ target_logits,
2909
+ bonus_token_ids,
2910
+ dummy_metadata,
2911
+ )
2912
+ return sampler_output
2913
+
2914
+ def _dummy_pooler_run_task(
2915
+ self,
2916
+ hidden_states: torch.Tensor,
2917
+ task: PoolingTask,
2918
+ ) -> PoolerOutput:
2919
+ num_tokens = hidden_states.shape[0]
2920
+ max_num_reqs = self.scheduler_config.max_num_seqs
2921
+ num_reqs = min(num_tokens, max_num_reqs)
2922
+ min_tokens_per_req = num_tokens // num_reqs
2923
+ num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
2924
+ num_scheduled_tokens_list[-1] += num_tokens % num_reqs
2925
+ assert sum(num_scheduled_tokens_list) == num_tokens
2926
+ assert len(num_scheduled_tokens_list) == num_reqs
2927
+
2928
+ req_num_tokens = num_tokens // num_reqs
2929
+
2930
+ dummy_prompt_lens = torch.tensor(
2931
+ num_scheduled_tokens_list,
2932
+ device="cpu",
2933
+ )
2934
+ dummy_token_ids = torch.zeros((num_reqs, req_num_tokens),
2935
+ dtype=torch.int32,
2936
+ device=self.device)
2937
+
2938
+ model = cast(VllmModelForPooling, self.get_model())
2939
+ dummy_pooling_params = PoolingParams(task=task)
2940
+ to_update = model.pooler.get_pooling_updates(task)
2941
+ to_update.apply(dummy_pooling_params)
2942
+
2943
+ dummy_metadata = PoolingMetadata(
2944
+ prompt_lens=dummy_prompt_lens,
2945
+ prompt_token_ids=dummy_token_ids,
2946
+ pooling_params=[dummy_pooling_params] * num_reqs,
2947
+ )
2948
+
2949
+ dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list,
2950
+ device=hidden_states.device)
2951
+
2952
+ try:
2953
+ return model.pooler(hidden_states=hidden_states,
2954
+ pooling_metadata=dummy_metadata)
2955
+ except RuntimeError as e:
2956
+ if 'out of memory' in str(e):
2957
+ raise RuntimeError(
2958
+ "CUDA out of memory occurred when warming up pooler "
2959
+ f"({task=}) with {num_reqs} dummy requests. Please try "
2960
+ "lowering `max_num_seqs` or `gpu_memory_utilization` when "
2961
+ "initializing the engine.") from e
2962
+ else:
2963
+ raise e
2964
+
2965
+ @torch.inference_mode()
2966
+ def _dummy_pooler_run(
2967
+ self,
2968
+ hidden_states: torch.Tensor,
2969
+ ) -> PoolerOutput:
2970
+ # Find the task that has the largest output for subsequent steps
2971
+ output_size = dict[PoolingTask, float]()
2972
+ for task in self.get_supported_pooling_tasks():
2973
+ # Run a full batch with each task to ensure none of them OOMs
2974
+ output = self._dummy_pooler_run_task(hidden_states, task)
2975
+ output_size[task] = output.get_data_nbytes()
2976
+ del output # Allow GC
2977
+
2978
+ max_task = max(output_size.items(), key=lambda x: x[1])[0]
2979
+ return self._dummy_pooler_run_task(hidden_states, max_task)
2980
+
2981
+ def profile_run(self) -> None:
2982
+ # Profile with multimodal encoder & encoder cache.
2983
+ if self.supports_mm_inputs:
2984
+ if self.model_config.multimodal_config.skip_mm_profiling:
2985
+ logger.info(
2986
+ "Skipping memory profiling for multimodal encoder and "
2987
+ "encoder cache.")
2988
+ else:
2989
+ mm_budget = self.mm_budget
2990
+ assert mm_budget is not None
2991
+
2992
+ if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
2993
+ # NOTE: Currently model is profiled with a single non-text
2994
+ # modality with the max possible input tokens even when
2995
+ # it supports multiple.
2996
+ dummy_modality = mm_budget.get_modality_with_max_tokens()
2997
+ max_mm_items_per_batch = mm_budget \
2998
+ .max_items_per_batch_by_modality[dummy_modality]
2999
+
3000
+ logger.info(
3001
+ "Encoder cache will be initialized with a budget of "
3002
+ "%s tokens, and profiled with %s %s items of the "
3003
+ "maximum feature size.",
3004
+ encoder_budget,
3005
+ max_mm_items_per_batch,
3006
+ dummy_modality,
3007
+ )
3008
+
3009
+ # Create dummy batch of multimodal inputs.
3010
+ batched_dummy_mm_inputs = self._get_mm_dummy_batch(
3011
+ dummy_modality,
3012
+ max_mm_items_per_batch,
3013
+ )
3014
+
3015
+ # Run multimodal encoder.
3016
+ dummy_encoder_outputs = \
3017
+ self.model.get_multimodal_embeddings(
3018
+ **batched_dummy_mm_inputs)
3019
+
3020
+ sanity_check_mm_encoder_outputs(
3021
+ dummy_encoder_outputs,
3022
+ expected_num_items=max_mm_items_per_batch,
3023
+ )
3024
+
3025
+ # Cache the dummy encoder outputs.
3026
+ self.encoder_cache["tmp"] = dict(
3027
+ enumerate(dummy_encoder_outputs))
3028
+
3029
+ # Add `is_profile` here to pre-allocate communication buffers
3030
+ hidden_states, last_hidden_states \
3031
+ = self._dummy_run(self.max_num_tokens, is_profile=True)
3032
+ if get_pp_group().is_last_rank:
3033
+ if self.is_pooling_model:
3034
+ output = self._dummy_pooler_run(hidden_states)
3035
+ else:
3036
+ output = self._dummy_sampler_run(last_hidden_states)
3037
+ else:
3038
+ output = None
3039
+ self._sync_device()
3040
+ del hidden_states, output
3041
+ self.encoder_cache.clear()
3042
+ gc.collect()
3043
+
3044
+ def capture_model(self) -> int:
3045
+ if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
3046
+ logger.warning(
3047
+ "Skipping CUDA graph capture. To turn on CUDA graph capture, "
3048
+ "ensure `cudagraph_mode` was not manually set to `NONE`")
3049
+ return 0
3050
+ else:
3051
+ self.initialize_cudagraph_capture()
3052
+
3053
+ compilation_counter.num_gpu_runner_capture_triggers += 1
3054
+
3055
+ start_time = time.perf_counter()
3056
+ start_free_gpu_memory = torch.cuda.mem_get_info()[0]
3057
+
3058
+ @contextmanager
3059
+ def freeze_gc():
3060
+ # Optimize garbage collection during CUDA graph capture.
3061
+ # Clean up, then freeze all remaining objects from being included
3062
+ # in future collections.
3063
+ gc.collect()
3064
+ should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC
3065
+ if should_freeze:
3066
+ gc.freeze()
3067
+ try:
3068
+ yield
3069
+ finally:
3070
+ if should_freeze:
3071
+ gc.unfreeze()
3072
+ gc.collect()
3073
+
3074
+ # Trigger CUDA graph capture for specific shapes.
3075
+ # Capture the large shapes first so that the smaller shapes
3076
+ # can reuse the memory pool allocated for the large shapes.
3077
+ set_cudagraph_capturing_enabled(True)
3078
+ with freeze_gc(), graph_capture(device=self.device):
3079
+ cudagraph_mode = self.compilation_config.cudagraph_mode
3080
+ if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
3081
+ cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
3082
+
3083
+ compilation_cases = list(reversed(self.cudagraph_batch_sizes))
3084
+ self._capture_cudagraphs(
3085
+ compilation_cases,
3086
+ cudagraph_runtime_mode=cudagraph_runtime_mode,
3087
+ uniform_decode=False)
3088
+
3089
+ # Capture full cudagraph for uniform decode batches if we have
3090
+ # dont already have full mixed prefill-decode cudagraphs
3091
+ if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \
3092
+ cudagraph_mode.separate_routine():
3093
+ max_num_tokens = self.scheduler_config.max_num_seqs * \
3094
+ self.uniform_decode_query_len
3095
+ decode_cudagraph_batch_sizes = [
3096
+ x for x in self.cudagraph_batch_sizes if
3097
+ x <= max_num_tokens and x >= self.uniform_decode_query_len
3098
+ ]
3099
+ compilation_cases_decode = list(
3100
+ reversed(decode_cudagraph_batch_sizes))
3101
+ self._capture_cudagraphs(
3102
+ compilation_cases=compilation_cases_decode,
3103
+ cudagraph_runtime_mode=CUDAGraphMode.FULL,
3104
+ uniform_decode=True)
3105
+
3106
+ # Disable cudagraph capturing globally, so any unexpected cudagraph
3107
+ # capturing will be detected and raise an error after here.
3108
+ # Note: We don't put it into graph_capture context manager because
3109
+ # we may do lazy capturing in future that still allows capturing
3110
+ # after here.
3111
+ set_cudagraph_capturing_enabled(False)
3112
+
3113
+ end_time = time.perf_counter()
3114
+ end_free_gpu_memory = torch.cuda.mem_get_info()[0]
3115
+ elapsed_time = end_time - start_time
3116
+ cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
3117
+ # This usually takes 5~20 seconds.
3118
+ logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
3119
+ elapsed_time, cuda_graph_size / (1 << 30))
3120
+ return cuda_graph_size
3121
+
3122
+ def _capture_cudagraphs(self, compilation_cases: list[int],
3123
+ cudagraph_runtime_mode: CUDAGraphMode,
3124
+ uniform_decode: bool):
3125
+ assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
3126
+ cudagraph_runtime_mode in [CUDAGraphMode.FULL,
3127
+ CUDAGraphMode.PIECEWISE]
3128
+
3129
+ # Only rank 0 should print progress bar during capture
3130
+ if is_global_first_rank():
3131
+ compilation_cases = tqdm(
3132
+ compilation_cases,
3133
+ disable=not self.load_config.use_tqdm_on_load,
3134
+ desc="Capturing CUDA graphs ({}, {})".format(
3135
+ "decode" if uniform_decode else "mixed prefill-decode",
3136
+ cudagraph_runtime_mode.name))
3137
+ # We skip EPLB here since we don't want to record dummy metrics
3138
+ for num_tokens in compilation_cases:
3139
+ for _ in range(self.compilation_config.cudagraph_num_of_warmups):
3140
+ # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
3141
+ # But be careful, warm up with `NONE`is orthogonal to
3142
+ # if we want to warm up attention or not. This is
3143
+ # different from the case where `FULL` implies capture
3144
+ # attention while `PIECEWISE` implies no attention.
3145
+ force_attention = (
3146
+ cudagraph_runtime_mode == CUDAGraphMode.FULL)
3147
+ self._dummy_run(num_tokens,
3148
+ cudagraph_runtime_mode=CUDAGraphMode.NONE,
3149
+ force_attention=force_attention,
3150
+ uniform_decode=uniform_decode,
3151
+ skip_eplb=True,
3152
+ remove_lora=False)
3153
+ self._dummy_run(num_tokens,
3154
+ cudagraph_runtime_mode=cudagraph_runtime_mode,
3155
+ uniform_decode=uniform_decode,
3156
+ skip_eplb=True,
3157
+ remove_lora=False)
3158
+ self.maybe_remove_all_loras(self.lora_config)
3159
+
3160
+ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
3161
+ """
3162
+ Initialize the attention backends and attention metadata builders.
3163
+ """
3164
+ assert len(self.attn_groups) == 0, \
3165
+ "Attention backends are already initialized"
3166
+
3167
+ def get_attn_backends_for_layers(
3168
+ layer_names: list[str]
3169
+ ) -> dict[type[AttentionBackend], list[str]]:
3170
+ layers = get_layers_from_vllm_config(self.vllm_config,
3171
+ AttentionLayerBase,
3172
+ layer_names)
3173
+ attn_backends = {}
3174
+ attn_backend_layers = defaultdict(list)
3175
+ # Dedupe based on full class name; this is a bit safer than
3176
+ # using the class itself as the key because when we create dynamic
3177
+ # attention backend subclasses (e.g. ChunkedLocalAttention) unless
3178
+ # they are cached correctly, there will be different objects per
3179
+ # layer.
3180
+ for layer_name in layer_names:
3181
+ attn_backend = layers[layer_name].get_attn_backend()
3182
+
3183
+ if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
3184
+ attn_backend = create_fast_prefill_custom_backend(
3185
+ "FastPrefill",
3186
+ attn_backend,
3187
+ )
3188
+
3189
+ key = attn_backend.full_cls_name()
3190
+ attn_backends[key] = attn_backend
3191
+ attn_backend_layers[key].append(layer_name)
3192
+ return {
3193
+ attn_backends[k]: v
3194
+ for k, v in attn_backend_layers.items()
3195
+ }
3196
+
3197
+ def create_attn_groups(
3198
+ attn_backends_map: dict[AttentionBackend, list[str]],
3199
+ kv_cache_spec: KVCacheSpec,
3200
+ ) -> list[AttentionGroup]:
3201
+ attn_groups: list[AttentionGroup] = []
3202
+ for attn_backend, layer_names in attn_backends_map.items():
3203
+ attn_metadata_builder_i = attn_backend.get_builder_cls()(
3204
+ kv_cache_spec,
3205
+ layer_names,
3206
+ self.vllm_config,
3207
+ self.device,
3208
+ )
3209
+ attn_group = AttentionGroup(attn_backend,
3210
+ attn_metadata_builder_i,
3211
+ layer_names)
3212
+ attn_groups.append(attn_group)
3213
+ return attn_groups
3214
+
3215
+ for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
3216
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
3217
+ attn_backends = get_attn_backends_for_layers(
3218
+ kv_cache_group_spec.layer_names)
3219
+ self.attn_groups.append(
3220
+ create_attn_groups(attn_backends, kv_cache_spec))
3221
+
3222
+ # Calculate reorder batch threshold (if needed)
3223
+ self.calculate_reorder_batch_threshold()
3224
+
3225
+ def initialize_cudagraph_capture(self) -> None:
3226
+ min_cg_support = AttentionCGSupport.ALWAYS
3227
+ min_cg_builder_name = None
3228
+
3229
+ for attn_group in self._attn_group_iterator():
3230
+ builder = attn_group.metadata_builder
3231
+ if builder.cudagraph_support.value < min_cg_support.value:
3232
+ min_cg_support = builder.cudagraph_support
3233
+ min_cg_builder_name = builder.__class__.__name__
3234
+
3235
+ # Flexible resolve the cudagraph mode
3236
+ cudagraph_mode = self.compilation_config.cudagraph_mode
3237
+ # check cudagraph for mixed batch is supported
3238
+ if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \
3239
+ and min_cg_support != AttentionCGSupport.ALWAYS:
3240
+ msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
3241
+ f"with {min_cg_builder_name} backend (support: "
3242
+ f"{min_cg_support})")
3243
+ if min_cg_support == AttentionCGSupport.NEVER:
3244
+ # if not supported any full cudagraphs, just raise it.
3245
+ msg += "; please try cudagraph_mode=PIECEWISE, and "\
3246
+ "make sure compilation level is piecewise"
3247
+ raise ValueError(msg)
3248
+
3249
+ # attempt to resolve the full cudagraph related mode
3250
+ if self.compilation_config.splitting_ops_contain_attention():
3251
+ msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE"
3252
+ cudagraph_mode = self.compilation_config.cudagraph_mode = \
3253
+ CUDAGraphMode.FULL_AND_PIECEWISE
3254
+ else:
3255
+ msg += "; setting cudagraph_mode=FULL_DECODE_ONLY"
3256
+ cudagraph_mode = self.compilation_config.cudagraph_mode = \
3257
+ CUDAGraphMode.FULL_DECODE_ONLY
3258
+ logger.warning(msg)
3259
+
3260
+ # check that if we are doing spec-decode + decode full-cudagraphs it is
3261
+ # supported
3262
+ if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
3263
+ and self.uniform_decode_query_len > 1 and min_cg_support.value
3264
+ < AttentionCGSupport.UNIFORM_BATCH.value):
3265
+ msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
3266
+ f" with spec-decode for attention backend "
3267
+ f"{min_cg_builder_name} (support: {min_cg_support})")
3268
+ if self.compilation_config.splitting_ops_contain_attention():
3269
+ msg += "; setting cudagraph_mode=PIECEWISE"
3270
+ cudagraph_mode = self.compilation_config.cudagraph_mode = \
3271
+ CUDAGraphMode.PIECEWISE
3272
+ else:
3273
+ msg += "; setting cudagraph_mode=NONE"
3274
+ cudagraph_mode = self.compilation_config.cudagraph_mode = \
3275
+ CUDAGraphMode.NONE
3276
+ logger.warning(msg)
3277
+
3278
+ # double check that we can support full cudagraph if they are requested
3279
+ # even after automatic downgrades
3280
+ if cudagraph_mode.has_full_cudagraphs() \
3281
+ and min_cg_support == AttentionCGSupport.NEVER:
3282
+ raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not "
3283
+ f"supported with {min_cg_builder_name} backend ("
3284
+ f"support:{min_cg_support}) "
3285
+ "; please try cudagraph_mode=PIECEWISE, "
3286
+ "and make sure compilation level is piecewise")
3287
+
3288
+ # Trigger cudagraph dispatching keys initialization here (after
3289
+ # initializing attn backends).
3290
+ self.cudagraph_dispatcher.initialize_cudagraph_keys(
3291
+ self.compilation_config.cudagraph_mode,
3292
+ self.uniform_decode_query_len)
3293
+
3294
+ def calculate_reorder_batch_threshold(self) -> None:
3295
+ """
3296
+ Check that if any backends reorder batches; that the reordering
3297
+ is compatible (e.g., decode threshold is the same)
3298
+ """
3299
+ for group in self._attn_group_iterator():
3300
+ attn_metadata_builder_i = group.metadata_builder
3301
+
3302
+ # check that if any backends reorder batches; that the reordering
3303
+ # is compatible (e.g., decode threshold is the same)
3304
+ reorder_batch_threshold_i = (
3305
+ attn_metadata_builder_i.reorder_batch_threshold)
3306
+ if reorder_batch_threshold_i is not None:
3307
+ if self.reorder_batch_threshold is not None:
3308
+ if reorder_batch_threshold_i != \
3309
+ self.reorder_batch_threshold:
3310
+ raise ValueError(
3311
+ f"Attention backend reorders decodes with "
3312
+ f"threshold {reorder_batch_threshold_i} but other "
3313
+ f"backend uses threshold "
3314
+ f"{self.reorder_batch_threshold}")
3315
+ else:
3316
+ self.reorder_batch_threshold = reorder_batch_threshold_i
3317
+
3318
+ def may_reinitialize_input_batch(self,
3319
+ kv_cache_config: KVCacheConfig) -> None:
3320
+ """
3321
+ Re-initialize the input batch if the block sizes are different from
3322
+ `[self.cache_config.block_size]`. This usually happens when there
3323
+ are multiple KV cache groups.
3324
+
3325
+ Args:
3326
+ kv_cache_config: The KV cache configuration.
3327
+ """
3328
+ block_sizes = [
3329
+ kv_cache_group.kv_cache_spec.block_size
3330
+ for kv_cache_group in kv_cache_config.kv_cache_groups
3331
+ ]
3332
+ if block_sizes != [self.cache_config.block_size]:
3333
+ assert self.cache_config.cpu_offload_gb == 0, (
3334
+ "Cannot re-initialize the input batch when CPU weight "
3335
+ "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
3336
+ "for more details.")
3337
+ self.input_batch = InputBatch(
3338
+ max_num_reqs=self.max_num_reqs,
3339
+ max_model_len=max(self.max_model_len, self.max_encoder_len),
3340
+ max_num_batched_tokens=self.max_num_tokens,
3341
+ device=self.device,
3342
+ pin_memory=self.pin_memory,
3343
+ vocab_size=self.model_config.get_vocab_size(),
3344
+ block_sizes=block_sizes,
3345
+ is_spec_decode=bool(self.vllm_config.speculative_config),
3346
+ logitsprocs=self.input_batch.logitsprocs,
3347
+ is_pooling_model=self.is_pooling_model,
3348
+ num_speculative_tokens=(
3349
+ self.vllm_config.speculative_config.num_speculative_tokens
3350
+ if self.vllm_config.speculative_config else 0),
3351
+ )
3352
+
3353
+ def _allocate_kv_cache_tensors(
3354
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
3355
+ """
3356
+ Initializes the KV cache buffer with the correct size. The buffer needs
3357
+ to be reshaped to the desired shape before being used by the models.
3358
+
3359
+ Args:
3360
+ kv_cache_config: The KV cache config
3361
+ Returns:
3362
+ dict[str, torch.Tensor]: A map between layer names to their
3363
+ corresponding memory buffer for KV cache.
3364
+ """
3365
+ kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
3366
+ for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
3367
+ tensor = torch.zeros(kv_cache_tensor.size,
3368
+ dtype=torch.int8,
3369
+ device=self.device)
3370
+ for layer_name in kv_cache_tensor.shared_by:
3371
+ kv_cache_raw_tensors[layer_name] = tensor
3372
+
3373
+ layer_names = set()
3374
+ for group in kv_cache_config.kv_cache_groups:
3375
+ for layer_name in group.layer_names:
3376
+ if layer_name in self.runner_only_attn_layers:
3377
+ continue
3378
+ layer_names.add(layer_name)
3379
+ assert layer_names == set(kv_cache_raw_tensors.keys(
3380
+ )), "Some layers are not correctly initialized"
3381
+ return kv_cache_raw_tensors
3382
+
3383
+ def _attn_group_iterator(self) -> Iterator[AttentionGroup]:
3384
+ return itertools.chain.from_iterable(self.attn_groups)
3385
+
3386
+ def _kv_cache_spec_attn_group_iterator(
3387
+ self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]:
3388
+ if not self.kv_cache_config.kv_cache_groups:
3389
+ return
3390
+ for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups):
3391
+ for attn_group in attn_groups:
3392
+ yield self.kv_cache_config.kv_cache_groups[
3393
+ kv_cache_spec_id].kv_cache_spec, attn_group
3394
+
3395
+ def _reshape_kv_cache_tensors(
3396
+ self,
3397
+ kv_cache_config: KVCacheConfig,
3398
+ kv_cache_raw_tensors: dict[str, torch.Tensor],
3399
+ ) -> dict[str, torch.Tensor]:
3400
+ """
3401
+ Reshape the KV cache tensors to the desired shape and dtype.
3402
+
3403
+ Args:
3404
+ kv_cache_config: The KV cache config
3405
+ kv_cache_raw_tensors: The KV cache buffer of each layer, with
3406
+ correct size but uninitialized shape.
3407
+ Returns:
3408
+ Dict[str, torch.Tensor]: A map between layer names to their
3409
+ corresponding memory buffer for KV cache.
3410
+ """
3411
+ kv_caches: dict[str, torch.Tensor] = {}
3412
+ has_attn, has_mamba = False, False
3413
+ for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
3414
+ attn_backend = group.backend
3415
+ for layer_name in group.layer_names:
3416
+ if layer_name in self.runner_only_attn_layers:
3417
+ continue
3418
+ raw_tensor = kv_cache_raw_tensors[layer_name]
3419
+ assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
3420
+ num_blocks = (raw_tensor.numel() //
3421
+ kv_cache_spec.page_size_bytes)
3422
+ if isinstance(kv_cache_spec, AttentionSpec):
3423
+ has_attn = True
3424
+ kv_cache_shape = attn_backend.get_kv_cache_shape(
3425
+ num_blocks, kv_cache_spec.block_size,
3426
+ kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
3427
+ dtype = kv_cache_spec.dtype
3428
+ try:
3429
+ kv_cache_stride_order = \
3430
+ attn_backend.get_kv_cache_stride_order()
3431
+ assert len(kv_cache_stride_order) == len(
3432
+ kv_cache_shape)
3433
+ except (AttributeError, NotImplementedError):
3434
+ kv_cache_stride_order = tuple(
3435
+ range(len(kv_cache_shape)))
3436
+ # The allocation respects the backend-defined stride order
3437
+ # to ensure the semantic remains consistent for each
3438
+ # backend. We first obtain the generic kv cache shape and
3439
+ # then permute it according to the stride order which could
3440
+ # result in a non-contiguous tensor.
3441
+ kv_cache_shape = tuple(kv_cache_shape[i]
3442
+ for i in kv_cache_stride_order)
3443
+ # Maintain original KV shape view.
3444
+ inv_order = [
3445
+ kv_cache_stride_order.index(i)
3446
+ for i in range(len(kv_cache_stride_order))
3447
+ ]
3448
+ kv_caches[layer_name] = kv_cache_raw_tensors[
3449
+ layer_name].view(dtype).view(kv_cache_shape).permute(
3450
+ *inv_order)
3451
+ elif isinstance(kv_cache_spec, MambaSpec):
3452
+ has_mamba = True
3453
+ raw_tensor = kv_cache_raw_tensors[layer_name]
3454
+ state_tensors = []
3455
+ storage_offset_bytes = 0
3456
+ for (shape, dtype) in zip(kv_cache_spec.shapes,
3457
+ kv_cache_spec.dtypes):
3458
+ dtype_size = get_dtype_size(dtype)
3459
+ num_element_per_page = (
3460
+ kv_cache_spec.page_size_bytes // dtype_size)
3461
+ target_shape = (num_blocks, *shape)
3462
+ stride = torch.empty(target_shape).stride()
3463
+ target_stride = (num_element_per_page, *stride[1:])
3464
+ assert storage_offset_bytes % dtype_size == 0
3465
+ tensor = torch.as_strided(
3466
+ raw_tensor.view(dtype),
3467
+ size=target_shape,
3468
+ stride=target_stride,
3469
+ storage_offset=storage_offset_bytes // dtype_size,
3470
+ )
3471
+ state_tensors.append(tensor)
3472
+ storage_offset_bytes += stride[0] * dtype_size
3473
+
3474
+ kv_caches[layer_name] = state_tensors
3475
+ else:
3476
+ raise NotImplementedError
3477
+
3478
+ if has_attn and has_mamba:
3479
+ self._update_hybrid_attention_mamba_layout(kv_caches)
3480
+
3481
+ return kv_caches
3482
+
3483
+ def _update_hybrid_attention_mamba_layout(
3484
+ self, kv_caches: dict[str, torch.Tensor]) -> None:
3485
+ """
3486
+ Update the layout of attention layers from (2, num_blocks, ...) to
3487
+ (num_blocks, 2, ...).
3488
+
3489
+ Args:
3490
+ kv_caches: The KV cache buffer of each layer.
3491
+ """
3492
+
3493
+ for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
3494
+ for layer_name in group.layer_names:
3495
+ kv_cache = kv_caches[layer_name]
3496
+ if (isinstance(kv_cache_spec, AttentionSpec)
3497
+ and kv_cache.shape[0] == 2):
3498
+ assert kv_cache.shape[1] != 2, \
3499
+ "Fail to determine whether the layout is " \
3500
+ "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \
3501
+ f"a tensor of shape {kv_cache.shape}"
3502
+ hidden_size = kv_cache.shape[2:].numel()
3503
+ kv_cache.as_strided_(size=kv_cache.shape,
3504
+ stride=(hidden_size, 2 * hidden_size,
3505
+ *kv_cache.stride()[2:]))
3506
+
3507
+ def initialize_kv_cache_tensors(
3508
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
3509
+ """
3510
+ Initialize the memory buffer for KV cache.
3511
+
3512
+ Args:
3513
+ kv_cache_config: The KV cache config
3514
+ Returns:
3515
+ Dict[str, torch.Tensor]: A map between layer names to their
3516
+ corresponding memory buffer for KV cache.
3517
+ """
3518
+ # Initialize the memory buffer for KV cache
3519
+ kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
3520
+ # Change the memory buffer to the desired shape
3521
+ kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
3522
+ kv_cache_raw_tensors)
3523
+
3524
+ # Set up cross-layer KV cache sharing
3525
+ for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
3526
+ ):
3527
+ logger.debug("%s reuses KV cache of %s", layer_name,
3528
+ target_layer_name)
3529
+ kv_caches[layer_name] = kv_caches[target_layer_name]
3530
+
3531
+ bind_kv_cache(kv_caches,
3532
+ self.compilation_config.static_forward_context,
3533
+ self.kv_caches)
3534
+ return kv_caches
3535
+
3536
+ def maybe_add_kv_sharing_layers_to_kv_cache_groups(
3537
+ self, kv_cache_config: KVCacheConfig) -> None:
3538
+ """
3539
+ Add layers that re-use KV cache to KV cache group of its target layer.
3540
+ Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
3541
+ """
3542
+ if not self.shared_kv_cache_layers:
3543
+ # No cross-layer KV sharing, return
3544
+ return
3545
+
3546
+ add_kv_sharing_layers_to_kv_cache_groups(
3547
+ self.shared_kv_cache_layers,
3548
+ kv_cache_config.kv_cache_groups,
3549
+ self.runner_only_attn_layers,
3550
+ )
3551
+
3552
+ if self.cache_config.kv_sharing_fast_prefill:
3553
+ # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other
3554
+ # similar KV sharing setups, only the layers that generate KV caches
3555
+ # are involved in the prefill phase, enabling prefill to early exit.
3556
+ attn_layers = get_layers_from_vllm_config(self.vllm_config,
3557
+ Attention)
3558
+ for layer_name in reversed(attn_layers):
3559
+ if layer_name in self.shared_kv_cache_layers:
3560
+ self.kv_sharing_fast_prefill_eligible_layers.add(
3561
+ layer_name)
3562
+ else:
3563
+ break
3564
+
3565
+ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
3566
+ """
3567
+ Initialize KV cache based on `kv_cache_config`.
3568
+ Args:
3569
+ kv_cache_config: Configuration for the KV cache, including the KV
3570
+ cache size of each layer
3571
+ """
3572
+ kv_cache_config = deepcopy(kv_cache_config)
3573
+ self.kv_cache_config = kv_cache_config
3574
+ self.may_reinitialize_input_batch(kv_cache_config)
3575
+ self.may_add_encoder_only_layers_to_kv_cache_config()
3576
+ self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
3577
+ self.initialize_attn_backend(kv_cache_config)
3578
+ kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
3579
+
3580
+ if self.speculative_config and self.speculative_config.use_eagle():
3581
+ assert isinstance(self.drafter, EagleProposer)
3582
+ # validate all draft model layers belong to the same kv cache
3583
+ # group
3584
+ self.drafter.validate_same_kv_cache_group(kv_cache_config)
3585
+
3586
+ if has_kv_transfer_group():
3587
+ get_kv_transfer_group().register_kv_caches(kv_caches)
3588
+ if self.device.type == 'xpu':
3589
+ get_kv_transfer_group().set_host_xfer_buffer_ops(
3590
+ copy_kv_blocks)
3591
+
3592
+ if self.dcp_world_size > 1:
3593
+ layer_names = self.attn_groups[0][0].layer_names
3594
+ layers = get_layers_from_vllm_config(self.vllm_config,
3595
+ AttentionLayerBase,
3596
+ layer_names)
3597
+ for layer in layers.values():
3598
+ assert layer.impl.need_to_return_lse_for_decode, (
3599
+ "DCP requires attention impls to return"
3600
+ " the softmax lse for decode, but the impl "
3601
+ f"{layer.impl.__class__.__name__} "
3602
+ "does not return the softmax lse for decode.")
3603
+
3604
+ def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
3605
+ """
3606
+ Add encoder-only layers to the KV cache config.
3607
+ """
3608
+ block_size = self.vllm_config.cache_config.block_size
3609
+ use_mla = self.vllm_config.model_config.use_mla
3610
+ encoder_only_attn_specs: dict[AttentionSpec,
3611
+ list[str]] = defaultdict(list)
3612
+ attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
3613
+ for layer_name, attn_module in attn_layers.items():
3614
+ if attn_module.attn_type == AttentionType.ENCODER_ONLY:
3615
+ attn_spec: AttentionSpec = EncoderOnlyAttentionSpec(
3616
+ block_size=block_size,
3617
+ num_kv_heads=attn_module.num_kv_heads,
3618
+ head_size=attn_module.head_size,
3619
+ dtype=self.kv_cache_dtype,
3620
+ use_mla=use_mla)
3621
+ encoder_only_attn_specs[attn_spec].append(layer_name)
3622
+ self.runner_only_attn_layers.add(layer_name)
3623
+ if len(encoder_only_attn_specs) > 0:
3624
+ assert len(
3625
+ encoder_only_attn_specs
3626
+ ) == 1, "Only support one encoder-only attention spec now"
3627
+ spec, layer_names = encoder_only_attn_specs.popitem()
3628
+ self.kv_cache_config.kv_cache_groups.append(
3629
+ KVCacheGroupSpec(layer_names=layer_names, kv_cache_spec=spec))
3630
+
3631
+ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
3632
+ """
3633
+ Generates the KVCacheSpec by parsing the kv cache format from each
3634
+ Attention module in the static forward context.
3635
+ Returns:
3636
+ KVCacheSpec: A dictionary mapping layer names to their KV cache
3637
+ format. Layers that do not need KV cache are not included.
3638
+ """
3639
+
3640
+ block_size = self.vllm_config.cache_config.block_size
3641
+ use_mla = self.vllm_config.model_config.use_mla
3642
+ kv_cache_spec: dict[str, KVCacheSpec] = {}
3643
+ attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
3644
+ for layer_name, attn_module in attn_layers.items():
3645
+ if (kv_tgt_layer :=
3646
+ attn_module.kv_sharing_target_layer_name) is not None:
3647
+ # The layer doesn't need its own KV cache and will use that of
3648
+ # the target layer. We skip creating a KVCacheSpec for it, so
3649
+ # that KV cache management logic will act as this layer does
3650
+ # not exist, and doesn't allocate KV cache for the layer. This
3651
+ # enables the memory saving of cross-layer kv sharing, allowing
3652
+ # a given amount of memory to accommodate longer context lengths
3653
+ # or enable more requests to be processed simultaneously.
3654
+ self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
3655
+ continue
3656
+
3657
+ # TODO(lucas): move the attention specs into the model layers like
3658
+ # the attention backends
3659
+ if attn_module.attn_type == AttentionType.DECODER:
3660
+ if attn_module.sliding_window is not None:
3661
+ kv_cache_spec[layer_name] = SlidingWindowSpec(
3662
+ block_size=block_size,
3663
+ num_kv_heads=attn_module.num_kv_heads,
3664
+ head_size=attn_module.head_size,
3665
+ dtype=self.kv_cache_dtype,
3666
+ sliding_window=attn_module.sliding_window,
3667
+ use_mla=use_mla)
3668
+ elif self.attention_chunk_size is not None \
3669
+ and isinstance(attn_module, ChunkedLocalAttention):
3670
+ kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec(
3671
+ block_size=block_size,
3672
+ num_kv_heads=attn_module.num_kv_heads,
3673
+ head_size=attn_module.head_size,
3674
+ dtype=self.kv_cache_dtype,
3675
+ attention_chunk_size=self.attention_chunk_size,
3676
+ use_mla=use_mla)
3677
+ else:
3678
+ kv_cache_spec[layer_name] = FullAttentionSpec(
3679
+ block_size=block_size,
3680
+ num_kv_heads=attn_module.num_kv_heads,
3681
+ head_size=attn_module.head_size,
3682
+ dtype=self.kv_cache_dtype,
3683
+ use_mla=use_mla)
3684
+ elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
3685
+ kv_cache_spec[layer_name] = CrossAttentionSpec(
3686
+ block_size=block_size,
3687
+ num_kv_heads=attn_module.num_kv_heads,
3688
+ head_size=attn_module.head_size,
3689
+ dtype=self.kv_cache_dtype,
3690
+ use_mla=use_mla)
3691
+ elif attn_module.attn_type in (AttentionType.ENCODER,
3692
+ AttentionType.ENCODER_ONLY):
3693
+ # encoder-only attention does not need KV cache.
3694
+ continue
3695
+ else:
3696
+ raise ValueError(
3697
+ f"Unknown attention type: {attn_module.attn_type}")
3698
+
3699
+ mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
3700
+ if len(mamba_layers) > 0:
3701
+ if (self.vllm_config.speculative_config is not None
3702
+ and self.vllm_config.model_config.hf_config.model_type
3703
+ not in ["qwen3_next"]):
3704
+ raise NotImplementedError(
3705
+ "Mamba with speculative decoding is not supported yet.")
3706
+ if self.vllm_config.cache_config.enable_prefix_caching:
3707
+ raise NotImplementedError(
3708
+ "Prefix caching is not supported for Mamba yet.")
3709
+ max_model_len = self.vllm_config.model_config.max_model_len
3710
+
3711
+ page_size_padded = (
3712
+ self.vllm_config.cache_config.mamba_page_size_padded)
3713
+
3714
+ # Set block_size to max_model_len, so that mamba model will always
3715
+ # have only one block in the KV cache.
3716
+ for layer_name, mamba_module in mamba_layers.items():
3717
+ kv_cache_spec[layer_name] = MambaSpec(
3718
+ shapes=mamba_module.get_state_shape(),
3719
+ dtypes=mamba_module.get_state_dtype(),
3720
+ block_size=max_model_len,
3721
+ page_size_padded=page_size_padded,
3722
+ mamba_type=mamba_module.mamba_type,
3723
+ num_speculative_blocks=(
3724
+ self.speculative_config.num_speculative_tokens
3725
+ if self.speculative_config else 0),
3726
+ )
3727
+
3728
+ return kv_cache_spec
3729
+
3730
+ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
3731
+ # This is a short term mitigation for issue mentioned in
3732
+ # https://github.com/vllm-project/vllm/issues/22754.
3733
+ # `tolist` would trigger a cuda wise stream sync, which
3734
+ # would block other copy ops from other cuda streams.
3735
+ # A cuda event sync would avoid such a situation. Since
3736
+ # this is in the critical path of every single model
3737
+ # forward loop, this has caused perf issue for a disagg
3738
+ # setup.
3739
+ pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
3740
+ pinned.copy_(sampled_token_ids, non_blocking=True)
3741
+ self.transfer_event.record()
3742
+ self.transfer_event.synchronize()
3743
+ return pinned.tolist()