vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2320 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import copy
5
+ import gc
6
+ import time
7
+ import weakref
8
+ from contextlib import contextmanager
9
+ from typing import TYPE_CHECKING, Any, Optional, Union
10
+
11
+ import numpy as np
12
+ import torch
13
+ import torch.distributed
14
+ import torch.nn as nn
15
+
16
+ import vllm.envs as envs
17
+ from vllm.attention import AttentionType, get_attn_backend
18
+ from vllm.attention.backends.abstract import (AttentionBackend,
19
+ AttentionMetadataBuilder)
20
+ from vllm.attention.layer import Attention
21
+ from vllm.attention.utils.fa_utils import get_flash_attn_version
22
+ from vllm.config import (CompilationLevel, VllmConfig,
23
+ get_layers_from_vllm_config)
24
+ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
25
+ has_kv_transfer_group)
26
+ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
27
+ from vllm.distributed.parallel_state import (
28
+ get_pp_group, get_tp_group, graph_capture,
29
+ prepare_communication_buffer_for_model)
30
+ from vllm.forward_context import (DPMetadata, get_forward_context,
31
+ set_forward_context)
32
+ from vllm.logger import init_logger
33
+ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
34
+ from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
35
+ from vllm.multimodal import MULTIMODAL_REGISTRY
36
+ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
37
+ from vllm.multimodal.utils import group_mm_inputs_by_modality
38
+ from vllm.sampling_params import SamplingType
39
+ from vllm.sequence import IntermediateTensors
40
+ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
41
+ GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
42
+ check_use_alibi, is_pin_memory_available)
43
+ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
44
+ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
45
+ from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
46
+ KVCacheConfig, KVCacheSpec,
47
+ SlidingWindowSpec)
48
+ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
49
+ ModelRunnerOutput)
50
+ from vllm.v1.sample.metadata import SamplingMetadata
51
+ from vllm.v1.sample.rejection_sampler import RejectionSampler
52
+ from vllm.v1.sample.sampler import Sampler
53
+ from vllm.v1.spec_decode.eagle import EagleProposer
54
+ from vllm.v1.spec_decode.medusa import MedusaProposer
55
+ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
56
+ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
57
+ from vllm.v1.spec_decode.utils import is_spec_decode_supported
58
+ from vllm.v1.utils import bind_kv_cache
59
+ from vllm.v1.worker.block_table import BlockTable
60
+ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
61
+ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
62
+
63
+ from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
64
+ sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
65
+
66
+ if TYPE_CHECKING:
67
+ import xgrammar as xgr
68
+
69
+ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
70
+ from vllm.v1.core.sched.output import SchedulerOutput
71
+ else:
72
+ xgr = LazyLoader("xgr", globals(), "xgrammar")
73
+
74
+ logger = init_logger(__name__)
75
+
76
+
77
+ class GPUModelRunner(LoRAModelRunnerMixin):
78
+
79
+ def __init__(
80
+ self,
81
+ vllm_config: VllmConfig,
82
+ device: torch.device,
83
+ ):
84
+ self.vllm_config = vllm_config
85
+ self.model_config = vllm_config.model_config
86
+ self.cache_config = vllm_config.cache_config
87
+ self.lora_config = vllm_config.lora_config
88
+ self.load_config = vllm_config.load_config
89
+ self.parallel_config = vllm_config.parallel_config
90
+ self.scheduler_config = vllm_config.scheduler_config
91
+ self.speculative_config = vllm_config.speculative_config
92
+ self.prompt_adapter_config = vllm_config.prompt_adapter_config
93
+ self.observability_config = vllm_config.observability_config
94
+
95
+ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
96
+ set_cpu_offload_max_bytes(
97
+ int(self.cache_config.cpu_offload_gb * 1024**3))
98
+
99
+ model_config = self.model_config
100
+ cache_config = self.cache_config
101
+ scheduler_config = self.scheduler_config
102
+ parallel_config = self.parallel_config
103
+ self.device = device
104
+ self.pin_memory = is_pin_memory_available()
105
+ self.dtype = self.model_config.dtype
106
+ if cache_config.cache_dtype == "auto":
107
+ self.kv_cache_dtype = self.dtype
108
+ else:
109
+ self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
110
+ cache_config.cache_dtype]
111
+
112
+ self.is_multimodal_model = model_config.is_multimodal_model
113
+ self.max_model_len = model_config.max_model_len
114
+ self.max_num_tokens = scheduler_config.max_num_batched_tokens
115
+ self.max_num_reqs = scheduler_config.max_num_seqs
116
+
117
+ # Model-related.
118
+ self.num_query_heads = model_config.get_num_attention_heads(
119
+ parallel_config)
120
+ self.hidden_size = model_config.get_hidden_size()
121
+ self.attention_chunk_size = model_config.attention_chunk_size
122
+
123
+ self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
124
+
125
+ # Multi-modal data support
126
+ self.mm_registry = MULTIMODAL_REGISTRY
127
+ self.uses_mrope = model_config.uses_mrope
128
+
129
+ encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
130
+ model_config=model_config,
131
+ scheduler_config=scheduler_config,
132
+ mm_registry=self.mm_registry,
133
+ )
134
+ self.max_num_encoder_input_tokens = encoder_compute_budget
135
+ self.encoder_cache_size = encoder_cache_size
136
+
137
+ # Sampler
138
+ self.sampler = Sampler()
139
+
140
+ # Lazy initializations
141
+ # self.model: nn.Module # Set after load_model
142
+ # Initialize in initialize_kv_cache
143
+ self.kv_caches: list[torch.Tensor] = []
144
+ self.attn_metadata_builders: list[AttentionMetadataBuilder] = []
145
+ self.attn_backends: list[type[AttentionBackend]] = []
146
+ # self.kv_cache_config: KVCacheConfig
147
+
148
+ # req_id -> (input_id -> encoder_output)
149
+ self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
150
+
151
+ self.use_aux_hidden_state_outputs = False
152
+ # Set up speculative decoding.
153
+ # NOTE(Jiayi): currently we put the entire draft model on
154
+ # the last PP rank. This is not ideal if there are many
155
+ # layers in the draft model.
156
+ if self.speculative_config and get_pp_group().is_last_rank:
157
+ if self.speculative_config.method == "ngram":
158
+ self.drafter = NgramProposer(self.vllm_config)
159
+ elif self.speculative_config.use_eagle():
160
+ self.drafter = EagleProposer(self.vllm_config, self.device,
161
+ self) # type: ignore
162
+ if self.speculative_config.method == "eagle3":
163
+ self.use_aux_hidden_state_outputs = True
164
+ elif self.speculative_config.method == "medusa":
165
+ self.drafter = MedusaProposer(
166
+ vllm_config=self.vllm_config,
167
+ device=self.device) # type: ignore
168
+ else:
169
+ raise ValueError("Unknown speculative decoding method: "
170
+ f"{self.speculative_config.method}")
171
+ self.rejection_sampler = RejectionSampler()
172
+
173
+ # Request states.
174
+ self.requests: dict[str, CachedRequestState] = {}
175
+
176
+ # Input Batch
177
+ # NOTE(Chen): Ideally, we should initialize the input batch inside
178
+ # `initialize_kv_cache` based on the kv cache config. However, as in
179
+ # https://github.com/vllm-project/vllm/pull/18298, due to some unknown
180
+ # reasons, we have to initialize the input batch before `load_model`,
181
+ # quantization + weight offloading will fail otherwise. As a temporary
182
+ # solution, we initialize the input batch here, and re-initialize it
183
+ # in `initialize_kv_cache` if the block_sizes here is different from
184
+ # the block_sizes in the kv cache config.
185
+ self.input_batch = InputBatch(
186
+ max_num_reqs=self.max_num_reqs,
187
+ max_model_len=self.max_model_len,
188
+ max_num_batched_tokens=self.max_num_tokens,
189
+ device=self.device,
190
+ pin_memory=self.pin_memory,
191
+ vocab_size=self.model_config.get_vocab_size(),
192
+ block_sizes=[self.cache_config.block_size],
193
+ )
194
+
195
+ self.use_cuda_graph = (self.vllm_config.compilation_config.level
196
+ == CompilationLevel.PIECEWISE
197
+ and not self.model_config.enforce_eager)
198
+ # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
199
+ # The convention is different.
200
+ # self.cudagraph_batch_sizes sorts in ascending order.
201
+ # The batch sizes in the config are in descending order.
202
+ self.cudagraph_batch_sizes = list(
203
+ reversed(
204
+ self.vllm_config.compilation_config.cudagraph_capture_sizes))
205
+
206
+ # Cache the device properties.
207
+ self._init_device_properties()
208
+
209
+ # Persistent buffers for CUDA graphs.
210
+ self.input_ids = torch.zeros(self.max_num_tokens,
211
+ dtype=torch.int32,
212
+ device=self.device)
213
+ self.positions = torch.zeros(self.max_num_tokens,
214
+ dtype=torch.int64,
215
+ device=self.device)
216
+ self.query_start_loc = torch.zeros(self.max_num_reqs + 1,
217
+ dtype=torch.int32,
218
+ device=self.device)
219
+ self.seq_lens = torch.zeros(self.max_num_reqs,
220
+ dtype=torch.int32,
221
+ device=self.device)
222
+ self.slot_mapping = torch.zeros(self.max_num_tokens,
223
+ dtype=torch.int64,
224
+ device=self.device)
225
+
226
+ # None in the first PP rank. The rest are set after load_model.
227
+ self.intermediate_tensors: Optional[IntermediateTensors] = None
228
+
229
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
230
+ if self.uses_mrope:
231
+ # NOTE: `mrope_positions` is implemented with one additional dummy
232
+ # position on purpose to make it non-contiguous so that it can work
233
+ # with torch compile.
234
+ # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
235
+
236
+ # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
237
+ # the modality of inputs. For text-only inputs, each dimension has
238
+ # identical position IDs, making M-RoPE functionally equivalent to
239
+ # 1D-RoPE.
240
+ # See page 5 of https://arxiv.org/abs/2409.12191
241
+ self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1),
242
+ dtype=torch.int64,
243
+ device=self.device)
244
+ self.mrope_positions_cpu = torch.zeros(
245
+ (3, self.max_num_tokens + 1),
246
+ dtype=torch.int64,
247
+ device="cpu",
248
+ pin_memory=self.pin_memory)
249
+
250
+ # Only relevant for models using ALiBi (e.g, MPT)
251
+ self.use_alibi = check_use_alibi(model_config)
252
+
253
+ self.inputs_embeds = torch.zeros(
254
+ (self.max_num_tokens, self.hidden_size),
255
+ dtype=self.dtype,
256
+ device=self.device)
257
+
258
+ # OPTIMIZATION: Cache the tensors rather than creating them every step.
259
+ # Keep in int64 to avoid overflow with long context
260
+ self.arange_np = np.arange(max(self.max_num_reqs + 1,
261
+ self.max_model_len,
262
+ self.max_num_tokens),
263
+ dtype=np.int64)
264
+ # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
265
+ # a faster version of creating a new tensor every time. Thus, we should
266
+ # not make any assumptions about the values in these tensors.
267
+ self.input_ids_cpu = torch.zeros(self.max_num_tokens,
268
+ dtype=torch.int32,
269
+ device="cpu",
270
+ pin_memory=self.pin_memory)
271
+ self.positions_cpu = torch.zeros(self.max_num_tokens,
272
+ dtype=torch.int64,
273
+ device="cpu",
274
+ pin_memory=self.pin_memory)
275
+ self.positions_np = self.positions_cpu.numpy()
276
+ self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1,
277
+ dtype=torch.int32,
278
+ device="cpu",
279
+ pin_memory=self.pin_memory)
280
+ self.query_start_loc_np = self.query_start_loc_cpu.numpy()
281
+ self.seq_lens_cpu = torch.zeros(self.max_num_reqs,
282
+ dtype=torch.int32,
283
+ device="cpu",
284
+ pin_memory=self.pin_memory)
285
+ self.seq_lens_np = self.seq_lens_cpu.numpy()
286
+
287
+ # Layer pairings for cross-layer KV sharing.
288
+ # If an Attention layer `layer_name` is in the keys of this dict, it
289
+ # means this layer will perform attention using the keys and values
290
+ # from the KV cache of `shared_kv_cache_layers[layer_name]`.
291
+ self.shared_kv_cache_layers: dict[str, str] = {}
292
+
293
+ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
294
+ """
295
+ Update the order of requests in the batch based on the attention
296
+ backend's needs. For example, some attention backends (namely MLA) may
297
+ want to separate requests based on if the attention computation will be
298
+ compute-bound or memory-bound.
299
+
300
+ Args:
301
+ scheduler_output: The scheduler output.
302
+
303
+ Returns:
304
+ True if the batch was reordered, False otherwise.
305
+ """
306
+ batch_reordered = self.attn_metadata_builders[0].reorder_batch(
307
+ self.input_batch, scheduler_output)
308
+
309
+ # For models with multiple KV cache groups, the groups should agree on
310
+ # the same order of requests. We ensure this by only allowing the first
311
+ # group to reorder the batch and asserting that all other groups do not
312
+ # reorder the batch.
313
+ for i in range(1, len(self.kv_cache_config.kv_cache_groups)):
314
+ assert not self.attn_metadata_builders[i].reorder_batch(
315
+ self.input_batch, scheduler_output)
316
+ return batch_reordered
317
+
318
+ # Note: used for model runner override.
319
+ def _init_device_properties(self) -> None:
320
+ """Initialize attributes from torch.cuda.get_device_properties
321
+ """
322
+ self.device_properties = torch.cuda.get_device_properties(self.device)
323
+ self.num_sms = self.device_properties.multi_processor_count
324
+
325
+ # Note: used for model runner override.
326
+ def _sync_device(self) -> None:
327
+ torch.cuda.synchronize()
328
+
329
+ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
330
+ """Update the cached states and the persistent batch with the scheduler
331
+ output.
332
+
333
+ The updated states are used by the `_prepare_inputs` function to create
334
+ the input GPU tensors for the model.
335
+
336
+ The SamplingMetadata is updated and copied to the GPU if there is a
337
+ new/resumed/paused/finished request in the batch.
338
+ """
339
+ # Remove finished requests from the cached states.
340
+ for req_id in scheduler_output.finished_req_ids:
341
+ self.requests.pop(req_id, None)
342
+ self.encoder_cache.pop(req_id, None)
343
+ # Remove the finished requests from the persistent batch.
344
+ # NOTE(woosuk): There could be an edge case where finished_req_ids and
345
+ # scheduled_req_ids overlap. This happens when a request is aborted and
346
+ # then resubmitted with the same ID. In this case, we treat them as two
347
+ # distinct requests - clearing the cached states for the first request
348
+ # and handling the second as a new request.
349
+ removed_req_indices: list[int] = []
350
+ for req_id in scheduler_output.finished_req_ids:
351
+ req_index = self.input_batch.remove_request(req_id)
352
+ if req_index is not None:
353
+ removed_req_indices.append(req_index)
354
+
355
+ # Free the cached encoder outputs.
356
+ for req_id, input_id in scheduler_output.free_encoder_input_ids:
357
+ encoder_outputs = self.encoder_cache.get(req_id)
358
+ if encoder_outputs is not None:
359
+ encoder_outputs.pop(input_id, None)
360
+ if not encoder_outputs:
361
+ self.encoder_cache.pop(req_id, None)
362
+
363
+ # Remove the unscheduled requests from the persistent batch.
364
+ # NOTE(woosuk): The unscheduled requests are either preempted requests
365
+ # or running requests that are not scheduled in this step. We remove
366
+ # them from the persistent batch but keep their cached states since
367
+ # they will be scheduled again sometime in the future.
368
+ scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
369
+ cached_req_ids = self.input_batch.req_id_to_index.keys()
370
+ unscheduled_req_ids = cached_req_ids - scheduled_req_ids
371
+ # NOTE(woosuk): The persistent batch optimization assumes that
372
+ # consecutive batches contain mostly the same requests. If batches
373
+ # have low request overlap (e.g., alternating between two distinct
374
+ # sets of requests), this optimization becomes very inefficient.
375
+ for req_id in unscheduled_req_ids:
376
+ req_index = self.input_batch.remove_request(req_id)
377
+ assert req_index is not None
378
+ removed_req_indices.append(req_index)
379
+
380
+ req_ids_to_add: list[str] = []
381
+ # Add new requests to the cached states.
382
+ for new_req_data in scheduler_output.scheduled_new_reqs:
383
+ req_id = new_req_data.req_id
384
+ sampling_params = new_req_data.sampling_params
385
+ if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
386
+ generator = torch.Generator(device=self.device)
387
+ generator.manual_seed(sampling_params.seed)
388
+ else:
389
+ generator = None
390
+
391
+ self.requests[req_id] = CachedRequestState(
392
+ req_id=req_id,
393
+ prompt_token_ids=new_req_data.prompt_token_ids,
394
+ mm_inputs=new_req_data.mm_inputs,
395
+ mm_positions=new_req_data.mm_positions,
396
+ sampling_params=sampling_params,
397
+ generator=generator,
398
+ block_ids=new_req_data.block_ids,
399
+ num_computed_tokens=new_req_data.num_computed_tokens,
400
+ output_token_ids=[],
401
+ lora_request=new_req_data.lora_request,
402
+ )
403
+
404
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
405
+ if self.uses_mrope:
406
+ image_grid_thw = []
407
+ video_grid_thw = []
408
+ second_per_grid_ts = []
409
+ audio_feature_lengths = []
410
+ use_audio_in_video = False
411
+ for mm_input in self.requests[req_id].mm_inputs:
412
+ if mm_input.get("image_grid_thw") is not None:
413
+ image_grid_thw.extend(
414
+ mm_input["image_grid_thw"].tolist())
415
+ if mm_input.get("video_grid_thw") is not None:
416
+ video_grid_thw.extend(
417
+ mm_input["video_grid_thw"].tolist())
418
+ if mm_input.get("second_per_grid_ts") is not None:
419
+ second_per_grid_ts.extend(
420
+ mm_input["second_per_grid_ts"])
421
+ if mm_input.get("audio_feature_lengths") is not None:
422
+ audio_feature_lengths.extend(
423
+ mm_input["audio_feature_lengths"])
424
+ if mm_input.get("use_audio_in_video") is True:
425
+ use_audio_in_video = True
426
+
427
+ hf_config = self.model_config.hf_config
428
+
429
+ self.requests[req_id].mrope_positions, \
430
+ self.requests[req_id].mrope_position_delta = \
431
+ MRotaryEmbedding.get_input_positions_tensor(
432
+ self.requests[req_id].prompt_token_ids,
433
+ hf_config=hf_config,
434
+ image_grid_thw=image_grid_thw,
435
+ video_grid_thw=video_grid_thw,
436
+ second_per_grid_ts=second_per_grid_ts,
437
+ audio_feature_lengths=audio_feature_lengths,
438
+ use_audio_in_video=use_audio_in_video,
439
+ )
440
+
441
+ req_ids_to_add.append(req_id)
442
+
443
+ # Update the states of the running/resumed requests.
444
+ for req_data in scheduler_output.scheduled_cached_reqs:
445
+ req_id = req_data.req_id
446
+ req_state = self.requests[req_id]
447
+
448
+ # Update the cached states.
449
+ num_computed_tokens = req_data.num_computed_tokens
450
+ req_state.num_computed_tokens = num_computed_tokens
451
+ # Add the sampled token(s) from the previous step (if any).
452
+ # This doesn't include "unverified" tokens like spec decode tokens.
453
+ num_new_tokens = (num_computed_tokens +
454
+ len(req_data.new_token_ids) -
455
+ req_state.num_tokens)
456
+ if num_new_tokens == 1:
457
+ # Avoid slicing list in most common case.
458
+ req_state.output_token_ids.append(req_data.new_token_ids[-1])
459
+ elif num_new_tokens > 0:
460
+ req_state.output_token_ids.extend(
461
+ req_data.new_token_ids[-num_new_tokens:])
462
+ # Update the block IDs.
463
+ if not req_data.resumed_from_preemption:
464
+ # Append the new blocks to the existing block IDs.
465
+ for block_ids, new_block_ids in zip( # type: ignore[call-overload]
466
+ req_state.block_ids,
467
+ req_data.new_block_ids,
468
+ strict=True):
469
+ block_ids.extend(new_block_ids)
470
+ else:
471
+ # The request is resumed from preemption.
472
+ # Replace the existing block IDs with the new ones.
473
+ req_state.block_ids = req_data.new_block_ids
474
+
475
+ req_index = self.input_batch.req_id_to_index.get(req_id)
476
+ if req_index is None:
477
+ # The request is not in the persistent batch.
478
+ # The request was either preempted and resumed later, or was not
479
+ # scheduled in the previous step and needs to be added again.
480
+ req_ids_to_add.append(req_id)
481
+ continue
482
+
483
+ # Update the persistent batch.
484
+ self.input_batch.num_computed_tokens_cpu[req_index] = (
485
+ num_computed_tokens)
486
+ self.input_batch.block_table.append_row(req_data.new_block_ids,
487
+ req_index)
488
+ # Add new_token_ids to token_ids_cpu.
489
+ start_token_index = num_computed_tokens
490
+ end_token_index = num_computed_tokens + len(req_data.new_token_ids)
491
+ self.input_batch.token_ids_cpu[
492
+ req_index,
493
+ start_token_index:end_token_index] = req_data.new_token_ids
494
+ self.input_batch.num_tokens_no_spec[req_index] = end_token_index
495
+ # Add spec_token_ids to token_ids_cpu.
496
+ spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
497
+ req_id, ())
498
+ if spec_token_ids:
499
+ start_index = end_token_index
500
+ end_token_index += len(spec_token_ids)
501
+ self.input_batch.token_ids_cpu[
502
+ req_index, start_index:end_token_index] = spec_token_ids
503
+ # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
504
+ self.input_batch.num_tokens[req_index] = end_token_index
505
+
506
+ # Check if the batch has changed. If not, we can skip copying the
507
+ # sampling metadata from CPU to GPU.
508
+ batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
509
+
510
+ # Add the new or resumed requests to the persistent batch.
511
+ # The smaller empty indices are filled first.
512
+ removed_req_indices.sort(reverse=True)
513
+ for req_id in req_ids_to_add:
514
+ req_state = self.requests[req_id]
515
+ if removed_req_indices:
516
+ # Fill the empty index.
517
+ req_index = removed_req_indices.pop()
518
+ else:
519
+ # Append to the end.
520
+ req_index = None
521
+ self.input_batch.add_request(req_state, req_index)
522
+
523
+ # Condense the batched states if there are empty indices.
524
+ if removed_req_indices:
525
+ self.input_batch.condense(removed_req_indices)
526
+
527
+ batch_reordered = self._may_reorder_batch(scheduler_output)
528
+
529
+ if batch_changed or batch_reordered:
530
+ self.input_batch.refresh_sampling_metadata()
531
+
532
+ def _get_cumsum_and_arange(
533
+ self,
534
+ num_tokens: np.ndarray,
535
+ cumsum_dtype: Optional[np.dtype] = None,
536
+ ) -> tuple[np.ndarray, np.ndarray]:
537
+ """Get the cumulative sum and batched arange of the given array.
538
+ # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
539
+ # Equivalent to but faster than:
540
+ # np.concatenate([np.arange(n) for n in num_tokens])
541
+ """
542
+ # Step 1. [2, 5, 3] -> [2, 7, 10]
543
+ cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype)
544
+ total_num_tokens = cu_num_tokens[-1]
545
+ # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
546
+ cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens)
547
+ # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
548
+ arange = self.arange_np[:total_num_tokens] - cumsums_offsets
549
+
550
+ return cu_num_tokens, arange
551
+
552
+ def _prepare_inputs(
553
+ self,
554
+ scheduler_output: "SchedulerOutput",
555
+ ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]:
556
+ total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
557
+ assert total_num_scheduled_tokens > 0
558
+ num_reqs = self.input_batch.num_reqs
559
+ assert num_reqs > 0
560
+
561
+ # OPTIMIZATION: Start copying the block table first.
562
+ # This way, we can overlap the copy with the following CPU operations.
563
+ self.input_batch.block_table.commit(num_reqs)
564
+
565
+ # Get the number of scheduled tokens for each request.
566
+ req_ids = self.input_batch.req_ids
567
+ tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
568
+ num_scheduled_tokens = np.array(tokens, dtype=np.int32)
569
+ max_num_scheduled_tokens = max(tokens)
570
+
571
+ # Get request indices.
572
+ # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
573
+ req_indices = np.repeat(self.arange_np[:num_reqs],
574
+ num_scheduled_tokens)
575
+
576
+ # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
577
+ # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
578
+ cu_num_tokens, arange = self._get_cumsum_and_arange(
579
+ num_scheduled_tokens)
580
+
581
+ # Get positions.
582
+ positions_np = self.positions_np[:total_num_scheduled_tokens]
583
+ np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
584
+ arange,
585
+ out=positions_np)
586
+
587
+ # Calculate M-RoPE positions.
588
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
589
+ if self.uses_mrope:
590
+ self._calc_mrope_positions(scheduler_output)
591
+
592
+ # Get token indices.
593
+ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
594
+ # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
595
+ # where M is the max_model_len.
596
+ token_indices = (positions_np +
597
+ req_indices * self.input_batch.token_ids_cpu.shape[1])
598
+
599
+ # NOTE(woosuk): We use torch.index_select instead of np.take here
600
+ # because torch.index_select is much faster than np.take for large
601
+ # tensors.
602
+ torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
603
+ 0,
604
+ torch.from_numpy(token_indices),
605
+ out=self.input_ids_cpu[:total_num_scheduled_tokens])
606
+
607
+ # Calculate the slot mapping for each KV cache group.
608
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
609
+ self.kv_cache_config.kv_cache_groups):
610
+ block_size = kv_cache_group_spec.kv_cache_spec.block_size
611
+ block_table: BlockTable = self.input_batch.block_table[
612
+ kv_cache_group_id]
613
+ # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
614
+ # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
615
+ # where K is the max_num_blocks_per_req and the block size is 2.
616
+ # NOTE(woosuk): We can't simply use `token_indices // block_size`
617
+ # here because M (max_model_len) is not necessarily divisible by
618
+ # block_size.
619
+ block_table_indices = (
620
+ req_indices * block_table.max_num_blocks_per_req +
621
+ positions_np // block_size)
622
+ block_table_cpu = block_table.get_cpu_tensor()
623
+ block_numbers = block_table_cpu.flatten(
624
+ )[block_table_indices].numpy()
625
+ block_offsets = positions_np % block_size
626
+ np.add(
627
+ block_numbers * block_size,
628
+ block_offsets,
629
+ out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
630
+
631
+ # Prepare the attention metadata.
632
+ self.query_start_loc_np[0] = 0
633
+ self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
634
+
635
+ self.seq_lens_np[:num_reqs] = (
636
+ self.input_batch.num_computed_tokens_cpu[:num_reqs] +
637
+ num_scheduled_tokens)
638
+
639
+ # Copy the tensors to the GPU.
640
+ self.input_ids[:total_num_scheduled_tokens].copy_(
641
+ self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
642
+ if self.uses_mrope:
643
+ # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
644
+ self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
645
+ self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
646
+ non_blocking=True)
647
+ else:
648
+ # Common case (1D positions)
649
+ self.positions[:total_num_scheduled_tokens].copy_(
650
+ self.positions_cpu[:total_num_scheduled_tokens],
651
+ non_blocking=True)
652
+
653
+ self.query_start_loc[:num_reqs + 1].copy_(
654
+ self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
655
+ self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
656
+ non_blocking=True)
657
+
658
+ # Fill unused with -1. Needed for reshape_and_cache
659
+ self.seq_lens[num_reqs:].fill_(0)
660
+ # Note: pad query_start_loc to be non-decreasing, as kernels
661
+ # like FlashAttention requires that
662
+ self.query_start_loc[num_reqs + 1:].fill_(
663
+ self.query_start_loc_cpu[num_reqs].item())
664
+
665
+ query_start_loc = self.query_start_loc[:num_reqs + 1]
666
+ seq_lens = self.seq_lens[:num_reqs]
667
+
668
+ common_attn_metadata = CommonAttentionMetadata(
669
+ query_start_loc=query_start_loc, seq_lens=seq_lens)
670
+
671
+ attn_metadata: dict[str, Any] = {}
672
+ # Prepare the attention metadata for each KV cache group and make layers
673
+ # in the same group share the same metadata.
674
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
675
+ self.kv_cache_config.kv_cache_groups):
676
+
677
+ # Prepare for cascade attention if enabled & beneficial.
678
+ common_prefix_len = 0
679
+ if self.cascade_attn_enabled:
680
+ common_prefix_len = self._compute_cascade_attn_prefix_len(
681
+ num_scheduled_tokens,
682
+ scheduler_output.
683
+ num_common_prefix_blocks[kv_cache_group_id],
684
+ kv_cache_group_spec.kv_cache_spec,
685
+ self.attn_metadata_builders[kv_cache_group_id],
686
+ )
687
+
688
+ attn_metadata_i = (
689
+ self.attn_metadata_builders[kv_cache_group_id].build(
690
+ num_reqs=num_reqs,
691
+ num_actual_tokens=total_num_scheduled_tokens,
692
+ max_query_len=max_num_scheduled_tokens,
693
+ common_prefix_len=common_prefix_len,
694
+ common_attn_metadata=common_attn_metadata))
695
+ for layer_name in kv_cache_group_spec.layer_names:
696
+ attn_metadata[layer_name] = attn_metadata_i
697
+
698
+ use_spec_decode = len(
699
+ scheduler_output.scheduled_spec_decode_tokens) > 0
700
+ if not use_spec_decode:
701
+ # NOTE(woosuk): Due to chunked prefills, the batch may contain
702
+ # partial requests. While we should not sample any token
703
+ # from these partial requests, we do so for simplicity.
704
+ # We will ignore the sampled tokens from the partial requests.
705
+ # TODO: Support prompt logprobs.
706
+ logits_indices = query_start_loc[1:] - 1
707
+ spec_decode_metadata = None
708
+ else:
709
+ # Get the number of draft tokens for each request.
710
+ # Iterate over the dictionary rather than all requests since not all
711
+ # requests have draft tokens.
712
+ num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
713
+ for req_id, draft_token_ids in (
714
+ scheduler_output.scheduled_spec_decode_tokens.items()):
715
+ req_idx = self.input_batch.req_id_to_index[req_id]
716
+ num_draft_tokens[req_idx] = len(draft_token_ids)
717
+
718
+ spec_decode_metadata = self._calc_spec_decode_metadata(
719
+ num_draft_tokens, cu_num_tokens)
720
+ logits_indices = spec_decode_metadata.logits_indices
721
+
722
+ # Hot-Swap lora model
723
+ if self.lora_config:
724
+ self.set_active_loras(self.input_batch, num_scheduled_tokens)
725
+
726
+ return attn_metadata, logits_indices, spec_decode_metadata
727
+
728
+ def _compute_cascade_attn_prefix_len(
729
+ self,
730
+ num_scheduled_tokens: np.ndarray,
731
+ num_common_prefix_blocks: int,
732
+ kv_cache_spec: KVCacheSpec,
733
+ attn_metadata_builder: AttentionMetadataBuilder,
734
+ ) -> int:
735
+ """Compute the length of the common prefix for cascade attention.
736
+
737
+ NOTE(woosuk): The common prefix length returned by this function
738
+ represents the length used specifically for cascade attention, not the
739
+ actual number of tokens shared between requests. When cascade attention
740
+ is disabled (use_cascade=False), this function returns 0 even if
741
+ requests share common tokens. Additionally, the common prefix length is
742
+ truncated to a multiple of the block size and may be further truncated
743
+ due to implementation details explained below.
744
+
745
+ Args:
746
+ num_scheduled_tokens: Number of tokens scheduled per request.
747
+ num_common_prefix_blocks: Number of shared KV cache blocks.
748
+
749
+ Returns:
750
+ int: Length of common prefix in tokens.
751
+ """
752
+ common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size
753
+ if common_prefix_len == 0:
754
+ # Common case.
755
+ return 0
756
+
757
+ # NOTE(woosuk): Cascade attention uses two attention kernels: one
758
+ # for the common prefix and the other for the rest. For the first
759
+ # kernel, we concatenate all the query tokens (possibly from
760
+ # different requests) and treat them as if they are from the same
761
+ # request. Then, we use bi-directional attention to process the
762
+ # common prefix in the KV cache. Importantly, this means that the
763
+ # first kernel does not do any masking.
764
+
765
+ # Consider the following example:
766
+ # Request 1's input query: [D, E, X]
767
+ # Request 1's kv cache: [A, B, C, D, E, X]
768
+ # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
769
+ # Request 2's input query: [E, Y]
770
+ # Request 2's kv cache: [A, B, C, D, E, Y]
771
+ # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
772
+
773
+ # If we use [A, B, C, D, E] as the common prefix, then the
774
+ # first kernel will compute the bi-directional attention between
775
+ # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
776
+ # However, this is wrong because D in Request 1 should not attend to
777
+ # E in the common prefix (i.e., we need masking).
778
+ # To avoid this, [A, B, C, D] should be the common prefix.
779
+ # That is, the common prefix should be capped by the minimum
780
+ # num_computed_tokens among the requests, and plus one to include
781
+ # the first token of the query.
782
+
783
+ # In practice, we use [A, B, C] as the common prefix, instead of
784
+ # [A, B, C, D] (i.e., the common prefix is capped by the minimum
785
+ # num_computed_tokens, without plus one).
786
+ # This is because of an implementation detail: We want to always
787
+ # use two kernels for cascade attention. Let's imagine:
788
+ # Request 3's input query: [D]
789
+ # Request 3's kv cache: [A, B, C, D]
790
+ # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
791
+ # If we use [A, B, C, D] as the common prefix for Request 1-3,
792
+ # then Request 3 will be processed only by the first kernel,
793
+ # and the second kernel will get an empty input. While this is not
794
+ # a fundamental problem, our current implementation does not support
795
+ # this case.
796
+ num_reqs = len(num_scheduled_tokens)
797
+ common_prefix_len = min(
798
+ common_prefix_len,
799
+ self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
800
+ # common_prefix_len should be a multiple of the block size.
801
+ common_prefix_len = (common_prefix_len // kv_cache_spec.block_size *
802
+ kv_cache_spec.block_size)
803
+ use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or
804
+ (isinstance(kv_cache_spec, FullAttentionSpec)
805
+ and kv_cache_spec.sliding_window is not None))
806
+ assert isinstance(kv_cache_spec, AttentionSpec)
807
+ use_cascade = attn_metadata_builder.use_cascade_attention(
808
+ common_prefix_len=common_prefix_len,
809
+ query_lens=num_scheduled_tokens,
810
+ num_query_heads=self.num_query_heads,
811
+ num_kv_heads=kv_cache_spec.num_kv_heads,
812
+ use_alibi=self.use_alibi,
813
+ use_sliding_window=use_sliding_window,
814
+ num_sms=self.num_sms,
815
+ )
816
+ return common_prefix_len if use_cascade else 0
817
+
818
+ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
819
+ mrope_pos_ptr = 0
820
+ for index, req_id in enumerate(self.input_batch.req_ids):
821
+ req = self.requests[req_id]
822
+ assert req.mrope_positions is not None
823
+
824
+ num_computed_tokens = \
825
+ self.input_batch.num_computed_tokens_cpu[index]
826
+ num_scheduled_tokens = \
827
+ scheduler_output.num_scheduled_tokens[req_id]
828
+ num_prompt_tokens = len(req.prompt_token_ids)
829
+
830
+ if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
831
+ prompt_part_len = max(0,
832
+ num_prompt_tokens - num_computed_tokens)
833
+ completion_part_len = max(
834
+ 0, num_scheduled_tokens - prompt_part_len)
835
+ else:
836
+ prompt_part_len = num_scheduled_tokens
837
+ completion_part_len = 0
838
+
839
+ assert num_scheduled_tokens == prompt_part_len + completion_part_len
840
+
841
+ if prompt_part_len > 0:
842
+ # prompt's mrope_positions are pre-computed
843
+ dst_start = mrope_pos_ptr
844
+ dst_end = mrope_pos_ptr + prompt_part_len
845
+ src_start = num_computed_tokens
846
+ src_end = num_computed_tokens + prompt_part_len
847
+
848
+ self.mrope_positions_cpu[:, dst_start:dst_end] = \
849
+ req.mrope_positions[:,src_start:src_end]
850
+
851
+ mrope_pos_ptr += prompt_part_len
852
+
853
+ if completion_part_len > 0:
854
+ # compute completion's mrope_positions on-the-fly
855
+ dst_start = mrope_pos_ptr
856
+ dst_end = mrope_pos_ptr + completion_part_len
857
+
858
+ self.mrope_positions_cpu[:, dst_start:dst_end] = \
859
+ MRotaryEmbedding.get_next_input_positions_tensor(
860
+ req.mrope_position_delta,
861
+ context_len=num_computed_tokens +
862
+ prompt_part_len,
863
+ seq_len=num_computed_tokens +
864
+ prompt_part_len +
865
+ completion_part_len,
866
+ )
867
+
868
+ mrope_pos_ptr += completion_part_len
869
+
870
+ def _calc_spec_decode_metadata(
871
+ self,
872
+ num_draft_tokens: np.ndarray,
873
+ cu_num_scheduled_tokens: np.ndarray,
874
+ ) -> SpecDecodeMetadata:
875
+ # Inputs:
876
+ # cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209]
877
+ # num_draft_tokens: [ 3, 0, 2, 0, 1]
878
+ # Outputs:
879
+ # cu_num_draft_tokens: [ 3, 3, 5, 5, 6]
880
+ # logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106,
881
+ # 206, 207, 208]
882
+ # target_logits_indices: [ 0, 1, 2, 5, 6, 9]
883
+ # bonus_logits_indices: [ 3, 4, 7, 8, 10]
884
+
885
+ # Compute the logits indices.
886
+ # [4, 1, 3, 1, 2]
887
+ num_sampled_tokens = num_draft_tokens + 1
888
+
889
+ # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11]
890
+ # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
891
+ cu_num_sampled_tokens, arange = self._get_cumsum_and_arange(
892
+ num_sampled_tokens, cumsum_dtype=np.int32)
893
+ # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
894
+ logits_indices = np.repeat(
895
+ cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
896
+ # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
897
+ logits_indices += arange
898
+
899
+ # Compute the bonus logits indices.
900
+ bonus_logits_indices = cu_num_sampled_tokens - 1
901
+
902
+ # Compute the draft logits indices.
903
+ # cu_num_draft_tokens: [3, 3, 5, 5, 6]
904
+ # arange: [0, 1, 2, 0, 1, 0]
905
+ cu_num_draft_tokens, arange = self._get_cumsum_and_arange(
906
+ num_draft_tokens, cumsum_dtype=np.int32)
907
+ # [0, 0, 0, 5, 5, 9]
908
+ target_logits_indices = np.repeat(
909
+ cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
910
+ # [0, 1, 2, 5, 6, 9]
911
+ target_logits_indices += arange
912
+
913
+ # TODO: Optimize the CPU -> GPU copy.
914
+ cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
915
+ self.device, non_blocking=True)
916
+ logits_indices = torch.from_numpy(logits_indices).to(self.device,
917
+ non_blocking=True)
918
+ target_logits_indices = torch.from_numpy(target_logits_indices).to(
919
+ self.device, non_blocking=True)
920
+ bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
921
+ self.device, non_blocking=True)
922
+
923
+ # Compute the draft token ids.
924
+ # draft_token_indices: [ 1, 2, 3, 105, 106, 208]
925
+ draft_token_ids = self.input_ids[logits_indices]
926
+ draft_token_ids = draft_token_ids[target_logits_indices + 1]
927
+
928
+ metadata = SpecDecodeMetadata(
929
+ draft_token_ids=draft_token_ids,
930
+ num_draft_tokens=num_draft_tokens.tolist(),
931
+ cu_num_draft_tokens=cu_num_draft_tokens,
932
+ target_logits_indices=target_logits_indices,
933
+ bonus_logits_indices=bonus_logits_indices,
934
+ logits_indices=logits_indices,
935
+ )
936
+ return metadata
937
+
938
+ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
939
+ scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
940
+ if not scheduled_encoder_inputs:
941
+ return
942
+
943
+ # Batch the multi-modal inputs.
944
+ mm_inputs = list[MultiModalKwargs]()
945
+ req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
946
+ for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
947
+ req_state = self.requests[req_id]
948
+
949
+ for mm_input_id in encoder_input_ids:
950
+ mm_inputs.append(req_state.mm_inputs[mm_input_id])
951
+ req_ids_pos.append(
952
+ (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
953
+
954
+ # Batch mm inputs as much as we can: if a request in the batch has
955
+ # multiple modalities or a different modality than the previous one,
956
+ # we process it separately to preserve item order.
957
+ # FIXME(ywang96): This is a hacky way to deal with multiple modalities
958
+ # in the same batch while still being able to benefit from batching
959
+ # multimodal inputs. The proper solution should be reordering the
960
+ # encoder outputs.
961
+ grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
962
+
963
+ encoder_outputs = []
964
+ for grouped_mm_inputs in grouped_mm_inputs_list:
965
+ batched_mm_inputs = MultiModalKwargs.batch(
966
+ grouped_mm_inputs, pin_memory=self.pin_memory)
967
+ batched_mm_inputs = MultiModalKwargs.as_kwargs(
968
+ batched_mm_inputs,
969
+ device=self.device,
970
+ )
971
+
972
+ # Run the encoder.
973
+ # `curr_group_outputs` is either of the following:
974
+ # 1. A tensor of shape (num_items, feature_size, hidden_size)
975
+ # in case feature_size is fixed across all multimodal items.
976
+ # 2. A list or tuple (length: num_items) of tensors, each of shape
977
+ # (feature_size, hidden_size) in case the feature size is dynamic
978
+ # depending on the input multimodal items.
979
+ curr_group_outputs = self.model.get_multimodal_embeddings(
980
+ **batched_mm_inputs)
981
+
982
+ sanity_check_mm_encoder_outputs(
983
+ curr_group_outputs,
984
+ expected_num_items=len(grouped_mm_inputs),
985
+ )
986
+
987
+ for output in curr_group_outputs:
988
+ encoder_outputs.append(output)
989
+
990
+ # Cache the encoder outputs.
991
+ for (req_id, input_id, pos_info), output in zip(
992
+ req_ids_pos,
993
+ encoder_outputs,
994
+ ):
995
+ if req_id not in self.encoder_cache:
996
+ self.encoder_cache[req_id] = {}
997
+
998
+ self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
999
+ output,
1000
+ is_embed=pos_info.is_embed,
1001
+ )
1002
+
1003
+ def _gather_mm_embeddings(
1004
+ self,
1005
+ scheduler_output: "SchedulerOutput",
1006
+ ) -> list[torch.Tensor]:
1007
+ mm_embeds: list[torch.Tensor] = []
1008
+ for req_id in self.input_batch.req_ids:
1009
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
1010
+ req_id]
1011
+ req_state = self.requests[req_id]
1012
+ num_computed_tokens = req_state.num_computed_tokens
1013
+ mm_positions = req_state.mm_positions
1014
+ for i, pos_info in enumerate(mm_positions):
1015
+ start_pos = pos_info.offset
1016
+ num_encoder_tokens = pos_info.length
1017
+
1018
+ # The encoder output is needed if the two ranges overlap:
1019
+ # [num_computed_tokens,
1020
+ # num_computed_tokens + num_scheduled_tokens) and
1021
+ # [start_pos, start_pos + num_encoder_tokens)
1022
+ if start_pos >= num_computed_tokens + num_scheduled_tokens:
1023
+ # The encoder output is not needed in this step.
1024
+ break
1025
+ if start_pos + num_encoder_tokens <= num_computed_tokens:
1026
+ # The encoder output is already processed and stored
1027
+ # in the decoder's KV cache.
1028
+ continue
1029
+
1030
+ start_idx = max(num_computed_tokens - start_pos, 0)
1031
+ end_idx = min(
1032
+ num_computed_tokens - start_pos + num_scheduled_tokens,
1033
+ num_encoder_tokens)
1034
+ assert start_idx < end_idx
1035
+ assert req_id in self.encoder_cache
1036
+ assert i in self.encoder_cache[req_id]
1037
+ encoder_output = self.encoder_cache[req_id][i]
1038
+
1039
+ if (is_embed := pos_info.is_embed) is not None:
1040
+ is_embed = is_embed[start_idx:end_idx]
1041
+
1042
+ mm_embeds_item = gather_mm_placeholders(
1043
+ encoder_output[start_idx:end_idx],
1044
+ is_embed=is_embed,
1045
+ )
1046
+ mm_embeds.append(mm_embeds_item)
1047
+ return mm_embeds
1048
+
1049
+ def get_model(self) -> nn.Module:
1050
+ return self.model
1051
+
1052
+ def apply_grammar_bitmask(
1053
+ self,
1054
+ scheduler_output: "SchedulerOutput",
1055
+ logits: torch.Tensor,
1056
+ ):
1057
+ grammar_bitmask = scheduler_output.grammar_bitmask
1058
+ if grammar_bitmask is None:
1059
+ return
1060
+
1061
+ # We receive the structured output bitmask from the scheduler,
1062
+ # compacted to contain bitmasks only for structured output requests.
1063
+ # The order of the requests in the bitmask is not guaranteed to be the
1064
+ # same as the order of the requests in the gpu runner's batch. We need
1065
+ # to sort the bitmask to match the order of the requests used here.
1066
+
1067
+ # Get the batch indices of the structured output requests.
1068
+ # Keep track of the number of speculative tokens scheduled for every
1069
+ # request in the batch, as the logit indices are offset by this amount.
1070
+ struct_out_req_batch_indices: dict[str, int] = {}
1071
+ cumulative_offset = 0
1072
+ seq = sorted(self.input_batch.req_id_to_index.items(),
1073
+ key=lambda x: x[1])
1074
+ for req_id, batch_index in seq:
1075
+ logit_index = batch_index + cumulative_offset
1076
+ cumulative_offset += len(
1077
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1078
+ if req_id in scheduler_output.structured_output_request_ids:
1079
+ struct_out_req_batch_indices[req_id] = logit_index
1080
+
1081
+ out_indices = []
1082
+
1083
+ # Reorder the bitmask to match the order of the requests in the batch.
1084
+ sorted_bitmask = np.zeros_like(grammar_bitmask,
1085
+ shape=(logits.shape[0],
1086
+ grammar_bitmask.shape[1]))
1087
+ cumulative_index = 0
1088
+ seq = sorted(scheduler_output.structured_output_request_ids.items(),
1089
+ key=lambda x: x[1])
1090
+ for req_id, _ in seq:
1091
+ logit_index = struct_out_req_batch_indices[req_id]
1092
+ num_spec_tokens = len(
1093
+ scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
1094
+ for i in range(1 + num_spec_tokens):
1095
+ sorted_bitmask[logit_index + i] = \
1096
+ grammar_bitmask[cumulative_index + i]
1097
+ out_indices.append(logit_index + i)
1098
+ cumulative_index += 1 + num_spec_tokens
1099
+ grammar_bitmask = sorted_bitmask
1100
+
1101
+ # Serialization of np.ndarray is much more efficient than a tensor,
1102
+ # so we receive it in that format.
1103
+ grammar_bitmask = torch.from_numpy(grammar_bitmask)
1104
+
1105
+ xgr.apply_token_bitmask_inplace(
1106
+ logits,
1107
+ grammar_bitmask.to(self.device, non_blocking=True),
1108
+ indices=out_indices,
1109
+ )
1110
+
1111
+ def sync_and_slice_intermediate_tensors(
1112
+ self, num_tokens: int, intermediate_tensors: IntermediateTensors,
1113
+ sync_self: bool) -> IntermediateTensors:
1114
+
1115
+ assert self.intermediate_tensors is not None
1116
+
1117
+ tp = self.vllm_config.parallel_config.tensor_parallel_size
1118
+ enabled_sp = self.vllm_config.compilation_config.pass_config. \
1119
+ enable_sequence_parallelism
1120
+ if enabled_sp:
1121
+ # When sequence parallelism is enabled, we always pad num_tokens
1122
+ # to be a multiple of tensor_parallel_size (tp) earlier
1123
+ assert num_tokens % tp == 0
1124
+ is_residual_scattered = tp > 1 and enabled_sp \
1125
+ and num_tokens % tp == 0
1126
+
1127
+ # When sequence parallelism is enabled, the "residual" tensor is sharded
1128
+ # across tensor parallel ranks, so each rank only needs its own slice.
1129
+ if sync_self:
1130
+ assert intermediate_tensors is not None
1131
+ for k, v in intermediate_tensors.items():
1132
+ is_scattered = "residual" and is_residual_scattered
1133
+ copy_len = num_tokens // tp if is_scattered else \
1134
+ num_tokens
1135
+ self.intermediate_tensors[k][:copy_len].copy_(
1136
+ v[:copy_len], non_blocking=True)
1137
+
1138
+ return IntermediateTensors({
1139
+ k:
1140
+ v[:num_tokens // tp]
1141
+ if k == "residual" and is_residual_scattered else v[:num_tokens]
1142
+ for k, v in self.intermediate_tensors.items()
1143
+ })
1144
+
1145
+ def get_dp_padding(self,
1146
+ num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
1147
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
1148
+ dp_rank = self.vllm_config.parallel_config.data_parallel_rank
1149
+
1150
+ # For DP: Don't pad when setting enforce_eager.
1151
+ # This lets us set enforce_eager on the prefiller in a P/D setup and
1152
+ # still use CUDA graphs (enabled by this padding) on the decoder.
1153
+ #
1154
+ # TODO(tms) : There are many cases where padding is enabled for
1155
+ # prefills, causing unnecessary and excessive padding of activations.
1156
+
1157
+ if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
1158
+ # Early exit.
1159
+ return 0, None
1160
+
1161
+ num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
1162
+ num_tokens, dp_size, dp_rank)
1163
+ max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
1164
+ num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
1165
+ dp_size,
1166
+ device="cpu",
1167
+ dtype=torch.int32)
1168
+ return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
1169
+
1170
+ @torch.inference_mode()
1171
+ def execute_model(
1172
+ self,
1173
+ scheduler_output: "SchedulerOutput",
1174
+ intermediate_tensors: Optional[IntermediateTensors] = None,
1175
+ ) -> Union[ModelRunnerOutput, IntermediateTensors]:
1176
+
1177
+ self._update_states(scheduler_output)
1178
+ if not scheduler_output.total_num_scheduled_tokens:
1179
+ if not has_kv_transfer_group():
1180
+ # Return empty ModelRunnerOutput if there's no work to do.
1181
+ return EMPTY_MODEL_RUNNER_OUTPUT
1182
+
1183
+ return self.kv_connector_no_forward(scheduler_output)
1184
+
1185
+ # Prepare the decoder inputs.
1186
+ attn_metadata, logits_indices, spec_decode_metadata = (
1187
+ self._prepare_inputs(scheduler_output))
1188
+ num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
1189
+ if (self.use_cuda_graph
1190
+ and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
1191
+ # Use piecewise CUDA graphs.
1192
+ # Add padding to the batch size.
1193
+ num_input_tokens = self.vllm_config.pad_for_cudagraph(
1194
+ num_scheduled_tokens)
1195
+ else:
1196
+ # Eager mode.
1197
+ # Pad tokens to multiple of tensor_parallel_size when
1198
+ # enabled collective fusion for SP
1199
+ tp_size = self.vllm_config.parallel_config.tensor_parallel_size
1200
+ if self.vllm_config.compilation_config.pass_config. \
1201
+ enable_sequence_parallelism and tp_size > 1:
1202
+ from vllm.utils import round_up
1203
+ num_input_tokens = round_up(num_scheduled_tokens, tp_size)
1204
+ else:
1205
+ num_input_tokens = num_scheduled_tokens
1206
+
1207
+ # Padding for DP
1208
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
1209
+ num_input_tokens += num_pad
1210
+
1211
+ # _prepare_inputs may reorder the batch, so we must gather multi
1212
+ # modal outputs after that to ensure the correct order
1213
+ if self.is_multimodal_model:
1214
+ # Run the multimodal encoder if any.
1215
+ self._execute_mm_encoder(scheduler_output)
1216
+ mm_embeds = self._gather_mm_embeddings(scheduler_output)
1217
+ else:
1218
+ mm_embeds = []
1219
+
1220
+ if self.is_multimodal_model and get_pp_group().is_first_rank:
1221
+ # NOTE(woosuk): To unify token ids and soft tokens (vision
1222
+ # embeddings), we always use embeddings (rather than token ids)
1223
+ # as input to the multimodal model, even when the input is text.
1224
+ input_ids = self.input_ids[:num_scheduled_tokens]
1225
+ if mm_embeds:
1226
+ inputs_embeds = self.model.get_input_embeddings(
1227
+ input_ids, mm_embeds)
1228
+ else:
1229
+ inputs_embeds = self.model.get_input_embeddings(input_ids)
1230
+ # TODO(woosuk): Avoid the copy. Optimize.
1231
+ self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
1232
+ inputs_embeds = self.inputs_embeds[:num_input_tokens]
1233
+ input_ids = None
1234
+ else:
1235
+ # For text-only models, we use token ids as input.
1236
+ # While it is possible to use embeddings as input just like the
1237
+ # multimodal models, it is not desirable for performance since
1238
+ # then the embedding layer is not included in the CUDA graph.
1239
+ input_ids = self.input_ids[:num_input_tokens]
1240
+ inputs_embeds = None
1241
+ if self.uses_mrope:
1242
+ positions = self.mrope_positions[:, :num_input_tokens]
1243
+ else:
1244
+ positions = self.positions[:num_input_tokens]
1245
+
1246
+ if get_pp_group().is_first_rank:
1247
+ intermediate_tensors = None
1248
+ else:
1249
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
1250
+ num_input_tokens, intermediate_tensors, True)
1251
+
1252
+ # Run the decoder.
1253
+ # Use persistent buffers for CUDA graphs.
1254
+ with set_forward_context(attn_metadata,
1255
+ self.vllm_config,
1256
+ num_tokens=num_input_tokens,
1257
+ num_tokens_across_dp=num_tokens_across_dp):
1258
+ self.maybe_setup_kv_connector(scheduler_output)
1259
+
1260
+ model_output = self.model(
1261
+ input_ids=input_ids,
1262
+ positions=positions,
1263
+ intermediate_tensors=intermediate_tensors,
1264
+ inputs_embeds=inputs_embeds,
1265
+ )
1266
+
1267
+ self.maybe_wait_for_kv_save()
1268
+ finished_sending, finished_recving = (
1269
+ self.get_finished_kv_transfers(scheduler_output))
1270
+
1271
+ if self.use_aux_hidden_state_outputs:
1272
+ hidden_states, aux_hidden_states = model_output
1273
+ else:
1274
+ hidden_states = model_output
1275
+ # Broadcast PP output for external_launcher (torchrun)
1276
+ # to make sure we are synced across pp ranks
1277
+ # TODO: Support overlapping mirco-batches
1278
+ # https://github.com/vllm-project/vllm/issues/18019
1279
+ broadcast_pp_output = \
1280
+ self.parallel_config.distributed_executor_backend \
1281
+ == "external_launcher" and len(get_pp_group().ranks) > 0
1282
+ if not get_pp_group().is_last_rank:
1283
+ # For mid-pipeline stages, return the hidden states.
1284
+ if not broadcast_pp_output:
1285
+ return hidden_states
1286
+ assert isinstance(hidden_states, IntermediateTensors)
1287
+ get_pp_group().send_tensor_dict(hidden_states.tensors,
1288
+ all_gather_group=get_tp_group())
1289
+ logits = None
1290
+ else:
1291
+ sample_hidden_states = hidden_states[logits_indices]
1292
+ logits = self.model.compute_logits(sample_hidden_states, None)
1293
+ if broadcast_pp_output:
1294
+ model_output_broadcast_data = {
1295
+ "logits": logits.contiguous(),
1296
+ } if logits is not None else {}
1297
+ model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
1298
+ model_output_broadcast_data, src=len(get_pp_group().ranks) - 1)
1299
+ assert model_output_broadcast_data is not None
1300
+ logits = model_output_broadcast_data["logits"]
1301
+
1302
+ # Apply structured output bitmasks if present
1303
+ if scheduler_output.grammar_bitmask is not None:
1304
+ self.apply_grammar_bitmask(scheduler_output, logits)
1305
+
1306
+ # Sample the next token and get logprobs if needed.
1307
+ sampling_metadata = self.input_batch.sampling_metadata
1308
+ if spec_decode_metadata is None:
1309
+ sampler_output = self.sampler(
1310
+ logits=logits,
1311
+ sampling_metadata=sampling_metadata,
1312
+ )
1313
+ else:
1314
+ # When indexing with a tensor (bonus_logits_indices), PyTorch
1315
+ # creates a new tensor with separate storage from the original
1316
+ # logits tensor. This means any in-place operations on bonus_logits
1317
+ # won't affect the original logits tensor.
1318
+ assert logits is not None
1319
+ bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
1320
+ sampler_output = self.sampler(
1321
+ logits=bonus_logits,
1322
+ sampling_metadata=sampling_metadata,
1323
+ )
1324
+ bonus_token_ids = sampler_output.sampled_token_ids
1325
+
1326
+ # Just like `bonus_logits`, `target_logits` is a new tensor with
1327
+ # separate storage from the original `logits` tensor. Therefore,
1328
+ # it is safe to update `target_logits` in place.
1329
+ target_logits = logits[spec_decode_metadata.target_logits_indices]
1330
+ output_token_ids = self.rejection_sampler(
1331
+ spec_decode_metadata,
1332
+ None, # draft_probs
1333
+ target_logits,
1334
+ bonus_token_ids,
1335
+ sampling_metadata,
1336
+ )
1337
+ sampler_output.sampled_token_ids = output_token_ids
1338
+
1339
+ # TODO(woosuk): The following loop can be slow since it iterates over
1340
+ # the requests one by one. Optimize.
1341
+ discard_sampled_tokens_req_indices = []
1342
+ for i, req_id in enumerate(self.input_batch.req_ids):
1343
+ req_state = self.requests[req_id]
1344
+ seq_len = (req_state.num_computed_tokens +
1345
+ scheduler_output.num_scheduled_tokens[req_id])
1346
+ if seq_len < req_state.num_tokens:
1347
+ # Ignore the sampled token for partial prefills.
1348
+ # Rewind the generator state as if the token was not sampled.
1349
+ # This relies on cuda-specific torch-internal impl details
1350
+ generator = self.input_batch.generators.get(i)
1351
+ if generator is not None:
1352
+ generator.set_offset(generator.get_offset() - 4)
1353
+ # Record the index of the request that should not be sampled,
1354
+ # so that we could clear the sampled tokens before returning.
1355
+ discard_sampled_tokens_req_indices.append(i)
1356
+
1357
+ # NOTE: GPU -> CPU Sync happens here.
1358
+ # Move as many CPU operations as possible before this sync point.
1359
+ logprobs_tensors = sampler_output.logprobs_tensors
1360
+ logprobs_lists = logprobs_tensors.tolists() \
1361
+ if logprobs_tensors is not None else None
1362
+
1363
+ # Compute prompt logprobs if needed.
1364
+ prompt_logprobs_dict = self._get_prompt_logprobs_dict(
1365
+ hidden_states[:num_scheduled_tokens],
1366
+ scheduler_output,
1367
+ )
1368
+
1369
+ # Get the valid generated tokens.
1370
+ sampled_token_ids = sampler_output.sampled_token_ids
1371
+ max_gen_len = sampled_token_ids.shape[-1]
1372
+ if max_gen_len == 1:
1373
+ # No spec decode tokens.
1374
+ valid_sampled_token_ids = sampled_token_ids.tolist()
1375
+ else:
1376
+ # Includes spec decode tokens.
1377
+ valid_sampled_token_ids = self.rejection_sampler.parse_output(
1378
+ sampled_token_ids,
1379
+ self.input_batch.vocab_size,
1380
+ )
1381
+ # Mask out the sampled tokens that should not be sampled.
1382
+ for i in discard_sampled_tokens_req_indices:
1383
+ valid_sampled_token_ids[i].clear()
1384
+
1385
+ if not self.speculative_config:
1386
+ # Speculative decoding is not enabled.
1387
+ spec_token_ids = None
1388
+ elif self.speculative_config.method == "ngram":
1389
+ assert isinstance(self.drafter, NgramProposer)
1390
+ spec_token_ids = self.generate_draft_token_ids(
1391
+ valid_sampled_token_ids, sampling_metadata)
1392
+ elif self.speculative_config.method == "medusa":
1393
+ assert isinstance(self.drafter, MedusaProposer)
1394
+ if max_gen_len == 1:
1395
+ hidden_states = sample_hidden_states
1396
+ else:
1397
+ indices = []
1398
+ offset = 0
1399
+ for num_draft, tokens in zip(
1400
+ spec_decode_metadata.num_draft_tokens,
1401
+ valid_sampled_token_ids):
1402
+ indices.append(offset + len(tokens) - 1)
1403
+ offset += num_draft + 1
1404
+
1405
+ indices = torch.tensor(indices,
1406
+ device=sample_hidden_states.device)
1407
+ hidden_states = sample_hidden_states[indices]
1408
+
1409
+ spec_token_ids = self.drafter.propose(
1410
+ target_hidden_states=hidden_states,
1411
+ sampling_metadata=sampling_metadata,
1412
+ )
1413
+ elif self.speculative_config.use_eagle():
1414
+ assert isinstance(self.drafter, EagleProposer)
1415
+ # TODO(woosuk): Refactor the loop.
1416
+ next_token_ids: list[int] = []
1417
+ for i, token_ids in enumerate(valid_sampled_token_ids):
1418
+ if token_ids:
1419
+ # Common case.
1420
+ next_token_id = token_ids[-1]
1421
+ else:
1422
+ # Partial prefill (rare case).
1423
+ # Get the next token id from the request state.
1424
+ req_id = self.input_batch.req_ids[i]
1425
+ req_state = self.requests[req_id]
1426
+ seq_len = (req_state.num_computed_tokens +
1427
+ scheduler_output.num_scheduled_tokens[req_id])
1428
+ next_token_id = req_state.get_token_id(seq_len)
1429
+ next_token_ids.append(next_token_id)
1430
+ next_token_ids = torch.tensor(next_token_ids,
1431
+ dtype=torch.int32,
1432
+ device=self.device)
1433
+ # At this moment, we assume all eagle layers belong to the same KV
1434
+ # cache group, thus using the same attention metadata.
1435
+ eagle_attn_metadata = attn_metadata[
1436
+ self.drafter.attn_layer_names[0]]
1437
+
1438
+ # NOTE: deepseek_mtp uses MLA which does not have `block_table`
1439
+ if hasattr(eagle_attn_metadata, "block_table"):
1440
+ block_table = eagle_attn_metadata.block_table
1441
+ else:
1442
+ block_table = None
1443
+
1444
+ if spec_decode_metadata is None:
1445
+ # input_ids can be None for multimodal models.
1446
+ target_token_ids = self.input_ids[:num_scheduled_tokens]
1447
+ target_positions = positions[:num_scheduled_tokens]
1448
+ if self.use_aux_hidden_state_outputs:
1449
+ target_hidden_states = torch.cat(
1450
+ [h[:num_scheduled_tokens] for h in aux_hidden_states],
1451
+ dim=-1)
1452
+ else:
1453
+ target_hidden_states = hidden_states[:num_scheduled_tokens]
1454
+ target_slot_mapping = eagle_attn_metadata.slot_mapping
1455
+ cu_num_tokens = eagle_attn_metadata.query_start_loc
1456
+ else:
1457
+ # TODO(woosuk): Refactor this.
1458
+ num_draft_tokens = spec_decode_metadata.num_draft_tokens
1459
+ num_rejected_tokens = [
1460
+ n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
1461
+ for i, n in enumerate(num_draft_tokens)
1462
+ ]
1463
+ num_rejected_tokens_tensor = async_tensor_h2d(
1464
+ num_rejected_tokens,
1465
+ dtype=torch.int32,
1466
+ target_device=self.device,
1467
+ pin_memory=True)
1468
+ num_tokens = num_scheduled_tokens - sum(num_rejected_tokens)
1469
+ cu_num_tokens, token_indices = self.drafter.prepare_inputs(
1470
+ eagle_attn_metadata.query_start_loc,
1471
+ num_rejected_tokens_tensor,
1472
+ num_tokens,
1473
+ )
1474
+ target_token_ids = self.input_ids[token_indices]
1475
+ target_positions = positions[token_indices]
1476
+ if self.use_aux_hidden_state_outputs:
1477
+ target_hidden_states = torch.cat(
1478
+ [h[token_indices] for h in aux_hidden_states], dim=-1)
1479
+ else:
1480
+ target_hidden_states = hidden_states[token_indices]
1481
+ target_slot_mapping = eagle_attn_metadata.slot_mapping[
1482
+ token_indices]
1483
+ draft_token_ids = self.drafter.propose(
1484
+ target_token_ids=target_token_ids,
1485
+ target_positions=target_positions,
1486
+ target_hidden_states=target_hidden_states,
1487
+ target_slot_mapping=target_slot_mapping,
1488
+ next_token_ids=next_token_ids,
1489
+ cu_num_tokens=cu_num_tokens,
1490
+ block_table=block_table,
1491
+ sampling_metadata=sampling_metadata,
1492
+ )
1493
+ spec_token_ids = draft_token_ids.tolist()
1494
+
1495
+ # Clear KVConnector state after all KVs are generated.
1496
+ if has_kv_transfer_group():
1497
+ get_kv_transfer_group().clear_connector_metadata()
1498
+
1499
+ return ModelRunnerOutput(
1500
+ req_ids=self.input_batch.req_ids,
1501
+ req_id_to_index=self.input_batch.req_id_to_index,
1502
+ sampled_token_ids=valid_sampled_token_ids,
1503
+ spec_token_ids=spec_token_ids,
1504
+ logprobs=logprobs_lists,
1505
+ prompt_logprobs_dict=prompt_logprobs_dict,
1506
+ finished_sending=finished_sending,
1507
+ finished_recving=finished_recving,
1508
+ )
1509
+
1510
+ def kv_connector_no_forward(
1511
+ self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput:
1512
+ # KV send/recv even if no work to do.
1513
+ with set_forward_context(None, self.vllm_config):
1514
+ self.maybe_setup_kv_connector(scheduler_output)
1515
+ finished_sending, finished_recving = (
1516
+ self.get_finished_kv_transfers(scheduler_output))
1517
+
1518
+ if not finished_sending and not finished_recving:
1519
+ return EMPTY_MODEL_RUNNER_OUTPUT
1520
+
1521
+ output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
1522
+ output.finished_sending = finished_sending
1523
+ output.finished_recving = finished_recving
1524
+ return output
1525
+
1526
+ @staticmethod
1527
+ def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
1528
+ # Update KVConnector with the KVConnector metadata forward().
1529
+ if has_kv_transfer_group():
1530
+ kv_connector = get_kv_transfer_group()
1531
+ assert isinstance(kv_connector, KVConnectorBase_V1)
1532
+ assert scheduler_output.kv_connector_metadata is not None
1533
+ kv_connector.bind_connector_metadata(
1534
+ scheduler_output.kv_connector_metadata)
1535
+
1536
+ # Background KV cache transfers happen here.
1537
+ # These transfers are designed to be async and the requests
1538
+ # involved may be disjoint from the running requests.
1539
+ # Do this here to save a collective_rpc.
1540
+ kv_connector.start_load_kv(get_forward_context())
1541
+
1542
+ @staticmethod
1543
+ def maybe_wait_for_kv_save() -> None:
1544
+ if has_kv_transfer_group():
1545
+ get_kv_transfer_group().wait_for_save()
1546
+
1547
+ @staticmethod
1548
+ def get_finished_kv_transfers(
1549
+ scheduler_output: "SchedulerOutput",
1550
+ ) -> tuple[Optional[set[str]], Optional[set[str]]]:
1551
+ if has_kv_transfer_group():
1552
+ return get_kv_transfer_group().get_finished(
1553
+ scheduler_output.finished_req_ids)
1554
+ return None, None
1555
+
1556
+ def generate_draft_token_ids(
1557
+ self,
1558
+ sampled_token_ids: list[list[int]],
1559
+ sampling_metadata: SamplingMetadata,
1560
+ ) -> list[list[int]]:
1561
+ # TODO(woosuk): Optimize.
1562
+ draft_token_ids: list[list[int]] = []
1563
+ for i, sampled_ids in enumerate(sampled_token_ids):
1564
+ num_sampled_ids = len(sampled_ids)
1565
+ if not num_sampled_ids:
1566
+ # Skip speculative decoding.
1567
+ draft_token_ids.append([])
1568
+ continue
1569
+
1570
+ # Skip requests that require sampling parameters that are not
1571
+ # supported with speculative decoding.
1572
+ req_id = self.input_batch.req_ids[i]
1573
+ if not is_spec_decode_supported(req_id, self.input_batch):
1574
+ draft_token_ids.append([])
1575
+ continue
1576
+
1577
+ # Add sampled_token_ids to token_ids_cpu.
1578
+ start_idx = self.input_batch.num_tokens_no_spec[i]
1579
+ end_idx = start_idx + num_sampled_ids
1580
+ if end_idx >= self.max_model_len:
1581
+ # Skip requests that have already reached the max model length.
1582
+ draft_token_ids.append([])
1583
+ continue
1584
+
1585
+ self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
1586
+ drafter_output = self.drafter.propose(
1587
+ self.input_batch.token_ids_cpu[i, :end_idx])
1588
+ if drafter_output is None or len(drafter_output) == 0:
1589
+ draft_token_ids.append([])
1590
+ else:
1591
+ draft_token_ids.append(drafter_output.tolist())
1592
+ return draft_token_ids
1593
+
1594
+ def load_model(self) -> None:
1595
+ logger.info("Starting to load model %s...", self.model_config.model)
1596
+ with DeviceMemoryProfiler() as m: # noqa: SIM117
1597
+ time_before_load = time.perf_counter()
1598
+ model_loader = get_model_loader(self.load_config)
1599
+ if not hasattr(self, "model"):
1600
+ logger.info("Loading model from scratch...")
1601
+ self.model = model_loader.load_model(
1602
+ vllm_config=self.vllm_config,
1603
+ model_config=self.model_config)
1604
+ else:
1605
+ logger.info(
1606
+ "Model was already initialized. Loading weights inplace..."
1607
+ )
1608
+ model_loader.load_weights(self.model,
1609
+ model_config=self.model_config)
1610
+ if self.lora_config:
1611
+ self.model = self.load_lora_model(self.model,
1612
+ self.model_config,
1613
+ self.scheduler_config,
1614
+ self.lora_config,
1615
+ self.device)
1616
+ if hasattr(self, "drafter"):
1617
+ logger.info("Loading drafter model...")
1618
+ self.drafter.load_model(self.model)
1619
+ if self.use_aux_hidden_state_outputs:
1620
+ self.model.set_aux_hidden_state_layers(
1621
+ self.model.get_eagle3_aux_hidden_state_layers())
1622
+ time_after_load = time.perf_counter()
1623
+ self.model_memory_usage = m.consumed_memory
1624
+ logger.info("Model loading took %.4f GiB and %.6f seconds",
1625
+ self.model_memory_usage / GiB_bytes,
1626
+ time_after_load - time_before_load)
1627
+ prepare_communication_buffer_for_model(self.model)
1628
+
1629
+ def save_tensorized_model(
1630
+ self,
1631
+ tensorizer_config: "TensorizerConfig",
1632
+ ) -> None:
1633
+ TensorizerLoader.save_model(
1634
+ self.model,
1635
+ tensorizer_config=tensorizer_config,
1636
+ )
1637
+
1638
+ def _get_prompt_logprobs_dict(
1639
+ self,
1640
+ hidden_states: torch.Tensor,
1641
+ scheduler_output: "SchedulerOutput",
1642
+ ) -> dict[str, Optional[LogprobsTensors]]:
1643
+ num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
1644
+ if not num_prompt_logprobs_dict:
1645
+ return {}
1646
+
1647
+ in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
1648
+ prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
1649
+
1650
+ # Since prompt logprobs are a rare feature, prioritize simple,
1651
+ # maintainable loop over optimal performance.
1652
+ completed_prefill_reqs = []
1653
+ for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
1654
+
1655
+ num_tokens = scheduler_output.num_scheduled_tokens[req_id]
1656
+
1657
+ # Get metadata for this request.
1658
+ request = self.requests[req_id]
1659
+ num_prompt_tokens = len(request.prompt_token_ids)
1660
+ prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
1661
+ self.device, non_blocking=True)
1662
+
1663
+ # Set up target LogprobsTensors object.
1664
+ logprobs_tensors = in_progress_dict.get(req_id)
1665
+ if not logprobs_tensors:
1666
+ # Create empty logprobs CPU tensors for the entire prompt.
1667
+ # If chunked, we'll copy in slice by slice.
1668
+ logprobs_tensors = LogprobsTensors.empty_cpu(
1669
+ num_prompt_tokens - 1, num_prompt_logprobs + 1)
1670
+ in_progress_dict[req_id] = logprobs_tensors
1671
+
1672
+ # Determine number of logits to retrieve.
1673
+ start_idx = request.num_computed_tokens
1674
+ start_tok = start_idx + 1
1675
+ num_remaining_tokens = num_prompt_tokens - start_tok
1676
+ if num_tokens <= num_remaining_tokens:
1677
+ # This is a chunk, more tokens remain.
1678
+ # In the == case, there are no more prompt logprobs to produce
1679
+ # but we want to defer returning them to the next step where we
1680
+ # have new generated tokens to return.
1681
+ num_logits = num_tokens
1682
+ else:
1683
+ # This is the last chunk of prompt tokens to return.
1684
+ num_logits = num_remaining_tokens
1685
+ completed_prefill_reqs.append(req_id)
1686
+ prompt_logprobs_dict[req_id] = logprobs_tensors
1687
+
1688
+ if num_logits <= 0:
1689
+ # This can happen for the final chunk if we prefilled exactly
1690
+ # (num_prompt_tokens - 1) tokens for this request in the prior
1691
+ # step. There are no more prompt logprobs to produce.
1692
+ continue
1693
+
1694
+ # Get the logits corresponding to this req's prompt tokens.
1695
+ # If this is a partial request (i.e. chunked prefill),
1696
+ # then there is prompt logprob generated for each index.
1697
+ req_idx = self.input_batch.req_id_to_index[req_id]
1698
+ offset = self.query_start_loc_np[req_idx].item()
1699
+ prompt_hidden_states = hidden_states[offset:offset + num_logits]
1700
+ logits = self.model.compute_logits(prompt_hidden_states, None)
1701
+
1702
+ # Get the "target" tokens for each index. For prompt at index i,
1703
+ # the token at prompt index i+1 is the "sampled" token we want
1704
+ # to gather the logprob for.
1705
+ tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
1706
+
1707
+ # Compute prompt logprobs.
1708
+ logprobs = self.sampler.compute_logprobs(logits)
1709
+ token_ids, logprobs, ranks = self.sampler.gather_logprobs(
1710
+ logprobs, num_prompt_logprobs, tgt_token_ids)
1711
+
1712
+ # Transfer GPU->CPU async.
1713
+ chunk_slice = slice(start_idx, start_idx + num_logits)
1714
+ logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
1715
+ token_ids, non_blocking=True)
1716
+ logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
1717
+ non_blocking=True)
1718
+ logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
1719
+ ranks, non_blocking=True)
1720
+
1721
+ # Remove requests that have completed prefill from the batch
1722
+ # num_prompt_logprobs_dict.
1723
+ for req_id in completed_prefill_reqs:
1724
+ del num_prompt_logprobs_dict[req_id]
1725
+ del in_progress_dict[req_id]
1726
+
1727
+ # Must synchronize the non-blocking GPU->CPU transfers.
1728
+ if prompt_logprobs_dict:
1729
+ self._sync_device()
1730
+
1731
+ return prompt_logprobs_dict
1732
+
1733
+ @contextmanager
1734
+ def maybe_randomize_inputs(self, input_ids: torch.Tensor):
1735
+ """
1736
+ Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
1737
+ This is to help balance expert-selection
1738
+ - during profile_run
1739
+ - during DP rank dummy run
1740
+ """
1741
+ dp_size = self.vllm_config.parallel_config.data_parallel_size
1742
+ randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
1743
+ if not randomize_inputs:
1744
+ yield
1745
+ else:
1746
+ import functools
1747
+
1748
+ @functools.cache
1749
+ def rand_input_ids() -> torch.Tensor:
1750
+ return torch.randint_like(
1751
+ self.input_ids,
1752
+ low=0,
1753
+ high=self.model_config.get_vocab_size(),
1754
+ dtype=input_ids.dtype)
1755
+
1756
+ logger.debug("Randomizing dummy data for DP Rank")
1757
+ input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
1758
+ non_blocking=True)
1759
+ yield
1760
+ input_ids.fill_(0)
1761
+
1762
+ @torch.inference_mode()
1763
+ def _dummy_run(
1764
+ self,
1765
+ num_tokens: int,
1766
+ skip_attn: bool = True,
1767
+ ) -> torch.Tensor:
1768
+
1769
+ # Padding for DP
1770
+ num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
1771
+ num_tokens += num_pad
1772
+
1773
+ # Set num_scheduled_tokens based on num_tokens and max_num_seqs
1774
+ # for dummy run with LoRA so that the num_reqs collectively
1775
+ # has num_tokens in total.
1776
+ assert num_tokens <= self.scheduler_config.max_num_batched_tokens
1777
+ max_num_reqs = self.scheduler_config.max_num_seqs
1778
+ num_reqs = min(num_tokens, max_num_reqs)
1779
+ min_tokens_per_req = num_tokens // num_reqs
1780
+ num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
1781
+ num_scheduled_tokens_list[-1] += num_tokens % num_reqs
1782
+ assert sum(num_scheduled_tokens_list) == num_tokens
1783
+ assert len(num_scheduled_tokens_list) == num_reqs
1784
+ num_scheduled_tokens = np.array(num_scheduled_tokens_list,
1785
+ dtype=np.int32)
1786
+
1787
+ if skip_attn:
1788
+ attn_metadata: Optional[dict[str, Any]] = None
1789
+ else:
1790
+ query_start_loc = self.query_start_loc[:num_reqs + 1]
1791
+ # Make sure max_model_len is used at the graph capture time.
1792
+ self.seq_lens_np[:num_reqs] = self.max_model_len
1793
+ self.seq_lens_np[num_reqs:] = 0
1794
+ self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
1795
+ non_blocking=True)
1796
+ seq_lens = self.seq_lens[:num_reqs]
1797
+
1798
+ common_attn_metadata = CommonAttentionMetadata(
1799
+ query_start_loc=query_start_loc, seq_lens=seq_lens)
1800
+
1801
+ attn_metadata = {}
1802
+ for kv_cache_group_id, kv_cache_group_spec in enumerate(
1803
+ self.kv_cache_config.kv_cache_groups):
1804
+ attn_metadata_i = (
1805
+ self.attn_metadata_builders[kv_cache_group_id].build(
1806
+ num_reqs=num_reqs,
1807
+ num_actual_tokens=num_tokens,
1808
+ max_query_len=num_tokens,
1809
+ common_prefix_len=0,
1810
+ common_attn_metadata=common_attn_metadata,
1811
+ ))
1812
+ for layer_name in kv_cache_group_spec.layer_names:
1813
+ attn_metadata[layer_name] = attn_metadata_i
1814
+
1815
+ with self.maybe_dummy_run_with_lora(self.lora_config,
1816
+ num_scheduled_tokens):
1817
+ model = self.model
1818
+ if self.is_multimodal_model:
1819
+ input_ids = None
1820
+ inputs_embeds = self.inputs_embeds[:num_tokens]
1821
+ else:
1822
+ input_ids = self.input_ids[:num_tokens]
1823
+ inputs_embeds = None
1824
+ if self.uses_mrope:
1825
+ positions = self.mrope_positions[:, :num_tokens]
1826
+ else:
1827
+ positions = self.positions[:num_tokens]
1828
+
1829
+ if get_pp_group().is_first_rank:
1830
+ intermediate_tensors = None
1831
+ else:
1832
+ if self.intermediate_tensors is None:
1833
+ self.intermediate_tensors = (
1834
+ self.model.make_empty_intermediate_tensors(
1835
+ batch_size=self.max_num_tokens,
1836
+ dtype=self.model_config.dtype,
1837
+ device=self.device))
1838
+
1839
+ intermediate_tensors = self.sync_and_slice_intermediate_tensors(
1840
+ num_tokens, None, False)
1841
+
1842
+ with self.maybe_randomize_inputs(input_ids), set_forward_context(
1843
+ attn_metadata,
1844
+ self.vllm_config,
1845
+ num_tokens=num_tokens,
1846
+ num_tokens_across_dp=num_tokens_across_dp):
1847
+ outputs = model(
1848
+ input_ids=input_ids,
1849
+ positions=positions,
1850
+ intermediate_tensors=intermediate_tensors,
1851
+ inputs_embeds=inputs_embeds,
1852
+ )
1853
+ if self.use_aux_hidden_state_outputs:
1854
+ hidden_states, _ = outputs
1855
+ else:
1856
+ hidden_states = outputs
1857
+
1858
+ if self.speculative_config and self.speculative_config.use_eagle():
1859
+ assert isinstance(self.drafter, EagleProposer)
1860
+ self.drafter.dummy_run(num_tokens)
1861
+
1862
+ logit_indices = np.cumsum(num_scheduled_tokens) - 1
1863
+ return hidden_states[logit_indices]
1864
+
1865
+ @torch.inference_mode()
1866
+ def _dummy_sampler_run(
1867
+ self,
1868
+ hidden_states: torch.Tensor,
1869
+ ) -> torch.Tensor:
1870
+ # The dummy hidden states may contain special values,
1871
+ # like `inf` or `nan`.
1872
+ # To avoid breaking the sampler, we use a random tensor here instead.
1873
+ hidden_states = torch.rand_like(hidden_states)
1874
+
1875
+ logits = self.model.compute_logits(hidden_states, None)
1876
+ num_reqs = logits.size(0)
1877
+
1878
+ dummy_tensors = lambda v: torch.full(
1879
+ (num_reqs, ), v, device=self.device)
1880
+
1881
+ dummy_metadata = SamplingMetadata(
1882
+ temperature=dummy_tensors(0.5),
1883
+ all_greedy=False,
1884
+ all_random=False,
1885
+ top_p=dummy_tensors(0.9),
1886
+ top_k=dummy_tensors(logits.size(1) - 1),
1887
+ min_p=None,
1888
+ generators={},
1889
+ max_num_logprobs=None,
1890
+ no_penalties=True,
1891
+ prompt_token_ids=None,
1892
+ frequency_penalties=dummy_tensors(0.1),
1893
+ presence_penalties=dummy_tensors(0.1),
1894
+ repetition_penalties=dummy_tensors(0.1),
1895
+ output_token_ids=[[] for _ in range(num_reqs)],
1896
+ min_tokens={},
1897
+ logit_bias=[None for _ in range(num_reqs)],
1898
+ allowed_token_ids_mask=None,
1899
+ bad_words_token_ids={},
1900
+ )
1901
+ try:
1902
+ sampler_output = self.sampler(logits=logits,
1903
+ sampling_metadata=dummy_metadata)
1904
+ except RuntimeError as e:
1905
+ if 'out of memory' in str(e):
1906
+ raise RuntimeError(
1907
+ "CUDA out of memory occurred when warming up sampler with "
1908
+ f"{num_reqs} dummy requests. Please try lowering "
1909
+ "`max_num_seqs` or `gpu_memory_utilization` when "
1910
+ "initializing the engine.") from e
1911
+ else:
1912
+ raise e
1913
+ if self.speculative_config:
1914
+ draft_token_ids = [[0] for _ in range(num_reqs)]
1915
+ dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
1916
+ draft_token_ids, self.device)
1917
+
1918
+ num_tokens = sum(len(ids) for ids in draft_token_ids)
1919
+ # draft_probs = torch.randn(
1920
+ # num_tokens, logits.shape[-1], device=self.device,
1921
+ # dtype=logits.dtype)
1922
+ draft_probs = None
1923
+ target_logits = torch.randn(num_tokens,
1924
+ logits.shape[-1],
1925
+ device=self.device,
1926
+ dtype=logits.dtype)
1927
+ # NOTE(woosuk): Here, we should use int32 because the sampler uses
1928
+ # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
1929
+ # will occur at runtime.
1930
+ bonus_token_ids = torch.zeros(num_reqs,
1931
+ device=self.device,
1932
+ dtype=torch.int32)
1933
+ self.rejection_sampler(
1934
+ dummy_spec_decode_metadata,
1935
+ draft_probs,
1936
+ target_logits,
1937
+ bonus_token_ids,
1938
+ dummy_metadata,
1939
+ )
1940
+ return sampler_output
1941
+
1942
+ def profile_run(self) -> None:
1943
+ # Profile with multimodal encoder & encoder cache.
1944
+ # TODO: handle encoder-decoder models once we support them.
1945
+ if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
1946
+ and self.encoder_cache_size > 0):
1947
+
1948
+ # NOTE: Currently model is profiled with a single non-text
1949
+ # modality with the max possible input tokens even when
1950
+ # it supports multiple.
1951
+ max_tokens_by_modality_dict = self.mm_registry \
1952
+ .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
1953
+ dummy_data_modality, max_tokens_per_mm_item = max(
1954
+ max_tokens_by_modality_dict.items(), key=lambda item: item[1])
1955
+
1956
+ # Check how many items of this modality can be supported by
1957
+ # the encoder budget.
1958
+ encoder_budget = min(self.max_num_encoder_input_tokens,
1959
+ self.encoder_cache_size)
1960
+
1961
+ max_num_mm_items_encoder_budget = cdiv(encoder_budget,
1962
+ max_tokens_per_mm_item)
1963
+
1964
+ # Check how many items of this modality can be supported by
1965
+ # the decoder budget.
1966
+ max_mm_items_per_req = self.mm_registry.get_mm_limits_per_prompt(
1967
+ self.model_config)[dummy_data_modality]
1968
+
1969
+ # NOTE: We do not consider max_num_batched_tokens on purpose
1970
+ # because the multimodal embeddings can be generated in advance
1971
+ # and chunked prefilled.
1972
+ max_num_mm_items_decoder_budget = self.max_num_reqs * \
1973
+ max_mm_items_per_req
1974
+
1975
+ max_num_mm_items = min(max_num_mm_items_encoder_budget,
1976
+ max_num_mm_items_decoder_budget)
1977
+
1978
+ logger.info(
1979
+ "Encoder cache will be initialized with a budget of %s tokens,"
1980
+ " and profiled with %s %s items of the maximum feature size.",
1981
+ encoder_budget, max_num_mm_items, dummy_data_modality)
1982
+
1983
+ # Create dummy batch of multimodal inputs.
1984
+ dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
1985
+ model_config=self.model_config,
1986
+ seq_len=self.max_num_tokens,
1987
+ mm_counts={
1988
+ dummy_data_modality: 1
1989
+ },
1990
+ ).multi_modal_data
1991
+
1992
+ batched_dummy_mm_inputs = MultiModalKwargs.batch(
1993
+ [dummy_mm_kwargs] * max_num_mm_items,
1994
+ pin_memory=self.pin_memory)
1995
+ batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
1996
+ batched_dummy_mm_inputs,
1997
+ device=self.device,
1998
+ )
1999
+
2000
+ # Run multimodal encoder.
2001
+ dummy_encoder_outputs = self.model.get_multimodal_embeddings(
2002
+ **batched_dummy_mm_inputs)
2003
+
2004
+ sanity_check_mm_encoder_outputs(
2005
+ dummy_encoder_outputs,
2006
+ expected_num_items=max_num_mm_items,
2007
+ )
2008
+
2009
+ # Cache the dummy encoder outputs.
2010
+ self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
2011
+
2012
+ hidden_states = self._dummy_run(self.max_num_tokens)
2013
+ if get_pp_group().is_last_rank:
2014
+ sampler_output = self._dummy_sampler_run(hidden_states)
2015
+ else:
2016
+ sampler_output = None
2017
+ self._sync_device()
2018
+ del hidden_states, sampler_output
2019
+ self.encoder_cache.clear()
2020
+ gc.collect()
2021
+
2022
+ def capture_model(self) -> None:
2023
+ if not self.use_cuda_graph:
2024
+ logger.warning(
2025
+ "Skipping CUDA graph capture. Please add "
2026
+ "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
2027
+ return
2028
+
2029
+ start_time = time.perf_counter()
2030
+ start_free_gpu_memory = torch.cuda.mem_get_info()[0]
2031
+
2032
+ # Trigger CUDA graph capture for specific shapes.
2033
+ # Capture the large shapes first so that the smaller shapes
2034
+ # can reuse the memory pool allocated for the large shapes.
2035
+ with graph_capture(device=self.device):
2036
+ skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
2037
+ for num_tokens in reversed(self.cudagraph_batch_sizes):
2038
+ for _ in range(self.vllm_config.compilation_config.
2039
+ cudagraph_num_of_warmups):
2040
+ self._dummy_run(num_tokens, skip_attn=skip_attn)
2041
+ self._dummy_run(num_tokens, skip_attn=skip_attn)
2042
+
2043
+ end_time = time.perf_counter()
2044
+ end_free_gpu_memory = torch.cuda.mem_get_info()[0]
2045
+ elapsed_time = end_time - start_time
2046
+ cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
2047
+ # This usually takes 5~20 seconds.
2048
+ logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
2049
+ elapsed_time, cuda_graph_size / (1 << 30))
2050
+
2051
+ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
2052
+ """
2053
+ Initialize the attention backends and attention metadata builders.
2054
+ """
2055
+ assert len(self.attn_backends) == 0 and len(
2056
+ self.attn_metadata_builders
2057
+ ) == 0, "Attention backends are already initialized"
2058
+ for i, kv_cache_group_spec in enumerate(
2059
+ kv_cache_config.kv_cache_groups):
2060
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
2061
+ if not isinstance(kv_cache_spec, AttentionSpec):
2062
+ raise NotImplementedError(
2063
+ "Only AttentionSpec is supported for now.")
2064
+ attn_backend_i = get_attn_backend(
2065
+ kv_cache_spec.head_size,
2066
+ self.dtype,
2067
+ kv_cache_spec.dtype,
2068
+ kv_cache_spec.block_size,
2069
+ self.model_config.is_attention_free,
2070
+ use_mla=kv_cache_spec.use_mla,
2071
+ )
2072
+ if attn_backend_i is None:
2073
+ error_msg = (
2074
+ f"Error with get_attn_backend: {kv_cache_spec.head_size=}, "
2075
+ f"{self.dtype=}, {kv_cache_spec.dtype=}, "
2076
+ f"{kv_cache_spec.block_size=}, "
2077
+ f"{self.model_config.is_attention_free=}, "
2078
+ f"{kv_cache_spec.use_mla=}")
2079
+ logger.error(error_msg)
2080
+ raise NotImplementedError(
2081
+ "Non-Attention backend is not supported by V1 "
2082
+ "GPUModelRunner.")
2083
+
2084
+ if self.vllm_config.compilation_config.full_cuda_graph:
2085
+ attn_backend_name = attn_backend_i.__name__
2086
+ flash_attn_version = get_flash_attn_version()
2087
+ if attn_backend_name != "FlashAttentionBackend" or \
2088
+ flash_attn_version != 3:
2089
+ raise ValueError(
2090
+ f"full_cuda_graph is only supported with "
2091
+ f"FA3. Current attention backend is "
2092
+ f"{attn_backend_name}, FlashAttention version is "
2093
+ f"{flash_attn_version}.")
2094
+
2095
+ block_table_i = self.input_batch.block_table[i]
2096
+ attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
2097
+ weakref.proxy(self), kv_cache_spec, block_table_i)
2098
+ self.attn_backends.append(attn_backend_i)
2099
+ self.attn_metadata_builders.append(attn_metadata_builder_i)
2100
+
2101
+ def may_reinitialize_input_batch(self,
2102
+ kv_cache_config: KVCacheConfig) -> None:
2103
+ """
2104
+ Re-initialize the input batch if the block sizes are different from
2105
+ `[self.cache_config.block_size]`. This usually happens when there
2106
+ are multiple KV cache groups.
2107
+
2108
+ Args:
2109
+ kv_cache_config: The KV cache configuration.
2110
+ """
2111
+ block_sizes = [
2112
+ kv_cache_group.kv_cache_spec.block_size
2113
+ for kv_cache_group in kv_cache_config.kv_cache_groups
2114
+ ]
2115
+ if block_sizes != [self.cache_config.block_size]:
2116
+ assert self.cache_config.cpu_offload_gb == 0, (
2117
+ "Cannot re-initialize the input batch when CPU weight "
2118
+ "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
2119
+ "for more details.")
2120
+ self.input_batch = InputBatch(
2121
+ max_num_reqs=self.max_num_reqs,
2122
+ max_model_len=self.max_model_len,
2123
+ max_num_batched_tokens=self.max_num_tokens,
2124
+ device=self.device,
2125
+ pin_memory=self.pin_memory,
2126
+ vocab_size=self.model_config.get_vocab_size(),
2127
+ block_sizes=block_sizes,
2128
+ )
2129
+
2130
+ def _allocate_kv_cache_tensors(
2131
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
2132
+ """
2133
+ Initializes the KV cache buffer with the correct size. The buffer needs
2134
+ to be reshaped to the desired shape before being used by the models.
2135
+
2136
+ Args:
2137
+ kv_cache_config: The KV cache config
2138
+ Returns:
2139
+ dict[str, torch.Tensor]: A map between layer names to their
2140
+ corresponding memory buffer for KV cache.
2141
+ """
2142
+ kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
2143
+ for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
2144
+ tensor = torch.zeros(kv_cache_tensor.size,
2145
+ dtype=torch.int8,
2146
+ device=self.device)
2147
+ for layer_name in kv_cache_tensor.shared_by:
2148
+ kv_cache_raw_tensors[layer_name] = tensor
2149
+
2150
+ layer_names = set()
2151
+ for group in kv_cache_config.kv_cache_groups:
2152
+ layer_names.update(group.layer_names)
2153
+ assert layer_names == set(kv_cache_raw_tensors.keys(
2154
+ )), "Some layers are not correctly initialized"
2155
+ return kv_cache_raw_tensors
2156
+
2157
+ def _reshape_kv_cache_tensors(
2158
+ self,
2159
+ kv_cache_config: KVCacheConfig,
2160
+ kv_cache_raw_tensors: dict[str, torch.Tensor],
2161
+ ) -> dict[str, torch.Tensor]:
2162
+ """
2163
+ Reshape the KV cache tensors to the desired shape and dtype.
2164
+
2165
+ Args:
2166
+ kv_cache_config: The KV cache config
2167
+ kv_cache_raw_tensors: The KV cache buffer of each layer, with
2168
+ correct size but uninitialized shape.
2169
+ Returns:
2170
+ Dict[str, torch.Tensor]: A map between layer names to their
2171
+ corresponding memory buffer for KV cache.
2172
+ """
2173
+ kv_caches: dict[str, torch.Tensor] = {}
2174
+ for i, kv_cache_group_spec in enumerate(
2175
+ kv_cache_config.kv_cache_groups):
2176
+ kv_cache_spec = kv_cache_group_spec.kv_cache_spec
2177
+ for layer_name in kv_cache_group_spec.layer_names:
2178
+ raw_tensor = kv_cache_raw_tensors[layer_name]
2179
+ assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
2180
+ num_blocks = (raw_tensor.numel() //
2181
+ kv_cache_spec.page_size_bytes)
2182
+ if isinstance(kv_cache_spec, AttentionSpec):
2183
+ kv_cache_shape = self.attn_backends[i].get_kv_cache_shape(
2184
+ num_blocks, kv_cache_spec.block_size,
2185
+ kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
2186
+ dtype = kv_cache_spec.dtype
2187
+ try:
2188
+ kv_cache_stride_order = self.attn_backends[
2189
+ i].get_kv_cache_stride_order()
2190
+ assert len(kv_cache_stride_order) == len(
2191
+ kv_cache_shape)
2192
+ except (AttributeError, NotImplementedError):
2193
+ kv_cache_stride_order = tuple(
2194
+ range(len(kv_cache_shape)))
2195
+ # The allocation respects the backend-defined stride order
2196
+ # to ensure the semantic remains consistent for each
2197
+ # backend. We first obtain the generic kv cache shape and
2198
+ # then permute it according to the stride order which could
2199
+ # result in a non-contiguous tensor.
2200
+ kv_cache_shape = tuple(kv_cache_shape[i]
2201
+ for i in kv_cache_stride_order)
2202
+ # Maintain original KV shape view.
2203
+ inv_order = [
2204
+ kv_cache_stride_order.index(i)
2205
+ for i in range(len(kv_cache_stride_order))
2206
+ ]
2207
+ kv_caches[layer_name] = kv_cache_raw_tensors[
2208
+ layer_name].view(dtype).view(kv_cache_shape).permute(
2209
+ *inv_order)
2210
+ else:
2211
+ raise NotImplementedError
2212
+ return kv_caches
2213
+
2214
+ def initialize_kv_cache_tensors(
2215
+ self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
2216
+ """
2217
+ Initialize the memory buffer for KV cache.
2218
+
2219
+ Args:
2220
+ kv_cache_config: The KV cache config
2221
+ Returns:
2222
+ Dict[str, torch.Tensor]: A map between layer names to their
2223
+ corresponding memory buffer for KV cache.
2224
+ """
2225
+ # Initialize the memory buffer for KV cache
2226
+ kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
2227
+ # Change the memory buffer to the desired shape
2228
+ kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
2229
+ kv_cache_raw_tensors)
2230
+
2231
+ # Setup `kv_cache_config` and `kv_caches` for models
2232
+ # with cross-layer KV sharing
2233
+ if self.shared_kv_cache_layers:
2234
+ initialize_kv_cache_for_kv_sharing(
2235
+ self.shared_kv_cache_layers,
2236
+ kv_cache_config.kv_cache_groups,
2237
+ kv_caches,
2238
+ )
2239
+
2240
+ bind_kv_cache(
2241
+ kv_caches,
2242
+ self.vllm_config.compilation_config.static_forward_context,
2243
+ self.kv_caches)
2244
+ return kv_caches
2245
+
2246
+ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
2247
+ """
2248
+ Initialize KV cache based on `kv_cache_config`.
2249
+ Args:
2250
+ kv_cache_config: Configuration for the KV cache, including the KV
2251
+ cache size of each layer
2252
+ """
2253
+ self.kv_cache_config = kv_cache_config
2254
+ self.may_reinitialize_input_batch(kv_cache_config)
2255
+ self.initialize_attn_backend(kv_cache_config)
2256
+ kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
2257
+
2258
+ if self.speculative_config and self.speculative_config.use_eagle():
2259
+ assert isinstance(self.drafter, EagleProposer)
2260
+ # validate all draft model layers belong to the same kv cache
2261
+ # group
2262
+ self.drafter.validate_same_kv_cache_group(kv_cache_config)
2263
+
2264
+ if has_kv_transfer_group():
2265
+ get_kv_transfer_group().register_kv_caches(kv_caches)
2266
+
2267
+ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
2268
+ """
2269
+ Generates the KVCacheSpec by parsing the kv cache format from each
2270
+ Attention module in the static forward context.
2271
+ Returns:
2272
+ KVCacheSpec: A dictionary mapping layer names to their KV cache
2273
+ format. Layers that do not need KV cache are not included.
2274
+ """
2275
+
2276
+ layers = get_layers_from_vllm_config(self.vllm_config, Attention)
2277
+ block_size = self.vllm_config.cache_config.block_size
2278
+ use_mla = self.vllm_config.model_config.use_mla
2279
+ kv_cache_spec: dict[str, KVCacheSpec] = {}
2280
+ for layer_name, attn_module in layers.items():
2281
+ if (kv_tgt_layer :=
2282
+ attn_module.kv_sharing_target_layer_name) is not None:
2283
+ # The layer doesn't need its own KV cache and will use that of
2284
+ # the target layer. We skip creating a KVCacheSpec for it, so
2285
+ # that KV cache management logic will act as this layer does
2286
+ # not exist, and doesn't allocate KV cache for the layer. This
2287
+ # enables the memory saving of cross-layer kv sharing, allowing
2288
+ # a given amount of memory to accommodate longer context lengths
2289
+ # or enable more requests to be processed simultaneously.
2290
+ self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
2291
+ continue
2292
+
2293
+ # TODO: Support other attention modules, e.g., cross-attention
2294
+ if attn_module.attn_type == AttentionType.DECODER:
2295
+ if attn_module.sliding_window is not None:
2296
+ kv_cache_spec[layer_name] = SlidingWindowSpec(
2297
+ block_size=block_size,
2298
+ num_kv_heads=attn_module.num_kv_heads,
2299
+ head_size=attn_module.head_size,
2300
+ dtype=self.kv_cache_dtype,
2301
+ sliding_window=attn_module.sliding_window,
2302
+ use_mla=use_mla)
2303
+ else:
2304
+ kv_cache_spec[layer_name] = FullAttentionSpec(
2305
+ block_size=block_size,
2306
+ num_kv_heads=attn_module.num_kv_heads,
2307
+ head_size=attn_module.head_size,
2308
+ dtype=self.kv_cache_dtype,
2309
+ use_mla=use_mla)
2310
+ elif attn_module.attn_type in (AttentionType.ENCODER,
2311
+ AttentionType.ENCODER_ONLY):
2312
+ # encoder-only attention does not need KV cache.
2313
+ continue
2314
+ elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
2315
+ raise NotImplementedError
2316
+ else:
2317
+ raise ValueError(
2318
+ f"Unknown attention type: {attn_module.attn_type}")
2319
+
2320
+ return kv_cache_spec