vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1536 -0
  4. vllm/_ipex_ops.py +241 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +38 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +31 -0
  16. vllm/assets/video.py +103 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +303 -0
  22. vllm/attention/backends/flash_attn.py +999 -0
  23. vllm/attention/backends/flashinfer.py +1092 -0
  24. vllm/attention/backends/flashmla.py +242 -0
  25. vllm/attention/backends/hpu_attn.py +301 -0
  26. vllm/attention/backends/ipex_attn.py +396 -0
  27. vllm/attention/backends/mla/__init__.py +0 -0
  28. vllm/attention/backends/mla/common.py +1444 -0
  29. vllm/attention/backends/pallas.py +346 -0
  30. vllm/attention/backends/placeholder_attn.py +399 -0
  31. vllm/attention/backends/rocm_aiter_mla.py +412 -0
  32. vllm/attention/backends/rocm_flash_attn.py +969 -0
  33. vllm/attention/backends/torch_sdpa.py +691 -0
  34. vllm/attention/backends/triton_mla.py +113 -0
  35. vllm/attention/backends/utils.py +609 -0
  36. vllm/attention/backends/xformers.py +798 -0
  37. vllm/attention/layer.py +443 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  41. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  42. vllm/attention/ops/blocksparse_attention/utils.py +244 -0
  43. vllm/attention/ops/chunked_prefill_paged_decode.py +366 -0
  44. vllm/attention/ops/flashmla.py +115 -0
  45. vllm/attention/ops/hpu_paged_attn.py +105 -0
  46. vllm/attention/ops/ipex_attn.py +193 -0
  47. vllm/attention/ops/merge_attn_states.py +42 -0
  48. vllm/attention/ops/nki_flash_attn.py +905 -0
  49. vllm/attention/ops/paged_attn.py +255 -0
  50. vllm/attention/ops/prefix_prefill.py +902 -0
  51. vllm/attention/ops/rocm_aiter_mla.py +42 -0
  52. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  53. vllm/attention/ops/triton_decode_attention.py +675 -0
  54. vllm/attention/ops/triton_flash_attention.py +1375 -0
  55. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  56. vllm/attention/selector.py +186 -0
  57. vllm/attention/utils/fa_utils.py +54 -0
  58. vllm/beam_search.py +82 -0
  59. vllm/benchmarks/__init__.py +0 -0
  60. vllm/benchmarks/datasets.py +831 -0
  61. vllm/benchmarks/endpoint_request_func.py +160 -0
  62. vllm/benchmarks/latency.py +181 -0
  63. vllm/benchmarks/serve.py +925 -0
  64. vllm/benchmarks/throughput.py +608 -0
  65. vllm/benchmarks/utils.py +69 -0
  66. vllm/collect_env.py +795 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/backends.py +715 -0
  69. vllm/compilation/compiler_interface.py +437 -0
  70. vllm/compilation/counter.py +33 -0
  71. vllm/compilation/decorators.py +249 -0
  72. vllm/compilation/fix_functionalization.py +182 -0
  73. vllm/compilation/fusion.py +617 -0
  74. vllm/compilation/fx_utils.py +60 -0
  75. vllm/compilation/inductor_pass.py +114 -0
  76. vllm/compilation/monitor.py +38 -0
  77. vllm/compilation/multi_output_match.py +108 -0
  78. vllm/compilation/noop_elimination.py +135 -0
  79. vllm/compilation/pass_manager.py +74 -0
  80. vllm/compilation/sequence_parallelism.py +266 -0
  81. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  82. vllm/compilation/vllm_inductor_pass.py +68 -0
  83. vllm/compilation/wrapper.py +129 -0
  84. vllm/config.py +4179 -0
  85. vllm/connections.py +170 -0
  86. vllm/core/__init__.py +0 -0
  87. vllm/core/block/__init__.py +0 -0
  88. vllm/core/block/block_table.py +398 -0
  89. vllm/core/block/common.py +370 -0
  90. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  91. vllm/core/block/interfaces.py +318 -0
  92. vllm/core/block/naive_block.py +465 -0
  93. vllm/core/block/prefix_caching_block.py +1134 -0
  94. vllm/core/block/utils.py +27 -0
  95. vllm/core/block_manager.py +520 -0
  96. vllm/core/evictor.py +156 -0
  97. vllm/core/interfaces.py +134 -0
  98. vllm/core/placeholder_block_space_manager.py +99 -0
  99. vllm/core/scheduler.py +2060 -0
  100. vllm/device_allocator/__init__.py +0 -0
  101. vllm/device_allocator/cumem.py +280 -0
  102. vllm/distributed/__init__.py +5 -0
  103. vllm/distributed/communication_op.py +40 -0
  104. vllm/distributed/device_communicators/__init__.py +0 -0
  105. vllm/distributed/device_communicators/base_device_communicator.py +151 -0
  106. vllm/distributed/device_communicators/cpu_communicator.py +139 -0
  107. vllm/distributed/device_communicators/cuda_communicator.py +131 -0
  108. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  109. vllm/distributed/device_communicators/custom_all_reduce.py +301 -0
  110. vllm/distributed/device_communicators/custom_all_reduce_utils.py +257 -0
  111. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  112. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  113. vllm/distributed/device_communicators/pynccl.py +217 -0
  114. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  115. vllm/distributed/device_communicators/shm_broadcast.py +557 -0
  116. vllm/distributed/device_communicators/tpu_communicator.py +93 -0
  117. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  118. vllm/distributed/kv_transfer/README.md +29 -0
  119. vllm/distributed/kv_transfer/__init__.py +11 -0
  120. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  121. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  122. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  123. vllm/distributed/kv_transfer/kv_connector/factory.py +107 -0
  124. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  125. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +201 -0
  126. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +90 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +8 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +209 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +131 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  132. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  133. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  134. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  135. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  136. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  137. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  138. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  139. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  140. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  141. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  142. vllm/distributed/parallel_state.py +1209 -0
  143. vllm/distributed/utils.py +366 -0
  144. vllm/engine/__init__.py +0 -0
  145. vllm/engine/arg_utils.py +1724 -0
  146. vllm/engine/async_llm_engine.py +1261 -0
  147. vllm/engine/async_timeout.py +191 -0
  148. vllm/engine/llm_engine.py +2150 -0
  149. vllm/engine/metrics.py +717 -0
  150. vllm/engine/metrics_types.py +96 -0
  151. vllm/engine/multiprocessing/__init__.py +183 -0
  152. vllm/engine/multiprocessing/client.py +745 -0
  153. vllm/engine/multiprocessing/engine.py +450 -0
  154. vllm/engine/output_processor/__init__.py +0 -0
  155. vllm/engine/output_processor/interfaces.py +74 -0
  156. vllm/engine/output_processor/multi_step.py +210 -0
  157. vllm/engine/output_processor/single_step.py +136 -0
  158. vllm/engine/output_processor/stop_checker.py +130 -0
  159. vllm/engine/output_processor/util.py +27 -0
  160. vllm/engine/protocol.py +302 -0
  161. vllm/entrypoints/__init__.py +0 -0
  162. vllm/entrypoints/api_server.py +177 -0
  163. vllm/entrypoints/chat_utils.py +1259 -0
  164. vllm/entrypoints/cli/__init__.py +0 -0
  165. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  166. vllm/entrypoints/cli/benchmark/base.py +38 -0
  167. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  168. vllm/entrypoints/cli/benchmark/main.py +53 -0
  169. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  170. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  171. vllm/entrypoints/cli/collect_env.py +35 -0
  172. vllm/entrypoints/cli/main.py +59 -0
  173. vllm/entrypoints/cli/openai.py +175 -0
  174. vllm/entrypoints/cli/serve.py +59 -0
  175. vllm/entrypoints/cli/types.py +24 -0
  176. vllm/entrypoints/launcher.py +146 -0
  177. vllm/entrypoints/llm.py +1450 -0
  178. vllm/entrypoints/logger.py +44 -0
  179. vllm/entrypoints/openai/__init__.py +0 -0
  180. vllm/entrypoints/openai/api_server.py +1130 -0
  181. vllm/entrypoints/openai/cli_args.py +296 -0
  182. vllm/entrypoints/openai/logits_processors.py +89 -0
  183. vllm/entrypoints/openai/protocol.py +1806 -0
  184. vllm/entrypoints/openai/run_batch.py +439 -0
  185. vllm/entrypoints/openai/serving_chat.py +1210 -0
  186. vllm/entrypoints/openai/serving_completion.py +557 -0
  187. vllm/entrypoints/openai/serving_embedding.py +245 -0
  188. vllm/entrypoints/openai/serving_engine.py +569 -0
  189. vllm/entrypoints/openai/serving_models.py +314 -0
  190. vllm/entrypoints/openai/serving_pooling.py +237 -0
  191. vllm/entrypoints/openai/serving_score.py +439 -0
  192. vllm/entrypoints/openai/serving_tokenization.py +147 -0
  193. vllm/entrypoints/openai/serving_transcription.py +421 -0
  194. vllm/entrypoints/openai/tool_parsers/__init__.py +19 -0
  195. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  196. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +254 -0
  197. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +232 -0
  198. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  199. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +211 -0
  200. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +303 -0
  201. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +262 -0
  202. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  203. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +110 -0
  204. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +292 -0
  205. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  206. vllm/entrypoints/score_utils.py +49 -0
  207. vllm/entrypoints/ssl.py +74 -0
  208. vllm/entrypoints/utils.py +136 -0
  209. vllm/env_override.py +34 -0
  210. vllm/envs.py +800 -0
  211. vllm/executor/__init__.py +0 -0
  212. vllm/executor/executor_base.py +400 -0
  213. vllm/executor/mp_distributed_executor.py +243 -0
  214. vllm/executor/msgspec_utils.py +29 -0
  215. vllm/executor/multiproc_worker_utils.py +312 -0
  216. vllm/executor/ray_distributed_executor.py +700 -0
  217. vllm/executor/ray_utils.py +400 -0
  218. vllm/executor/uniproc_executor.py +141 -0
  219. vllm/forward_context.py +159 -0
  220. vllm/inputs/__init__.py +37 -0
  221. vllm/inputs/data.py +248 -0
  222. vllm/inputs/parse.py +121 -0
  223. vllm/inputs/preprocess.py +745 -0
  224. vllm/inputs/registry.py +212 -0
  225. vllm/jsontree.py +79 -0
  226. vllm/logger.py +210 -0
  227. vllm/logging_utils/__init__.py +7 -0
  228. vllm/logging_utils/formatter.py +17 -0
  229. vllm/logits_process.py +121 -0
  230. vllm/lora/__init__.py +0 -0
  231. vllm/lora/fully_sharded_layers.py +335 -0
  232. vllm/lora/layers.py +1263 -0
  233. vllm/lora/lora.py +198 -0
  234. vllm/lora/models.py +802 -0
  235. vllm/lora/ops/__init__.py +0 -0
  236. vllm/lora/ops/torch_ops/__init__.py +15 -0
  237. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  238. vllm/lora/ops/triton_ops/__init__.py +11 -0
  239. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  240. vllm/lora/ops/triton_ops/lora_expand.py +293 -0
  241. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  242. vllm/lora/ops/triton_ops/lora_shrink.py +247 -0
  243. vllm/lora/ops/triton_ops/utils.py +121 -0
  244. vllm/lora/peft_helper.py +115 -0
  245. vllm/lora/punica_wrapper/__init__.py +9 -0
  246. vllm/lora/punica_wrapper/punica_base.py +483 -0
  247. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  248. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  249. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  250. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  251. vllm/lora/punica_wrapper/utils.py +161 -0
  252. vllm/lora/request.py +97 -0
  253. vllm/lora/resolver.py +83 -0
  254. vllm/lora/utils.py +237 -0
  255. vllm/lora/worker_manager.py +251 -0
  256. vllm/model_executor/__init__.py +15 -0
  257. vllm/model_executor/custom_op.py +153 -0
  258. vllm/model_executor/guided_decoding/__init__.py +180 -0
  259. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  260. vllm/model_executor/guided_decoding/guidance_logits_processors.py +85 -0
  261. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  262. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  263. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  264. vllm/model_executor/guided_decoding/outlines_logits_processors.py +271 -0
  265. vllm/model_executor/guided_decoding/reasoner/__init__.py +35 -0
  266. vllm/model_executor/guided_decoding/utils.py +241 -0
  267. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  268. vllm/model_executor/layers/__init__.py +0 -0
  269. vllm/model_executor/layers/activation.py +368 -0
  270. vllm/model_executor/layers/fused_moe/__init__.py +51 -0
  271. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  272. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  273. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  274. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  275. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  276. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  277. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  278. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  279. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  280. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  281. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  282. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  283. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  284. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  285. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  286. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  287. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  426. vllm/model_executor/layers/fused_moe/cutlass_moe.py +180 -0
  427. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +294 -0
  428. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +374 -0
  429. vllm/model_executor/layers/fused_moe/fused_moe.py +1539 -0
  430. vllm/model_executor/layers/fused_moe/layer.py +949 -0
  431. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  432. vllm/model_executor/layers/fused_moe/moe_pallas.py +64 -0
  433. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  434. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +416 -0
  435. vllm/model_executor/layers/fused_moe/utils.py +48 -0
  436. vllm/model_executor/layers/layernorm.py +277 -0
  437. vllm/model_executor/layers/lightning_attn.py +651 -0
  438. vllm/model_executor/layers/linear.py +1518 -0
  439. vllm/model_executor/layers/logits_processor.py +196 -0
  440. vllm/model_executor/layers/mamba/__init__.py +0 -0
  441. vllm/model_executor/layers/mamba/mamba2_metadata.py +109 -0
  442. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  443. vllm/model_executor/layers/mamba/mamba_mixer2.py +538 -0
  444. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  445. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  446. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +415 -0
  447. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  448. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  449. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  450. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  451. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  452. vllm/model_executor/layers/pooler.py +336 -0
  453. vllm/model_executor/layers/quantization/__init__.py +153 -0
  454. vllm/model_executor/layers/quantization/aqlm.py +374 -0
  455. vllm/model_executor/layers/quantization/awq.py +184 -0
  456. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  457. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  458. vllm/model_executor/layers/quantization/base_config.py +145 -0
  459. vllm/model_executor/layers/quantization/bitblas.py +459 -0
  460. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  461. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  462. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +624 -0
  463. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1100 -0
  464. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
  465. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  466. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  467. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  468. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +119 -0
  469. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  470. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  471. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  472. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  473. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +213 -0
  474. vllm/model_executor/layers/quantization/deepspeedfp.py +193 -0
  475. vllm/model_executor/layers/quantization/experts_int8.py +194 -0
  476. vllm/model_executor/layers/quantization/fbgemm_fp8.py +168 -0
  477. vllm/model_executor/layers/quantization/fp8.py +832 -0
  478. vllm/model_executor/layers/quantization/gguf.py +408 -0
  479. vllm/model_executor/layers/quantization/gptq.py +276 -0
  480. vllm/model_executor/layers/quantization/gptq_bitblas.py +438 -0
  481. vllm/model_executor/layers/quantization/gptq_marlin.py +643 -0
  482. vllm/model_executor/layers/quantization/gptq_marlin_24.py +295 -0
  483. vllm/model_executor/layers/quantization/hqq_marlin.py +328 -0
  484. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  485. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  486. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  487. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  488. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  489. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  490. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  491. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  492. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +132 -0
  493. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  494. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  495. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  496. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  497. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  498. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  499. vllm/model_executor/layers/quantization/kv_cache.py +137 -0
  500. vllm/model_executor/layers/quantization/marlin.py +259 -0
  501. vllm/model_executor/layers/quantization/modelopt.py +410 -0
  502. vllm/model_executor/layers/quantization/moe_wna16.py +447 -0
  503. vllm/model_executor/layers/quantization/neuron_quant.py +67 -0
  504. vllm/model_executor/layers/quantization/ptpc_fp8.py +125 -0
  505. vllm/model_executor/layers/quantization/qqq.py +273 -0
  506. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  507. vllm/model_executor/layers/quantization/quark/quark.py +385 -0
  508. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  509. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +7 -0
  510. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  511. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +142 -0
  512. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  513. vllm/model_executor/layers/quantization/quark/utils.py +102 -0
  514. vllm/model_executor/layers/quantization/schema.py +85 -0
  515. vllm/model_executor/layers/quantization/torchao.py +127 -0
  516. vllm/model_executor/layers/quantization/tpu_int8.py +119 -0
  517. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  518. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  519. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +198 -0
  520. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  521. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  522. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  523. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  524. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  525. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  526. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  527. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  528. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  529. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  530. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  531. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  532. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  533. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  534. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  535. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  536. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  537. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  538. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  539. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  540. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  541. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  542. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  543. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  544. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  545. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  546. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  547. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  548. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  549. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  550. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  551. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/fp8_utils.py +523 -0
  723. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  724. vllm/model_executor/layers/quantization/utils/int8_utils.py +459 -0
  725. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  726. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  727. vllm/model_executor/layers/quantization/utils/marlin_utils.py +413 -0
  728. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +110 -0
  729. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  730. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  731. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +127 -0
  732. vllm/model_executor/layers/quantization/utils/quant_utils.py +571 -0
  733. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  734. vllm/model_executor/layers/rejection_sampler.py +400 -0
  735. vllm/model_executor/layers/resampler.py +269 -0
  736. vllm/model_executor/layers/rotary_embedding.py +1598 -0
  737. vllm/model_executor/layers/sampler.py +1221 -0
  738. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  739. vllm/model_executor/layers/typical_acceptance_sampler.py +172 -0
  740. vllm/model_executor/layers/utils.py +99 -0
  741. vllm/model_executor/layers/vocab_parallel_embedding.py +485 -0
  742. vllm/model_executor/model_loader/__init__.py +20 -0
  743. vllm/model_executor/model_loader/loader.py +1542 -0
  744. vllm/model_executor/model_loader/neuron.py +243 -0
  745. vllm/model_executor/model_loader/tensorizer.py +468 -0
  746. vllm/model_executor/model_loader/utils.py +171 -0
  747. vllm/model_executor/model_loader/weight_utils.py +749 -0
  748. vllm/model_executor/models/__init__.py +27 -0
  749. vllm/model_executor/models/adapters.py +247 -0
  750. vllm/model_executor/models/arctic.py +559 -0
  751. vllm/model_executor/models/aria.py +656 -0
  752. vllm/model_executor/models/aya_vision.py +461 -0
  753. vllm/model_executor/models/baichuan.py +469 -0
  754. vllm/model_executor/models/bamba.py +542 -0
  755. vllm/model_executor/models/bart.py +936 -0
  756. vllm/model_executor/models/bert.py +725 -0
  757. vllm/model_executor/models/blip.py +337 -0
  758. vllm/model_executor/models/blip2.py +717 -0
  759. vllm/model_executor/models/bloom.py +358 -0
  760. vllm/model_executor/models/chameleon.py +1135 -0
  761. vllm/model_executor/models/chatglm.py +476 -0
  762. vllm/model_executor/models/clip.py +410 -0
  763. vllm/model_executor/models/commandr.py +466 -0
  764. vllm/model_executor/models/constant_size_cache.py +136 -0
  765. vllm/model_executor/models/dbrx.py +469 -0
  766. vllm/model_executor/models/deepseek.py +484 -0
  767. vllm/model_executor/models/deepseek_mtp.py +266 -0
  768. vllm/model_executor/models/deepseek_v2.py +830 -0
  769. vllm/model_executor/models/deepseek_vl2.py +647 -0
  770. vllm/model_executor/models/eagle.py +247 -0
  771. vllm/model_executor/models/exaone.py +548 -0
  772. vllm/model_executor/models/fairseq2_llama.py +153 -0
  773. vllm/model_executor/models/falcon.py +508 -0
  774. vllm/model_executor/models/florence2.py +1102 -0
  775. vllm/model_executor/models/fuyu.py +388 -0
  776. vllm/model_executor/models/gemma.py +423 -0
  777. vllm/model_executor/models/gemma2.py +423 -0
  778. vllm/model_executor/models/gemma3.py +531 -0
  779. vllm/model_executor/models/gemma3_mm.py +716 -0
  780. vllm/model_executor/models/glm.py +22 -0
  781. vllm/model_executor/models/glm4.py +303 -0
  782. vllm/model_executor/models/glm4v.py +647 -0
  783. vllm/model_executor/models/gpt2.py +313 -0
  784. vllm/model_executor/models/gpt_bigcode.py +336 -0
  785. vllm/model_executor/models/gpt_j.py +337 -0
  786. vllm/model_executor/models/gpt_neox.py +330 -0
  787. vllm/model_executor/models/granite.py +494 -0
  788. vllm/model_executor/models/granite_speech.py +777 -0
  789. vllm/model_executor/models/granitemoe.py +435 -0
  790. vllm/model_executor/models/granitemoeshared.py +339 -0
  791. vllm/model_executor/models/gritlm.py +245 -0
  792. vllm/model_executor/models/grok1.py +560 -0
  793. vllm/model_executor/models/h2ovl.py +542 -0
  794. vllm/model_executor/models/idefics2_vision_model.py +387 -0
  795. vllm/model_executor/models/idefics3.py +767 -0
  796. vllm/model_executor/models/interfaces.py +569 -0
  797. vllm/model_executor/models/interfaces_base.py +163 -0
  798. vllm/model_executor/models/intern_vit.py +476 -0
  799. vllm/model_executor/models/internlm2.py +453 -0
  800. vllm/model_executor/models/internlm2_ve.py +146 -0
  801. vllm/model_executor/models/internvl.py +945 -0
  802. vllm/model_executor/models/jais.py +371 -0
  803. vllm/model_executor/models/jamba.py +590 -0
  804. vllm/model_executor/models/kimi_vl.py +577 -0
  805. vllm/model_executor/models/llama.py +619 -0
  806. vllm/model_executor/models/llama4.py +530 -0
  807. vllm/model_executor/models/llama_eagle.py +152 -0
  808. vllm/model_executor/models/llama_eagle3.py +232 -0
  809. vllm/model_executor/models/llava.py +869 -0
  810. vllm/model_executor/models/llava_next.py +582 -0
  811. vllm/model_executor/models/llava_next_video.py +470 -0
  812. vllm/model_executor/models/llava_onevision.py +954 -0
  813. vllm/model_executor/models/mamba.py +271 -0
  814. vllm/model_executor/models/mamba2.py +302 -0
  815. vllm/model_executor/models/mamba_cache.py +76 -0
  816. vllm/model_executor/models/medusa.py +210 -0
  817. vllm/model_executor/models/minicpm.py +592 -0
  818. vllm/model_executor/models/minicpm3.py +229 -0
  819. vllm/model_executor/models/minicpmo.py +725 -0
  820. vllm/model_executor/models/minicpmv.py +1287 -0
  821. vllm/model_executor/models/minimax_cache.py +35 -0
  822. vllm/model_executor/models/minimax_text_01.py +1261 -0
  823. vllm/model_executor/models/mistral3.py +598 -0
  824. vllm/model_executor/models/mixtral.py +485 -0
  825. vllm/model_executor/models/mixtral_quant.py +447 -0
  826. vllm/model_executor/models/mllama.py +1623 -0
  827. vllm/model_executor/models/mllama4.py +838 -0
  828. vllm/model_executor/models/mlp_speculator.py +205 -0
  829. vllm/model_executor/models/modernbert.py +325 -0
  830. vllm/model_executor/models/module_mapping.py +71 -0
  831. vllm/model_executor/models/molmo.py +1567 -0
  832. vllm/model_executor/models/moonvit.py +628 -0
  833. vllm/model_executor/models/mpt.py +329 -0
  834. vllm/model_executor/models/nemotron.py +506 -0
  835. vllm/model_executor/models/nemotron_nas.py +446 -0
  836. vllm/model_executor/models/nvlm_d.py +212 -0
  837. vllm/model_executor/models/olmo.py +390 -0
  838. vllm/model_executor/models/olmo2.py +412 -0
  839. vllm/model_executor/models/olmoe.py +449 -0
  840. vllm/model_executor/models/opt.py +410 -0
  841. vllm/model_executor/models/orion.py +356 -0
  842. vllm/model_executor/models/paligemma.py +397 -0
  843. vllm/model_executor/models/persimmon.py +342 -0
  844. vllm/model_executor/models/phi.py +354 -0
  845. vllm/model_executor/models/phi3.py +18 -0
  846. vllm/model_executor/models/phi3_small.py +463 -0
  847. vllm/model_executor/models/phi3v.py +722 -0
  848. vllm/model_executor/models/phi4mm.py +1263 -0
  849. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  850. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  851. vllm/model_executor/models/phimoe.py +666 -0
  852. vllm/model_executor/models/pixtral.py +1281 -0
  853. vllm/model_executor/models/plamo2.py +736 -0
  854. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  855. vllm/model_executor/models/qwen.py +360 -0
  856. vllm/model_executor/models/qwen2.py +552 -0
  857. vllm/model_executor/models/qwen2_5_omni_thinker.py +901 -0
  858. vllm/model_executor/models/qwen2_5_vl.py +1136 -0
  859. vllm/model_executor/models/qwen2_audio.py +402 -0
  860. vllm/model_executor/models/qwen2_moe.py +531 -0
  861. vllm/model_executor/models/qwen2_rm.py +130 -0
  862. vllm/model_executor/models/qwen2_vl.py +1409 -0
  863. vllm/model_executor/models/qwen3.py +319 -0
  864. vllm/model_executor/models/qwen3_moe.py +528 -0
  865. vllm/model_executor/models/qwen_vl.py +784 -0
  866. vllm/model_executor/models/registry.py +611 -0
  867. vllm/model_executor/models/roberta.py +332 -0
  868. vllm/model_executor/models/siglip.py +522 -0
  869. vllm/model_executor/models/skyworkr1v.py +949 -0
  870. vllm/model_executor/models/smolvlm.py +51 -0
  871. vllm/model_executor/models/solar.py +504 -0
  872. vllm/model_executor/models/stablelm.py +349 -0
  873. vllm/model_executor/models/starcoder2.py +355 -0
  874. vllm/model_executor/models/telechat2.py +139 -0
  875. vllm/model_executor/models/teleflm.py +78 -0
  876. vllm/model_executor/models/transformers.py +442 -0
  877. vllm/model_executor/models/ultravox.py +655 -0
  878. vllm/model_executor/models/utils.py +714 -0
  879. vllm/model_executor/models/vision.py +149 -0
  880. vllm/model_executor/models/whisper.py +746 -0
  881. vllm/model_executor/models/zamba2.py +1008 -0
  882. vllm/model_executor/parameter.py +458 -0
  883. vllm/model_executor/pooling_metadata.py +71 -0
  884. vllm/model_executor/sampling_metadata.py +596 -0
  885. vllm/model_executor/utils.py +53 -0
  886. vllm/multimodal/__init__.py +31 -0
  887. vllm/multimodal/audio.py +105 -0
  888. vllm/multimodal/base.py +218 -0
  889. vllm/multimodal/hasher.py +103 -0
  890. vllm/multimodal/image.py +77 -0
  891. vllm/multimodal/inputs.py +843 -0
  892. vllm/multimodal/parse.py +454 -0
  893. vllm/multimodal/processing.py +1760 -0
  894. vllm/multimodal/profiling.py +274 -0
  895. vllm/multimodal/registry.py +321 -0
  896. vllm/multimodal/utils.py +386 -0
  897. vllm/multimodal/video.py +166 -0
  898. vllm/outputs.py +521 -0
  899. vllm/platforms/__init__.py +286 -0
  900. vllm/platforms/cpu.py +182 -0
  901. vllm/platforms/cuda.py +463 -0
  902. vllm/platforms/hpu.py +94 -0
  903. vllm/platforms/interface.py +427 -0
  904. vllm/platforms/neuron.py +69 -0
  905. vllm/platforms/rocm.py +346 -0
  906. vllm/platforms/tpu.py +174 -0
  907. vllm/platforms/xpu.py +142 -0
  908. vllm/plugins/__init__.py +82 -0
  909. vllm/pooling_params.py +53 -0
  910. vllm/profiler/__init__.py +7 -0
  911. vllm/profiler/layerwise_profile.py +374 -0
  912. vllm/profiler/utils.py +147 -0
  913. vllm/prompt_adapter/__init__.py +0 -0
  914. vllm/prompt_adapter/layers.py +82 -0
  915. vllm/prompt_adapter/models.py +357 -0
  916. vllm/prompt_adapter/request.py +36 -0
  917. vllm/prompt_adapter/utils.py +97 -0
  918. vllm/prompt_adapter/worker_manager.py +178 -0
  919. vllm/py.typed +2 -0
  920. vllm/reasoning/__init__.py +12 -0
  921. vllm/reasoning/abs_reasoning_parsers.py +189 -0
  922. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  923. vllm/reasoning/granite_reasoning_parser.py +362 -0
  924. vllm/sampling_params.py +598 -0
  925. vllm/scalar_type.py +335 -0
  926. vllm/scripts.py +14 -0
  927. vllm/sequence.py +1486 -0
  928. vllm/spec_decode/__init__.py +0 -0
  929. vllm/spec_decode/batch_expansion.py +505 -0
  930. vllm/spec_decode/draft_model_runner.py +335 -0
  931. vllm/spec_decode/interfaces.py +98 -0
  932. vllm/spec_decode/medusa_worker.py +137 -0
  933. vllm/spec_decode/metrics.py +212 -0
  934. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  935. vllm/spec_decode/mqa_scorer.py +159 -0
  936. vllm/spec_decode/multi_step_worker.py +416 -0
  937. vllm/spec_decode/ngram_worker.py +195 -0
  938. vllm/spec_decode/proposer_worker_base.py +58 -0
  939. vllm/spec_decode/smaller_tp_proposer_worker.py +194 -0
  940. vllm/spec_decode/spec_decode_worker.py +1324 -0
  941. vllm/spec_decode/target_model_runner.py +44 -0
  942. vllm/spec_decode/top1_proposer.py +274 -0
  943. vllm/spec_decode/util.py +276 -0
  944. vllm/test_utils.py +129 -0
  945. vllm/third_party/__init__.py +0 -0
  946. vllm/third_party/pynvml.py +6139 -0
  947. vllm/tracing.py +130 -0
  948. vllm/transformers_utils/__init__.py +19 -0
  949. vllm/transformers_utils/config.py +813 -0
  950. vllm/transformers_utils/configs/__init__.py +52 -0
  951. vllm/transformers_utils/configs/arctic.py +206 -0
  952. vllm/transformers_utils/configs/chatglm.py +71 -0
  953. vllm/transformers_utils/configs/cohere2.py +194 -0
  954. vllm/transformers_utils/configs/dbrx.py +280 -0
  955. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  956. vllm/transformers_utils/configs/eagle.py +65 -0
  957. vllm/transformers_utils/configs/exaone.py +191 -0
  958. vllm/transformers_utils/configs/falcon.py +89 -0
  959. vllm/transformers_utils/configs/h2ovl.py +15 -0
  960. vllm/transformers_utils/configs/internvl.py +53 -0
  961. vllm/transformers_utils/configs/jais.py +237 -0
  962. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  963. vllm/transformers_utils/configs/medusa.py +62 -0
  964. vllm/transformers_utils/configs/mllama.py +30 -0
  965. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  966. vllm/transformers_utils/configs/moonvit.py +32 -0
  967. vllm/transformers_utils/configs/mpt.py +179 -0
  968. vllm/transformers_utils/configs/nemotron.py +204 -0
  969. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  970. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  971. vllm/transformers_utils/configs/solar.py +246 -0
  972. vllm/transformers_utils/configs/telechat2.py +63 -0
  973. vllm/transformers_utils/configs/ultravox.py +107 -0
  974. vllm/transformers_utils/detokenizer.py +167 -0
  975. vllm/transformers_utils/detokenizer_utils.py +188 -0
  976. vllm/transformers_utils/processor.py +210 -0
  977. vllm/transformers_utils/processors/__init__.py +6 -0
  978. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  979. vllm/transformers_utils/s3_utils.py +161 -0
  980. vllm/transformers_utils/tokenizer.py +291 -0
  981. vllm/transformers_utils/tokenizer_base.py +146 -0
  982. vllm/transformers_utils/tokenizer_group.py +110 -0
  983. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  984. vllm/transformers_utils/tokenizers/mistral.py +483 -0
  985. vllm/transformers_utils/utils.py +98 -0
  986. vllm/triton_utils/__init__.py +5 -0
  987. vllm/triton_utils/importing.py +53 -0
  988. vllm/usage/__init__.py +0 -0
  989. vllm/usage/usage_lib.py +255 -0
  990. vllm/utils.py +2692 -0
  991. vllm/v1/__init__.py +0 -0
  992. vllm/v1/attention/__init__.py +0 -0
  993. vllm/v1/attention/backends/__init__.py +0 -0
  994. vllm/v1/attention/backends/flash_attn.py +783 -0
  995. vllm/v1/attention/backends/flashinfer.py +638 -0
  996. vllm/v1/attention/backends/mla/__init__.py +0 -0
  997. vllm/v1/attention/backends/mla/common.py +974 -0
  998. vllm/v1/attention/backends/mla/flashmla.py +149 -0
  999. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1000. vllm/v1/attention/backends/pallas.py +221 -0
  1001. vllm/v1/attention/backends/triton_attn.py +198 -0
  1002. vllm/v1/core/__init__.py +0 -0
  1003. vllm/v1/core/block_pool.py +281 -0
  1004. vllm/v1/core/encoder_cache_manager.py +149 -0
  1005. vllm/v1/core/kv_cache_manager.py +385 -0
  1006. vllm/v1/core/kv_cache_utils.py +744 -0
  1007. vllm/v1/core/sched/__init__.py +0 -0
  1008. vllm/v1/core/sched/interface.py +134 -0
  1009. vllm/v1/core/sched/output.py +126 -0
  1010. vllm/v1/core/sched/scheduler.py +838 -0
  1011. vllm/v1/core/sched/utils.py +22 -0
  1012. vllm/v1/core/specialized_manager.py +161 -0
  1013. vllm/v1/engine/__init__.py +166 -0
  1014. vllm/v1/engine/async_llm.py +532 -0
  1015. vllm/v1/engine/core.py +701 -0
  1016. vllm/v1/engine/core_client.py +942 -0
  1017. vllm/v1/engine/detokenizer.py +260 -0
  1018. vllm/v1/engine/exceptions.py +16 -0
  1019. vllm/v1/engine/llm_engine.py +285 -0
  1020. vllm/v1/engine/logprobs.py +198 -0
  1021. vllm/v1/engine/mm_input_cache.py +82 -0
  1022. vllm/v1/engine/output_processor.py +420 -0
  1023. vllm/v1/engine/parallel_sampling.py +132 -0
  1024. vllm/v1/engine/processor.py +387 -0
  1025. vllm/v1/executor/__init__.py +0 -0
  1026. vllm/v1/executor/abstract.py +112 -0
  1027. vllm/v1/executor/multiproc_executor.py +480 -0
  1028. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1029. vllm/v1/kv_cache_interface.py +166 -0
  1030. vllm/v1/metrics/__init__.py +0 -0
  1031. vllm/v1/metrics/loggers.py +498 -0
  1032. vllm/v1/metrics/stats.py +238 -0
  1033. vllm/v1/outputs.py +111 -0
  1034. vllm/v1/request.py +178 -0
  1035. vllm/v1/sample/__init__.py +0 -0
  1036. vllm/v1/sample/metadata.py +43 -0
  1037. vllm/v1/sample/ops/__init__.py +0 -0
  1038. vllm/v1/sample/ops/bad_words.py +38 -0
  1039. vllm/v1/sample/ops/penalties.py +58 -0
  1040. vllm/v1/sample/ops/topk_topp_sampler.py +315 -0
  1041. vllm/v1/sample/rejection_sampler.py +631 -0
  1042. vllm/v1/sample/sampler.py +270 -0
  1043. vllm/v1/sample/tpu/__init__.py +0 -0
  1044. vllm/v1/sample/tpu/metadata.py +118 -0
  1045. vllm/v1/sample/tpu/sampler.py +154 -0
  1046. vllm/v1/serial_utils.py +274 -0
  1047. vllm/v1/spec_decode/__init__.py +0 -0
  1048. vllm/v1/spec_decode/eagle.py +318 -0
  1049. vllm/v1/spec_decode/metadata.py +61 -0
  1050. vllm/v1/spec_decode/metrics.py +164 -0
  1051. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1052. vllm/v1/spec_decode/utils.py +18 -0
  1053. vllm/v1/stats/__init__.py +0 -0
  1054. vllm/v1/stats/common.py +453 -0
  1055. vllm/v1/structured_output/__init__.py +113 -0
  1056. vllm/v1/structured_output/backend_guidance.py +215 -0
  1057. vllm/v1/structured_output/backend_types.py +96 -0
  1058. vllm/v1/structured_output/backend_xgrammar.py +299 -0
  1059. vllm/v1/structured_output/request.py +84 -0
  1060. vllm/v1/structured_output/utils.py +174 -0
  1061. vllm/v1/utils.py +249 -0
  1062. vllm/v1/worker/__init__.py +0 -0
  1063. vllm/v1/worker/block_table.py +87 -0
  1064. vllm/v1/worker/gpu_input_batch.py +677 -0
  1065. vllm/v1/worker/gpu_model_runner.py +1776 -0
  1066. vllm/v1/worker/gpu_worker.py +349 -0
  1067. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1068. vllm/v1/worker/tpu_model_runner.py +1419 -0
  1069. vllm/v1/worker/tpu_worker.py +260 -0
  1070. vllm/v1/worker/utils.py +74 -0
  1071. vllm/v1/worker/worker_base.py +64 -0
  1072. vllm/version.py +40 -0
  1073. vllm/vllm_flash_attn/.gitkeep +0 -0
  1074. vllm/worker/__init__.py +0 -0
  1075. vllm/worker/cache_engine.py +144 -0
  1076. vllm/worker/cpu_enc_dec_model_runner.py +323 -0
  1077. vllm/worker/cpu_model_runner.py +668 -0
  1078. vllm/worker/cpu_pooling_model_runner.py +122 -0
  1079. vllm/worker/cpu_worker.py +400 -0
  1080. vllm/worker/enc_dec_model_runner.py +542 -0
  1081. vllm/worker/hpu_model_runner.py +2221 -0
  1082. vllm/worker/hpu_worker.py +483 -0
  1083. vllm/worker/model_runner.py +2056 -0
  1084. vllm/worker/model_runner_base.py +281 -0
  1085. vllm/worker/multi_step_hpu_worker.py +122 -0
  1086. vllm/worker/multi_step_model_runner.py +908 -0
  1087. vllm/worker/multi_step_tpu_worker.py +107 -0
  1088. vllm/worker/multi_step_worker.py +196 -0
  1089. vllm/worker/neuron_model_runner.py +336 -0
  1090. vllm/worker/neuron_worker.py +138 -0
  1091. vllm/worker/pooling_model_runner.py +200 -0
  1092. vllm/worker/tpu_model_runner.py +908 -0
  1093. vllm/worker/tpu_worker.py +332 -0
  1094. vllm/worker/utils.py +52 -0
  1095. vllm/worker/worker.py +570 -0
  1096. vllm/worker/worker_base.py +644 -0
  1097. vllm/worker/xpu_model_runner.py +603 -0
  1098. vllm/worker/xpu_worker.py +185 -0
  1099. vllm_cpu-0.8.5.post2.dist-info/METADATA +309 -0
  1100. vllm_cpu-0.8.5.post2.dist-info/RECORD +1103 -0
  1101. vllm_cpu-0.8.5.post2.dist-info/WHEEL +5 -0
  1102. vllm_cpu-0.8.5.post2.dist-info/entry_points.txt +2 -0
  1103. vllm_cpu-0.8.5.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2150 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import copy
4
+ import time
5
+ from collections import Counter as collectionsCounter
6
+ from collections import deque
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass
9
+ from functools import partial
10
+ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
11
+ Iterable, List, Literal, Mapping, NamedTuple, Optional)
12
+ from typing import Sequence as GenericSequence
13
+ from typing import Set, Type, Union, cast, overload
14
+
15
+ import torch
16
+ from typing_extensions import TypeVar, deprecated
17
+
18
+ import vllm.envs as envs
19
+ from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
20
+ ObservabilityConfig, ParallelConfig, SchedulerConfig,
21
+ VllmConfig)
22
+ from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
23
+ from vllm.engine.arg_utils import EngineArgs
24
+ from vllm.engine.metrics_types import StatLoggerBase, Stats
25
+ from vllm.engine.output_processor.interfaces import (
26
+ SequenceGroupOutputProcessor)
27
+ from vllm.engine.output_processor.stop_checker import StopChecker
28
+ from vllm.engine.output_processor.util import create_output_by_sequence_group
29
+ from vllm.entrypoints.openai.logits_processors import (
30
+ get_logits_processors as get_openai_logits_processors)
31
+ from vllm.executor.executor_base import ExecutorBase
32
+ from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
33
+ from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
34
+ from vllm.inputs.preprocess import InputPreprocessor
35
+ from vllm.logger import init_logger
36
+ from vllm.logits_process import get_bad_words_logits_processors
37
+ from vllm.lora.request import LoRARequest
38
+ from vllm.model_executor.guided_decoding import (
39
+ get_local_guided_decoding_logits_processor)
40
+ from vllm.model_executor.layers.sampler import SamplerOutput
41
+ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
42
+ from vllm.multimodal.processing import EncDecMultiModalProcessor
43
+ from vllm.outputs import (PoolingRequestOutput, RequestOutput,
44
+ RequestOutputFactory)
45
+ from vllm.pooling_params import PoolingParams
46
+ from vllm.prompt_adapter.request import PromptAdapterRequest
47
+ from vllm.sampling_params import RequestOutputKind, SamplingParams
48
+ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
49
+ PoolingSequenceGroupOutput, Sequence, SequenceGroup,
50
+ SequenceGroupBase, SequenceGroupMetadata,
51
+ SequenceGroupOutput, SequenceStatus)
52
+ from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
53
+ init_tracer)
54
+ from vllm.transformers_utils.detokenizer import Detokenizer
55
+ from vllm.transformers_utils.tokenizer import AnyTokenizer
56
+ from vllm.transformers_utils.tokenizer_group import (
57
+ TokenizerGroup, init_tokenizer_from_configs)
58
+ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
59
+ usage_message)
60
+ from vllm.utils import (Counter, Device, deprecate_kwargs,
61
+ resolve_obj_by_qualname, weak_bind)
62
+ from vllm.version import __version__ as VLLM_VERSION
63
+ from vllm.worker.model_runner_base import InputProcessingError
64
+
65
+ logger = init_logger(__name__)
66
+ _LOCAL_LOGGING_INTERVAL_SEC = 5
67
+
68
+ _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
69
+ _R = TypeVar("_R", default=Any)
70
+
71
+
72
+ @dataclass
73
+ class SchedulerOutputState:
74
+ """Caches the scheduler outputs for a virtual engine. Used for Multi-Step"""
75
+ seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
76
+ scheduler_outputs: Optional[SchedulerOutputs] = None
77
+ allow_async_output_proc: bool = False
78
+ last_output: Optional[SamplerOutput] = None
79
+
80
+
81
+ class OutputData(NamedTuple):
82
+ outputs: List[SamplerOutput]
83
+ seq_group_metadata_list: List[SequenceGroupMetadata]
84
+ scheduler_outputs: SchedulerOutputs
85
+ is_async: bool
86
+ is_last_step: bool
87
+ # Indicates if this output is from the first step of the
88
+ # multi-step. When multi-step is disabled, this is always
89
+ # set to True.
90
+ # is_first_step_output is invalid when `outputs` has
91
+ # outputs from multiple steps.
92
+ is_first_step_output: Optional[bool]
93
+ skip: List[int]
94
+
95
+
96
+ class SchedulerContext:
97
+
98
+ def __init__(self, multi_step_stream_outputs: bool = False):
99
+ self.output_queue: Deque[OutputData] = deque()
100
+ self.request_outputs: List[Union[RequestOutput,
101
+ PoolingRequestOutput]] = []
102
+ self.seq_group_metadata_list: Optional[
103
+ List[SequenceGroupMetadata]] = None
104
+ self.scheduler_outputs: Optional[SchedulerOutputs] = None
105
+
106
+ self.multi_step_stream_outputs: bool = multi_step_stream_outputs
107
+
108
+ def append_output(self, outputs: List[SamplerOutput],
109
+ seq_group_metadata_list: List[SequenceGroupMetadata],
110
+ scheduler_outputs: SchedulerOutputs, is_async: bool,
111
+ is_last_step: bool,
112
+ is_first_step_output: Optional[bool]):
113
+ self.output_queue.append(
114
+ OutputData(outputs=outputs,
115
+ seq_group_metadata_list=seq_group_metadata_list,
116
+ scheduler_outputs=scheduler_outputs,
117
+ is_async=is_async,
118
+ is_last_step=is_last_step,
119
+ is_first_step_output=is_first_step_output,
120
+ skip=[]))
121
+
122
+
123
+ class LLMEngine:
124
+ """An LLM engine that receives requests and generates texts.
125
+
126
+ This is the main class for the vLLM engine. It receives requests
127
+ from clients and generates texts from the LLM. It includes a tokenizer, a
128
+ language model (possibly distributed across multiple GPUs), and GPU memory
129
+ space allocated for intermediate states (aka KV cache). This class utilizes
130
+ iteration-level scheduling and efficient memory management to maximize the
131
+ serving throughput.
132
+
133
+ The :class:`~vllm.LLM` class wraps this class for offline batched inference
134
+ and the :class:`AsyncLLMEngine` class wraps this class for online serving.
135
+
136
+ The config arguments are derived from :class:`~vllm.EngineArgs`. (See
137
+ :ref:`engine-args`)
138
+
139
+ Args:
140
+ model_config: The configuration related to the LLM model.
141
+ cache_config: The configuration related to the KV cache memory
142
+ management.
143
+ parallel_config: The configuration related to distributed execution.
144
+ scheduler_config: The configuration related to the request scheduler.
145
+ device_config: The configuration related to the device.
146
+ lora_config (Optional): The configuration related to serving multi-LoRA.
147
+ speculative_config (Optional): The configuration related to speculative
148
+ decoding.
149
+ executor_class: The model executor class for managing distributed
150
+ execution.
151
+ prompt_adapter_config (Optional): The configuration related to serving
152
+ prompt adapters.
153
+ log_stats: Whether to log statistics.
154
+ usage_context: Specified entry point, used for usage info collection.
155
+ """
156
+
157
+ DO_VALIDATE_OUTPUT: ClassVar[bool] = False
158
+ """A flag to toggle whether to validate the type of request output."""
159
+
160
+ @classmethod
161
+ @contextmanager
162
+ def enable_output_validation(cls):
163
+ cls.DO_VALIDATE_OUTPUT = True
164
+
165
+ yield
166
+
167
+ cls.DO_VALIDATE_OUTPUT = False
168
+
169
+ @classmethod
170
+ def validate_output(
171
+ cls,
172
+ output: object,
173
+ output_type: Type[_O],
174
+ ) -> _O:
175
+ do_validate = cls.DO_VALIDATE_OUTPUT
176
+
177
+ if ((TYPE_CHECKING or do_validate)
178
+ and not isinstance(output, output_type)):
179
+ raise TypeError(f"Expected output of type {output_type}, "
180
+ f"but found type {type(output)}")
181
+
182
+ return cast(_O, output)
183
+
184
+ @classmethod
185
+ def validate_outputs(
186
+ cls,
187
+ outputs: GenericSequence[object],
188
+ output_type: Type[_O],
189
+ ) -> List[_O]:
190
+ do_validate = cls.DO_VALIDATE_OUTPUT
191
+
192
+ outputs_: List[_O]
193
+ if TYPE_CHECKING or do_validate:
194
+ outputs_ = []
195
+ for output in outputs:
196
+ if not isinstance(output, output_type):
197
+ raise TypeError(f"Expected output of type {output_type}, "
198
+ f"but found type {type(output)}")
199
+
200
+ outputs_.append(output)
201
+ else:
202
+ outputs_ = outputs
203
+
204
+ return outputs_
205
+
206
+ tokenizer: Optional[TokenizerGroup]
207
+
208
+ def __init__(
209
+ self,
210
+ vllm_config: VllmConfig,
211
+ executor_class: Type[ExecutorBase],
212
+ log_stats: bool,
213
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
214
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
215
+ mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
216
+ use_cached_outputs: bool = False,
217
+ ) -> None:
218
+ if envs.VLLM_USE_V1:
219
+ raise ValueError(
220
+ "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
221
+ "This should not happen. As a workaround, try using "
222
+ "LLMEngine.from_vllm_config(...) or explicitly set "
223
+ "VLLM_USE_V1=0 or 1 and report this issue on Github.")
224
+
225
+ self.vllm_config = vllm_config
226
+ self.model_config = vllm_config.model_config
227
+ self.cache_config = vllm_config.cache_config
228
+ self.lora_config = vllm_config.lora_config
229
+ self.parallel_config = vllm_config.parallel_config
230
+ self.scheduler_config = vllm_config.scheduler_config
231
+ self.device_config = vllm_config.device_config
232
+ self.speculative_config = vllm_config.speculative_config # noqa
233
+ self.load_config = vllm_config.load_config
234
+ self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
235
+ )
236
+ self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
237
+ self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
238
+ )
239
+
240
+ logger.info(
241
+ "Initializing a V0 LLM engine (v%s) with config: %s, "
242
+ "use_cached_outputs=%s, ",
243
+ VLLM_VERSION,
244
+ vllm_config,
245
+ use_cached_outputs,
246
+ )
247
+
248
+ self.log_stats = log_stats
249
+ self.use_cached_outputs = use_cached_outputs
250
+
251
+ if not self.model_config.skip_tokenizer_init:
252
+ self.tokenizer = self._init_tokenizer()
253
+ self.detokenizer = Detokenizer(self.tokenizer)
254
+ tokenizer_group = self.get_tokenizer_group()
255
+ else:
256
+ self.tokenizer = None
257
+ self.detokenizer = None
258
+ tokenizer_group = None
259
+
260
+ # Ensure that the function doesn't contain a reference to self,
261
+ # to avoid engine GC issues
262
+ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
263
+ assert tokenizer_group, ("tokenizer_group cannot be None, "
264
+ "make sure skip_tokenizer_init is False")
265
+ return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
266
+
267
+ self.seq_counter = Counter()
268
+ self.generation_config_fields = (
269
+ self.model_config.try_get_generation_config())
270
+
271
+ self.input_preprocessor = InputPreprocessor(self.model_config,
272
+ self.tokenizer,
273
+ mm_registry)
274
+
275
+ self.model_executor = executor_class(vllm_config=vllm_config)
276
+
277
+ if self.model_config.runner_type != "pooling":
278
+ self._initialize_kv_caches()
279
+
280
+ # If usage stat is enabled, collect relevant info.
281
+ if is_usage_stats_enabled():
282
+ from vllm.model_executor.model_loader import (
283
+ get_architecture_class_name)
284
+ usage_message.report_usage(
285
+ get_architecture_class_name(self.model_config),
286
+ usage_context,
287
+ extra_kvs={
288
+ # Common configuration
289
+ "dtype":
290
+ str(self.model_config.dtype),
291
+ "tensor_parallel_size":
292
+ self.parallel_config.tensor_parallel_size,
293
+ "block_size":
294
+ self.cache_config.block_size,
295
+ "gpu_memory_utilization":
296
+ self.cache_config.gpu_memory_utilization,
297
+
298
+ # Quantization
299
+ "quantization":
300
+ self.model_config.quantization,
301
+ "kv_cache_dtype":
302
+ str(self.cache_config.cache_dtype),
303
+
304
+ # Feature flags
305
+ "enable_lora":
306
+ bool(self.lora_config),
307
+ "enable_prompt_adapter":
308
+ bool(self.prompt_adapter_config),
309
+ "enable_prefix_caching":
310
+ self.cache_config.enable_prefix_caching,
311
+ "enforce_eager":
312
+ self.model_config.enforce_eager,
313
+ "disable_custom_all_reduce":
314
+ self.parallel_config.disable_custom_all_reduce,
315
+ })
316
+
317
+ self.cached_scheduler_outputs = [
318
+ SchedulerOutputState()
319
+ for _ in range(self.parallel_config.pipeline_parallel_size)
320
+ ]
321
+
322
+ self.scheduler_contexts = [
323
+ SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
324
+ multi_step_stream_outputs)
325
+ for _ in range(self.parallel_config.pipeline_parallel_size)
326
+ ]
327
+
328
+ if self.model_config.use_async_output_proc:
329
+ process_model_outputs = weak_bind(self._process_model_outputs)
330
+
331
+ self.async_callbacks = [
332
+ partial(process_model_outputs,
333
+ ctx=self.scheduler_contexts[v_id])
334
+ for v_id in range(self.parallel_config.pipeline_parallel_size)
335
+ ]
336
+ else:
337
+ self.async_callbacks = []
338
+
339
+ # Currently used by AsyncLLMEngine to ensure quick append
340
+ # of request outputs to asyncio queues
341
+ self.process_request_outputs_callback: Optional[Callable] = None
342
+
343
+ # Create the scheduler.
344
+ # NOTE: the cache_config here have been updated with the numbers of
345
+ # GPU and CPU blocks, which are profiled in the distributed executor.
346
+ if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
347
+ Scheduler = resolve_obj_by_qualname(
348
+ self.vllm_config.scheduler_config.scheduler_cls)
349
+ else:
350
+ Scheduler = self.vllm_config.scheduler_config.scheduler_cls
351
+ self.scheduler = [
352
+ Scheduler(
353
+ self.scheduler_config, self.cache_config, self.lora_config,
354
+ self.parallel_config.pipeline_parallel_size,
355
+ self.async_callbacks[v_id]
356
+ if self.model_config.use_async_output_proc else None)
357
+ for v_id in range(self.parallel_config.pipeline_parallel_size)
358
+ ]
359
+
360
+ # Metric Logging.
361
+ if self.log_stats:
362
+ if stat_loggers is not None:
363
+ self.stat_loggers = stat_loggers
364
+ else:
365
+ # Lazy import for prometheus multiprocessing.
366
+ # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
367
+ # before prometheus_client is imported.
368
+ # See https://prometheus.github.io/client_python/multiprocess/
369
+ from vllm.engine.metrics import (LoggingStatLogger,
370
+ PrometheusStatLogger)
371
+
372
+ self.stat_loggers = {
373
+ "logging":
374
+ LoggingStatLogger(
375
+ local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
376
+ vllm_config=vllm_config),
377
+ "prometheus":
378
+ PrometheusStatLogger(
379
+ local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
380
+ labels=dict(
381
+ model_name=self.model_config.served_model_name),
382
+ vllm_config=vllm_config),
383
+ }
384
+ self.stat_loggers["prometheus"].info("cache_config",
385
+ self.cache_config)
386
+
387
+ self.tracer = None
388
+ if self.observability_config.otlp_traces_endpoint:
389
+ self.tracer = init_tracer(
390
+ "vllm.llm_engine",
391
+ self.observability_config.otlp_traces_endpoint)
392
+
393
+ # Create sequence output processor, e.g. for beam search or
394
+ # speculative decoding.
395
+ self.output_processor = (
396
+ SequenceGroupOutputProcessor.create_output_processor(
397
+ self.scheduler_config,
398
+ self.detokenizer,
399
+ self.scheduler,
400
+ self.seq_counter,
401
+ get_tokenizer_for_seq,
402
+ stop_checker=StopChecker(
403
+ self.scheduler_config.max_model_len,
404
+ get_tokenizer_for_seq,
405
+ ),
406
+ ))
407
+
408
+ self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
409
+
410
+ # Flag to set when an input fails to process and the engine should run
411
+ # the next step without re-scheduling.
412
+ self._skip_scheduling_next_step = False
413
+
414
+ def _initialize_kv_caches(self) -> None:
415
+ """Initialize the KV cache in the worker(s).
416
+
417
+ The workers will determine the number of blocks in both the GPU cache
418
+ and the swap CPU cache.
419
+ """
420
+ start = time.time()
421
+ num_gpu_blocks, num_cpu_blocks = (
422
+ self.model_executor.determine_num_available_blocks())
423
+
424
+ if self.cache_config.num_gpu_blocks_override is not None:
425
+ num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
426
+ logger.info(
427
+ "Overriding num_gpu_blocks=%d with "
428
+ "num_gpu_blocks_override=%d", num_gpu_blocks,
429
+ num_gpu_blocks_override)
430
+ num_gpu_blocks = num_gpu_blocks_override
431
+
432
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
433
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
434
+
435
+ self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
436
+ elapsed = time.time() - start
437
+ logger.info(("init engine (profile, create kv cache, "
438
+ "warmup model) took %.2f seconds"), elapsed)
439
+
440
+ @classmethod
441
+ def _get_executor_cls(cls,
442
+ engine_config: VllmConfig) -> Type[ExecutorBase]:
443
+ # distributed_executor_backend must be set in VllmConfig.__post_init__
444
+ distributed_executor_backend = (
445
+ engine_config.parallel_config.distributed_executor_backend)
446
+ # Initialize the cluster and specify the executor class.
447
+ if isinstance(distributed_executor_backend, type):
448
+ if not issubclass(distributed_executor_backend, ExecutorBase):
449
+ raise TypeError(
450
+ "distributed_executor_backend must be a subclass of "
451
+ f"ExecutorBase. Got {distributed_executor_backend}.")
452
+ executor_class = distributed_executor_backend
453
+ elif distributed_executor_backend == "ray":
454
+ from vllm.executor.ray_distributed_executor import (
455
+ RayDistributedExecutor)
456
+ executor_class = RayDistributedExecutor
457
+ elif distributed_executor_backend == "mp":
458
+ from vllm.executor.mp_distributed_executor import (
459
+ MultiprocessingDistributedExecutor)
460
+ assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
461
+ "multiprocessing distributed executor backend does not "
462
+ "support VLLM_USE_RAY_SPMD_WORKER=1")
463
+ executor_class = MultiprocessingDistributedExecutor
464
+ elif distributed_executor_backend == "uni":
465
+ # JAX-style, single-process, multi-device executor.
466
+ from vllm.executor.uniproc_executor import UniProcExecutor
467
+ executor_class = UniProcExecutor
468
+ elif distributed_executor_backend == "external_launcher":
469
+ # executor with external launcher
470
+ from vllm.executor.uniproc_executor import ( # noqa
471
+ ExecutorWithExternalLauncher)
472
+ executor_class = ExecutorWithExternalLauncher
473
+ else:
474
+ raise ValueError("unrecognized distributed_executor_backend: "
475
+ f"{distributed_executor_backend}")
476
+ return executor_class
477
+
478
+ @classmethod
479
+ def from_vllm_config(
480
+ cls,
481
+ vllm_config: VllmConfig,
482
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
483
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
484
+ disable_log_stats: bool = False,
485
+ ) -> "LLMEngine":
486
+ return cls(
487
+ vllm_config=vllm_config,
488
+ executor_class=cls._get_executor_cls(vllm_config),
489
+ log_stats=(not disable_log_stats),
490
+ usage_context=usage_context,
491
+ stat_loggers=stat_loggers,
492
+ )
493
+
494
+ @classmethod
495
+ def from_engine_args(
496
+ cls,
497
+ engine_args: EngineArgs,
498
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
499
+ stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
500
+ ) -> "LLMEngine":
501
+ """Creates an LLM engine from the engine arguments."""
502
+ # Create the engine configs.
503
+ vllm_config = engine_args.create_engine_config(usage_context)
504
+
505
+ engine_cls = cls
506
+ if envs.VLLM_USE_V1:
507
+ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
508
+ engine_cls = V1LLMEngine
509
+
510
+ return engine_cls.from_vllm_config(
511
+ vllm_config=vllm_config,
512
+ usage_context=usage_context,
513
+ stat_loggers=stat_loggers,
514
+ disable_log_stats=engine_args.disable_log_stats,
515
+ )
516
+
517
+ def __reduce__(self):
518
+ # This is to ensure that the LLMEngine is not referenced in
519
+ # the closure used to initialize Ray worker actors
520
+ raise RuntimeError("LLMEngine should not be pickled!")
521
+
522
+ def __del__(self):
523
+ # Shutdown model executor when engine is garbage collected
524
+ # Use getattr since __init__ can fail before the field is set
525
+ if model_executor := getattr(self, "model_executor", None):
526
+ model_executor.shutdown()
527
+
528
+ def get_tokenizer_group(self) -> TokenizerGroup:
529
+ if self.tokenizer is None:
530
+ raise ValueError("Unable to get tokenizer because "
531
+ "skip_tokenizer_init is True")
532
+
533
+ return self.tokenizer
534
+
535
+ def get_tokenizer(
536
+ self,
537
+ lora_request: Optional[LoRARequest] = None,
538
+ ) -> AnyTokenizer:
539
+ return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
540
+
541
+ def _init_tokenizer(self) -> TokenizerGroup:
542
+ return init_tokenizer_from_configs(
543
+ model_config=self.model_config,
544
+ scheduler_config=self.scheduler_config,
545
+ lora_config=self.lora_config)
546
+
547
+ def _verify_args(self) -> None:
548
+ self.model_config.verify_with_parallel_config(self.parallel_config)
549
+ self.cache_config.verify_with_parallel_config(self.parallel_config)
550
+ if self.lora_config:
551
+ self.lora_config.verify_with_model_config(self.model_config)
552
+ self.lora_config.verify_with_scheduler_config(
553
+ self.scheduler_config)
554
+ if self.prompt_adapter_config:
555
+ self.prompt_adapter_config.verify_with_model_config(
556
+ self.model_config)
557
+
558
+ def _add_processed_request(
559
+ self,
560
+ request_id: str,
561
+ processed_inputs: ProcessorInputs,
562
+ params: Union[SamplingParams, PoolingParams],
563
+ arrival_time: float,
564
+ lora_request: Optional[LoRARequest],
565
+ prompt_adapter_request: Optional[PromptAdapterRequest],
566
+ trace_headers: Optional[Mapping[str, str]] = None,
567
+ priority: int = 0,
568
+ ) -> Optional[SequenceGroup]:
569
+ """Add a processed request to the engine's request pool.
570
+ return the created sequence group.
571
+ """
572
+ if isinstance(params, SamplingParams) and params.n > 1:
573
+ ParallelSampleSequenceGroup.add_request(
574
+ request_id,
575
+ self,
576
+ params,
577
+ processed_inputs=processed_inputs,
578
+ arrival_time=arrival_time,
579
+ lora_request=lora_request,
580
+ trace_headers=trace_headers,
581
+ prompt_adapter_request=prompt_adapter_request,
582
+ priority=priority,
583
+ )
584
+ return None
585
+
586
+ self._validate_model_inputs(processed_inputs, lora_request)
587
+ # Create the sequences.
588
+ block_size = self.cache_config.block_size
589
+ seq_id = next(self.seq_counter)
590
+ eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
591
+
592
+ encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
593
+
594
+ seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
595
+ lora_request, prompt_adapter_request)
596
+
597
+ encoder_seq = (None if encoder_inputs is None else Sequence(
598
+ seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
599
+ prompt_adapter_request))
600
+
601
+ # Create a SequenceGroup based on SamplingParams or PoolingParams
602
+ if isinstance(params, SamplingParams):
603
+ seq_group = self._create_sequence_group_with_sampling(
604
+ request_id,
605
+ seq,
606
+ params,
607
+ arrival_time=arrival_time,
608
+ lora_request=lora_request,
609
+ trace_headers=trace_headers,
610
+ prompt_adapter_request=prompt_adapter_request,
611
+ encoder_seq=encoder_seq,
612
+ priority=priority)
613
+ elif isinstance(params, PoolingParams):
614
+ seq_group = self._create_sequence_group_with_pooling(
615
+ request_id,
616
+ seq,
617
+ params,
618
+ arrival_time=arrival_time,
619
+ lora_request=lora_request,
620
+ prompt_adapter_request=prompt_adapter_request,
621
+ encoder_seq=encoder_seq,
622
+ priority=priority)
623
+ else:
624
+ raise ValueError(
625
+ "Either SamplingParams or PoolingParams must be provided.")
626
+
627
+ # Add the sequence group to the scheduler with least unfinished seqs.
628
+ costs = [
629
+ scheduler.get_num_unfinished_seq_groups()
630
+ for scheduler in self.scheduler
631
+ ]
632
+ min_cost_scheduler = self.scheduler[costs.index(min(costs))]
633
+ min_cost_scheduler.add_seq_group(seq_group)
634
+
635
+ return seq_group
636
+
637
+ def stop_remote_worker_execution_loop(self) -> None:
638
+ self.model_executor.stop_remote_worker_execution_loop()
639
+
640
+ @overload
641
+ def add_request(
642
+ self,
643
+ request_id: str,
644
+ prompt: PromptType,
645
+ params: Union[SamplingParams, PoolingParams],
646
+ arrival_time: Optional[float] = None,
647
+ lora_request: Optional[LoRARequest] = None,
648
+ trace_headers: Optional[Mapping[str, str]] = None,
649
+ prompt_adapter_request: Optional[PromptAdapterRequest] = None,
650
+ priority: int = 0,
651
+ ) -> None:
652
+ ...
653
+
654
+ @overload
655
+ @deprecated("'inputs' will be renamed to 'prompt")
656
+ def add_request(
657
+ self,
658
+ request_id: str,
659
+ *,
660
+ inputs: PromptType,
661
+ params: Union[SamplingParams, PoolingParams],
662
+ arrival_time: Optional[float] = None,
663
+ lora_request: Optional[LoRARequest] = None,
664
+ trace_headers: Optional[Mapping[str, str]] = None,
665
+ prompt_adapter_request: Optional[PromptAdapterRequest] = None,
666
+ priority: int = 0,
667
+ ) -> None:
668
+ ...
669
+
670
+ @deprecate_kwargs(
671
+ "inputs",
672
+ additional_message="Please use the 'prompt' parameter instead.",
673
+ )
674
+ def add_request(
675
+ self,
676
+ request_id: str,
677
+ prompt: Optional[PromptType] = None,
678
+ params: Optional[Union[SamplingParams, PoolingParams]] = None,
679
+ arrival_time: Optional[float] = None,
680
+ lora_request: Optional[LoRARequest] = None,
681
+ trace_headers: Optional[Mapping[str, str]] = None,
682
+ prompt_adapter_request: Optional[PromptAdapterRequest] = None,
683
+ priority: int = 0,
684
+ *,
685
+ inputs: Optional[PromptType] = None, # DEPRECATED
686
+ ) -> None:
687
+ """Add a request to the engine's request pool.
688
+
689
+ The request is added to the request pool and will be processed by the
690
+ scheduler as `engine.step()` is called. The exact scheduling policy is
691
+ determined by the scheduler.
692
+
693
+ Args:
694
+ request_id: The unique ID of the request.
695
+ prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
696
+ for more details about the format of each input.
697
+ params: Parameters for sampling or pooling.
698
+ :class:`~vllm.SamplingParams` for text generation.
699
+ :class:`~vllm.PoolingParams` for pooling.
700
+ arrival_time: The arrival time of the request. If None, we use
701
+ the current monotonic time.
702
+ lora_request: The LoRA request to add.
703
+ trace_headers: OpenTelemetry trace headers.
704
+ prompt_adapter_request: The prompt adapter request to add.
705
+ priority: The priority of the request.
706
+ Only applicable with priority scheduling.
707
+
708
+ Details:
709
+ - Set arrival_time to the current time if it is None.
710
+ - Set prompt_token_ids to the encoded prompt if it is None.
711
+ - Create `n` number of :class:`~vllm.Sequence` objects.
712
+ - Create a :class:`~vllm.SequenceGroup` object
713
+ from the list of :class:`~vllm.Sequence`.
714
+ - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
715
+
716
+ Example:
717
+ >>> # initialize engine
718
+ >>> engine = LLMEngine.from_engine_args(engine_args)
719
+ >>> # set request arguments
720
+ >>> example_prompt = "Who is the president of the United States?"
721
+ >>> sampling_params = SamplingParams(temperature=0.0)
722
+ >>> request_id = 0
723
+ >>>
724
+ >>> # add the request to the engine
725
+ >>> engine.add_request(
726
+ >>> str(request_id),
727
+ >>> example_prompt,
728
+ >>> SamplingParams(temperature=0.0))
729
+ >>> # continue the request processing
730
+ >>> ...
731
+ """
732
+ if inputs is not None:
733
+ prompt = inputs
734
+ assert prompt is not None and params is not None
735
+
736
+ if lora_request is not None and not self.lora_config:
737
+ raise ValueError(f"Got lora_request {lora_request} but LoRA is "
738
+ "not enabled!")
739
+
740
+ if priority != 0 and not self.scheduler_config.policy == "priority":
741
+ raise ValueError(f"Got priority {priority} but "
742
+ "Priority scheduling is not enabled.")
743
+
744
+ if isinstance(params, SamplingParams) \
745
+ and (params.guided_decoding or params.logits_processors) \
746
+ and self.scheduler_config.num_scheduler_steps > 1:
747
+ raise ValueError(
748
+ "Guided decoding and logits processors are not supported "
749
+ "in multi-step decoding")
750
+
751
+ if arrival_time is None:
752
+ arrival_time = time.time()
753
+
754
+ if self.tokenizer is not None:
755
+ self._validate_token_prompt(
756
+ prompt,
757
+ tokenizer=self.get_tokenizer(lora_request=lora_request))
758
+
759
+ processed_inputs = self.input_preprocessor.preprocess(
760
+ prompt,
761
+ lora_request=lora_request,
762
+ prompt_adapter_request=prompt_adapter_request,
763
+ )
764
+
765
+ self._add_processed_request(
766
+ request_id=request_id,
767
+ processed_inputs=processed_inputs,
768
+ params=params,
769
+ arrival_time=arrival_time,
770
+ lora_request=lora_request,
771
+ prompt_adapter_request=prompt_adapter_request,
772
+ trace_headers=trace_headers,
773
+ priority=priority,
774
+ )
775
+
776
+ def _validate_token_prompt(self, prompt: PromptType,
777
+ tokenizer: AnyTokenizer):
778
+ # Guard against out-of-vocab tokens.
779
+ # For some tokenizers, tokenizer.decode will happily return empty text
780
+ # for token ids that are out of vocab, and we don't detect token ids
781
+ # that are greater than the max token id before running the model.
782
+ # However, these token ids will later crash a cuda kernel at runtime
783
+ # with an index out of bounds error. This will crash the entire engine.
784
+ # This needs to happen before multimodal input pre-processing, which
785
+ # may add dummy <image> tokens that aren't part of the tokenizer's
786
+ # vocabulary.
787
+ if is_token_prompt(prompt):
788
+ prompt_ids = prompt["prompt_token_ids"]
789
+ if len(prompt_ids) == 0:
790
+ # Empty prompt check is handled later
791
+ return
792
+ max_input_id = max(prompt_ids)
793
+ if max_input_id > tokenizer.max_token_id:
794
+ raise ValueError(
795
+ "Token id {} is out of vocabulary".format(max_input_id))
796
+
797
+ def _create_sequence_group_with_sampling(
798
+ self,
799
+ request_id: str,
800
+ seq: Sequence,
801
+ sampling_params: SamplingParams,
802
+ arrival_time: float,
803
+ lora_request: Optional[LoRARequest],
804
+ trace_headers: Optional[Mapping[str, str]] = None,
805
+ prompt_adapter_request: Optional[PromptAdapterRequest] = None,
806
+ encoder_seq: Optional[Sequence] = None,
807
+ priority: int = 0,
808
+ ) -> SequenceGroup:
809
+ """Creates a SequenceGroup with SamplingParams."""
810
+ max_logprobs = self.get_model_config().max_logprobs
811
+ if (sampling_params.logprobs
812
+ and sampling_params.logprobs > max_logprobs) or (
813
+ sampling_params.prompt_logprobs
814
+ and sampling_params.prompt_logprobs > max_logprobs):
815
+ raise ValueError(f"Cannot request more than "
816
+ f"{max_logprobs} logprobs.")
817
+
818
+ sampling_params = self._build_logits_processors(
819
+ sampling_params, lora_request)
820
+
821
+ # Defensive copy of SamplingParams, which are used by the sampler,
822
+ # this doesn't deep-copy LogitsProcessor objects
823
+ sampling_params = sampling_params.clone()
824
+
825
+ sampling_params.update_from_generation_config(
826
+ self.generation_config_fields, seq.eos_token_id)
827
+
828
+ # Create the sequence group.
829
+ draft_size = 1
830
+ if self.vllm_config.speculative_config is not None:
831
+ draft_size = \
832
+ self.vllm_config.speculative_config.num_speculative_tokens + 1
833
+ seq_group = SequenceGroup(
834
+ request_id=request_id,
835
+ seqs=[seq],
836
+ arrival_time=arrival_time,
837
+ sampling_params=sampling_params,
838
+ lora_request=lora_request,
839
+ trace_headers=trace_headers,
840
+ prompt_adapter_request=prompt_adapter_request,
841
+ encoder_seq=encoder_seq,
842
+ priority=priority,
843
+ draft_size=draft_size)
844
+
845
+ return seq_group
846
+
847
+ def _create_sequence_group_with_pooling(
848
+ self,
849
+ request_id: str,
850
+ seq: Sequence,
851
+ pooling_params: PoolingParams,
852
+ arrival_time: float,
853
+ lora_request: Optional[LoRARequest],
854
+ prompt_adapter_request: Optional[PromptAdapterRequest],
855
+ encoder_seq: Optional[Sequence] = None,
856
+ priority: int = 0,
857
+ ) -> SequenceGroup:
858
+ """Creates a SequenceGroup with PoolingParams."""
859
+ # Defensive copy of PoolingParams, which are used by the pooler
860
+ pooling_params = pooling_params.clone()
861
+ # Create the sequence group.
862
+ seq_group = SequenceGroup(
863
+ request_id=request_id,
864
+ seqs=[seq],
865
+ arrival_time=arrival_time,
866
+ lora_request=lora_request,
867
+ pooling_params=pooling_params,
868
+ prompt_adapter_request=prompt_adapter_request,
869
+ encoder_seq=encoder_seq,
870
+ priority=priority)
871
+ return seq_group
872
+
873
+ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
874
+ """Aborts a request(s) with the given ID.
875
+
876
+ Args:
877
+ request_id: The ID(s) of the request to abort.
878
+
879
+ Details:
880
+ - Refer to the
881
+ :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
882
+ from class :class:`~vllm.core.scheduler.Scheduler`.
883
+
884
+ Example:
885
+ >>> # initialize engine and add a request with request_id
886
+ >>> request_id = str(0)
887
+ >>> # abort the request
888
+ >>> engine.abort_request(request_id)
889
+ """
890
+ for scheduler in self.scheduler:
891
+ scheduler.abort_seq_group(
892
+ request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
893
+
894
+ def get_vllm_config(self) -> VllmConfig:
895
+ """Gets the vllm configuration."""
896
+ return self.vllm_config
897
+
898
+ def get_model_config(self) -> ModelConfig:
899
+ """Gets the model configuration."""
900
+ return self.model_config
901
+
902
+ def get_parallel_config(self) -> ParallelConfig:
903
+ """Gets the parallel configuration."""
904
+ return self.parallel_config
905
+
906
+ def get_decoding_config(self) -> DecodingConfig:
907
+ """Gets the decoding configuration."""
908
+ return self.decoding_config
909
+
910
+ def get_scheduler_config(self) -> SchedulerConfig:
911
+ """Gets the scheduler configuration."""
912
+ return self.scheduler_config
913
+
914
+ def get_lora_config(self) -> LoRAConfig:
915
+ """Gets the LoRA configuration."""
916
+ return self.lora_config
917
+
918
+ def get_num_unfinished_requests(self) -> int:
919
+ """Gets the number of unfinished requests."""
920
+ return sum(scheduler.get_num_unfinished_seq_groups()
921
+ for scheduler in self.scheduler)
922
+
923
+ def has_unfinished_requests(self) -> bool:
924
+ """Returns True if there are unfinished requests."""
925
+ return any(scheduler.has_unfinished_seqs()
926
+ for scheduler in self.scheduler)
927
+
928
+ def has_unfinished_requests_for_virtual_engine(
929
+ self, virtual_engine: int) -> bool:
930
+ """
931
+ Returns True if there are unfinished requests for the virtual engine.
932
+ """
933
+ return self.scheduler[virtual_engine].has_unfinished_seqs()
934
+
935
+ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
936
+ """Reset prefix cache for all devices."""
937
+
938
+ success = True
939
+ for scheduler in self.scheduler:
940
+ success = success and scheduler.reset_prefix_cache(device)
941
+ return success
942
+
943
+ @staticmethod
944
+ def _process_sequence_group_outputs(
945
+ seq_group: SequenceGroup,
946
+ outputs: List[PoolingSequenceGroupOutput],
947
+ ) -> None:
948
+ seq_group.pooled_data = outputs[0].data
949
+
950
+ for seq in seq_group.get_seqs():
951
+ seq.status = SequenceStatus.FINISHED_STOPPED
952
+
953
+ return
954
+
955
+ def _update_num_computed_tokens_for_multi_step_prefill(
956
+ self, seq_group: SequenceGroup,
957
+ seq_group_meta: SequenceGroupMetadata,
958
+ is_first_step_output: Optional[bool]):
959
+ """
960
+ This function updates num_computed_tokens for prompt sequences
961
+ when Multi-Step is enabled.
962
+
963
+ seq_group: SequenceGroup to update the num_computed_tokens for.
964
+ seq_group_meta: Metadata of the given SequenceGroup.
965
+ is_first_step_output: Optional[bool] -
966
+ When available, is_first_step_output indicates if the appended
967
+ output token is the output of the first-step in multi-step.
968
+ A value of None indicates that outputs from all steps in
969
+ in multi-step are submitted in a single burst.
970
+ """
971
+
972
+ assert self.scheduler_config.is_multi_step
973
+
974
+ if not seq_group_meta.is_prompt:
975
+ # num_computed_token updates for multi-step decodes happen after
976
+ # the tokens are appended to the sequence.
977
+ return
978
+
979
+ do_update: bool = False
980
+ if self.scheduler_config.chunked_prefill_enabled:
981
+ # In multi-step + chunked-prefill case, the prompt sequences
982
+ # that are scheduled are fully processed in the first step.
983
+ do_update = is_first_step_output is None or is_first_step_output
984
+ else:
985
+ # Normal multi-step decoding case. In this case prompt-sequences
986
+ # are actually single-stepped. Always update in this case.
987
+ assert seq_group.state.num_steps == 1
988
+ do_update = True
989
+
990
+ if do_update:
991
+ seq_group.update_num_computed_tokens(
992
+ seq_group_meta.token_chunk_size)
993
+
994
+ def _process_model_outputs(self,
995
+ ctx: SchedulerContext,
996
+ request_id: Optional[str] = None) -> None:
997
+ """Apply the model output to the sequences in the scheduled seq groups
998
+ and return responses.
999
+
1000
+ ctx: The virtual engine context to work on
1001
+ request_id: If provided, then only this request is going to be processed
1002
+ """
1003
+
1004
+ now = time.time()
1005
+
1006
+ if len(ctx.output_queue) == 0:
1007
+ return None
1008
+
1009
+ # Get pending async postprocessor
1010
+ if request_id:
1011
+ # When we process only one request, no pop is required
1012
+ # (since later we will process all of the rest)
1013
+ (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
1014
+ is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
1015
+ else:
1016
+ (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
1017
+ is_last_step, is_first_step_output,
1018
+ skip) = ctx.output_queue.popleft()
1019
+
1020
+ # Sanity check
1021
+ assert len(seq_group_metadata_list) == len(
1022
+ scheduler_outputs.scheduled_seq_groups)
1023
+
1024
+ has_multiple_outputs: bool = len(outputs) > 1
1025
+ outputs_by_sequence_group: List[List[SequenceGroupOutput]]
1026
+ if has_multiple_outputs:
1027
+ assert self.scheduler_config.is_multi_step or \
1028
+ self.speculative_config
1029
+ # Organize outputs by [step][sequence group] instead of
1030
+ # [sequence group][step].
1031
+ if self.scheduler_config.is_multi_step:
1032
+ outputs_by_sequence_group = create_output_by_sequence_group(
1033
+ outputs, len(seq_group_metadata_list))
1034
+ elif self.speculative_config:
1035
+ # Decodes are multi-steps while prefills are not, outputting at
1036
+ # most 1 token. Separate them so that we can trigger chunk
1037
+ # processing without having to pad or copy over prompts K times
1038
+ # to match decodes structure (costly with prompt_logprobs).
1039
+ num_prefills = sum(sg.is_prompt
1040
+ for sg in seq_group_metadata_list)
1041
+ prefills, decodes = outputs[:num_prefills], outputs[
1042
+ num_prefills:]
1043
+ outputs_by_sequence_group = create_output_by_sequence_group(
1044
+ decodes,
1045
+ num_seq_groups=len(seq_group_metadata_list) - num_prefills)
1046
+ outputs_by_sequence_group = [p.outputs for p in prefills
1047
+ ] + outputs_by_sequence_group
1048
+ # We have outputs for multiple steps submitted in a single burst,
1049
+ # so invalidate is_first_step_output.
1050
+ is_first_step_output = None
1051
+ else:
1052
+ outputs_by_sequence_group = outputs
1053
+
1054
+ # Determine the requests we need to operate on
1055
+ if request_id:
1056
+ indices = []
1057
+ for i, seq_group_meta in enumerate(seq_group_metadata_list):
1058
+ if seq_group_meta.request_id == request_id:
1059
+ assert i not in skip # Cannot be called twice
1060
+ indices.append(i)
1061
+ break
1062
+
1063
+ # If the request_id was not found, then it means that
1064
+ # this is a new request that has no pending async
1065
+ # postprocessor
1066
+ if not indices:
1067
+ return
1068
+ else:
1069
+ indices = range(len(seq_group_metadata_list)) # type: ignore
1070
+
1071
+ finished_before: List[int] = []
1072
+ finished_now: List[int] = []
1073
+ for i in indices:
1074
+ if i in skip:
1075
+ continue
1076
+
1077
+ seq_group_meta = seq_group_metadata_list[i]
1078
+ scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
1079
+
1080
+ seq_group: SequenceGroup = scheduled_seq_group.seq_group
1081
+
1082
+ if seq_group.is_finished():
1083
+ finished_before.append(i)
1084
+ continue
1085
+
1086
+ output: List[SequenceGroupOutput]
1087
+ if has_multiple_outputs:
1088
+ output = outputs_by_sequence_group[i]
1089
+ else:
1090
+ output = [outputs_by_sequence_group[0][i]]
1091
+
1092
+ if not is_async:
1093
+ if self.scheduler_config.is_multi_step:
1094
+ # Updates happen only if the sequence is prefill
1095
+ self._update_num_computed_tokens_for_multi_step_prefill(
1096
+ seq_group, seq_group_meta, is_first_step_output)
1097
+ else:
1098
+ seq_group.update_num_computed_tokens(
1099
+ seq_group_meta.token_chunk_size or 0)
1100
+
1101
+ if outputs:
1102
+ for o in outputs:
1103
+ if (isinstance(o, SamplerOutput)
1104
+ and seq_group.metrics is not None):
1105
+ if seq_group.metrics.model_forward_time is not None:
1106
+ seq_group.metrics.model_forward_time += (
1107
+ o.model_forward_time or 0)
1108
+ else:
1109
+ seq_group.metrics.model_forward_time = (
1110
+ o.model_forward_time)
1111
+ if seq_group.metrics.model_execute_time is not None:
1112
+ seq_group.metrics.model_execute_time += (
1113
+ o.model_execute_time or 0)
1114
+ else:
1115
+ seq_group.metrics.model_execute_time = (
1116
+ o.model_execute_time)
1117
+
1118
+ if self.model_config.runner_type == "pooling":
1119
+ self._process_sequence_group_outputs(seq_group, output)
1120
+ else:
1121
+ self.output_processor.process_prompt_logprob(seq_group, output)
1122
+ if seq_group_meta.do_sample:
1123
+ self.output_processor.process_outputs(
1124
+ seq_group, output, is_async)
1125
+
1126
+ if seq_group.is_finished():
1127
+ finished_now.append(i)
1128
+
1129
+ # Generate outputs for the requests that finished this iteration
1130
+ for i in finished_now:
1131
+ scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
1132
+
1133
+ seq_group = scheduled_seq_group.seq_group
1134
+ seq_group.maybe_set_first_token_time(now)
1135
+ if not seq_group.is_prefill():
1136
+ seq_group.set_last_token_time(now)
1137
+ request_output = RequestOutputFactory.create(
1138
+ seq_group,
1139
+ self.seq_id_to_seq_group,
1140
+ use_cache=self.use_cached_outputs)
1141
+ if request_output:
1142
+ ctx.request_outputs.append(request_output)
1143
+
1144
+ # When we process a single request, we skip it for the next time,
1145
+ # and invoke the request output callback (if there was final output)
1146
+ if request_id:
1147
+ assert len(indices) == 1
1148
+ skip.append(indices[0])
1149
+
1150
+ if (finished_now
1151
+ and self.process_request_outputs_callback is not None):
1152
+ self.process_request_outputs_callback(ctx.request_outputs)
1153
+ ctx.request_outputs.clear()
1154
+ return
1155
+
1156
+ # Free currently finished requests
1157
+ if finished_now:
1158
+ for scheduler in self.scheduler:
1159
+ scheduler.free_finished_seq_groups()
1160
+
1161
+ # For multi-step without streaming, don't create outputs each iteration
1162
+ if not is_last_step and not ctx.multi_step_stream_outputs:
1163
+ # Immediately process request outputs here (if callback is given)
1164
+ if (finished_now
1165
+ and self.process_request_outputs_callback is not None):
1166
+ self.process_request_outputs_callback(ctx.request_outputs)
1167
+ ctx.request_outputs.clear()
1168
+ return
1169
+
1170
+ # Create the outputs
1171
+ for i in indices:
1172
+ if i in skip or i in finished_before or i in finished_now:
1173
+ continue # Avoids double processing
1174
+
1175
+ scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
1176
+
1177
+ seq_group = scheduled_seq_group.seq_group
1178
+ seq_group.maybe_set_first_token_time(now)
1179
+ if not seq_group.is_prefill():
1180
+ seq_group.set_last_token_time(now)
1181
+ request_output = RequestOutputFactory.create(
1182
+ seq_group,
1183
+ self.seq_id_to_seq_group,
1184
+ use_cache=self.use_cached_outputs)
1185
+ if request_output:
1186
+ ctx.request_outputs.append(request_output)
1187
+
1188
+ # For multi-step with streaming, create outputs each iteration
1189
+ if not is_last_step and ctx.multi_step_stream_outputs:
1190
+ # Immediately process request outputs here (if callback is given)
1191
+ if self.process_request_outputs_callback is not None:
1192
+ self.process_request_outputs_callback(ctx.request_outputs)
1193
+ ctx.request_outputs.clear()
1194
+ return
1195
+
1196
+ for seq_group in scheduler_outputs.ignored_seq_groups:
1197
+ params = seq_group.sampling_params
1198
+ if params is not None and params.output_kind == (
1199
+ RequestOutputKind.DELTA) and not seq_group.is_finished():
1200
+ continue
1201
+
1202
+ request_output = RequestOutputFactory.create(
1203
+ seq_group,
1204
+ self.seq_id_to_seq_group,
1205
+ use_cache=self.use_cached_outputs,
1206
+ )
1207
+ if request_output:
1208
+ ctx.request_outputs.append(request_output)
1209
+
1210
+ # Immediately process request outputs here (if callback is given)
1211
+ if (ctx.request_outputs
1212
+ and self.process_request_outputs_callback is not None):
1213
+ self.process_request_outputs_callback(ctx.request_outputs)
1214
+ ctx.request_outputs.clear()
1215
+
1216
+ # For async case, we need to record the stats here.
1217
+ # For non-async case, the stats are done in the
1218
+ # LLMEngine/AsyncLLMEngine directly
1219
+ if is_async:
1220
+ # Log stats.
1221
+ self.do_log_stats(scheduler_outputs, outputs, finished_before,
1222
+ skip)
1223
+
1224
+ # Tracing
1225
+ self.do_tracing(scheduler_outputs, finished_before)
1226
+
1227
+ return None
1228
+
1229
+ def _advance_to_next_step(
1230
+ self, output: SamplerOutput,
1231
+ seq_group_metadata_list: List[SequenceGroupMetadata],
1232
+ scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
1233
+ """Given model output from a single run, append the tokens to the
1234
+ sequences. This is normally done inside output processor, but it is
1235
+ required if the worker is to perform async forward pass to next step.
1236
+ """
1237
+ for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
1238
+ zip(seq_group_metadata_list, output, scheduled_seq_groups):
1239
+ seq_group = scheduled_seq_group.seq_group
1240
+
1241
+ if seq_group.is_finished():
1242
+ continue
1243
+
1244
+ if self.scheduler_config.is_multi_step:
1245
+ # Updates happen only if the sequence is prefill
1246
+ self._update_num_computed_tokens_for_multi_step_prefill(
1247
+ seq_group, seq_group_metadata,
1248
+ seq_group.state.num_steps == 1)
1249
+ else:
1250
+ token_chunk_size = (seq_group_metadata.token_chunk_size
1251
+ if seq_group_metadata.token_chunk_size
1252
+ is not None else 0)
1253
+ seq_group.update_num_computed_tokens(token_chunk_size)
1254
+
1255
+ if seq_group_metadata.do_sample:
1256
+ assert len(sequence_group_outputs.samples) == 1, (
1257
+ "Async output processor expects a single sample"
1258
+ " (i.e sampling_params.n == 1)")
1259
+ sample = sequence_group_outputs.samples[0]
1260
+
1261
+ assert len(seq_group.seqs) == 1
1262
+ seq = seq_group.seqs[0]
1263
+
1264
+ if self.scheduler_config.is_multi_step:
1265
+ is_prefill_append = seq.data.get_num_uncomputed_tokens(
1266
+ ) == 0
1267
+ seq.append_token_id(sample.output_token, sample.logprobs)
1268
+ if not is_prefill_append:
1269
+ seq_group.update_num_computed_tokens(1)
1270
+ else:
1271
+ seq.append_token_id(sample.output_token, sample.logprobs)
1272
+
1273
+ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
1274
+ """Performs one decoding iteration and returns newly generated results.
1275
+
1276
+ .. figure:: https://i.imgur.com/sv2HssD.png
1277
+ :alt: Overview of the step function
1278
+ :align: center
1279
+
1280
+ Overview of the step function.
1281
+
1282
+ Details:
1283
+ - Step 1: Schedules the sequences to be executed in the next
1284
+ iteration and the token blocks to be swapped in/out/copy.
1285
+
1286
+ - Depending on the scheduling policy,
1287
+ sequences may be `preempted/reordered`.
1288
+ - A Sequence Group (SG) refer to a group of sequences
1289
+ that are generated from the same prompt.
1290
+
1291
+ - Step 2: Calls the distributed executor to execute the model.
1292
+ - Step 3: Processes the model output. This mainly includes:
1293
+
1294
+ - Decodes the relevant outputs.
1295
+ - Updates the scheduled sequence groups with model outputs
1296
+ based on its `sampling parameters` (`use_beam_search` or not).
1297
+ - Frees the finished sequence groups.
1298
+
1299
+ - Finally, it creates and returns the newly generated results.
1300
+
1301
+ Example:
1302
+ >>> # Please see the example/ folder for more detailed examples.
1303
+ >>>
1304
+ >>> # initialize engine and request arguments
1305
+ >>> engine = LLMEngine.from_engine_args(engine_args)
1306
+ >>> example_inputs = [(0, "What is LLM?",
1307
+ >>> SamplingParams(temperature=0.0))]
1308
+ >>>
1309
+ >>> # Start the engine with an event loop
1310
+ >>> while True:
1311
+ >>> if example_inputs:
1312
+ >>> req_id, prompt, sampling_params = example_inputs.pop(0)
1313
+ >>> engine.add_request(str(req_id),prompt,sampling_params)
1314
+ >>>
1315
+ >>> # continue the request processing
1316
+ >>> request_outputs = engine.step()
1317
+ >>> for request_output in request_outputs:
1318
+ >>> if request_output.finished:
1319
+ >>> # return or show the request output
1320
+ >>>
1321
+ >>> if not (engine.has_unfinished_requests() or example_inputs):
1322
+ >>> break
1323
+ """
1324
+ if self.parallel_config.pipeline_parallel_size > 1:
1325
+ raise NotImplementedError(
1326
+ "Pipeline parallelism is only supported through AsyncLLMEngine "
1327
+ "as performance will be severely degraded otherwise.")
1328
+
1329
+ # For llm_engine, there is no pipeline parallel support, so the engine
1330
+ # used is always 0.
1331
+ virtual_engine = 0
1332
+
1333
+ # These are cached outputs from previous iterations. None if on first
1334
+ # iteration
1335
+ cached_outputs = self.cached_scheduler_outputs[virtual_engine]
1336
+ seq_group_metadata_list = cached_outputs.seq_group_metadata_list
1337
+ scheduler_outputs = cached_outputs.scheduler_outputs
1338
+ allow_async_output_proc = cached_outputs.allow_async_output_proc
1339
+
1340
+ ctx = self.scheduler_contexts[virtual_engine]
1341
+
1342
+ # Clear outputs for each new scheduler iteration
1343
+ ctx.request_outputs.clear()
1344
+
1345
+ # Skip the scheduler if there are any remaining steps in the seq groups.
1346
+ # This ensures that the scheduler is only called again when the current
1347
+ # batch has completed.
1348
+ # The scheduler is also skipped if a single request caused the last
1349
+ # engine step to fail, and the previous schedule needs to be rerun.
1350
+ if not self._has_remaining_steps(
1351
+ seq_group_metadata_list
1352
+ ) and not self._skip_scheduling_next_step:
1353
+ # Schedule iteration
1354
+ (seq_group_metadata_list, scheduler_outputs,
1355
+ allow_async_output_proc
1356
+ ) = self.scheduler[virtual_engine].schedule()
1357
+
1358
+ ctx.seq_group_metadata_list = seq_group_metadata_list
1359
+ ctx.scheduler_outputs = scheduler_outputs
1360
+
1361
+ finished_requests_ids = self.scheduler[
1362
+ virtual_engine].get_and_reset_finished_requests_ids()
1363
+ # When n>1, elements in self.seq_id_to_seq_group should be deleted
1364
+ # here, otherwise memory leaks.
1365
+ for finished_request_id in finished_requests_ids:
1366
+ if finished_request_id in self.seq_id_to_seq_group:
1367
+ del self.seq_id_to_seq_group[finished_request_id]
1368
+
1369
+ # Maybe switch from async mode to sync mode
1370
+ if not allow_async_output_proc and len(ctx.output_queue) > 0:
1371
+ self._process_model_outputs(ctx=ctx)
1372
+
1373
+ if (self.scheduler_config.is_multi_step
1374
+ and scheduler_outputs.num_lookahead_slots > 0):
1375
+ # cache the scheduler outputs for the next iteration if we have
1376
+ # lookahead slots
1377
+ self._cache_scheduler_outputs_for_multi_step(
1378
+ virtual_engine, seq_group_metadata_list, scheduler_outputs,
1379
+ allow_async_output_proc)
1380
+ else:
1381
+ finished_requests_ids = list()
1382
+
1383
+ assert seq_group_metadata_list is not None
1384
+ assert scheduler_outputs is not None
1385
+
1386
+ if not scheduler_outputs.is_empty():
1387
+
1388
+ # Check if we have a cached last_output from the previous iteration.
1389
+ # For supporting PP this is probably the best way to pass the
1390
+ # sampled_token_ids, as a separate broadcast over all the PP stages
1391
+ # will cause one virtual engine's microbatch to block the pipeline.
1392
+ last_sampled_token_ids = \
1393
+ self._get_last_sampled_token_ids(virtual_engine)
1394
+
1395
+ execute_model_req = ExecuteModelRequest(
1396
+ seq_group_metadata_list=seq_group_metadata_list,
1397
+ blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
1398
+ blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
1399
+ blocks_to_copy=scheduler_outputs.blocks_to_copy,
1400
+ num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
1401
+ running_queue_size=scheduler_outputs.running_queue_size,
1402
+ finished_requests_ids=finished_requests_ids,
1403
+ # We use ExecuteModelRequest to pass the last sampled_token_ids
1404
+ # to each of the non-last PP stages for in-place prepare_input.
1405
+ last_sampled_token_ids=last_sampled_token_ids)
1406
+
1407
+ if allow_async_output_proc:
1408
+ execute_model_req.async_callback = self.async_callbacks[
1409
+ virtual_engine]
1410
+
1411
+ try:
1412
+ outputs = self.model_executor.execute_model(
1413
+ execute_model_req=execute_model_req)
1414
+ self._skip_scheduling_next_step = False
1415
+ except InputProcessingError as e:
1416
+ # The input for this request cannot be processed, so we must
1417
+ # abort it. If there are remaining requests in the batch that
1418
+ # have been scheduled, they will be retried on the next step.
1419
+ invalid_request_id = e.request_id
1420
+ self._abort_and_cache_schedule(
1421
+ request_id=invalid_request_id,
1422
+ virtual_engine=virtual_engine,
1423
+ seq_group_metadata_list=seq_group_metadata_list,
1424
+ scheduler_outputs=scheduler_outputs,
1425
+ allow_async_output_proc=allow_async_output_proc)
1426
+ # Raise so the caller is notified that this request failed
1427
+ raise
1428
+
1429
+ # We need to do this here so that last step's sampled_token_ids can
1430
+ # be passed to the next iteration for PP.
1431
+ if self.scheduler_config.is_multi_step:
1432
+ self._update_cached_scheduler_output(virtual_engine, outputs)
1433
+ else:
1434
+ # Nothing scheduled => If there is pending async postprocessor,
1435
+ # then finish it here.
1436
+ if len(ctx.output_queue) > 0:
1437
+ self._process_model_outputs(ctx=ctx)
1438
+ # No outputs in this case
1439
+ outputs = []
1440
+
1441
+ # Finish the current step for all the sequence groups.
1442
+ if self.scheduler_config.is_multi_step:
1443
+ for seq_group in seq_group_metadata_list:
1444
+ seq_group.finish_step()
1445
+
1446
+ if not self._has_remaining_steps(seq_group_metadata_list):
1447
+ # clear the cache if we have finished all the steps.
1448
+ if self.scheduler_config.is_multi_step:
1449
+ self.cached_scheduler_outputs[0] = SchedulerOutputState()
1450
+
1451
+ # is_first_step_output is True only when the num_steps of all
1452
+ # the sequences are 1. When the num_steps > 1,
1453
+ # multi_step_model_runner does the first-step output append.
1454
+ is_first_step_output: bool = False if not seq_group_metadata_list \
1455
+ else seq_group_metadata_list[0].state.num_steps == 1
1456
+
1457
+ # Add results to the output_queue
1458
+ ctx.append_output(outputs=outputs,
1459
+ seq_group_metadata_list=seq_group_metadata_list,
1460
+ scheduler_outputs=scheduler_outputs,
1461
+ is_async=allow_async_output_proc,
1462
+ is_last_step=True,
1463
+ is_first_step_output=is_first_step_output)
1464
+
1465
+ if outputs and allow_async_output_proc:
1466
+ assert len(outputs) == 1, (
1467
+ "Async postprocessor expects only a single output set")
1468
+
1469
+ self._advance_to_next_step(
1470
+ outputs[0], seq_group_metadata_list,
1471
+ scheduler_outputs.scheduled_seq_groups)
1472
+
1473
+ # Check if need to run the usual non-async path
1474
+ if not allow_async_output_proc:
1475
+ self._process_model_outputs(ctx=ctx)
1476
+
1477
+ # Log stats.
1478
+ self.do_log_stats(scheduler_outputs, outputs)
1479
+
1480
+ # Tracing
1481
+ self.do_tracing(scheduler_outputs)
1482
+ else:
1483
+ # Multi-step case
1484
+ return ctx.request_outputs
1485
+
1486
+ if not self.has_unfinished_requests():
1487
+ # Drain async postprocessor (if exists)
1488
+ if len(ctx.output_queue) > 0:
1489
+ self._process_model_outputs(ctx=ctx)
1490
+ assert len(ctx.output_queue) == 0
1491
+
1492
+ # Stop the execute model loop in parallel workers until there are
1493
+ # more requests to process. This avoids waiting indefinitely in
1494
+ # torch.distributed ops which may otherwise timeout, and unblocks
1495
+ # the RPC thread in the workers so that they can process any other
1496
+ # queued control plane messages, such as add/remove lora adapters.
1497
+ logger.debug("Stopping remote worker execution loop.")
1498
+ self.model_executor.stop_remote_worker_execution_loop()
1499
+
1500
+ return ctx.request_outputs
1501
+
1502
+ def _abort_and_cache_schedule(
1503
+ self, request_id: str, virtual_engine: int,
1504
+ seq_group_metadata_list: List[SequenceGroupMetadata],
1505
+ scheduler_outputs: SchedulerOutputs,
1506
+ allow_async_output_proc: bool) -> None:
1507
+ """Aborts a single request, and caches the scheduler outputs minus that
1508
+ request. This allows the next step to continue processing the remaining
1509
+ requests without having to re-run the scheduler."""
1510
+
1511
+ # Abort the request and remove its sequence group from the current
1512
+ # schedule
1513
+ self.abort_request(request_id)
1514
+ for i, metadata in enumerate(seq_group_metadata_list):
1515
+ if metadata.request_id == request_id:
1516
+ del seq_group_metadata_list[i]
1517
+ break
1518
+ for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
1519
+ if group.seq_group.request_id == request_id:
1520
+ del scheduler_outputs.scheduled_seq_groups[i]
1521
+ break
1522
+
1523
+ # If there are still other sequence groups left in the schedule, cache
1524
+ # them and flag the engine to reuse the schedule.
1525
+ if len(seq_group_metadata_list) > 0:
1526
+ self._skip_scheduling_next_step = True
1527
+ # Reuse multi-step caching logic
1528
+ self._cache_scheduler_outputs_for_multi_step(
1529
+ virtual_engine=virtual_engine,
1530
+ scheduler_outputs=scheduler_outputs,
1531
+ seq_group_metadata_list=seq_group_metadata_list,
1532
+ allow_async_output_proc=allow_async_output_proc)
1533
+
1534
+ def _has_remaining_steps(
1535
+ self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
1536
+ ) -> bool:
1537
+ if (not self.scheduler_config.is_multi_step
1538
+ or not seq_group_metadata_list):
1539
+ return False
1540
+
1541
+ # TODO(will) this is a sanity check for nowto make sure that all the
1542
+ # seqs are on the same steps. Eventually we will want to do some sort of
1543
+ # dynamic scheduling when doing multi-step decoding.
1544
+ ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
1545
+ if any([
1546
+ seq_group.state.remaining_steps != ref_remaining_steps
1547
+ for seq_group in seq_group_metadata_list[1:]
1548
+ ]):
1549
+ raise AssertionError("All running sequence groups should "
1550
+ "have the same remaining steps.")
1551
+
1552
+ return ref_remaining_steps > 0
1553
+
1554
+ def _cache_scheduler_outputs_for_multi_step(
1555
+ self, virtual_engine: int,
1556
+ seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
1557
+ scheduler_outputs: SchedulerOutputs,
1558
+ allow_async_output_proc: bool) -> None:
1559
+ co = self.cached_scheduler_outputs[virtual_engine]
1560
+
1561
+ co.seq_group_metadata_list = seq_group_metadata_list
1562
+ co.scheduler_outputs = scheduler_outputs
1563
+ co.allow_async_output_proc = allow_async_output_proc
1564
+ co.last_output = None
1565
+
1566
+ def _update_cached_scheduler_output(
1567
+ self, virtual_engine: int,
1568
+ output: List[Optional[SamplerOutput]]) -> None:
1569
+ if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
1570
+ and output[0] is not None):
1571
+ last_output = output[-1]
1572
+ assert last_output is not None
1573
+ assert last_output.sampled_token_ids_cpu is not None
1574
+ assert last_output.sampled_token_ids is None
1575
+ assert last_output.sampled_token_probs is None
1576
+ self.cached_scheduler_outputs[
1577
+ virtual_engine].last_output = last_output
1578
+
1579
+ def _get_last_sampled_token_ids(
1580
+ self, virtual_engine: int) -> Optional[torch.Tensor]:
1581
+ cached_last_output = self.cached_scheduler_outputs[
1582
+ virtual_engine].last_output
1583
+ if (self.scheduler_config.is_multi_step
1584
+ and self.parallel_config.pipeline_parallel_size > 1
1585
+ and cached_last_output is not None
1586
+ and cached_last_output.sampled_token_ids_cpu is not None):
1587
+ return cached_last_output.sampled_token_ids_cpu
1588
+ return None
1589
+
1590
+ def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
1591
+ if not self.log_stats:
1592
+ raise RuntimeError(
1593
+ "Stat logging is disabled. Set `disable_log_stats=False` "
1594
+ "argument to enable.")
1595
+ if logger_name in self.stat_loggers:
1596
+ raise KeyError(f"Logger with name {logger_name} already exists.")
1597
+ self.stat_loggers[logger_name] = logger
1598
+
1599
+ def remove_logger(self, logger_name: str) -> None:
1600
+ if not self.log_stats:
1601
+ raise RuntimeError(
1602
+ "Stat logging is disabled. Set `disable_log_stats=False` "
1603
+ "argument to enable.")
1604
+ if logger_name not in self.stat_loggers:
1605
+ raise KeyError(f"Logger with name {logger_name} does not exist.")
1606
+ del self.stat_loggers[logger_name]
1607
+
1608
+ def do_log_stats(self,
1609
+ scheduler_outputs: Optional[SchedulerOutputs] = None,
1610
+ model_output: Optional[List[SamplerOutput]] = None,
1611
+ finished_before: Optional[List[int]] = None,
1612
+ skip: Optional[List[int]] = None) -> None:
1613
+ """Forced log when no requests active."""
1614
+ if self.log_stats:
1615
+ stats = self._get_stats(scheduler_outputs, model_output,
1616
+ finished_before, skip)
1617
+ for logger in self.stat_loggers.values():
1618
+ logger.log(stats)
1619
+
1620
+ def _get_stats(self,
1621
+ scheduler_outputs: Optional[SchedulerOutputs],
1622
+ model_output: Optional[List[SamplerOutput]] = None,
1623
+ finished_before: Optional[List[int]] = None,
1624
+ skip: Optional[List[int]] = None) -> Stats:
1625
+ """Get Stats to be Logged to Prometheus.
1626
+
1627
+ Args:
1628
+ scheduler_outputs: Optional, used to populate metrics related to
1629
+ the scheduled batch,
1630
+ model_output: Optional, used to emit speculative decoding metrics
1631
+ which are created by the workers.
1632
+ finished_before: Optional, indices of sequences that were finished
1633
+ before. These sequences will be ignored.
1634
+ skip: Optional, indices of sequences that were preempted. These
1635
+ sequences will be ignored.
1636
+ """
1637
+ now = time.time()
1638
+
1639
+ # System State
1640
+ # Scheduler State
1641
+ num_running_sys = sum(
1642
+ len(scheduler.running) for scheduler in self.scheduler)
1643
+ num_swapped_sys = sum(
1644
+ len(scheduler.swapped) for scheduler in self.scheduler)
1645
+ num_waiting_sys = sum(
1646
+ len(scheduler.waiting) for scheduler in self.scheduler)
1647
+
1648
+ # KV Cache Usage in %
1649
+ num_total_gpu = self.cache_config.num_gpu_blocks
1650
+ gpu_cache_usage_sys = 0.
1651
+ if num_total_gpu: # Guard against both None and 0
1652
+ num_free_gpu = sum(
1653
+ scheduler.block_manager.get_num_free_gpu_blocks()
1654
+ for scheduler in self.scheduler)
1655
+ gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
1656
+
1657
+ num_total_cpu = self.cache_config.num_cpu_blocks
1658
+ cpu_cache_usage_sys = 0.
1659
+ if num_total_cpu: # Guard against both None and 0
1660
+ num_free_cpu = sum(
1661
+ scheduler.block_manager.get_num_free_cpu_blocks()
1662
+ for scheduler in self.scheduler)
1663
+ cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
1664
+
1665
+ # Prefix Cache Hit Rate. Note that we always use
1666
+ # the cache hit rate of the first virtual engine.
1667
+ cpu_prefix_cache_hit_rate = self.scheduler[
1668
+ 0].get_prefix_cache_hit_rate(Device.CPU)
1669
+ gpu_prefix_cache_hit_rate = self.scheduler[
1670
+ 0].get_prefix_cache_hit_rate(Device.GPU)
1671
+
1672
+ # Iteration stats
1673
+ num_prompt_tokens_iter = 0
1674
+ num_generation_tokens_iter = 0
1675
+ num_tokens_iter = 0
1676
+ time_to_first_tokens_iter: List[float] = []
1677
+ time_per_output_tokens_iter: List[float] = []
1678
+ num_preemption_iter = (0 if scheduler_outputs is None else
1679
+ scheduler_outputs.preempted)
1680
+
1681
+ # Request stats
1682
+ # Latency
1683
+ time_e2e_requests: List[float] = []
1684
+ time_queue_requests: List[float] = []
1685
+ time_inference_requests: List[float] = []
1686
+ time_prefill_requests: List[float] = []
1687
+ time_decode_requests: List[float] = []
1688
+ time_in_queue_requests: List[float] = []
1689
+ model_forward_time_requests: List[float] = []
1690
+ model_execute_time_requests: List[float] = []
1691
+ # Metadata
1692
+ num_prompt_tokens_requests: List[int] = []
1693
+ num_generation_tokens_requests: List[int] = []
1694
+ n_requests: List[int] = []
1695
+ max_num_generation_tokens_requests: List[int] = []
1696
+ max_tokens_requests: List[int] = []
1697
+ finished_reason_requests: List[str] = []
1698
+
1699
+ # LoRA requests
1700
+ running_lora_adapters = dict(
1701
+ collectionsCounter([
1702
+ running_request.lora_request.lora_name
1703
+ for scheduler in self.scheduler
1704
+ for running_request in scheduler.running
1705
+ if running_request.lora_request
1706
+ ]))
1707
+ waiting_lora_adapters = dict(
1708
+ collectionsCounter([
1709
+ waiting_request.lora_request.lora_name
1710
+ for scheduler in self.scheduler
1711
+ for waiting_request in scheduler.waiting
1712
+ if waiting_request.lora_request
1713
+ ]))
1714
+ max_lora_stat = "0"
1715
+ if self.lora_config:
1716
+ max_lora_stat = str(self.lora_config.max_loras)
1717
+
1718
+ # NOTE: This loop assumes prefill seq_groups are before
1719
+ # decode seq_groups in scheduled_seq_groups.
1720
+ if scheduler_outputs is not None:
1721
+ # For async postprocessor, already finished sequences need to be
1722
+ # not counted (to avoid double counting)
1723
+ actual_num_batched_tokens = scheduler_outputs.num_batched_tokens # type: ignore
1724
+
1725
+ num_generation_tokens_from_prefill_groups = 0
1726
+ # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
1727
+ # the len of scheduler_outputs.scheduled_seq_groups is !=
1728
+ # scheduler_outputs.num_prefill_groups, this means that
1729
+ # chunked prefills have been detected.
1730
+
1731
+ for idx, scheduled_seq_group in enumerate(
1732
+ scheduler_outputs.scheduled_seq_groups):
1733
+ # Skip double logging when using async output proc
1734
+ if finished_before and idx in finished_before:
1735
+ actual_num_batched_tokens -= 1
1736
+ continue
1737
+
1738
+ # Currently, skip == preempted sequences, so we need to skip
1739
+ # their log stats
1740
+ if skip and idx in skip:
1741
+ continue
1742
+
1743
+ group_was_prefill = idx < scheduler_outputs.num_prefill_groups
1744
+ seq_group = scheduled_seq_group.seq_group
1745
+
1746
+ # NOTE: a seq_group that completed all of its prefill tokens
1747
+ # in the last iteration will have seq_group.is_prefill() = False
1748
+ # with group_was_prefill = True
1749
+ if group_was_prefill:
1750
+ # Number of prompt tokens.
1751
+ num_prompt_tokens_iter += (
1752
+ scheduled_seq_group.token_chunk_size)
1753
+
1754
+ # If the seq_group just finished the prefill state
1755
+ # get TTFT.
1756
+ if not seq_group.is_prefill():
1757
+ latency = seq_group.get_last_token_latency()
1758
+ time_to_first_tokens_iter.append(latency)
1759
+
1760
+ # One generation token per finished prefill.
1761
+ num_generation_tokens_from_prefill_groups += (
1762
+ seq_group.num_seqs())
1763
+ else:
1764
+ # TPOTs.
1765
+ latency = seq_group.get_last_token_latency()
1766
+ time_per_output_tokens_iter.append(latency)
1767
+ if seq_group.state.current_step == 0:
1768
+ # For async_output_proc, the do_log_stats()
1769
+ # is called following init_multi_step(), which
1770
+ # sets the current_step to zero.
1771
+ actual_num_batched_tokens +=\
1772
+ seq_group.state.num_steps - 1
1773
+ else:
1774
+ actual_num_batched_tokens +=\
1775
+ seq_group.state.current_step - 1
1776
+
1777
+ # Because of chunked prefill, we can have a single sequence
1778
+ # group that does multiple prompt_runs. To prevent logging
1779
+ # the same metadata more than once per request, we standardize
1780
+ # on logging request level information for finished requests,
1781
+ # which can only happen once.
1782
+ if seq_group.is_finished():
1783
+ # Latency timings
1784
+ time_e2e_requests.append(now -
1785
+ seq_group.metrics.arrival_time)
1786
+ if (seq_group.metrics.first_scheduled_time is not None and
1787
+ seq_group.metrics.first_token_time is not None):
1788
+ time_queue_requests.append(
1789
+ seq_group.metrics.first_scheduled_time -
1790
+ seq_group.metrics.arrival_time)
1791
+ time_prefill_requests.append(
1792
+ seq_group.metrics.first_token_time -
1793
+ seq_group.metrics.first_scheduled_time)
1794
+ time_decode_requests.append(
1795
+ now - seq_group.metrics.first_token_time)
1796
+ time_inference_requests.append(
1797
+ now - seq_group.metrics.first_scheduled_time)
1798
+ if seq_group.metrics.time_in_queue is not None:
1799
+ time_in_queue_requests.append(
1800
+ seq_group.metrics.time_in_queue)
1801
+ if seq_group.metrics.model_forward_time is not None:
1802
+ model_forward_time_requests.append(
1803
+ seq_group.metrics.model_forward_time)
1804
+ if seq_group.metrics.model_execute_time is not None:
1805
+ model_execute_time_requests.append(
1806
+ seq_group.metrics.model_execute_time * 1000)
1807
+ # Metadata
1808
+ num_prompt_tokens_requests.append(
1809
+ len(seq_group.prompt_token_ids))
1810
+ num_generation_tokens_requests.extend([
1811
+ seq.get_output_len()
1812
+ for seq in seq_group.get_finished_seqs()
1813
+ ])
1814
+ max_num_generation_tokens_requests.append(
1815
+ max(seq.get_output_len()
1816
+ for seq in seq_group.get_seqs()))
1817
+ if seq_group.sampling_params is not None:
1818
+ n_requests.append(seq_group.sampling_params.n)
1819
+ max_tokens_requests.append(
1820
+ seq_group.sampling_params.max_tokens)
1821
+ finished_reason_requests.extend([
1822
+ SequenceStatus.get_finished_reason(seq.status)
1823
+ for seq in seq_group.get_finished_seqs()
1824
+ ])
1825
+
1826
+ # Number of generation tokens.
1827
+ # num_batched_tokens equals the number of prompt_tokens plus the
1828
+ # number of decode_tokens in a single iteration. So,
1829
+ # num_generation_tokens = num_batched_tokens - num_prompt_tokens
1830
+ # + num_generation_tokens_from_prefill_groups (since we generate
1831
+ # one token on prefills on iters where the prefill finishes).
1832
+ num_generation_tokens_iter = (
1833
+ actual_num_batched_tokens - num_prompt_tokens_iter +
1834
+ num_generation_tokens_from_prefill_groups)
1835
+ num_tokens_iter = (num_generation_tokens_iter +
1836
+ num_prompt_tokens_iter)
1837
+ # Spec decode, if enabled, emits specialized metrics from the worker in
1838
+ # sampler output.
1839
+ if model_output and isinstance(model_output[0], SamplerOutput) and (
1840
+ model_output[0].spec_decode_worker_metrics is not None):
1841
+ spec_decode_metrics = model_output[0].spec_decode_worker_metrics
1842
+ else:
1843
+ spec_decode_metrics = None
1844
+
1845
+ return Stats(
1846
+ now=now,
1847
+ # System stats
1848
+ # Scheduler State
1849
+ num_running_sys=num_running_sys,
1850
+ num_swapped_sys=num_swapped_sys,
1851
+ num_waiting_sys=num_waiting_sys,
1852
+ # KV Cache Usage in %
1853
+ gpu_cache_usage_sys=gpu_cache_usage_sys,
1854
+ cpu_cache_usage_sys=cpu_cache_usage_sys,
1855
+ # Prefix Cache Hit Rate
1856
+ cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
1857
+ gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
1858
+
1859
+ # Iteration stats
1860
+ num_prompt_tokens_iter=num_prompt_tokens_iter,
1861
+ num_generation_tokens_iter=num_generation_tokens_iter,
1862
+ num_tokens_iter=num_tokens_iter,
1863
+ time_to_first_tokens_iter=time_to_first_tokens_iter,
1864
+ time_per_output_tokens_iter=time_per_output_tokens_iter,
1865
+ spec_decode_metrics=spec_decode_metrics,
1866
+ num_preemption_iter=num_preemption_iter,
1867
+
1868
+ # Request stats
1869
+ # Latency
1870
+ time_e2e_requests=time_e2e_requests,
1871
+ time_queue_requests=time_queue_requests,
1872
+ time_inference_requests=time_inference_requests,
1873
+ time_prefill_requests=time_prefill_requests,
1874
+ time_decode_requests=time_decode_requests,
1875
+ time_in_queue_requests=time_in_queue_requests,
1876
+ model_forward_time_requests=model_forward_time_requests,
1877
+ model_execute_time_requests=model_execute_time_requests,
1878
+ # Metadata
1879
+ num_prompt_tokens_requests=num_prompt_tokens_requests,
1880
+ num_generation_tokens_requests=num_generation_tokens_requests,
1881
+ max_num_generation_tokens_requests=
1882
+ max_num_generation_tokens_requests,
1883
+ n_requests=n_requests,
1884
+ max_tokens_requests=max_tokens_requests,
1885
+ finished_reason_requests=finished_reason_requests,
1886
+ max_lora=str(max_lora_stat),
1887
+ waiting_lora_adapters=list(waiting_lora_adapters.keys()),
1888
+ running_lora_adapters=list(running_lora_adapters.keys()))
1889
+
1890
+ def add_lora(self, lora_request: LoRARequest) -> bool:
1891
+ return self.model_executor.add_lora(lora_request)
1892
+
1893
+ def remove_lora(self, lora_id: int) -> bool:
1894
+ return self.model_executor.remove_lora(lora_id)
1895
+
1896
+ def list_loras(self) -> Set[int]:
1897
+ return self.model_executor.list_loras()
1898
+
1899
+ def pin_lora(self, lora_id: int) -> bool:
1900
+ return self.model_executor.pin_lora(lora_id)
1901
+
1902
+ def add_prompt_adapter(
1903
+ self, prompt_adapter_request: PromptAdapterRequest) -> bool:
1904
+ return self.model_executor.add_prompt_adapter(prompt_adapter_request)
1905
+
1906
+ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
1907
+ return self.model_executor.remove_prompt_adapter(prompt_adapter_id)
1908
+
1909
+ def list_prompt_adapters(self) -> List[int]:
1910
+ return self.model_executor.list_prompt_adapters()
1911
+
1912
+ def start_profile(self) -> None:
1913
+ self.model_executor.start_profile()
1914
+
1915
+ def stop_profile(self) -> None:
1916
+ self.model_executor.stop_profile()
1917
+
1918
+ def sleep(self, level: int = 1) -> None:
1919
+ assert self.vllm_config.model_config.enable_sleep_mode, (
1920
+ "Sleep mode is not enabled in the model config")
1921
+ self.model_executor.sleep(level=level)
1922
+
1923
+ def wake_up(self, tags: Optional[list[str]] = None) -> None:
1924
+ assert self.vllm_config.model_config.enable_sleep_mode, (
1925
+ "Sleep mode is not enabled in the model config")
1926
+ self.model_executor.wake_up(tags)
1927
+
1928
+ def is_sleeping(self) -> bool:
1929
+ return self.model_executor.is_sleeping
1930
+
1931
+ def check_health(self) -> None:
1932
+ self.model_executor.check_health()
1933
+
1934
+ def is_tracing_enabled(self) -> bool:
1935
+ return self.tracer is not None
1936
+
1937
+ def do_tracing(self,
1938
+ scheduler_outputs: SchedulerOutputs,
1939
+ finished_before: Optional[List[int]] = None) -> None:
1940
+ if self.tracer is None:
1941
+ return
1942
+
1943
+ for idx, scheduled_seq_group in enumerate(
1944
+ scheduler_outputs.scheduled_seq_groups):
1945
+ # Skip double tracing when using async output proc
1946
+ if finished_before and idx in finished_before:
1947
+ continue
1948
+
1949
+ seq_group = scheduled_seq_group.seq_group
1950
+ if seq_group.is_finished():
1951
+ self.create_trace_span(seq_group)
1952
+
1953
+ def create_trace_span(self, seq_group: SequenceGroup) -> None:
1954
+ if self.tracer is None or seq_group.sampling_params is None:
1955
+ return
1956
+ arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
1957
+
1958
+ trace_context = extract_trace_context(seq_group.trace_headers)
1959
+
1960
+ with self.tracer.start_as_current_span(
1961
+ "llm_request",
1962
+ kind=SpanKind.SERVER,
1963
+ context=trace_context,
1964
+ start_time=arrival_time_nano_seconds) as seq_span:
1965
+ metrics = seq_group.metrics
1966
+ ttft = metrics.first_token_time - metrics.arrival_time
1967
+ e2e_time = metrics.finished_time - metrics.arrival_time
1968
+ seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
1969
+ self.model_config.model)
1970
+ seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
1971
+ seq_group.request_id)
1972
+ seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
1973
+ seq_group.sampling_params.temperature)
1974
+ seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
1975
+ seq_group.sampling_params.top_p)
1976
+ seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
1977
+ seq_group.sampling_params.max_tokens)
1978
+ seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
1979
+ seq_group.sampling_params.n)
1980
+ seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
1981
+ seq_group.num_seqs())
1982
+ seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
1983
+ len(seq_group.prompt_token_ids))
1984
+ seq_span.set_attribute(
1985
+ SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
1986
+ sum([
1987
+ seq.get_output_len()
1988
+ for seq in seq_group.get_finished_seqs()
1989
+ ]))
1990
+ seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
1991
+ metrics.time_in_queue)
1992
+ seq_span.set_attribute(
1993
+ SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
1994
+ seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
1995
+ if metrics.scheduler_time is not None:
1996
+ seq_span.set_attribute(
1997
+ SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
1998
+ metrics.scheduler_time)
1999
+ if metrics.model_forward_time is not None:
2000
+ seq_span.set_attribute(
2001
+ SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
2002
+ metrics.model_forward_time / 1000.0)
2003
+ if metrics.model_execute_time is not None:
2004
+ seq_span.set_attribute(
2005
+ SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
2006
+ metrics.model_execute_time)
2007
+
2008
+ def _validate_model_inputs(self, inputs: ProcessorInputs,
2009
+ lora_request: Optional[LoRARequest]):
2010
+ encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
2011
+
2012
+ if encoder_inputs is not None:
2013
+ self._validate_model_input(encoder_inputs,
2014
+ lora_request,
2015
+ prompt_type="encoder")
2016
+
2017
+ self._validate_model_input(decoder_inputs,
2018
+ lora_request,
2019
+ prompt_type="decoder")
2020
+
2021
+ def _validate_model_input(
2022
+ self,
2023
+ prompt_inputs: SingletonInputs,
2024
+ lora_request: Optional[LoRARequest],
2025
+ *,
2026
+ prompt_type: Literal["encoder", "decoder"],
2027
+ ):
2028
+ model_config = self.model_config
2029
+ tokenizer = (None if self.tokenizer is None else
2030
+ self.tokenizer.get_lora_tokenizer(lora_request))
2031
+
2032
+ prompt_ids = prompt_inputs["prompt_token_ids"]
2033
+ if not prompt_ids:
2034
+ if prompt_type == "encoder" and model_config.is_multimodal_model:
2035
+ pass # Mllama may have empty encoder inputs for text-only data
2036
+ else:
2037
+ raise ValueError(f"The {prompt_type} prompt cannot be empty")
2038
+
2039
+ max_prompt_len = self.model_config.max_model_len
2040
+ if len(prompt_ids) > max_prompt_len:
2041
+ if prompt_type == "encoder" and model_config.is_multimodal_model:
2042
+ mm_registry = self.input_preprocessor.mm_registry
2043
+ mm_processor = mm_registry.create_processor(
2044
+ model_config,
2045
+ tokenizer=tokenizer or object(), # Dummy if no tokenizer
2046
+ )
2047
+ assert isinstance(mm_processor, EncDecMultiModalProcessor)
2048
+
2049
+ if mm_processor.pad_dummy_encoder_prompt:
2050
+ return # Skip encoder length check for Whisper
2051
+
2052
+ if model_config.is_multimodal_model:
2053
+ suggestion = (
2054
+ "Make sure that `max_model_len` is no smaller than the "
2055
+ "number of text tokens plus multimodal tokens. For image "
2056
+ "inputs, the number of image tokens depends on the number "
2057
+ "of images, and possibly their aspect ratios as well.")
2058
+ else:
2059
+ suggestion = (
2060
+ "Make sure that `max_model_len` is no smaller than the "
2061
+ "number of text tokens.")
2062
+
2063
+ raise ValueError(
2064
+ f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
2065
+ f"longer than the maximum model length of {max_prompt_len}. "
2066
+ f"{suggestion}")
2067
+
2068
+ # TODO: Find out how many placeholder tokens are there so we can
2069
+ # check that chunked prefill does not truncate them
2070
+ # max_batch_len = self.scheduler_config.max_num_batched_tokens
2071
+
2072
+ def _build_logits_processors(
2073
+ self, sampling_params: SamplingParams,
2074
+ lora_request: Optional[LoRARequest]) -> SamplingParams:
2075
+ """Constructs logits processors based on the guided_decoding,
2076
+ logits_bias, and allowed_token_ids fields in sampling_params. Deletes
2077
+ those fields and adds the constructed logits processors to the
2078
+ logits_processors field. Returns the modified sampling params."""
2079
+
2080
+ logits_processors = []
2081
+
2082
+ if sampling_params.guided_decoding is not None:
2083
+ # Defensively copy sampling params since guided decoding logits
2084
+ # processors can have different state for each request
2085
+ sampling_params = copy.copy(sampling_params)
2086
+ guided_decoding = sampling_params.guided_decoding
2087
+
2088
+ logger.debug(
2089
+ "Building guided decoding logits processor in "
2090
+ "LLMEngine. Params: %s", guided_decoding)
2091
+
2092
+ tokenizer = self.get_tokenizer(lora_request=lora_request)
2093
+ guided_decoding.backend = guided_decoding.backend or \
2094
+ self.decoding_config.guided_decoding_backend
2095
+
2096
+ if self.decoding_config.reasoning_backend is not None:
2097
+ logger.debug("Building with reasoning backend %s",
2098
+ self.decoding_config.reasoning_backend)
2099
+
2100
+ processor = get_local_guided_decoding_logits_processor(
2101
+ guided_params=guided_decoding,
2102
+ tokenizer=tokenizer,
2103
+ model_config=self.model_config,
2104
+ reasoning_backend=self.decoding_config.reasoning_backend,
2105
+ )
2106
+ if processor:
2107
+ logits_processors.append(processor)
2108
+
2109
+ # Unset so this doesn't get passed down to the model
2110
+ sampling_params.guided_decoding = None
2111
+
2112
+ if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
2113
+ tokenizer = self.get_tokenizer(lora_request=lora_request)
2114
+
2115
+ processors = get_openai_logits_processors(
2116
+ logit_bias=sampling_params.logit_bias,
2117
+ allowed_token_ids=sampling_params.allowed_token_ids,
2118
+ tokenizer=tokenizer)
2119
+ logits_processors.extend(processors)
2120
+
2121
+ # Unset so these don't get passed down to the model
2122
+ sampling_params.logit_bias = None
2123
+ sampling_params.allowed_token_ids = None
2124
+
2125
+ if len(sampling_params.bad_words) > 0:
2126
+ tokenizer = self.get_tokenizer(lora_request)
2127
+ processors = get_bad_words_logits_processors(
2128
+ bad_words=sampling_params.bad_words, tokenizer=tokenizer)
2129
+ logits_processors.extend(processors)
2130
+
2131
+ if logits_processors:
2132
+ if sampling_params.logits_processors is None:
2133
+ sampling_params.logits_processors = logits_processors
2134
+ else:
2135
+ sampling_params.logits_processors.extend(logits_processors)
2136
+
2137
+ return sampling_params
2138
+
2139
+ def collective_rpc(self,
2140
+ method: Union[str, Callable[..., _R]],
2141
+ timeout: Optional[float] = None,
2142
+ args: tuple = (),
2143
+ kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
2144
+ return self.model_executor.collective_rpc(method, timeout, args,
2145
+ kwargs)
2146
+
2147
+
2148
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
2149
+ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
2150
+ LLMEngine = V1LLMEngine # type: ignore