vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1536 -0
  4. vllm/_ipex_ops.py +241 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +38 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +31 -0
  16. vllm/assets/video.py +103 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +303 -0
  22. vllm/attention/backends/flash_attn.py +999 -0
  23. vllm/attention/backends/flashinfer.py +1092 -0
  24. vllm/attention/backends/flashmla.py +242 -0
  25. vllm/attention/backends/hpu_attn.py +301 -0
  26. vllm/attention/backends/ipex_attn.py +396 -0
  27. vllm/attention/backends/mla/__init__.py +0 -0
  28. vllm/attention/backends/mla/common.py +1444 -0
  29. vllm/attention/backends/pallas.py +346 -0
  30. vllm/attention/backends/placeholder_attn.py +399 -0
  31. vllm/attention/backends/rocm_aiter_mla.py +412 -0
  32. vllm/attention/backends/rocm_flash_attn.py +969 -0
  33. vllm/attention/backends/torch_sdpa.py +691 -0
  34. vllm/attention/backends/triton_mla.py +113 -0
  35. vllm/attention/backends/utils.py +609 -0
  36. vllm/attention/backends/xformers.py +798 -0
  37. vllm/attention/layer.py +443 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  41. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  42. vllm/attention/ops/blocksparse_attention/utils.py +244 -0
  43. vllm/attention/ops/chunked_prefill_paged_decode.py +366 -0
  44. vllm/attention/ops/flashmla.py +115 -0
  45. vllm/attention/ops/hpu_paged_attn.py +105 -0
  46. vllm/attention/ops/ipex_attn.py +193 -0
  47. vllm/attention/ops/merge_attn_states.py +42 -0
  48. vllm/attention/ops/nki_flash_attn.py +905 -0
  49. vllm/attention/ops/paged_attn.py +255 -0
  50. vllm/attention/ops/prefix_prefill.py +902 -0
  51. vllm/attention/ops/rocm_aiter_mla.py +42 -0
  52. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  53. vllm/attention/ops/triton_decode_attention.py +675 -0
  54. vllm/attention/ops/triton_flash_attention.py +1375 -0
  55. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  56. vllm/attention/selector.py +186 -0
  57. vllm/attention/utils/fa_utils.py +54 -0
  58. vllm/beam_search.py +82 -0
  59. vllm/benchmarks/__init__.py +0 -0
  60. vllm/benchmarks/datasets.py +831 -0
  61. vllm/benchmarks/endpoint_request_func.py +160 -0
  62. vllm/benchmarks/latency.py +181 -0
  63. vllm/benchmarks/serve.py +925 -0
  64. vllm/benchmarks/throughput.py +608 -0
  65. vllm/benchmarks/utils.py +69 -0
  66. vllm/collect_env.py +795 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/backends.py +715 -0
  69. vllm/compilation/compiler_interface.py +437 -0
  70. vllm/compilation/counter.py +33 -0
  71. vllm/compilation/decorators.py +249 -0
  72. vllm/compilation/fix_functionalization.py +182 -0
  73. vllm/compilation/fusion.py +617 -0
  74. vllm/compilation/fx_utils.py +60 -0
  75. vllm/compilation/inductor_pass.py +114 -0
  76. vllm/compilation/monitor.py +38 -0
  77. vllm/compilation/multi_output_match.py +108 -0
  78. vllm/compilation/noop_elimination.py +135 -0
  79. vllm/compilation/pass_manager.py +74 -0
  80. vllm/compilation/sequence_parallelism.py +266 -0
  81. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  82. vllm/compilation/vllm_inductor_pass.py +68 -0
  83. vllm/compilation/wrapper.py +129 -0
  84. vllm/config.py +4179 -0
  85. vllm/connections.py +170 -0
  86. vllm/core/__init__.py +0 -0
  87. vllm/core/block/__init__.py +0 -0
  88. vllm/core/block/block_table.py +398 -0
  89. vllm/core/block/common.py +370 -0
  90. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  91. vllm/core/block/interfaces.py +318 -0
  92. vllm/core/block/naive_block.py +465 -0
  93. vllm/core/block/prefix_caching_block.py +1134 -0
  94. vllm/core/block/utils.py +27 -0
  95. vllm/core/block_manager.py +520 -0
  96. vllm/core/evictor.py +156 -0
  97. vllm/core/interfaces.py +134 -0
  98. vllm/core/placeholder_block_space_manager.py +99 -0
  99. vllm/core/scheduler.py +2060 -0
  100. vllm/device_allocator/__init__.py +0 -0
  101. vllm/device_allocator/cumem.py +280 -0
  102. vllm/distributed/__init__.py +5 -0
  103. vllm/distributed/communication_op.py +40 -0
  104. vllm/distributed/device_communicators/__init__.py +0 -0
  105. vllm/distributed/device_communicators/base_device_communicator.py +151 -0
  106. vllm/distributed/device_communicators/cpu_communicator.py +139 -0
  107. vllm/distributed/device_communicators/cuda_communicator.py +131 -0
  108. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  109. vllm/distributed/device_communicators/custom_all_reduce.py +301 -0
  110. vllm/distributed/device_communicators/custom_all_reduce_utils.py +257 -0
  111. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  112. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  113. vllm/distributed/device_communicators/pynccl.py +217 -0
  114. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  115. vllm/distributed/device_communicators/shm_broadcast.py +557 -0
  116. vllm/distributed/device_communicators/tpu_communicator.py +93 -0
  117. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  118. vllm/distributed/kv_transfer/README.md +29 -0
  119. vllm/distributed/kv_transfer/__init__.py +11 -0
  120. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  121. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  122. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  123. vllm/distributed/kv_transfer/kv_connector/factory.py +107 -0
  124. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  125. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +201 -0
  126. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +90 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +8 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +209 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +131 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  132. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  133. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  134. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  135. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  136. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  137. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  138. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  139. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  140. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  141. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  142. vllm/distributed/parallel_state.py +1209 -0
  143. vllm/distributed/utils.py +366 -0
  144. vllm/engine/__init__.py +0 -0
  145. vllm/engine/arg_utils.py +1724 -0
  146. vllm/engine/async_llm_engine.py +1261 -0
  147. vllm/engine/async_timeout.py +191 -0
  148. vllm/engine/llm_engine.py +2150 -0
  149. vllm/engine/metrics.py +717 -0
  150. vllm/engine/metrics_types.py +96 -0
  151. vllm/engine/multiprocessing/__init__.py +183 -0
  152. vllm/engine/multiprocessing/client.py +745 -0
  153. vllm/engine/multiprocessing/engine.py +450 -0
  154. vllm/engine/output_processor/__init__.py +0 -0
  155. vllm/engine/output_processor/interfaces.py +74 -0
  156. vllm/engine/output_processor/multi_step.py +210 -0
  157. vllm/engine/output_processor/single_step.py +136 -0
  158. vllm/engine/output_processor/stop_checker.py +130 -0
  159. vllm/engine/output_processor/util.py +27 -0
  160. vllm/engine/protocol.py +302 -0
  161. vllm/entrypoints/__init__.py +0 -0
  162. vllm/entrypoints/api_server.py +177 -0
  163. vllm/entrypoints/chat_utils.py +1259 -0
  164. vllm/entrypoints/cli/__init__.py +0 -0
  165. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  166. vllm/entrypoints/cli/benchmark/base.py +38 -0
  167. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  168. vllm/entrypoints/cli/benchmark/main.py +53 -0
  169. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  170. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  171. vllm/entrypoints/cli/collect_env.py +35 -0
  172. vllm/entrypoints/cli/main.py +59 -0
  173. vllm/entrypoints/cli/openai.py +175 -0
  174. vllm/entrypoints/cli/serve.py +59 -0
  175. vllm/entrypoints/cli/types.py +24 -0
  176. vllm/entrypoints/launcher.py +146 -0
  177. vllm/entrypoints/llm.py +1450 -0
  178. vllm/entrypoints/logger.py +44 -0
  179. vllm/entrypoints/openai/__init__.py +0 -0
  180. vllm/entrypoints/openai/api_server.py +1130 -0
  181. vllm/entrypoints/openai/cli_args.py +296 -0
  182. vllm/entrypoints/openai/logits_processors.py +89 -0
  183. vllm/entrypoints/openai/protocol.py +1806 -0
  184. vllm/entrypoints/openai/run_batch.py +439 -0
  185. vllm/entrypoints/openai/serving_chat.py +1210 -0
  186. vllm/entrypoints/openai/serving_completion.py +557 -0
  187. vllm/entrypoints/openai/serving_embedding.py +245 -0
  188. vllm/entrypoints/openai/serving_engine.py +569 -0
  189. vllm/entrypoints/openai/serving_models.py +314 -0
  190. vllm/entrypoints/openai/serving_pooling.py +237 -0
  191. vllm/entrypoints/openai/serving_score.py +439 -0
  192. vllm/entrypoints/openai/serving_tokenization.py +147 -0
  193. vllm/entrypoints/openai/serving_transcription.py +421 -0
  194. vllm/entrypoints/openai/tool_parsers/__init__.py +19 -0
  195. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  196. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +254 -0
  197. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +232 -0
  198. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  199. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +211 -0
  200. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +303 -0
  201. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +262 -0
  202. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  203. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +110 -0
  204. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +292 -0
  205. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  206. vllm/entrypoints/score_utils.py +49 -0
  207. vllm/entrypoints/ssl.py +74 -0
  208. vllm/entrypoints/utils.py +136 -0
  209. vllm/env_override.py +34 -0
  210. vllm/envs.py +800 -0
  211. vllm/executor/__init__.py +0 -0
  212. vllm/executor/executor_base.py +400 -0
  213. vllm/executor/mp_distributed_executor.py +243 -0
  214. vllm/executor/msgspec_utils.py +29 -0
  215. vllm/executor/multiproc_worker_utils.py +312 -0
  216. vllm/executor/ray_distributed_executor.py +700 -0
  217. vllm/executor/ray_utils.py +400 -0
  218. vllm/executor/uniproc_executor.py +141 -0
  219. vllm/forward_context.py +159 -0
  220. vllm/inputs/__init__.py +37 -0
  221. vllm/inputs/data.py +248 -0
  222. vllm/inputs/parse.py +121 -0
  223. vllm/inputs/preprocess.py +745 -0
  224. vllm/inputs/registry.py +212 -0
  225. vllm/jsontree.py +79 -0
  226. vllm/logger.py +210 -0
  227. vllm/logging_utils/__init__.py +7 -0
  228. vllm/logging_utils/formatter.py +17 -0
  229. vllm/logits_process.py +121 -0
  230. vllm/lora/__init__.py +0 -0
  231. vllm/lora/fully_sharded_layers.py +335 -0
  232. vllm/lora/layers.py +1263 -0
  233. vllm/lora/lora.py +198 -0
  234. vllm/lora/models.py +802 -0
  235. vllm/lora/ops/__init__.py +0 -0
  236. vllm/lora/ops/torch_ops/__init__.py +15 -0
  237. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  238. vllm/lora/ops/triton_ops/__init__.py +11 -0
  239. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  240. vllm/lora/ops/triton_ops/lora_expand.py +293 -0
  241. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  242. vllm/lora/ops/triton_ops/lora_shrink.py +247 -0
  243. vllm/lora/ops/triton_ops/utils.py +121 -0
  244. vllm/lora/peft_helper.py +115 -0
  245. vllm/lora/punica_wrapper/__init__.py +9 -0
  246. vllm/lora/punica_wrapper/punica_base.py +483 -0
  247. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  248. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  249. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  250. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  251. vllm/lora/punica_wrapper/utils.py +161 -0
  252. vllm/lora/request.py +97 -0
  253. vllm/lora/resolver.py +83 -0
  254. vllm/lora/utils.py +237 -0
  255. vllm/lora/worker_manager.py +251 -0
  256. vllm/model_executor/__init__.py +15 -0
  257. vllm/model_executor/custom_op.py +153 -0
  258. vllm/model_executor/guided_decoding/__init__.py +180 -0
  259. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  260. vllm/model_executor/guided_decoding/guidance_logits_processors.py +85 -0
  261. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  262. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  263. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  264. vllm/model_executor/guided_decoding/outlines_logits_processors.py +271 -0
  265. vllm/model_executor/guided_decoding/reasoner/__init__.py +35 -0
  266. vllm/model_executor/guided_decoding/utils.py +241 -0
  267. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  268. vllm/model_executor/layers/__init__.py +0 -0
  269. vllm/model_executor/layers/activation.py +368 -0
  270. vllm/model_executor/layers/fused_moe/__init__.py +51 -0
  271. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  272. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  273. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  274. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  275. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  276. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  277. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  278. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  279. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  280. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  281. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  282. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  283. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  284. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  285. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  286. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  287. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  426. vllm/model_executor/layers/fused_moe/cutlass_moe.py +180 -0
  427. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +294 -0
  428. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +374 -0
  429. vllm/model_executor/layers/fused_moe/fused_moe.py +1539 -0
  430. vllm/model_executor/layers/fused_moe/layer.py +949 -0
  431. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  432. vllm/model_executor/layers/fused_moe/moe_pallas.py +64 -0
  433. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  434. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +416 -0
  435. vllm/model_executor/layers/fused_moe/utils.py +48 -0
  436. vllm/model_executor/layers/layernorm.py +277 -0
  437. vllm/model_executor/layers/lightning_attn.py +651 -0
  438. vllm/model_executor/layers/linear.py +1518 -0
  439. vllm/model_executor/layers/logits_processor.py +196 -0
  440. vllm/model_executor/layers/mamba/__init__.py +0 -0
  441. vllm/model_executor/layers/mamba/mamba2_metadata.py +109 -0
  442. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  443. vllm/model_executor/layers/mamba/mamba_mixer2.py +538 -0
  444. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  445. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  446. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +415 -0
  447. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  448. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  449. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  450. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  451. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  452. vllm/model_executor/layers/pooler.py +336 -0
  453. vllm/model_executor/layers/quantization/__init__.py +153 -0
  454. vllm/model_executor/layers/quantization/aqlm.py +374 -0
  455. vllm/model_executor/layers/quantization/awq.py +184 -0
  456. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  457. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  458. vllm/model_executor/layers/quantization/base_config.py +145 -0
  459. vllm/model_executor/layers/quantization/bitblas.py +459 -0
  460. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  461. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  462. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +624 -0
  463. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1100 -0
  464. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
  465. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  466. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  467. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  468. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +119 -0
  469. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  470. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  471. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  472. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  473. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +213 -0
  474. vllm/model_executor/layers/quantization/deepspeedfp.py +193 -0
  475. vllm/model_executor/layers/quantization/experts_int8.py +194 -0
  476. vllm/model_executor/layers/quantization/fbgemm_fp8.py +168 -0
  477. vllm/model_executor/layers/quantization/fp8.py +832 -0
  478. vllm/model_executor/layers/quantization/gguf.py +408 -0
  479. vllm/model_executor/layers/quantization/gptq.py +276 -0
  480. vllm/model_executor/layers/quantization/gptq_bitblas.py +438 -0
  481. vllm/model_executor/layers/quantization/gptq_marlin.py +643 -0
  482. vllm/model_executor/layers/quantization/gptq_marlin_24.py +295 -0
  483. vllm/model_executor/layers/quantization/hqq_marlin.py +328 -0
  484. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  485. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  486. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  487. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  488. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  489. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  490. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  491. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  492. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +132 -0
  493. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  494. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  495. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  496. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  497. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  498. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  499. vllm/model_executor/layers/quantization/kv_cache.py +137 -0
  500. vllm/model_executor/layers/quantization/marlin.py +259 -0
  501. vllm/model_executor/layers/quantization/modelopt.py +410 -0
  502. vllm/model_executor/layers/quantization/moe_wna16.py +447 -0
  503. vllm/model_executor/layers/quantization/neuron_quant.py +67 -0
  504. vllm/model_executor/layers/quantization/ptpc_fp8.py +125 -0
  505. vllm/model_executor/layers/quantization/qqq.py +273 -0
  506. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  507. vllm/model_executor/layers/quantization/quark/quark.py +385 -0
  508. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  509. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +7 -0
  510. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  511. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +142 -0
  512. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  513. vllm/model_executor/layers/quantization/quark/utils.py +102 -0
  514. vllm/model_executor/layers/quantization/schema.py +85 -0
  515. vllm/model_executor/layers/quantization/torchao.py +127 -0
  516. vllm/model_executor/layers/quantization/tpu_int8.py +119 -0
  517. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  518. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  519. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +198 -0
  520. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  521. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  522. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  523. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  524. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  525. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  526. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  527. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  528. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  529. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  530. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  531. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  532. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  533. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  534. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  535. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  536. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  537. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  538. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  539. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  540. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  541. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  542. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  543. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  544. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  545. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  546. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  547. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  548. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  549. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  550. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  551. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/fp8_utils.py +523 -0
  723. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  724. vllm/model_executor/layers/quantization/utils/int8_utils.py +459 -0
  725. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  726. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  727. vllm/model_executor/layers/quantization/utils/marlin_utils.py +413 -0
  728. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +110 -0
  729. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  730. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  731. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +127 -0
  732. vllm/model_executor/layers/quantization/utils/quant_utils.py +571 -0
  733. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  734. vllm/model_executor/layers/rejection_sampler.py +400 -0
  735. vllm/model_executor/layers/resampler.py +269 -0
  736. vllm/model_executor/layers/rotary_embedding.py +1598 -0
  737. vllm/model_executor/layers/sampler.py +1221 -0
  738. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  739. vllm/model_executor/layers/typical_acceptance_sampler.py +172 -0
  740. vllm/model_executor/layers/utils.py +99 -0
  741. vllm/model_executor/layers/vocab_parallel_embedding.py +485 -0
  742. vllm/model_executor/model_loader/__init__.py +20 -0
  743. vllm/model_executor/model_loader/loader.py +1542 -0
  744. vllm/model_executor/model_loader/neuron.py +243 -0
  745. vllm/model_executor/model_loader/tensorizer.py +468 -0
  746. vllm/model_executor/model_loader/utils.py +171 -0
  747. vllm/model_executor/model_loader/weight_utils.py +749 -0
  748. vllm/model_executor/models/__init__.py +27 -0
  749. vllm/model_executor/models/adapters.py +247 -0
  750. vllm/model_executor/models/arctic.py +559 -0
  751. vllm/model_executor/models/aria.py +656 -0
  752. vllm/model_executor/models/aya_vision.py +461 -0
  753. vllm/model_executor/models/baichuan.py +469 -0
  754. vllm/model_executor/models/bamba.py +542 -0
  755. vllm/model_executor/models/bart.py +936 -0
  756. vllm/model_executor/models/bert.py +725 -0
  757. vllm/model_executor/models/blip.py +337 -0
  758. vllm/model_executor/models/blip2.py +717 -0
  759. vllm/model_executor/models/bloom.py +358 -0
  760. vllm/model_executor/models/chameleon.py +1135 -0
  761. vllm/model_executor/models/chatglm.py +476 -0
  762. vllm/model_executor/models/clip.py +410 -0
  763. vllm/model_executor/models/commandr.py +466 -0
  764. vllm/model_executor/models/constant_size_cache.py +136 -0
  765. vllm/model_executor/models/dbrx.py +469 -0
  766. vllm/model_executor/models/deepseek.py +484 -0
  767. vllm/model_executor/models/deepseek_mtp.py +266 -0
  768. vllm/model_executor/models/deepseek_v2.py +830 -0
  769. vllm/model_executor/models/deepseek_vl2.py +647 -0
  770. vllm/model_executor/models/eagle.py +247 -0
  771. vllm/model_executor/models/exaone.py +548 -0
  772. vllm/model_executor/models/fairseq2_llama.py +153 -0
  773. vllm/model_executor/models/falcon.py +508 -0
  774. vllm/model_executor/models/florence2.py +1102 -0
  775. vllm/model_executor/models/fuyu.py +388 -0
  776. vllm/model_executor/models/gemma.py +423 -0
  777. vllm/model_executor/models/gemma2.py +423 -0
  778. vllm/model_executor/models/gemma3.py +531 -0
  779. vllm/model_executor/models/gemma3_mm.py +716 -0
  780. vllm/model_executor/models/glm.py +22 -0
  781. vllm/model_executor/models/glm4.py +303 -0
  782. vllm/model_executor/models/glm4v.py +647 -0
  783. vllm/model_executor/models/gpt2.py +313 -0
  784. vllm/model_executor/models/gpt_bigcode.py +336 -0
  785. vllm/model_executor/models/gpt_j.py +337 -0
  786. vllm/model_executor/models/gpt_neox.py +330 -0
  787. vllm/model_executor/models/granite.py +494 -0
  788. vllm/model_executor/models/granite_speech.py +777 -0
  789. vllm/model_executor/models/granitemoe.py +435 -0
  790. vllm/model_executor/models/granitemoeshared.py +339 -0
  791. vllm/model_executor/models/gritlm.py +245 -0
  792. vllm/model_executor/models/grok1.py +560 -0
  793. vllm/model_executor/models/h2ovl.py +542 -0
  794. vllm/model_executor/models/idefics2_vision_model.py +387 -0
  795. vllm/model_executor/models/idefics3.py +767 -0
  796. vllm/model_executor/models/interfaces.py +569 -0
  797. vllm/model_executor/models/interfaces_base.py +163 -0
  798. vllm/model_executor/models/intern_vit.py +476 -0
  799. vllm/model_executor/models/internlm2.py +453 -0
  800. vllm/model_executor/models/internlm2_ve.py +146 -0
  801. vllm/model_executor/models/internvl.py +945 -0
  802. vllm/model_executor/models/jais.py +371 -0
  803. vllm/model_executor/models/jamba.py +590 -0
  804. vllm/model_executor/models/kimi_vl.py +577 -0
  805. vllm/model_executor/models/llama.py +619 -0
  806. vllm/model_executor/models/llama4.py +530 -0
  807. vllm/model_executor/models/llama_eagle.py +152 -0
  808. vllm/model_executor/models/llama_eagle3.py +232 -0
  809. vllm/model_executor/models/llava.py +869 -0
  810. vllm/model_executor/models/llava_next.py +582 -0
  811. vllm/model_executor/models/llava_next_video.py +470 -0
  812. vllm/model_executor/models/llava_onevision.py +954 -0
  813. vllm/model_executor/models/mamba.py +271 -0
  814. vllm/model_executor/models/mamba2.py +302 -0
  815. vllm/model_executor/models/mamba_cache.py +76 -0
  816. vllm/model_executor/models/medusa.py +210 -0
  817. vllm/model_executor/models/minicpm.py +592 -0
  818. vllm/model_executor/models/minicpm3.py +229 -0
  819. vllm/model_executor/models/minicpmo.py +725 -0
  820. vllm/model_executor/models/minicpmv.py +1287 -0
  821. vllm/model_executor/models/minimax_cache.py +35 -0
  822. vllm/model_executor/models/minimax_text_01.py +1261 -0
  823. vllm/model_executor/models/mistral3.py +598 -0
  824. vllm/model_executor/models/mixtral.py +485 -0
  825. vllm/model_executor/models/mixtral_quant.py +447 -0
  826. vllm/model_executor/models/mllama.py +1623 -0
  827. vllm/model_executor/models/mllama4.py +838 -0
  828. vllm/model_executor/models/mlp_speculator.py +205 -0
  829. vllm/model_executor/models/modernbert.py +325 -0
  830. vllm/model_executor/models/module_mapping.py +71 -0
  831. vllm/model_executor/models/molmo.py +1567 -0
  832. vllm/model_executor/models/moonvit.py +628 -0
  833. vllm/model_executor/models/mpt.py +329 -0
  834. vllm/model_executor/models/nemotron.py +506 -0
  835. vllm/model_executor/models/nemotron_nas.py +446 -0
  836. vllm/model_executor/models/nvlm_d.py +212 -0
  837. vllm/model_executor/models/olmo.py +390 -0
  838. vllm/model_executor/models/olmo2.py +412 -0
  839. vllm/model_executor/models/olmoe.py +449 -0
  840. vllm/model_executor/models/opt.py +410 -0
  841. vllm/model_executor/models/orion.py +356 -0
  842. vllm/model_executor/models/paligemma.py +397 -0
  843. vllm/model_executor/models/persimmon.py +342 -0
  844. vllm/model_executor/models/phi.py +354 -0
  845. vllm/model_executor/models/phi3.py +18 -0
  846. vllm/model_executor/models/phi3_small.py +463 -0
  847. vllm/model_executor/models/phi3v.py +722 -0
  848. vllm/model_executor/models/phi4mm.py +1263 -0
  849. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  850. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  851. vllm/model_executor/models/phimoe.py +666 -0
  852. vllm/model_executor/models/pixtral.py +1281 -0
  853. vllm/model_executor/models/plamo2.py +736 -0
  854. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  855. vllm/model_executor/models/qwen.py +360 -0
  856. vllm/model_executor/models/qwen2.py +552 -0
  857. vllm/model_executor/models/qwen2_5_omni_thinker.py +901 -0
  858. vllm/model_executor/models/qwen2_5_vl.py +1136 -0
  859. vllm/model_executor/models/qwen2_audio.py +402 -0
  860. vllm/model_executor/models/qwen2_moe.py +531 -0
  861. vllm/model_executor/models/qwen2_rm.py +130 -0
  862. vllm/model_executor/models/qwen2_vl.py +1409 -0
  863. vllm/model_executor/models/qwen3.py +319 -0
  864. vllm/model_executor/models/qwen3_moe.py +528 -0
  865. vllm/model_executor/models/qwen_vl.py +784 -0
  866. vllm/model_executor/models/registry.py +611 -0
  867. vllm/model_executor/models/roberta.py +332 -0
  868. vllm/model_executor/models/siglip.py +522 -0
  869. vllm/model_executor/models/skyworkr1v.py +949 -0
  870. vllm/model_executor/models/smolvlm.py +51 -0
  871. vllm/model_executor/models/solar.py +504 -0
  872. vllm/model_executor/models/stablelm.py +349 -0
  873. vllm/model_executor/models/starcoder2.py +355 -0
  874. vllm/model_executor/models/telechat2.py +139 -0
  875. vllm/model_executor/models/teleflm.py +78 -0
  876. vllm/model_executor/models/transformers.py +442 -0
  877. vllm/model_executor/models/ultravox.py +655 -0
  878. vllm/model_executor/models/utils.py +714 -0
  879. vllm/model_executor/models/vision.py +149 -0
  880. vllm/model_executor/models/whisper.py +746 -0
  881. vllm/model_executor/models/zamba2.py +1008 -0
  882. vllm/model_executor/parameter.py +458 -0
  883. vllm/model_executor/pooling_metadata.py +71 -0
  884. vllm/model_executor/sampling_metadata.py +596 -0
  885. vllm/model_executor/utils.py +53 -0
  886. vllm/multimodal/__init__.py +31 -0
  887. vllm/multimodal/audio.py +105 -0
  888. vllm/multimodal/base.py +218 -0
  889. vllm/multimodal/hasher.py +103 -0
  890. vllm/multimodal/image.py +77 -0
  891. vllm/multimodal/inputs.py +843 -0
  892. vllm/multimodal/parse.py +454 -0
  893. vllm/multimodal/processing.py +1760 -0
  894. vllm/multimodal/profiling.py +274 -0
  895. vllm/multimodal/registry.py +321 -0
  896. vllm/multimodal/utils.py +386 -0
  897. vllm/multimodal/video.py +166 -0
  898. vllm/outputs.py +521 -0
  899. vllm/platforms/__init__.py +286 -0
  900. vllm/platforms/cpu.py +182 -0
  901. vllm/platforms/cuda.py +463 -0
  902. vllm/platforms/hpu.py +94 -0
  903. vllm/platforms/interface.py +427 -0
  904. vllm/platforms/neuron.py +69 -0
  905. vllm/platforms/rocm.py +346 -0
  906. vllm/platforms/tpu.py +174 -0
  907. vllm/platforms/xpu.py +142 -0
  908. vllm/plugins/__init__.py +82 -0
  909. vllm/pooling_params.py +53 -0
  910. vllm/profiler/__init__.py +7 -0
  911. vllm/profiler/layerwise_profile.py +374 -0
  912. vllm/profiler/utils.py +147 -0
  913. vllm/prompt_adapter/__init__.py +0 -0
  914. vllm/prompt_adapter/layers.py +82 -0
  915. vllm/prompt_adapter/models.py +357 -0
  916. vllm/prompt_adapter/request.py +36 -0
  917. vllm/prompt_adapter/utils.py +97 -0
  918. vllm/prompt_adapter/worker_manager.py +178 -0
  919. vllm/py.typed +2 -0
  920. vllm/reasoning/__init__.py +12 -0
  921. vllm/reasoning/abs_reasoning_parsers.py +189 -0
  922. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  923. vllm/reasoning/granite_reasoning_parser.py +362 -0
  924. vllm/sampling_params.py +598 -0
  925. vllm/scalar_type.py +335 -0
  926. vllm/scripts.py +14 -0
  927. vllm/sequence.py +1486 -0
  928. vllm/spec_decode/__init__.py +0 -0
  929. vllm/spec_decode/batch_expansion.py +505 -0
  930. vllm/spec_decode/draft_model_runner.py +335 -0
  931. vllm/spec_decode/interfaces.py +98 -0
  932. vllm/spec_decode/medusa_worker.py +137 -0
  933. vllm/spec_decode/metrics.py +212 -0
  934. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  935. vllm/spec_decode/mqa_scorer.py +159 -0
  936. vllm/spec_decode/multi_step_worker.py +416 -0
  937. vllm/spec_decode/ngram_worker.py +195 -0
  938. vllm/spec_decode/proposer_worker_base.py +58 -0
  939. vllm/spec_decode/smaller_tp_proposer_worker.py +194 -0
  940. vllm/spec_decode/spec_decode_worker.py +1324 -0
  941. vllm/spec_decode/target_model_runner.py +44 -0
  942. vllm/spec_decode/top1_proposer.py +274 -0
  943. vllm/spec_decode/util.py +276 -0
  944. vllm/test_utils.py +129 -0
  945. vllm/third_party/__init__.py +0 -0
  946. vllm/third_party/pynvml.py +6139 -0
  947. vllm/tracing.py +130 -0
  948. vllm/transformers_utils/__init__.py +19 -0
  949. vllm/transformers_utils/config.py +813 -0
  950. vllm/transformers_utils/configs/__init__.py +52 -0
  951. vllm/transformers_utils/configs/arctic.py +206 -0
  952. vllm/transformers_utils/configs/chatglm.py +71 -0
  953. vllm/transformers_utils/configs/cohere2.py +194 -0
  954. vllm/transformers_utils/configs/dbrx.py +280 -0
  955. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  956. vllm/transformers_utils/configs/eagle.py +65 -0
  957. vllm/transformers_utils/configs/exaone.py +191 -0
  958. vllm/transformers_utils/configs/falcon.py +89 -0
  959. vllm/transformers_utils/configs/h2ovl.py +15 -0
  960. vllm/transformers_utils/configs/internvl.py +53 -0
  961. vllm/transformers_utils/configs/jais.py +237 -0
  962. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  963. vllm/transformers_utils/configs/medusa.py +62 -0
  964. vllm/transformers_utils/configs/mllama.py +30 -0
  965. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  966. vllm/transformers_utils/configs/moonvit.py +32 -0
  967. vllm/transformers_utils/configs/mpt.py +179 -0
  968. vllm/transformers_utils/configs/nemotron.py +204 -0
  969. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  970. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  971. vllm/transformers_utils/configs/solar.py +246 -0
  972. vllm/transformers_utils/configs/telechat2.py +63 -0
  973. vllm/transformers_utils/configs/ultravox.py +107 -0
  974. vllm/transformers_utils/detokenizer.py +167 -0
  975. vllm/transformers_utils/detokenizer_utils.py +188 -0
  976. vllm/transformers_utils/processor.py +210 -0
  977. vllm/transformers_utils/processors/__init__.py +6 -0
  978. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  979. vllm/transformers_utils/s3_utils.py +161 -0
  980. vllm/transformers_utils/tokenizer.py +291 -0
  981. vllm/transformers_utils/tokenizer_base.py +146 -0
  982. vllm/transformers_utils/tokenizer_group.py +110 -0
  983. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  984. vllm/transformers_utils/tokenizers/mistral.py +483 -0
  985. vllm/transformers_utils/utils.py +98 -0
  986. vllm/triton_utils/__init__.py +5 -0
  987. vllm/triton_utils/importing.py +53 -0
  988. vllm/usage/__init__.py +0 -0
  989. vllm/usage/usage_lib.py +255 -0
  990. vllm/utils.py +2692 -0
  991. vllm/v1/__init__.py +0 -0
  992. vllm/v1/attention/__init__.py +0 -0
  993. vllm/v1/attention/backends/__init__.py +0 -0
  994. vllm/v1/attention/backends/flash_attn.py +783 -0
  995. vllm/v1/attention/backends/flashinfer.py +638 -0
  996. vllm/v1/attention/backends/mla/__init__.py +0 -0
  997. vllm/v1/attention/backends/mla/common.py +974 -0
  998. vllm/v1/attention/backends/mla/flashmla.py +149 -0
  999. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1000. vllm/v1/attention/backends/pallas.py +221 -0
  1001. vllm/v1/attention/backends/triton_attn.py +198 -0
  1002. vllm/v1/core/__init__.py +0 -0
  1003. vllm/v1/core/block_pool.py +281 -0
  1004. vllm/v1/core/encoder_cache_manager.py +149 -0
  1005. vllm/v1/core/kv_cache_manager.py +385 -0
  1006. vllm/v1/core/kv_cache_utils.py +744 -0
  1007. vllm/v1/core/sched/__init__.py +0 -0
  1008. vllm/v1/core/sched/interface.py +134 -0
  1009. vllm/v1/core/sched/output.py +126 -0
  1010. vllm/v1/core/sched/scheduler.py +838 -0
  1011. vllm/v1/core/sched/utils.py +22 -0
  1012. vllm/v1/core/specialized_manager.py +161 -0
  1013. vllm/v1/engine/__init__.py +166 -0
  1014. vllm/v1/engine/async_llm.py +532 -0
  1015. vllm/v1/engine/core.py +701 -0
  1016. vllm/v1/engine/core_client.py +942 -0
  1017. vllm/v1/engine/detokenizer.py +260 -0
  1018. vllm/v1/engine/exceptions.py +16 -0
  1019. vllm/v1/engine/llm_engine.py +285 -0
  1020. vllm/v1/engine/logprobs.py +198 -0
  1021. vllm/v1/engine/mm_input_cache.py +82 -0
  1022. vllm/v1/engine/output_processor.py +420 -0
  1023. vllm/v1/engine/parallel_sampling.py +132 -0
  1024. vllm/v1/engine/processor.py +387 -0
  1025. vllm/v1/executor/__init__.py +0 -0
  1026. vllm/v1/executor/abstract.py +112 -0
  1027. vllm/v1/executor/multiproc_executor.py +480 -0
  1028. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1029. vllm/v1/kv_cache_interface.py +166 -0
  1030. vllm/v1/metrics/__init__.py +0 -0
  1031. vllm/v1/metrics/loggers.py +498 -0
  1032. vllm/v1/metrics/stats.py +238 -0
  1033. vllm/v1/outputs.py +111 -0
  1034. vllm/v1/request.py +178 -0
  1035. vllm/v1/sample/__init__.py +0 -0
  1036. vllm/v1/sample/metadata.py +43 -0
  1037. vllm/v1/sample/ops/__init__.py +0 -0
  1038. vllm/v1/sample/ops/bad_words.py +38 -0
  1039. vllm/v1/sample/ops/penalties.py +58 -0
  1040. vllm/v1/sample/ops/topk_topp_sampler.py +315 -0
  1041. vllm/v1/sample/rejection_sampler.py +631 -0
  1042. vllm/v1/sample/sampler.py +270 -0
  1043. vllm/v1/sample/tpu/__init__.py +0 -0
  1044. vllm/v1/sample/tpu/metadata.py +118 -0
  1045. vllm/v1/sample/tpu/sampler.py +154 -0
  1046. vllm/v1/serial_utils.py +274 -0
  1047. vllm/v1/spec_decode/__init__.py +0 -0
  1048. vllm/v1/spec_decode/eagle.py +318 -0
  1049. vllm/v1/spec_decode/metadata.py +61 -0
  1050. vllm/v1/spec_decode/metrics.py +164 -0
  1051. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1052. vllm/v1/spec_decode/utils.py +18 -0
  1053. vllm/v1/stats/__init__.py +0 -0
  1054. vllm/v1/stats/common.py +453 -0
  1055. vllm/v1/structured_output/__init__.py +113 -0
  1056. vllm/v1/structured_output/backend_guidance.py +215 -0
  1057. vllm/v1/structured_output/backend_types.py +96 -0
  1058. vllm/v1/structured_output/backend_xgrammar.py +299 -0
  1059. vllm/v1/structured_output/request.py +84 -0
  1060. vllm/v1/structured_output/utils.py +174 -0
  1061. vllm/v1/utils.py +249 -0
  1062. vllm/v1/worker/__init__.py +0 -0
  1063. vllm/v1/worker/block_table.py +87 -0
  1064. vllm/v1/worker/gpu_input_batch.py +677 -0
  1065. vllm/v1/worker/gpu_model_runner.py +1776 -0
  1066. vllm/v1/worker/gpu_worker.py +349 -0
  1067. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1068. vllm/v1/worker/tpu_model_runner.py +1419 -0
  1069. vllm/v1/worker/tpu_worker.py +260 -0
  1070. vllm/v1/worker/utils.py +74 -0
  1071. vllm/v1/worker/worker_base.py +64 -0
  1072. vllm/version.py +40 -0
  1073. vllm/vllm_flash_attn/.gitkeep +0 -0
  1074. vllm/worker/__init__.py +0 -0
  1075. vllm/worker/cache_engine.py +144 -0
  1076. vllm/worker/cpu_enc_dec_model_runner.py +323 -0
  1077. vllm/worker/cpu_model_runner.py +668 -0
  1078. vllm/worker/cpu_pooling_model_runner.py +122 -0
  1079. vllm/worker/cpu_worker.py +400 -0
  1080. vllm/worker/enc_dec_model_runner.py +542 -0
  1081. vllm/worker/hpu_model_runner.py +2221 -0
  1082. vllm/worker/hpu_worker.py +483 -0
  1083. vllm/worker/model_runner.py +2056 -0
  1084. vllm/worker/model_runner_base.py +281 -0
  1085. vllm/worker/multi_step_hpu_worker.py +122 -0
  1086. vllm/worker/multi_step_model_runner.py +908 -0
  1087. vllm/worker/multi_step_tpu_worker.py +107 -0
  1088. vllm/worker/multi_step_worker.py +196 -0
  1089. vllm/worker/neuron_model_runner.py +336 -0
  1090. vllm/worker/neuron_worker.py +138 -0
  1091. vllm/worker/pooling_model_runner.py +200 -0
  1092. vllm/worker/tpu_model_runner.py +908 -0
  1093. vllm/worker/tpu_worker.py +332 -0
  1094. vllm/worker/utils.py +52 -0
  1095. vllm/worker/worker.py +570 -0
  1096. vllm/worker/worker_base.py +644 -0
  1097. vllm/worker/xpu_model_runner.py +603 -0
  1098. vllm/worker/xpu_worker.py +185 -0
  1099. vllm_cpu-0.8.5.post2.dist-info/METADATA +309 -0
  1100. vllm_cpu-0.8.5.post2.dist-info/RECORD +1103 -0
  1101. vllm_cpu-0.8.5.post2.dist-info/WHEEL +5 -0
  1102. vllm_cpu-0.8.5.post2.dist-info/entry_points.txt +2 -0
  1103. vllm_cpu-0.8.5.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1724 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # yapf: disable
4
+ import argparse
5
+ import dataclasses
6
+ import json
7
+ import re
8
+ import threading
9
+ from dataclasses import MISSING, dataclass, fields
10
+ from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
11
+ TypeVar, Union, cast, get_args, get_origin)
12
+
13
+ import torch
14
+ from typing_extensions import TypeIs, deprecated
15
+
16
+ import vllm.envs as envs
17
+ from vllm import version
18
+ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
19
+ ConfigFormat, ConfigType, DecodingConfig, Device,
20
+ DeviceConfig, DistributedExecutorBackend,
21
+ GuidedDecodingBackendV1, HfOverrides,
22
+ KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
23
+ ModelConfig, ModelImpl, MultiModalConfig,
24
+ ObservabilityConfig, ParallelConfig, PoolerConfig,
25
+ PrefixCachingHashAlgo, PromptAdapterConfig,
26
+ SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
27
+ TaskOption, TokenizerPoolConfig, VllmConfig,
28
+ get_attr_docs, get_field)
29
+ from vllm.executor.executor_base import ExecutorBase
30
+ from vllm.logger import init_logger
31
+ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
32
+ from vllm.plugins import load_general_plugins
33
+ from vllm.reasoning import ReasoningParserManager
34
+ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
35
+ from vllm.transformers_utils.utils import check_gguf_file
36
+ from vllm.usage.usage_lib import UsageContext
37
+ from vllm.utils import FlexibleArgumentParser, GiB_bytes, is_in_ray_actor
38
+
39
+ # yapf: enable
40
+
41
+ logger = init_logger(__name__)
42
+
43
+ ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
44
+
45
+ # object is used to allow for special typing forms
46
+ T = TypeVar("T")
47
+ TypeHint = Union[type[Any], object]
48
+ TypeHintT = Union[type[T], object]
49
+
50
+
51
+ def optional_type(
52
+ return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
53
+
54
+ def _optional_type(val: str) -> Optional[T]:
55
+ if val == "" or val == "None":
56
+ return None
57
+ try:
58
+ if return_type is json.loads and not re.match("^{.*}$", val):
59
+ return cast(T, nullable_kvs(val))
60
+ return return_type(val)
61
+ except ValueError as e:
62
+ raise argparse.ArgumentTypeError(
63
+ f"Value {val} cannot be converted to {return_type}.") from e
64
+
65
+ return _optional_type
66
+
67
+
68
+ @deprecated(
69
+ "Passing a JSON argument as a string containing comma separated key=value "
70
+ "pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
71
+ "string instead.")
72
+ def nullable_kvs(val: str) -> dict[str, int]:
73
+ """Parses a string containing comma separate key [str] to value [int]
74
+ pairs into a dictionary.
75
+
76
+ Args:
77
+ val: String value to be parsed.
78
+
79
+ Returns:
80
+ Dictionary with parsed values.
81
+ """
82
+ out_dict: dict[str, int] = {}
83
+ for item in val.split(","):
84
+ kv_parts = [part.lower().strip() for part in item.split("=")]
85
+ if len(kv_parts) != 2:
86
+ raise argparse.ArgumentTypeError(
87
+ "Each item should be in the form KEY=VALUE")
88
+ key, value = kv_parts
89
+
90
+ try:
91
+ parsed_value = int(value)
92
+ except ValueError as exc:
93
+ msg = f"Failed to parse value of item {key}={value}"
94
+ raise argparse.ArgumentTypeError(msg) from exc
95
+
96
+ if key in out_dict and out_dict[key] != parsed_value:
97
+ raise argparse.ArgumentTypeError(
98
+ f"Conflicting values specified for key: {key}")
99
+ out_dict[key] = parsed_value
100
+
101
+ return out_dict
102
+
103
+
104
+ def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
105
+ """Check if the type hint is a specific type."""
106
+ return type_hint is type or get_origin(type_hint) is type
107
+
108
+
109
+ def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
110
+ """Check if the type hints contain a specific type."""
111
+ return any(is_type(type_hint, type) for type_hint in type_hints)
112
+
113
+
114
+ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
115
+ """Get the specific type from the type hints."""
116
+ return next((th for th in type_hints if is_type(th, type)), None)
117
+
118
+
119
+ def is_not_builtin(type_hint: TypeHint) -> bool:
120
+ """Check if the class is not a built-in type."""
121
+ return type_hint.__module__ != "builtins"
122
+
123
+
124
+ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
125
+ cls_docs = get_attr_docs(cls)
126
+ kwargs = {}
127
+ for field in fields(cls):
128
+ # Get the default value of the field
129
+ default = field.default
130
+ if field.default_factory is not MISSING:
131
+ default = field.default_factory()
132
+
133
+ # Get the help text for the field
134
+ name = field.name
135
+ help = cls_docs[name]
136
+ # Escape % for argparse
137
+ help = help.replace("%", "%%")
138
+
139
+ # Initialise the kwargs dictionary for the field
140
+ kwargs[name] = {"default": default, "help": help}
141
+
142
+ # Get the set of possible types for the field
143
+ type_hints: set[TypeHint] = set()
144
+ if get_origin(field.type) is Union:
145
+ type_hints.update(get_args(field.type))
146
+ else:
147
+ type_hints.add(field.type)
148
+
149
+ # Set other kwargs based on the type hints
150
+ if contains_type(type_hints, bool):
151
+ # Creates --no-<name> and --<name> flags
152
+ kwargs[name]["action"] = argparse.BooleanOptionalAction
153
+ elif contains_type(type_hints, Literal):
154
+ # Creates choices from Literal arguments
155
+ type_hint = get_type(type_hints, Literal)
156
+ choices = sorted(get_args(type_hint))
157
+ kwargs[name]["choices"] = choices
158
+ choice_type = type(choices[0])
159
+ assert all(type(c) is choice_type for c in choices), (
160
+ "All choices must be of the same type. "
161
+ f"Got {choices} with types {[type(c) for c in choices]}")
162
+ kwargs[name]["type"] = choice_type
163
+ elif contains_type(type_hints, tuple):
164
+ type_hint = get_type(type_hints, tuple)
165
+ types = get_args(type_hint)
166
+ tuple_type = types[0]
167
+ assert all(t is tuple_type for t in types if t is not Ellipsis), (
168
+ "All non-Ellipsis tuple elements must be of the same "
169
+ f"type. Got {types}.")
170
+ kwargs[name]["type"] = tuple_type
171
+ kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
172
+ elif contains_type(type_hints, list):
173
+ type_hint = get_type(type_hints, list)
174
+ types = get_args(type_hint)
175
+ assert len(types) == 1, (
176
+ "List type must have exactly one type. Got "
177
+ f"{type_hint} with types {types}")
178
+ kwargs[name]["type"] = types[0]
179
+ kwargs[name]["nargs"] = "+"
180
+ elif contains_type(type_hints, int):
181
+ kwargs[name]["type"] = int
182
+ elif contains_type(type_hints, float):
183
+ kwargs[name]["type"] = float
184
+ elif contains_type(type_hints, dict):
185
+ # Dict arguments will always be optional
186
+ kwargs[name]["type"] = optional_type(json.loads)
187
+ elif (contains_type(type_hints, str)
188
+ or any(is_not_builtin(th) for th in type_hints)):
189
+ kwargs[name]["type"] = str
190
+ else:
191
+ raise ValueError(
192
+ f"Unsupported type {type_hints} for argument {name}.")
193
+
194
+ # If None is in type_hints, make the argument optional.
195
+ # But not if it's a bool, argparse will handle this better.
196
+ if type(None) in type_hints and not contains_type(type_hints, bool):
197
+ kwargs[name]["type"] = optional_type(kwargs[name]["type"])
198
+ if kwargs[name].get("choices"):
199
+ kwargs[name]["choices"].append("None")
200
+ return kwargs
201
+
202
+
203
+ @dataclass
204
+ class EngineArgs:
205
+ """Arguments for vLLM engine."""
206
+ model: str = 'facebook/opt-125m'
207
+ served_model_name: Optional[Union[str, List[str]]] = None
208
+ tokenizer: Optional[str] = None
209
+ hf_config_path: Optional[str] = None
210
+ task: TaskOption = "auto"
211
+ skip_tokenizer_init: bool = False
212
+ tokenizer_mode: str = 'auto'
213
+ trust_remote_code: bool = False
214
+ allowed_local_media_path: str = ""
215
+ download_dir: Optional[str] = LoadConfig.download_dir
216
+ load_format: str = LoadConfig.load_format
217
+ config_format: ConfigFormat = ConfigFormat.AUTO
218
+ dtype: str = 'auto'
219
+ kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
220
+ seed: Optional[int] = None
221
+ max_model_len: Optional[int] = None
222
+ # Note: Specifying a custom executor backend by passing a class
223
+ # is intended for expert use only. The API may change without
224
+ # notice.
225
+ distributed_executor_backend: Optional[Union[
226
+ DistributedExecutorBackend,
227
+ Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
228
+ # number of P/D disaggregation (or other disaggregation) workers
229
+ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
230
+ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
231
+ data_parallel_size: int = ParallelConfig.data_parallel_size
232
+ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
233
+ max_parallel_loading_workers: Optional[
234
+ int] = ParallelConfig.max_parallel_loading_workers
235
+ block_size: Optional[BlockSize] = CacheConfig.block_size
236
+ enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
237
+ prefix_caching_hash_algo: PrefixCachingHashAlgo = \
238
+ CacheConfig.prefix_caching_hash_algo
239
+ disable_sliding_window: bool = False
240
+ disable_cascade_attn: bool = False
241
+ use_v2_block_manager: bool = True
242
+ swap_space: float = CacheConfig.swap_space
243
+ cpu_offload_gb: float = CacheConfig.cpu_offload_gb
244
+ gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
245
+ max_num_batched_tokens: Optional[
246
+ int] = SchedulerConfig.max_num_batched_tokens
247
+ max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
248
+ max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
249
+ long_prefill_token_threshold: int = \
250
+ SchedulerConfig.long_prefill_token_threshold
251
+ max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
252
+ max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
253
+ disable_log_stats: bool = False
254
+ revision: Optional[str] = None
255
+ code_revision: Optional[str] = None
256
+ rope_scaling: Optional[Dict[str, Any]] = None
257
+ rope_theta: Optional[float] = None
258
+ hf_token: Optional[Union[bool, str]] = None
259
+ hf_overrides: Optional[HfOverrides] = None
260
+ tokenizer_revision: Optional[str] = None
261
+ quantization: Optional[str] = None
262
+ enforce_eager: Optional[bool] = None
263
+ max_seq_len_to_capture: int = 8192
264
+ disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
265
+ # The following three fields are deprecated and will be removed in a future
266
+ # release. Setting them will have no effect. Please remove them from your
267
+ # configurations.
268
+ tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
269
+ tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
270
+ tokenizer_pool_extra_config: dict = \
271
+ get_field(TokenizerPoolConfig, "extra_config")
272
+ limit_mm_per_prompt: dict[str, int] = \
273
+ get_field(MultiModalConfig, "limit_per_prompt")
274
+ mm_processor_kwargs: Optional[Dict[str, Any]] = None
275
+ disable_mm_preprocessor_cache: bool = False
276
+ # LoRA fields
277
+ enable_lora: bool = False
278
+ enable_lora_bias: bool = LoRAConfig.bias_enabled
279
+ max_loras: int = LoRAConfig.max_loras
280
+ max_lora_rank: int = LoRAConfig.max_lora_rank
281
+ fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
282
+ max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
283
+ lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
284
+ lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
285
+ long_lora_scaling_factors: Optional[tuple[float, ...]] = \
286
+ LoRAConfig.long_lora_scaling_factors
287
+ # PromptAdapter fields
288
+ enable_prompt_adapter: bool = False
289
+ max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
290
+ max_prompt_adapter_token: int = \
291
+ PromptAdapterConfig.max_prompt_adapter_token
292
+
293
+ device: Device = DeviceConfig.device
294
+ num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
295
+ multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
296
+ ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
297
+ num_gpu_blocks_override: Optional[
298
+ int] = CacheConfig.num_gpu_blocks_override
299
+ num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
300
+ model_loader_extra_config: dict = \
301
+ get_field(LoadConfig, "model_loader_extra_config")
302
+ ignore_patterns: Optional[Union[str,
303
+ List[str]]] = LoadConfig.ignore_patterns
304
+ preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
305
+
306
+ scheduler_delay_factor: float = SchedulerConfig.delay_factor
307
+ enable_chunked_prefill: Optional[
308
+ bool] = SchedulerConfig.enable_chunked_prefill
309
+ disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
310
+
311
+ guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
312
+ logits_processor_pattern: Optional[str] = None
313
+
314
+ speculative_config: Optional[Dict[str, Any]] = None
315
+
316
+ qlora_adapter_name_or_path: Optional[str] = None
317
+ show_hidden_metrics_for_version: Optional[str] = None
318
+ otlp_traces_endpoint: Optional[str] = None
319
+ collect_detailed_traces: Optional[str] = None
320
+ disable_async_output_proc: bool = False
321
+ scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
322
+ scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
323
+
324
+ override_neuron_config: Optional[Dict[str, Any]] = None
325
+ override_pooler_config: Optional[PoolerConfig] = None
326
+ compilation_config: Optional[CompilationConfig] = None
327
+ worker_cls: str = ParallelConfig.worker_cls
328
+ worker_extension_cls: str = ParallelConfig.worker_extension_cls
329
+
330
+ kv_transfer_config: Optional[KVTransferConfig] = None
331
+
332
+ generation_config: Optional[str] = "auto"
333
+ override_generation_config: Optional[Dict[str, Any]] = None
334
+ enable_sleep_mode: bool = False
335
+ model_impl: str = "auto"
336
+
337
+ calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
338
+
339
+ additional_config: Optional[Dict[str, Any]] = None
340
+ enable_reasoning: Optional[bool] = None
341
+ reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
342
+ use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
343
+
344
+ def __post_init__(self):
345
+ if not self.tokenizer:
346
+ self.tokenizer = self.model
347
+
348
+ # support `EngineArgs(compilation_config={...})`
349
+ # without having to manually construct a
350
+ # CompilationConfig object
351
+ if isinstance(self.compilation_config, (int, dict)):
352
+ self.compilation_config = CompilationConfig.from_cli(
353
+ str(self.compilation_config))
354
+
355
+ # Setup plugins
356
+ from vllm.plugins import load_general_plugins
357
+ load_general_plugins()
358
+
359
+ @staticmethod
360
+ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
361
+ """Shared CLI arguments for vLLM engine."""
362
+
363
+ # Model arguments
364
+ parser.add_argument(
365
+ '--model',
366
+ type=str,
367
+ default=EngineArgs.model,
368
+ help='Name or path of the huggingface model to use.')
369
+ parser.add_argument(
370
+ '--task',
371
+ default=EngineArgs.task,
372
+ choices=get_args(TaskOption),
373
+ help='The task to use the model for. Each vLLM instance only '
374
+ 'supports one task, even if the same model can be used for '
375
+ 'multiple tasks. When the model only supports one task, ``"auto"`` '
376
+ 'can be used to select it; otherwise, you must specify explicitly '
377
+ 'which task to use.')
378
+ parser.add_argument(
379
+ '--tokenizer',
380
+ type=optional_type(str),
381
+ default=EngineArgs.tokenizer,
382
+ help='Name or path of the huggingface tokenizer to use. '
383
+ 'If unspecified, model name or path will be used.')
384
+ parser.add_argument(
385
+ "--hf-config-path",
386
+ type=optional_type(str),
387
+ default=EngineArgs.hf_config_path,
388
+ help='Name or path of the huggingface config to use. '
389
+ 'If unspecified, model name or path will be used.')
390
+ parser.add_argument(
391
+ '--skip-tokenizer-init',
392
+ action='store_true',
393
+ help='Skip initialization of tokenizer and detokenizer. '
394
+ 'Expects valid prompt_token_ids and None for prompt from '
395
+ 'the input. The generated output will contain token ids.')
396
+ parser.add_argument(
397
+ '--revision',
398
+ type=optional_type(str),
399
+ default=None,
400
+ help='The specific model version to use. It can be a branch '
401
+ 'name, a tag name, or a commit id. If unspecified, will use '
402
+ 'the default version.')
403
+ parser.add_argument(
404
+ '--code-revision',
405
+ type=optional_type(str),
406
+ default=None,
407
+ help='The specific revision to use for the model code on '
408
+ 'Hugging Face Hub. It can be a branch name, a tag name, or a '
409
+ 'commit id. If unspecified, will use the default version.')
410
+ parser.add_argument(
411
+ '--tokenizer-revision',
412
+ type=optional_type(str),
413
+ default=None,
414
+ help='Revision of the huggingface tokenizer to use. '
415
+ 'It can be a branch name, a tag name, or a commit id. '
416
+ 'If unspecified, will use the default version.')
417
+ parser.add_argument(
418
+ '--tokenizer-mode',
419
+ type=str,
420
+ default=EngineArgs.tokenizer_mode,
421
+ choices=['auto', 'slow', 'mistral', 'custom'],
422
+ help='The tokenizer mode.\n\n* "auto" will use the '
423
+ 'fast tokenizer if available.\n* "slow" will '
424
+ 'always use the slow tokenizer. \n* '
425
+ '"mistral" will always use the `mistral_common` tokenizer. \n* '
426
+ '"custom" will use --tokenizer to select the '
427
+ 'preregistered tokenizer.')
428
+ parser.add_argument('--trust-remote-code',
429
+ action='store_true',
430
+ help='Trust remote code from huggingface.')
431
+ parser.add_argument(
432
+ '--allowed-local-media-path',
433
+ type=str,
434
+ help="Allowing API requests to read local images or videos "
435
+ "from directories specified by the server file system. "
436
+ "This is a security risk. "
437
+ "Should only be enabled in trusted environments.")
438
+ # Model loading arguments
439
+ load_kwargs = get_kwargs(LoadConfig)
440
+ load_group = parser.add_argument_group(
441
+ title="LoadConfig",
442
+ description=LoadConfig.__doc__,
443
+ )
444
+ load_group.add_argument('--load-format',
445
+ choices=[f.value for f in LoadFormat],
446
+ **load_kwargs["load_format"])
447
+ load_group.add_argument('--download-dir',
448
+ **load_kwargs["download_dir"])
449
+ load_group.add_argument('--model-loader-extra-config',
450
+ **load_kwargs["model_loader_extra_config"])
451
+ load_group.add_argument('--use-tqdm-on-load',
452
+ **load_kwargs["use_tqdm_on_load"])
453
+
454
+ parser.add_argument(
455
+ '--config-format',
456
+ default=EngineArgs.config_format,
457
+ choices=[f.value for f in ConfigFormat],
458
+ help='The format of the model config to load.\n\n'
459
+ '* "auto" will try to load the config in hf format '
460
+ 'if available else it will try to load in mistral format ')
461
+ parser.add_argument(
462
+ '--dtype',
463
+ type=str,
464
+ default=EngineArgs.dtype,
465
+ choices=[
466
+ 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
467
+ ],
468
+ help='Data type for model weights and activations.\n\n'
469
+ '* "auto" will use FP16 precision for FP32 and FP16 models, and '
470
+ 'BF16 precision for BF16 models.\n'
471
+ '* "half" for FP16. Recommended for AWQ quantization.\n'
472
+ '* "float16" is the same as "half".\n'
473
+ '* "bfloat16" for a balance between precision and range.\n'
474
+ '* "float" is shorthand for FP32 precision.\n'
475
+ '* "float32" for FP32 precision.')
476
+ parser.add_argument('--max-model-len',
477
+ type=human_readable_int,
478
+ default=EngineArgs.max_model_len,
479
+ help='Model context length. If unspecified, will '
480
+ 'be automatically derived from the model config. '
481
+ 'Supports k/m/g/K/M/G in human-readable format.\n'
482
+ 'Examples:\n'
483
+ '- 1k → 1000\n'
484
+ '- 1K → 1024\n')
485
+
486
+ # Guided decoding arguments
487
+ guided_decoding_kwargs = get_kwargs(DecodingConfig)
488
+ guided_decoding_group = parser.add_argument_group(
489
+ title="DecodingConfig",
490
+ description=DecodingConfig.__doc__,
491
+ )
492
+ guided_decoding_group.add_argument(
493
+ '--guided-decoding-backend',
494
+ **guided_decoding_kwargs["guided_decoding_backend"])
495
+ guided_decoding_group.add_argument(
496
+ "--reasoning-parser",
497
+ # This choices is a special case because it's not static
498
+ choices=list(ReasoningParserManager.reasoning_parsers),
499
+ **guided_decoding_kwargs["reasoning_backend"])
500
+
501
+ parser.add_argument(
502
+ '--logits-processor-pattern',
503
+ type=optional_type(str),
504
+ default=None,
505
+ help='Optional regex pattern specifying valid logits processor '
506
+ 'qualified names that can be passed with the `logits_processors` '
507
+ 'extra completion argument. Defaults to None, which allows no '
508
+ 'processors.')
509
+ parser.add_argument(
510
+ '--model-impl',
511
+ type=str,
512
+ default=EngineArgs.model_impl,
513
+ choices=[f.value for f in ModelImpl],
514
+ help='Which implementation of the model to use.\n\n'
515
+ '* "auto" will try to use the vLLM implementation if it exists '
516
+ 'and fall back to the Transformers implementation if no vLLM '
517
+ 'implementation is available.\n'
518
+ '* "vllm" will use the vLLM model implementation.\n'
519
+ '* "transformers" will use the Transformers model '
520
+ 'implementation.\n')
521
+ # Parallel arguments
522
+ parallel_kwargs = get_kwargs(ParallelConfig)
523
+ parallel_group = parser.add_argument_group(
524
+ title="ParallelConfig",
525
+ description=ParallelConfig.__doc__,
526
+ )
527
+ parallel_group.add_argument(
528
+ '--distributed-executor-backend',
529
+ **parallel_kwargs["distributed_executor_backend"])
530
+ parallel_group.add_argument(
531
+ '--pipeline-parallel-size', '-pp',
532
+ **parallel_kwargs["pipeline_parallel_size"])
533
+ parallel_group.add_argument('--tensor-parallel-size', '-tp',
534
+ **parallel_kwargs["tensor_parallel_size"])
535
+ parallel_group.add_argument('--data-parallel-size', '-dp',
536
+ **parallel_kwargs["data_parallel_size"])
537
+ parallel_group.add_argument(
538
+ '--enable-expert-parallel',
539
+ **parallel_kwargs["enable_expert_parallel"])
540
+ parallel_group.add_argument(
541
+ '--max-parallel-loading-workers',
542
+ **parallel_kwargs["max_parallel_loading_workers"])
543
+ parallel_group.add_argument(
544
+ '--ray-workers-use-nsight',
545
+ **parallel_kwargs["ray_workers_use_nsight"])
546
+ parallel_group.add_argument(
547
+ '--disable-custom-all-reduce',
548
+ **parallel_kwargs["disable_custom_all_reduce"])
549
+
550
+ # KV cache arguments
551
+ cache_kwargs = get_kwargs(CacheConfig)
552
+ cache_group = parser.add_argument_group(
553
+ title="CacheConfig",
554
+ description=CacheConfig.__doc__,
555
+ )
556
+ cache_group.add_argument('--block-size', **cache_kwargs["block_size"])
557
+ cache_group.add_argument('--gpu-memory-utilization',
558
+ **cache_kwargs["gpu_memory_utilization"])
559
+ cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"])
560
+ cache_group.add_argument('--kv-cache-dtype',
561
+ **cache_kwargs["cache_dtype"])
562
+ cache_group.add_argument('--num-gpu-blocks-override',
563
+ **cache_kwargs["num_gpu_blocks_override"])
564
+ cache_group.add_argument("--enable-prefix-caching",
565
+ **cache_kwargs["enable_prefix_caching"])
566
+ cache_group.add_argument("--prefix-caching-hash-algo",
567
+ **cache_kwargs["prefix_caching_hash_algo"])
568
+ cache_group.add_argument('--cpu-offload-gb',
569
+ **cache_kwargs["cpu_offload_gb"])
570
+ cache_group.add_argument('--calculate-kv-scales',
571
+ **cache_kwargs["calculate_kv_scales"])
572
+
573
+ parser.add_argument('--disable-sliding-window',
574
+ action='store_true',
575
+ help='Disables sliding window, '
576
+ 'capping to sliding window size.')
577
+ parser.add_argument('--use-v2-block-manager',
578
+ action='store_true',
579
+ default=True,
580
+ help='[DEPRECATED] block manager v1 has been '
581
+ 'removed and SelfAttnBlockSpaceManager (i.e. '
582
+ 'block manager v2) is now the default. '
583
+ 'Setting this flag to True or False'
584
+ ' has no effect on vLLM behavior.')
585
+
586
+ parser.add_argument('--seed',
587
+ type=int,
588
+ default=EngineArgs.seed,
589
+ help='Random seed for operations.')
590
+ parser.add_argument(
591
+ '--max-logprobs',
592
+ type=int,
593
+ default=EngineArgs.max_logprobs,
594
+ help=('Max number of log probs to return logprobs is specified in'
595
+ ' SamplingParams.'))
596
+ parser.add_argument('--disable-log-stats',
597
+ action='store_true',
598
+ help='Disable logging statistics.')
599
+ # Quantization settings.
600
+ parser.add_argument('--quantization',
601
+ '-q',
602
+ type=optional_type(str),
603
+ choices=[*QUANTIZATION_METHODS, None],
604
+ default=EngineArgs.quantization,
605
+ help='Method used to quantize the weights. If '
606
+ 'None, we first check the `quantization_config` '
607
+ 'attribute in the model config file. If that is '
608
+ 'None, we assume the model weights are not '
609
+ 'quantized and use `dtype` to determine the data '
610
+ 'type of the weights.')
611
+ parser.add_argument(
612
+ '--rope-scaling',
613
+ default=None,
614
+ type=json.loads,
615
+ help='RoPE scaling configuration in JSON format. '
616
+ 'For example, ``{"rope_type":"dynamic","factor":2.0}``')
617
+ parser.add_argument('--rope-theta',
618
+ default=None,
619
+ type=float,
620
+ help='RoPE theta. Use with `rope_scaling`. In '
621
+ 'some cases, changing the RoPE theta improves the '
622
+ 'performance of the scaled model.')
623
+ parser.add_argument(
624
+ '--hf-token',
625
+ type=str,
626
+ nargs='?',
627
+ const=True,
628
+ default=None,
629
+ help='The token to use as HTTP bearer authorization'
630
+ ' for remote files. If `True`, will use the token '
631
+ 'generated when running `huggingface-cli login` '
632
+ '(stored in `~/.huggingface`).')
633
+ parser.add_argument('--hf-overrides',
634
+ type=json.loads,
635
+ default=EngineArgs.hf_overrides,
636
+ help='Extra arguments for the HuggingFace config. '
637
+ 'This should be a JSON string that will be '
638
+ 'parsed into a dictionary.')
639
+ parser.add_argument('--enforce-eager',
640
+ action='store_true',
641
+ help='Always use eager-mode PyTorch. If False, '
642
+ 'will use eager mode and CUDA graph in hybrid '
643
+ 'for maximal performance and flexibility.')
644
+ parser.add_argument('--max-seq-len-to-capture',
645
+ type=int,
646
+ default=EngineArgs.max_seq_len_to_capture,
647
+ help='Maximum sequence length covered by CUDA '
648
+ 'graphs. When a sequence has context length '
649
+ 'larger than this, we fall back to eager mode. '
650
+ 'Additionally for encoder-decoder models, if the '
651
+ 'sequence length of the encoder input is larger '
652
+ 'than this, we fall back to the eager mode.')
653
+
654
+ # Tokenizer arguments
655
+ tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
656
+ tokenizer_group = parser.add_argument_group(
657
+ title="TokenizerPoolConfig",
658
+ description=TokenizerPoolConfig.__doc__,
659
+ )
660
+ tokenizer_group.add_argument('--tokenizer-pool-size',
661
+ **tokenizer_kwargs["pool_size"])
662
+ tokenizer_group.add_argument('--tokenizer-pool-type',
663
+ **tokenizer_kwargs["pool_type"])
664
+ tokenizer_group.add_argument('--tokenizer-pool-extra-config',
665
+ **tokenizer_kwargs["extra_config"])
666
+
667
+ # Multimodal related configs
668
+ multimodal_kwargs = get_kwargs(MultiModalConfig)
669
+ multimodal_group = parser.add_argument_group(
670
+ title="MultiModalConfig",
671
+ description=MultiModalConfig.__doc__,
672
+ )
673
+ multimodal_group.add_argument('--limit-mm-per-prompt',
674
+ **multimodal_kwargs["limit_per_prompt"])
675
+
676
+ parser.add_argument(
677
+ '--mm-processor-kwargs',
678
+ default=None,
679
+ type=json.loads,
680
+ help=('Overrides for the multi-modal processor obtained from '
681
+ '``AutoProcessor.from_pretrained``. The available overrides '
682
+ 'depend on the model that is being run.'
683
+ 'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'))
684
+ parser.add_argument(
685
+ '--disable-mm-preprocessor-cache',
686
+ action='store_true',
687
+ help='If True, disable caching of the processed multi-modal '
688
+ 'inputs.')
689
+
690
+ # LoRA related configs
691
+ lora_kwargs = get_kwargs(LoRAConfig)
692
+ lora_group = parser.add_argument_group(
693
+ title="LoRAConfig",
694
+ description=LoRAConfig.__doc__,
695
+ )
696
+ lora_group.add_argument(
697
+ '--enable-lora',
698
+ action=argparse.BooleanOptionalAction,
699
+ help='If True, enable handling of LoRA adapters.')
700
+ lora_group.add_argument('--enable-lora-bias',
701
+ **lora_kwargs["bias_enabled"])
702
+ lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
703
+ lora_group.add_argument('--max-lora-rank',
704
+ **lora_kwargs["max_lora_rank"])
705
+ lora_group.add_argument('--lora-extra-vocab-size',
706
+ **lora_kwargs["lora_extra_vocab_size"])
707
+ lora_group.add_argument(
708
+ '--lora-dtype',
709
+ **lora_kwargs["lora_dtype"],
710
+ )
711
+ lora_group.add_argument('--long-lora-scaling-factors',
712
+ **lora_kwargs["long_lora_scaling_factors"])
713
+ lora_group.add_argument('--max-cpu-loras',
714
+ **lora_kwargs["max_cpu_loras"])
715
+ lora_group.add_argument('--fully-sharded-loras',
716
+ **lora_kwargs["fully_sharded_loras"])
717
+
718
+ # PromptAdapter related configs
719
+ prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
720
+ prompt_adapter_group = parser.add_argument_group(
721
+ title="PromptAdapterConfig",
722
+ description=PromptAdapterConfig.__doc__,
723
+ )
724
+ prompt_adapter_group.add_argument(
725
+ '--enable-prompt-adapter',
726
+ action=argparse.BooleanOptionalAction,
727
+ help='If True, enable handling of PromptAdapters.')
728
+ prompt_adapter_group.add_argument(
729
+ '--max-prompt-adapters',
730
+ **prompt_adapter_kwargs["max_prompt_adapters"])
731
+ prompt_adapter_group.add_argument(
732
+ '--max-prompt-adapter-token',
733
+ **prompt_adapter_kwargs["max_prompt_adapter_token"])
734
+
735
+ # Device arguments
736
+ device_kwargs = get_kwargs(DeviceConfig)
737
+ device_group = parser.add_argument_group(
738
+ title="DeviceConfig",
739
+ description=DeviceConfig.__doc__,
740
+ )
741
+ device_group.add_argument("--device", **device_kwargs["device"])
742
+
743
+ # Speculative arguments
744
+ speculative_group = parser.add_argument_group(
745
+ title="SpeculativeConfig",
746
+ description=SpeculativeConfig.__doc__,
747
+ )
748
+ speculative_group.add_argument(
749
+ '--speculative-config',
750
+ type=json.loads,
751
+ default=None,
752
+ help='The configurations for speculative decoding.'
753
+ ' Should be a JSON string.')
754
+
755
+ parser.add_argument(
756
+ '--ignore-patterns',
757
+ action="append",
758
+ type=str,
759
+ default=[],
760
+ help="The pattern(s) to ignore when loading the model."
761
+ "Default to `original/**/*` to avoid repeated loading of llama's "
762
+ "checkpoints.")
763
+
764
+ parser.add_argument(
765
+ "--served-model-name",
766
+ nargs="+",
767
+ type=str,
768
+ default=None,
769
+ help="The model name(s) used in the API. If multiple "
770
+ "names are provided, the server will respond to any "
771
+ "of the provided names. The model name in the model "
772
+ "field of a response will be the first name in this "
773
+ "list. If not specified, the model name will be the "
774
+ "same as the ``--model`` argument. Noted that this name(s) "
775
+ "will also be used in `model_name` tag content of "
776
+ "prometheus metrics, if multiple names provided, metrics "
777
+ "tag will take the first one.")
778
+ parser.add_argument('--qlora-adapter-name-or-path',
779
+ type=str,
780
+ default=None,
781
+ help='Name or path of the QLoRA adapter.')
782
+
783
+ parser.add_argument('--show-hidden-metrics-for-version',
784
+ type=str,
785
+ default=None,
786
+ help='Enable deprecated Prometheus metrics that '
787
+ 'have been hidden since the specified version. '
788
+ 'For example, if a previously deprecated metric '
789
+ 'has been hidden since the v0.7.0 release, you '
790
+ 'use --show-hidden-metrics-for-version=0.7 as a '
791
+ 'temporary escape hatch while you migrate to new '
792
+ 'metrics. The metric is likely to be removed '
793
+ 'completely in an upcoming release.')
794
+
795
+ parser.add_argument(
796
+ '--otlp-traces-endpoint',
797
+ type=str,
798
+ default=None,
799
+ help='Target URL to which OpenTelemetry traces will be sent.')
800
+ parser.add_argument(
801
+ '--collect-detailed-traces',
802
+ type=str,
803
+ default=None,
804
+ help="Valid choices are " +
805
+ ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
806
+ ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
807
+ " set. If set, it will collect detailed traces for the specified "
808
+ "modules. This involves use of possibly costly and or blocking "
809
+ "operations and hence might have a performance impact.")
810
+
811
+ parser.add_argument(
812
+ '--disable-async-output-proc',
813
+ action='store_true',
814
+ default=EngineArgs.disable_async_output_proc,
815
+ help="Disable async output processing. This may result in "
816
+ "lower performance.")
817
+
818
+ # Scheduler arguments
819
+ scheduler_kwargs = get_kwargs(SchedulerConfig)
820
+ scheduler_group = parser.add_argument_group(
821
+ title="SchedulerConfig",
822
+ description=SchedulerConfig.__doc__,
823
+ )
824
+ scheduler_group.add_argument(
825
+ '--max-num-batched-tokens',
826
+ **scheduler_kwargs["max_num_batched_tokens"])
827
+ scheduler_group.add_argument('--max-num-seqs',
828
+ **scheduler_kwargs["max_num_seqs"])
829
+ scheduler_group.add_argument(
830
+ "--max-num-partial-prefills",
831
+ **scheduler_kwargs["max_num_partial_prefills"])
832
+ scheduler_group.add_argument(
833
+ "--max-long-partial-prefills",
834
+ **scheduler_kwargs["max_long_partial_prefills"])
835
+ scheduler_group.add_argument(
836
+ "--long-prefill-token-threshold",
837
+ **scheduler_kwargs["long_prefill_token_threshold"])
838
+ scheduler_group.add_argument('--num-lookahead-slots',
839
+ **scheduler_kwargs["num_lookahead_slots"])
840
+ scheduler_group.add_argument('--scheduler-delay-factor',
841
+ **scheduler_kwargs["delay_factor"])
842
+ scheduler_group.add_argument('--preemption-mode',
843
+ **scheduler_kwargs["preemption_mode"])
844
+ scheduler_group.add_argument('--num-scheduler-steps',
845
+ **scheduler_kwargs["num_scheduler_steps"])
846
+ scheduler_group.add_argument(
847
+ '--multi-step-stream-outputs',
848
+ **scheduler_kwargs["multi_step_stream_outputs"])
849
+ scheduler_group.add_argument('--scheduling-policy',
850
+ **scheduler_kwargs["policy"])
851
+ scheduler_group.add_argument(
852
+ '--enable-chunked-prefill',
853
+ **scheduler_kwargs["enable_chunked_prefill"])
854
+ scheduler_group.add_argument(
855
+ "--disable-chunked-mm-input",
856
+ **scheduler_kwargs["disable_chunked_mm_input"])
857
+ parser.add_argument('--scheduler-cls',
858
+ **scheduler_kwargs["scheduler_cls"])
859
+
860
+ parser.add_argument(
861
+ '--override-neuron-config',
862
+ type=json.loads,
863
+ default=None,
864
+ help="Override or set neuron device configuration. "
865
+ "e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.")
866
+ parser.add_argument(
867
+ '--override-pooler-config',
868
+ type=PoolerConfig.from_json,
869
+ default=None,
870
+ help="Override or set the pooling method for pooling models. "
871
+ "e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.")
872
+
873
+ parser.add_argument('--compilation-config',
874
+ '-O',
875
+ type=CompilationConfig.from_cli,
876
+ default=None,
877
+ help='torch.compile configuration for the model.'
878
+ 'When it is a number (0, 1, 2, 3), it will be '
879
+ 'interpreted as the optimization level.\n'
880
+ 'NOTE: level 0 is the default level without '
881
+ 'any optimization. level 1 and 2 are for internal '
882
+ 'testing only. level 3 is the recommended level '
883
+ 'for production.\n'
884
+ 'To specify the full compilation config, '
885
+ 'use a JSON string, e.g. ``{"level": 3, '
886
+ '"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n'
887
+ 'Following the convention of traditional '
888
+ 'compilers, using ``-O`` without space is also '
889
+ 'supported. ``-O3`` is equivalent to ``-O 3``.')
890
+
891
+ parser.add_argument('--kv-transfer-config',
892
+ type=KVTransferConfig.from_cli,
893
+ default=None,
894
+ help='The configurations for distributed KV cache '
895
+ 'transfer. Should be a JSON string.')
896
+
897
+ parser.add_argument(
898
+ '--worker-cls',
899
+ type=str,
900
+ default="auto",
901
+ help='The worker class to use for distributed execution.')
902
+ parser.add_argument(
903
+ '--worker-extension-cls',
904
+ type=str,
905
+ default="",
906
+ help='The worker extension class on top of the worker cls, '
907
+ 'it is useful if you just want to add new functions to the worker '
908
+ 'class without changing the existing functions.')
909
+ parser.add_argument(
910
+ "--generation-config",
911
+ type=optional_type(str),
912
+ default="auto",
913
+ help="The folder path to the generation config. "
914
+ "Defaults to 'auto', the generation config will be loaded from "
915
+ "model path. If set to 'vllm', no generation config is loaded, "
916
+ "vLLM defaults will be used. If set to a folder path, the "
917
+ "generation config will be loaded from the specified folder path. "
918
+ "If `max_new_tokens` is specified in generation config, then "
919
+ "it sets a server-wide limit on the number of output tokens "
920
+ "for all requests.")
921
+
922
+ parser.add_argument(
923
+ "--override-generation-config",
924
+ type=json.loads,
925
+ default=None,
926
+ help="Overrides or sets generation config in JSON format. "
927
+ "e.g. ``{\"temperature\": 0.5}``. If used with "
928
+ "--generation-config=auto, the override parameters will be merged "
929
+ "with the default config from the model. If generation-config is "
930
+ "None, only the override parameters are used.")
931
+
932
+ parser.add_argument("--enable-sleep-mode",
933
+ action="store_true",
934
+ default=False,
935
+ help="Enable sleep mode for the engine. "
936
+ "(only cuda platform is supported)")
937
+
938
+ parser.add_argument(
939
+ "--additional-config",
940
+ type=json.loads,
941
+ default=None,
942
+ help="Additional config for specified platform in JSON format. "
943
+ "Different platforms may support different configs. Make sure the "
944
+ "configs are valid for the platform you are using. The input format"
945
+ " is like '{\"config_key\":\"config_value\"}'")
946
+
947
+ parser.add_argument(
948
+ "--enable-reasoning",
949
+ action="store_true",
950
+ default=False,
951
+ help="Whether to enable reasoning_content for the model. "
952
+ "If enabled, the model will be able to generate reasoning content."
953
+ )
954
+
955
+ parser.add_argument(
956
+ "--disable-cascade-attn",
957
+ action="store_true",
958
+ default=False,
959
+ help="Disable cascade attention for V1. While cascade attention "
960
+ "does not change the mathematical correctness, disabling it "
961
+ "could be useful for preventing potential numerical issues. "
962
+ "Note that even if this is set to False, cascade attention will be "
963
+ "only used when the heuristic tells that it's beneficial.")
964
+
965
+ return parser
966
+
967
+ @classmethod
968
+ def from_cli_args(cls, args: argparse.Namespace):
969
+ # Get the list of attributes of this dataclass.
970
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
971
+ # Set the attributes from the parsed arguments.
972
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
973
+ return engine_args
974
+
975
+ def create_model_config(self) -> ModelConfig:
976
+ # gguf file needs a specific model loader and doesn't use hf_repo
977
+ if check_gguf_file(self.model):
978
+ self.quantization = self.load_format = "gguf"
979
+
980
+ # NOTE: This is to allow model loading from S3 in CI
981
+ if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
982
+ and self.model in MODELS_ON_S3
983
+ and self.load_format == LoadFormat.AUTO): # noqa: E501
984
+ self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
985
+ self.load_format = LoadFormat.RUNAI_STREAMER
986
+
987
+ return ModelConfig(
988
+ model=self.model,
989
+ hf_config_path=self.hf_config_path,
990
+ task=self.task,
991
+ # We know this is not None because we set it in __post_init__
992
+ tokenizer=cast(str, self.tokenizer),
993
+ tokenizer_mode=self.tokenizer_mode,
994
+ trust_remote_code=self.trust_remote_code,
995
+ allowed_local_media_path=self.allowed_local_media_path,
996
+ dtype=self.dtype,
997
+ seed=self.seed,
998
+ revision=self.revision,
999
+ code_revision=self.code_revision,
1000
+ rope_scaling=self.rope_scaling,
1001
+ rope_theta=self.rope_theta,
1002
+ hf_token=self.hf_token,
1003
+ hf_overrides=self.hf_overrides,
1004
+ tokenizer_revision=self.tokenizer_revision,
1005
+ max_model_len=self.max_model_len,
1006
+ quantization=self.quantization,
1007
+ enforce_eager=self.enforce_eager,
1008
+ max_seq_len_to_capture=self.max_seq_len_to_capture,
1009
+ max_logprobs=self.max_logprobs,
1010
+ disable_sliding_window=self.disable_sliding_window,
1011
+ disable_cascade_attn=self.disable_cascade_attn,
1012
+ skip_tokenizer_init=self.skip_tokenizer_init,
1013
+ served_model_name=self.served_model_name,
1014
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
1015
+ use_async_output_proc=not self.disable_async_output_proc,
1016
+ config_format=self.config_format,
1017
+ mm_processor_kwargs=self.mm_processor_kwargs,
1018
+ disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
1019
+ override_neuron_config=self.override_neuron_config,
1020
+ override_pooler_config=self.override_pooler_config,
1021
+ logits_processor_pattern=self.logits_processor_pattern,
1022
+ generation_config=self.generation_config,
1023
+ override_generation_config=self.override_generation_config,
1024
+ enable_sleep_mode=self.enable_sleep_mode,
1025
+ model_impl=self.model_impl,
1026
+ )
1027
+
1028
+ def create_load_config(self) -> LoadConfig:
1029
+
1030
+ if(self.qlora_adapter_name_or_path is not None) and \
1031
+ self.quantization != "bitsandbytes":
1032
+ raise ValueError(
1033
+ "QLoRA adapter only support "
1034
+ f"'bitsandbytes' quantization, but got {self.quantization}")
1035
+
1036
+ if self.quantization == "bitsandbytes":
1037
+ self.load_format = "bitsandbytes"
1038
+ return LoadConfig(
1039
+ load_format=self.load_format,
1040
+ download_dir=self.download_dir,
1041
+ model_loader_extra_config=self.model_loader_extra_config,
1042
+ ignore_patterns=self.ignore_patterns,
1043
+ use_tqdm_on_load=self.use_tqdm_on_load,
1044
+ )
1045
+
1046
+ def create_speculative_config(
1047
+ self,
1048
+ target_model_config: ModelConfig,
1049
+ target_parallel_config: ParallelConfig,
1050
+ enable_chunked_prefill: bool,
1051
+ disable_log_stats: bool,
1052
+ ) -> Optional["SpeculativeConfig"]:
1053
+ """Initializes and returns a SpeculativeConfig object based on
1054
+ `speculative_config`.
1055
+
1056
+ This function utilizes `speculative_config` to create a
1057
+ SpeculativeConfig object. The `speculative_config` can either be
1058
+ provided as a JSON string input via CLI arguments or directly as a
1059
+ dictionary from the engine.
1060
+ """
1061
+ if self.speculative_config is None:
1062
+ return None
1063
+
1064
+ # Note(Shangming): These parameters are not obtained from the cli arg
1065
+ # '--speculative-config' and must be passed in when creating the engine
1066
+ # config.
1067
+ self.speculative_config.update({
1068
+ "target_model_config": target_model_config,
1069
+ "target_parallel_config": target_parallel_config,
1070
+ "enable_chunked_prefill": enable_chunked_prefill,
1071
+ "disable_log_stats": disable_log_stats,
1072
+ })
1073
+ speculative_config = SpeculativeConfig.from_dict(
1074
+ self.speculative_config)
1075
+
1076
+ return speculative_config
1077
+
1078
+ def create_engine_config(
1079
+ self,
1080
+ usage_context: Optional[UsageContext] = None,
1081
+ ) -> VllmConfig:
1082
+ """
1083
+ Create the VllmConfig.
1084
+
1085
+ NOTE: for autoselection of V0 vs V1 engine, we need to
1086
+ create the ModelConfig first, since ModelConfig's attrs
1087
+ (e.g. the model arch) are needed to make the decision.
1088
+
1089
+ This function set VLLM_USE_V1=X if VLLM_USE_V1 is
1090
+ unspecified by the user.
1091
+
1092
+ If VLLM_USE_V1 is specified by the user but the VllmConfig
1093
+ is incompatible, we raise an error.
1094
+ """
1095
+ from vllm.platforms import current_platform
1096
+ current_platform.pre_register_and_update()
1097
+
1098
+ device_config = DeviceConfig(device=self.device)
1099
+ model_config = self.create_model_config()
1100
+
1101
+ # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
1102
+ # and fall back to V0 for experimental or unsupported features.
1103
+ # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
1104
+ # features and raise error for unsupported features.
1105
+ # * If VLLM_USE_V1=0, we disable V1.
1106
+ use_v1 = False
1107
+ try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
1108
+ if try_v1 and self._is_v1_supported_oracle(model_config):
1109
+ use_v1 = True
1110
+
1111
+ # If user explicitly set VLLM_USE_V1, sanity check we respect it.
1112
+ if envs.is_set("VLLM_USE_V1"):
1113
+ assert use_v1 == envs.VLLM_USE_V1
1114
+ # Otherwise, set the VLLM_USE_V1 variable globally.
1115
+ else:
1116
+ envs.set_vllm_use_v1(use_v1)
1117
+
1118
+ # Set default arguments for V0 or V1 Engine.
1119
+ if use_v1:
1120
+ self._set_default_args_v1(usage_context)
1121
+ else:
1122
+ self._set_default_args_v0(model_config)
1123
+
1124
+ assert self.enable_chunked_prefill is not None
1125
+
1126
+ cache_config = CacheConfig(
1127
+ block_size=self.block_size,
1128
+ gpu_memory_utilization=self.gpu_memory_utilization,
1129
+ swap_space=self.swap_space,
1130
+ cache_dtype=self.kv_cache_dtype,
1131
+ is_attention_free=model_config.is_attention_free,
1132
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
1133
+ sliding_window=model_config.get_sliding_window(),
1134
+ enable_prefix_caching=self.enable_prefix_caching,
1135
+ prefix_caching_hash_algo=self.prefix_caching_hash_algo,
1136
+ cpu_offload_gb=self.cpu_offload_gb,
1137
+ calculate_kv_scales=self.calculate_kv_scales,
1138
+ )
1139
+
1140
+ # Get the current placement group if Ray is initialized and
1141
+ # we are in a Ray actor. If so, then the placement group will be
1142
+ # passed to spawned processes.
1143
+ placement_group = None
1144
+ if is_in_ray_actor():
1145
+ import ray
1146
+
1147
+ # This call initializes Ray automatically if it is not initialized,
1148
+ # but we should not do this here.
1149
+ placement_group = ray.util.get_current_placement_group()
1150
+
1151
+ parallel_config = ParallelConfig(
1152
+ pipeline_parallel_size=self.pipeline_parallel_size,
1153
+ tensor_parallel_size=self.tensor_parallel_size,
1154
+ data_parallel_size=self.data_parallel_size,
1155
+ enable_expert_parallel=self.enable_expert_parallel,
1156
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
1157
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
1158
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
1159
+ placement_group=placement_group,
1160
+ distributed_executor_backend=self.distributed_executor_backend,
1161
+ worker_cls=self.worker_cls,
1162
+ worker_extension_cls=self.worker_extension_cls,
1163
+ )
1164
+
1165
+ speculative_config = self.create_speculative_config(
1166
+ target_model_config=model_config,
1167
+ target_parallel_config=parallel_config,
1168
+ enable_chunked_prefill=self.enable_chunked_prefill,
1169
+ disable_log_stats=self.disable_log_stats,
1170
+ )
1171
+
1172
+ # Reminder: Please update docs/source/features/compatibility_matrix.md
1173
+ # If the feature combo become valid
1174
+ if self.num_scheduler_steps > 1:
1175
+ if speculative_config is not None:
1176
+ raise ValueError("Speculative decoding is not supported with "
1177
+ "multi-step (--num-scheduler-steps > 1)")
1178
+ if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
1179
+ raise ValueError("Multi-Step Chunked-Prefill is not supported "
1180
+ "for pipeline-parallel-size > 1")
1181
+ from vllm.platforms import current_platform
1182
+ if current_platform.is_cpu():
1183
+ logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
1184
+ "currently not supported for CPUs and has been "
1185
+ "disabled.")
1186
+ self.num_scheduler_steps = 1
1187
+
1188
+ # make sure num_lookahead_slots is set the higher value depending on
1189
+ # if we are using speculative decoding or multi-step
1190
+ num_lookahead_slots = max(self.num_lookahead_slots,
1191
+ self.num_scheduler_steps - 1)
1192
+ num_lookahead_slots = num_lookahead_slots \
1193
+ if speculative_config is None \
1194
+ else speculative_config.num_lookahead_slots
1195
+
1196
+ scheduler_config = SchedulerConfig(
1197
+ runner_type=model_config.runner_type,
1198
+ max_num_batched_tokens=self.max_num_batched_tokens,
1199
+ max_num_seqs=self.max_num_seqs,
1200
+ max_model_len=model_config.max_model_len,
1201
+ num_lookahead_slots=num_lookahead_slots,
1202
+ delay_factor=self.scheduler_delay_factor,
1203
+ enable_chunked_prefill=self.enable_chunked_prefill,
1204
+ disable_chunked_mm_input=self.disable_chunked_mm_input,
1205
+ is_multimodal_model=model_config.is_multimodal_model,
1206
+ preemption_mode=self.preemption_mode,
1207
+ num_scheduler_steps=self.num_scheduler_steps,
1208
+ multi_step_stream_outputs=self.multi_step_stream_outputs,
1209
+ send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
1210
+ and parallel_config.use_ray),
1211
+ policy=self.scheduling_policy,
1212
+ scheduler_cls=self.scheduler_cls,
1213
+ max_num_partial_prefills=self.max_num_partial_prefills,
1214
+ max_long_partial_prefills=self.max_long_partial_prefills,
1215
+ long_prefill_token_threshold=self.long_prefill_token_threshold,
1216
+ )
1217
+
1218
+ lora_config = LoRAConfig(
1219
+ bias_enabled=self.enable_lora_bias,
1220
+ max_lora_rank=self.max_lora_rank,
1221
+ max_loras=self.max_loras,
1222
+ fully_sharded_loras=self.fully_sharded_loras,
1223
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
1224
+ long_lora_scaling_factors=self.long_lora_scaling_factors,
1225
+ lora_dtype=self.lora_dtype,
1226
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
1227
+ and self.max_cpu_loras > 0 else None) if self.enable_lora else None
1228
+
1229
+ if self.qlora_adapter_name_or_path is not None and \
1230
+ self.qlora_adapter_name_or_path != "":
1231
+ self.model_loader_extra_config[
1232
+ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
1233
+
1234
+ # bitsandbytes pre-quantized model need a specific model loader
1235
+ if model_config.quantization == "bitsandbytes":
1236
+ self.quantization = self.load_format = "bitsandbytes"
1237
+
1238
+ load_config = self.create_load_config()
1239
+
1240
+ prompt_adapter_config = PromptAdapterConfig(
1241
+ max_prompt_adapters=self.max_prompt_adapters,
1242
+ max_prompt_adapter_token=self.max_prompt_adapter_token) \
1243
+ if self.enable_prompt_adapter else None
1244
+
1245
+ decoding_config = DecodingConfig(
1246
+ guided_decoding_backend=self.guided_decoding_backend,
1247
+ reasoning_backend=self.reasoning_parser
1248
+ if self.enable_reasoning else None,
1249
+ )
1250
+
1251
+ show_hidden_metrics = False
1252
+ if self.show_hidden_metrics_for_version is not None:
1253
+ show_hidden_metrics = version._prev_minor_version_was(
1254
+ self.show_hidden_metrics_for_version)
1255
+
1256
+ detailed_trace_modules = []
1257
+ if self.collect_detailed_traces is not None:
1258
+ detailed_trace_modules = self.collect_detailed_traces.split(",")
1259
+ for m in detailed_trace_modules:
1260
+ if m not in ALLOWED_DETAILED_TRACE_MODULES:
1261
+ raise ValueError(
1262
+ f"Invalid module {m} in collect_detailed_traces. "
1263
+ f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
1264
+ observability_config = ObservabilityConfig(
1265
+ show_hidden_metrics=show_hidden_metrics,
1266
+ otlp_traces_endpoint=self.otlp_traces_endpoint,
1267
+ collect_model_forward_time="model" in detailed_trace_modules
1268
+ or "all" in detailed_trace_modules,
1269
+ collect_model_execute_time="worker" in detailed_trace_modules
1270
+ or "all" in detailed_trace_modules,
1271
+ )
1272
+
1273
+ config = VllmConfig(
1274
+ model_config=model_config,
1275
+ cache_config=cache_config,
1276
+ parallel_config=parallel_config,
1277
+ scheduler_config=scheduler_config,
1278
+ device_config=device_config,
1279
+ lora_config=lora_config,
1280
+ speculative_config=speculative_config,
1281
+ load_config=load_config,
1282
+ decoding_config=decoding_config,
1283
+ observability_config=observability_config,
1284
+ prompt_adapter_config=prompt_adapter_config,
1285
+ compilation_config=self.compilation_config,
1286
+ kv_transfer_config=self.kv_transfer_config,
1287
+ additional_config=self.additional_config,
1288
+ )
1289
+
1290
+ return config
1291
+
1292
+ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
1293
+ """Oracle for whether to use V0 or V1 Engine by default."""
1294
+
1295
+ #############################################################
1296
+ # Unsupported Feature Flags on V1.
1297
+
1298
+ if (self.load_format == LoadFormat.TENSORIZER.value
1299
+ or self.load_format == LoadFormat.SHARDED_STATE.value):
1300
+ _raise_or_fallback(
1301
+ feature_name=f"--load_format {self.load_format}",
1302
+ recommend_to_remove=False)
1303
+ return False
1304
+
1305
+ if (self.logits_processor_pattern
1306
+ != EngineArgs.logits_processor_pattern):
1307
+ _raise_or_fallback(feature_name="--logits-processor-pattern",
1308
+ recommend_to_remove=False)
1309
+ return False
1310
+
1311
+ if self.preemption_mode != SchedulerConfig.preemption_mode:
1312
+ _raise_or_fallback(feature_name="--preemption-mode",
1313
+ recommend_to_remove=True)
1314
+ return False
1315
+
1316
+ if (self.disable_async_output_proc
1317
+ != EngineArgs.disable_async_output_proc):
1318
+ _raise_or_fallback(feature_name="--disable-async-output-proc",
1319
+ recommend_to_remove=True)
1320
+ return False
1321
+
1322
+ if self.scheduling_policy != SchedulerConfig.policy:
1323
+ _raise_or_fallback(feature_name="--scheduling-policy",
1324
+ recommend_to_remove=False)
1325
+ return False
1326
+
1327
+ if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
1328
+ _raise_or_fallback(feature_name="--num-scheduler-steps",
1329
+ recommend_to_remove=True)
1330
+ return False
1331
+
1332
+ if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
1333
+ _raise_or_fallback(feature_name="--scheduler-delay-factor",
1334
+ recommend_to_remove=True)
1335
+ return False
1336
+
1337
+ # remove backend options when doing this check
1338
+ if self.guided_decoding_backend.split(':')[0] \
1339
+ not in get_args(GuidedDecodingBackendV1):
1340
+ _raise_or_fallback(
1341
+ feature_name=
1342
+ f"--guided-decoding-backend={self.guided_decoding_backend}",
1343
+ recommend_to_remove=False)
1344
+ return False
1345
+
1346
+ # Need at least Ampere for now (FA support required).
1347
+ # Skip this check if we are running on a non-GPU platform,
1348
+ # or if the device capability is not available
1349
+ # (e.g. in a Ray actor without GPUs).
1350
+ from vllm.platforms import current_platform
1351
+ if (current_platform.is_cuda()
1352
+ and current_platform.get_device_capability()
1353
+ and current_platform.get_device_capability().major < 8):
1354
+ _raise_or_fallback(feature_name="Compute Capability < 8.0",
1355
+ recommend_to_remove=False)
1356
+ return False
1357
+
1358
+ # No Fp8 KV cache so far.
1359
+ if self.kv_cache_dtype != "auto":
1360
+ fp8_attention = self.kv_cache_dtype.startswith("fp8")
1361
+ will_use_fa = (
1362
+ current_platform.is_cuda()
1363
+ and not envs.is_set("VLLM_ATTENTION_BACKEND")
1364
+ ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
1365
+ supported = False
1366
+ if fp8_attention and will_use_fa:
1367
+ from vllm.attention.utils.fa_utils import (
1368
+ flash_attn_supports_fp8)
1369
+ supported = flash_attn_supports_fp8()
1370
+ if not supported:
1371
+ _raise_or_fallback(feature_name="--kv-cache-dtype",
1372
+ recommend_to_remove=False)
1373
+ return False
1374
+
1375
+ # No Prompt Adapter so far.
1376
+ if self.enable_prompt_adapter:
1377
+ _raise_or_fallback(feature_name="--enable-prompt-adapter",
1378
+ recommend_to_remove=False)
1379
+ return False
1380
+
1381
+ # Only Fp16 and Bf16 dtypes since we only support FA.
1382
+ V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
1383
+ if model_config.dtype not in V1_SUPPORTED_DTYPES:
1384
+ _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
1385
+ recommend_to_remove=False)
1386
+ return False
1387
+
1388
+ # Some quantization is not compatible with torch.compile.
1389
+ V1_UNSUPPORTED_QUANT = ["gguf"]
1390
+ if model_config.quantization in V1_UNSUPPORTED_QUANT:
1391
+ _raise_or_fallback(
1392
+ feature_name=f"--quantization {model_config.quantization}",
1393
+ recommend_to_remove=False)
1394
+ return False
1395
+
1396
+ # No Embedding Models so far.
1397
+ if model_config.task not in ["generate"]:
1398
+ _raise_or_fallback(feature_name=f"--task {model_config.task}",
1399
+ recommend_to_remove=False)
1400
+ return False
1401
+
1402
+ # No Mamba or Encoder-Decoder so far.
1403
+ if not model_config.is_v1_compatible:
1404
+ _raise_or_fallback(feature_name=model_config.architectures,
1405
+ recommend_to_remove=False)
1406
+ return False
1407
+
1408
+ # No Concurrent Partial Prefills so far.
1409
+ if (self.max_num_partial_prefills
1410
+ != SchedulerConfig.max_num_partial_prefills
1411
+ or self.max_long_partial_prefills
1412
+ != SchedulerConfig.max_long_partial_prefills):
1413
+ _raise_or_fallback(feature_name="Concurrent Partial Prefill",
1414
+ recommend_to_remove=False)
1415
+ return False
1416
+
1417
+ # No OTLP observability so far.
1418
+ if (self.otlp_traces_endpoint or self.collect_detailed_traces):
1419
+ _raise_or_fallback(feature_name="--otlp-traces-endpoint",
1420
+ recommend_to_remove=False)
1421
+ return False
1422
+
1423
+ # Only Ngram speculative decoding so far.
1424
+ is_ngram_enabled = False
1425
+ is_eagle_enabled = False
1426
+ if self.speculative_config is not None:
1427
+ # This is supported but experimental (handled below).
1428
+ speculative_method = self.speculative_config.get("method")
1429
+ if speculative_method:
1430
+ if speculative_method in ("ngram", "[ngram]"):
1431
+ is_ngram_enabled = True
1432
+ elif speculative_method in ("eagle", "eagle3"):
1433
+ is_eagle_enabled = True
1434
+ else:
1435
+ speculative_model = self.speculative_config.get("model")
1436
+ if speculative_model in ("ngram", "[ngram]"):
1437
+ is_ngram_enabled = True
1438
+ if not (is_ngram_enabled or is_eagle_enabled):
1439
+ # Other speculative decoding methods are not supported yet.
1440
+ _raise_or_fallback(feature_name="Speculative Decoding",
1441
+ recommend_to_remove=False)
1442
+ return False
1443
+
1444
+ # No XFormers so far.
1445
+ V1_BACKENDS = [
1446
+ "FLASH_ATTN_VLLM_V1",
1447
+ "FLASH_ATTN",
1448
+ "PALLAS",
1449
+ "PALLAS_VLLM_V1",
1450
+ "TRITON_ATTN_VLLM_V1",
1451
+ "TRITON_MLA",
1452
+ "FLASHMLA",
1453
+ "FLASHINFER",
1454
+ "FLASHINFER_VLLM_V1",
1455
+ ]
1456
+ if (envs.is_set("VLLM_ATTENTION_BACKEND")
1457
+ and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
1458
+ name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
1459
+ _raise_or_fallback(feature_name=name, recommend_to_remove=True)
1460
+ return False
1461
+
1462
+ # Platforms must decide if they can support v1 for this model
1463
+ if not current_platform.supports_v1(model_config=model_config):
1464
+ _raise_or_fallback(
1465
+ feature_name=f"device type={current_platform.device_type}",
1466
+ recommend_to_remove=False)
1467
+ return False
1468
+ #############################################################
1469
+ # Experimental Features - allow users to opt in.
1470
+
1471
+ # Signal Handlers requires running in main thread.
1472
+ if (threading.current_thread() != threading.main_thread()
1473
+ and _warn_or_fallback("Engine in background thread")):
1474
+ return False
1475
+
1476
+ # PP is supported on V1 with Ray distributed executor,
1477
+ # but off for MP distributed executor for now.
1478
+ if (self.pipeline_parallel_size > 1
1479
+ and self.distributed_executor_backend != "ray"):
1480
+ name = "Pipeline Parallelism without Ray distributed executor"
1481
+ _raise_or_fallback(feature_name=name, recommend_to_remove=False)
1482
+ return False
1483
+
1484
+ # ngram is supported on V1, but off by default for now.
1485
+ if is_ngram_enabled and _warn_or_fallback("ngram"):
1486
+ return False
1487
+
1488
+ # Eagle is under development, so we don't support it yet.
1489
+ if is_eagle_enabled and _warn_or_fallback("Eagle"):
1490
+ return False
1491
+
1492
+ # Non-CUDA is supported on V1, but off by default for now.
1493
+ not_cuda = not current_platform.is_cuda()
1494
+ if not_cuda and _warn_or_fallback( # noqa: SIM103
1495
+ current_platform.device_name):
1496
+ return False
1497
+ #############################################################
1498
+
1499
+ return True
1500
+
1501
+ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
1502
+ """Set Default Arguments for V0 Engine."""
1503
+
1504
+ max_model_len = model_config.max_model_len
1505
+ use_long_context = max_model_len > 32768
1506
+ if self.enable_chunked_prefill is None:
1507
+ # Chunked prefill not supported for Multimodal or MLA in V0.
1508
+ if model_config.is_multimodal_model or model_config.use_mla:
1509
+ self.enable_chunked_prefill = False
1510
+
1511
+ # Enable chunked prefill by default for long context (> 32K)
1512
+ # models to avoid OOM errors in initial memory profiling phase.
1513
+ elif use_long_context:
1514
+ from vllm.platforms import current_platform
1515
+ is_gpu = current_platform.is_cuda()
1516
+ use_sliding_window = (model_config.get_sliding_window()
1517
+ is not None)
1518
+ use_spec_decode = self.speculative_config is not None
1519
+
1520
+ if (is_gpu and not use_sliding_window and not use_spec_decode
1521
+ and not self.enable_lora
1522
+ and not self.enable_prompt_adapter
1523
+ and model_config.runner_type != "pooling"):
1524
+ self.enable_chunked_prefill = True
1525
+ logger.warning(
1526
+ "Chunked prefill is enabled by default for models "
1527
+ "with max_model_len > 32K. Chunked prefill might "
1528
+ "not work with some features or models. If you "
1529
+ "encounter any issues, please disable by launching "
1530
+ "with --enable-chunked-prefill=False.")
1531
+
1532
+ if self.enable_chunked_prefill is None:
1533
+ self.enable_chunked_prefill = False
1534
+
1535
+ if not self.enable_chunked_prefill and use_long_context:
1536
+ logger.warning(
1537
+ "The model has a long context length (%s). This may cause"
1538
+ "OOM during the initial memory profiling phase, or result "
1539
+ "in low performance due to small KV cache size. Consider "
1540
+ "setting --max-model-len to a smaller value.", max_model_len)
1541
+ elif (self.enable_chunked_prefill
1542
+ and model_config.runner_type == "pooling"):
1543
+ msg = "Chunked prefill is not supported for pooling models"
1544
+ raise ValueError(msg)
1545
+
1546
+ # if using prefix caching, we must set a hash algo
1547
+ if self.enable_prefix_caching:
1548
+ # Disable prefix caching for multimodal models for VLLM_V0.
1549
+ if model_config.is_multimodal_model:
1550
+ logger.warning(
1551
+ "--enable-prefix-caching is not supported for multimodal "
1552
+ "models in V0 and has been disabled.")
1553
+ self.enable_prefix_caching = False
1554
+
1555
+ # VLLM_V0 only supports builtin hash algo for prefix caching.
1556
+ if self.prefix_caching_hash_algo == "sha256":
1557
+ raise ValueError(
1558
+ "sha256 is not supported for prefix caching in V0 engine. "
1559
+ "Please use 'builtin'.")
1560
+
1561
+ # Set max_num_seqs to 256 for VLLM_V0.
1562
+ if self.max_num_seqs is None:
1563
+ self.max_num_seqs = 256
1564
+
1565
+ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
1566
+ """Set Default Arguments for V1 Engine."""
1567
+
1568
+ # V1 always uses chunked prefills.
1569
+ self.enable_chunked_prefill = True
1570
+
1571
+ # V1 enables prefix caching by default.
1572
+ if self.enable_prefix_caching is None:
1573
+ self.enable_prefix_caching = True
1574
+
1575
+ # V1 should use the new scheduler by default.
1576
+ # Swap it only if this arg is set to the original V0 default
1577
+ if self.scheduler_cls == EngineArgs.scheduler_cls:
1578
+ self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
1579
+
1580
+ # When no user override, set the default values based on the usage
1581
+ # context.
1582
+ # Use different default values for different hardware.
1583
+
1584
+ # Try to query the device name on the current platform. If it fails,
1585
+ # it may be because the platform that imports vLLM is not the same
1586
+ # as the platform that vLLM is running on (e.g. the case of scaling
1587
+ # vLLM with Ray) and has no GPUs. In this case we use the default
1588
+ # values for non-H100/H200 GPUs.
1589
+ try:
1590
+ from vllm.platforms import current_platform
1591
+ device_memory = current_platform.get_device_total_memory()
1592
+ except Exception:
1593
+ # This is only used to set default_max_num_batched_tokens
1594
+ device_memory = 0
1595
+
1596
+ if device_memory >= 70 * GiB_bytes:
1597
+ # For GPUs like H100 and MI300x, use larger default values.
1598
+ default_max_num_batched_tokens = {
1599
+ UsageContext.LLM_CLASS: 16384,
1600
+ UsageContext.OPENAI_API_SERVER: 8192,
1601
+ }
1602
+ default_max_num_seqs = 1024
1603
+ else:
1604
+ # TODO(woosuk): Tune the default values for other hardware.
1605
+ default_max_num_batched_tokens = {
1606
+ UsageContext.LLM_CLASS: 8192,
1607
+ UsageContext.OPENAI_API_SERVER: 2048,
1608
+ }
1609
+ default_max_num_seqs = 256
1610
+
1611
+ use_context_value = usage_context.value if usage_context else None
1612
+ if (self.max_num_batched_tokens is None
1613
+ and usage_context in default_max_num_batched_tokens):
1614
+ self.max_num_batched_tokens = default_max_num_batched_tokens[
1615
+ usage_context]
1616
+ logger.debug(
1617
+ "Setting max_num_batched_tokens to %d for %s usage context.",
1618
+ self.max_num_batched_tokens, use_context_value)
1619
+
1620
+ if self.max_num_seqs is None:
1621
+ self.max_num_seqs = default_max_num_seqs
1622
+
1623
+ logger.debug("Setting max_num_seqs to %d for %s usage context.",
1624
+ self.max_num_seqs, use_context_value)
1625
+
1626
+
1627
+ @dataclass
1628
+ class AsyncEngineArgs(EngineArgs):
1629
+ """Arguments for asynchronous vLLM engine."""
1630
+ disable_log_requests: bool = False
1631
+
1632
+ @staticmethod
1633
+ def add_cli_args(parser: FlexibleArgumentParser,
1634
+ async_args_only: bool = False) -> FlexibleArgumentParser:
1635
+ # Initialize plugin to update the parser, for example, The plugin may
1636
+ # adding a new kind of quantization method to --quantization argument or
1637
+ # a new device to --device argument.
1638
+ load_general_plugins()
1639
+ if not async_args_only:
1640
+ parser = EngineArgs.add_cli_args(parser)
1641
+ parser.add_argument('--disable-log-requests',
1642
+ action='store_true',
1643
+ help='Disable logging requests.')
1644
+ from vllm.platforms import current_platform
1645
+ current_platform.pre_register_and_update(parser)
1646
+ return parser
1647
+
1648
+
1649
+ def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
1650
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1651
+ raise NotImplementedError(
1652
+ f"VLLM_USE_V1=1 is not supported with {feature_name}.")
1653
+ msg = f"{feature_name} is not supported by the V1 Engine. "
1654
+ msg += "Falling back to V0. "
1655
+ if recommend_to_remove:
1656
+ msg += f"We recommend to remove {feature_name} from your config "
1657
+ msg += "in favor of the V1 Engine."
1658
+ logger.warning(msg)
1659
+
1660
+
1661
+ def _warn_or_fallback(feature_name: str) -> bool:
1662
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1663
+ logger.warning(
1664
+ "Detected VLLM_USE_V1=1 with %s. Usage should "
1665
+ "be considered experimental. Please report any "
1666
+ "issues on Github.", feature_name)
1667
+ should_exit = False
1668
+ else:
1669
+ logger.info(
1670
+ "%s is experimental on VLLM_USE_V1=1. "
1671
+ "Falling back to V0 Engine.", feature_name)
1672
+ should_exit = True
1673
+ return should_exit
1674
+
1675
+
1676
+ def human_readable_int(value):
1677
+ """Parse human-readable integers like '1k', '2M', etc.
1678
+ Including decimal values with decimal multipliers.
1679
+
1680
+ Examples:
1681
+ - '1k' -> 1,000
1682
+ - '1K' -> 1,024
1683
+ - '25.6k' -> 25,600
1684
+ """
1685
+ value = value.strip()
1686
+ match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
1687
+ if match:
1688
+ decimal_multiplier = {
1689
+ 'k': 10**3,
1690
+ 'm': 10**6,
1691
+ 'g': 10**9,
1692
+ }
1693
+ binary_multiplier = {
1694
+ 'K': 2**10,
1695
+ 'M': 2**20,
1696
+ 'G': 2**30,
1697
+ }
1698
+
1699
+ number, suffix = match.groups()
1700
+ if suffix in decimal_multiplier:
1701
+ mult = decimal_multiplier[suffix]
1702
+ return int(float(number) * mult)
1703
+ elif suffix in binary_multiplier:
1704
+ mult = binary_multiplier[suffix]
1705
+ # Do not allow decimals with binary multipliers
1706
+ try:
1707
+ return int(number) * mult
1708
+ except ValueError as e:
1709
+ raise argparse.ArgumentTypeError("Decimals are not allowed " \
1710
+ f"with binary suffixes like {suffix}. Did you mean to use " \
1711
+ f"{number}{suffix.lower()} instead?") from e
1712
+
1713
+ # Regular plain number.
1714
+ return int(value)
1715
+
1716
+
1717
+ # These functions are used by sphinx to build the documentation
1718
+ def _engine_args_parser():
1719
+ return EngineArgs.add_cli_args(FlexibleArgumentParser())
1720
+
1721
+
1722
+ def _async_engine_args_parser():
1723
+ return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
1724
+ async_args_only=True)