vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1175) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1742 -0
  4. vllm/_ipex_ops.py +243 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +15 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +44 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +33 -0
  16. vllm/assets/video.py +114 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +305 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1494 -0
  23. vllm/attention/backends/flash_attn.py +999 -0
  24. vllm/attention/backends/flashinfer.py +1100 -0
  25. vllm/attention/backends/flashmla.py +242 -0
  26. vllm/attention/backends/hpu_attn.py +309 -0
  27. vllm/attention/backends/ipex_attn.py +394 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1381 -0
  30. vllm/attention/backends/pallas.py +347 -0
  31. vllm/attention/backends/placeholder_attn.py +399 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +970 -0
  34. vllm/attention/backends/torch_sdpa.py +691 -0
  35. vllm/attention/backends/triton_mla.py +113 -0
  36. vllm/attention/backends/utils.py +609 -0
  37. vllm/attention/backends/xformers.py +798 -0
  38. vllm/attention/layer.py +452 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +245 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +367 -0
  45. vllm/attention/ops/flashmla.py +115 -0
  46. vllm/attention/ops/hpu_paged_attn.py +87 -0
  47. vllm/attention/ops/ipex_attn.py +194 -0
  48. vllm/attention/ops/merge_attn_states.py +42 -0
  49. vllm/attention/ops/nki_flash_attn.py +905 -0
  50. vllm/attention/ops/paged_attn.py +255 -0
  51. vllm/attention/ops/prefix_prefill.py +901 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +99 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  54. vllm/attention/ops/triton_decode_attention.py +673 -0
  55. vllm/attention/ops/triton_flash_attention.py +1374 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  57. vllm/attention/ops/triton_unified_attention.py +337 -0
  58. vllm/attention/selector.py +186 -0
  59. vllm/attention/utils/fa_utils.py +54 -0
  60. vllm/beam_search.py +82 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +921 -0
  63. vllm/benchmarks/endpoint_request_func.py +160 -0
  64. vllm/benchmarks/latency.py +184 -0
  65. vllm/benchmarks/serve.py +925 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +69 -0
  68. vllm/collect_env.py +818 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +88 -0
  71. vllm/compilation/backends.py +560 -0
  72. vllm/compilation/base_piecewise_backend.py +71 -0
  73. vllm/compilation/collective_fusion.py +126 -0
  74. vllm/compilation/compiler_interface.py +533 -0
  75. vllm/compilation/counter.py +33 -0
  76. vllm/compilation/cuda_piecewise_backend.py +213 -0
  77. vllm/compilation/decorators.py +249 -0
  78. vllm/compilation/fix_functionalization.py +190 -0
  79. vllm/compilation/fusion.py +617 -0
  80. vllm/compilation/fx_utils.py +61 -0
  81. vllm/compilation/inductor_pass.py +114 -0
  82. vllm/compilation/monitor.py +38 -0
  83. vllm/compilation/multi_output_match.py +108 -0
  84. vllm/compilation/noop_elimination.py +136 -0
  85. vllm/compilation/pass_manager.py +77 -0
  86. vllm/compilation/sequence_parallelism.py +267 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  88. vllm/compilation/vllm_inductor_pass.py +66 -0
  89. vllm/compilation/wrapper.py +129 -0
  90. vllm/config.py +4600 -0
  91. vllm/connections.py +173 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +398 -0
  95. vllm/core/block/common.py +370 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  97. vllm/core/block/interfaces.py +318 -0
  98. vllm/core/block/naive_block.py +465 -0
  99. vllm/core/block/prefix_caching_block.py +1134 -0
  100. vllm/core/block/utils.py +27 -0
  101. vllm/core/block_manager.py +520 -0
  102. vllm/core/evictor.py +156 -0
  103. vllm/core/interfaces.py +134 -0
  104. vllm/core/placeholder_block_space_manager.py +99 -0
  105. vllm/core/scheduler.py +2092 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +280 -0
  108. vllm/distributed/__init__.py +5 -0
  109. vllm/distributed/communication_op.py +40 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +126 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +144 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +167 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +303 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +258 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  120. vllm/distributed/device_communicators/pynccl.py +217 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +541 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  125. vllm/distributed/kv_events.py +296 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +11 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +126 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +202 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +91 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +5 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +259 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +189 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +851 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  152. vllm/distributed/parallel_state.py +1294 -0
  153. vllm/distributed/utils.py +520 -0
  154. vllm/engine/__init__.py +0 -0
  155. vllm/engine/arg_utils.py +1649 -0
  156. vllm/engine/async_llm_engine.py +1274 -0
  157. vllm/engine/async_timeout.py +191 -0
  158. vllm/engine/llm_engine.py +2153 -0
  159. vllm/engine/metrics.py +717 -0
  160. vllm/engine/metrics_types.py +96 -0
  161. vllm/engine/multiprocessing/__init__.py +188 -0
  162. vllm/engine/multiprocessing/client.py +755 -0
  163. vllm/engine/multiprocessing/engine.py +459 -0
  164. vllm/engine/output_processor/__init__.py +0 -0
  165. vllm/engine/output_processor/interfaces.py +74 -0
  166. vllm/engine/output_processor/multi_step.py +215 -0
  167. vllm/engine/output_processor/single_step.py +144 -0
  168. vllm/engine/output_processor/stop_checker.py +130 -0
  169. vllm/engine/output_processor/util.py +27 -0
  170. vllm/engine/protocol.py +310 -0
  171. vllm/entrypoints/__init__.py +0 -0
  172. vllm/entrypoints/api_server.py +177 -0
  173. vllm/entrypoints/chat_utils.py +1298 -0
  174. vllm/entrypoints/cli/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/base.py +38 -0
  177. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  178. vllm/entrypoints/cli/benchmark/main.py +53 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  180. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  181. vllm/entrypoints/cli/collect_env.py +34 -0
  182. vllm/entrypoints/cli/main.py +62 -0
  183. vllm/entrypoints/cli/openai.py +204 -0
  184. vllm/entrypoints/cli/serve.py +141 -0
  185. vllm/entrypoints/cli/types.py +24 -0
  186. vllm/entrypoints/launcher.py +146 -0
  187. vllm/entrypoints/llm.py +1503 -0
  188. vllm/entrypoints/logger.py +49 -0
  189. vllm/entrypoints/openai/__init__.py +0 -0
  190. vllm/entrypoints/openai/api_server.py +1376 -0
  191. vllm/entrypoints/openai/cli_args.py +306 -0
  192. vllm/entrypoints/openai/logits_processors.py +89 -0
  193. vllm/entrypoints/openai/protocol.py +1890 -0
  194. vllm/entrypoints/openai/run_batch.py +439 -0
  195. vllm/entrypoints/openai/serving_chat.py +1192 -0
  196. vllm/entrypoints/openai/serving_classification.py +159 -0
  197. vllm/entrypoints/openai/serving_completion.py +590 -0
  198. vllm/entrypoints/openai/serving_embedding.py +200 -0
  199. vllm/entrypoints/openai/serving_engine.py +985 -0
  200. vllm/entrypoints/openai/serving_models.py +314 -0
  201. vllm/entrypoints/openai/serving_pooling.py +231 -0
  202. vllm/entrypoints/openai/serving_score.py +432 -0
  203. vllm/entrypoints/openai/serving_tokenization.py +151 -0
  204. vllm/entrypoints/openai/serving_transcription.py +421 -0
  205. vllm/entrypoints/openai/tool_parsers/__init__.py +22 -0
  206. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  207. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +369 -0
  208. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +258 -0
  209. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +236 -0
  210. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  211. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +215 -0
  212. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +307 -0
  213. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +302 -0
  214. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +266 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  216. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +111 -0
  217. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +296 -0
  218. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  219. vllm/entrypoints/score_utils.py +49 -0
  220. vllm/entrypoints/ssl.py +74 -0
  221. vllm/entrypoints/utils.py +219 -0
  222. vllm/env_override.py +34 -0
  223. vllm/envs.py +896 -0
  224. vllm/executor/__init__.py +0 -0
  225. vllm/executor/executor_base.py +400 -0
  226. vllm/executor/mp_distributed_executor.py +243 -0
  227. vllm/executor/msgspec_utils.py +29 -0
  228. vllm/executor/multiproc_worker_utils.py +312 -0
  229. vllm/executor/ray_distributed_executor.py +700 -0
  230. vllm/executor/ray_utils.py +398 -0
  231. vllm/executor/uniproc_executor.py +138 -0
  232. vllm/forward_context.py +147 -0
  233. vllm/inputs/__init__.py +40 -0
  234. vllm/inputs/data.py +330 -0
  235. vllm/inputs/parse.py +150 -0
  236. vllm/inputs/preprocess.py +908 -0
  237. vllm/inputs/registry.py +214 -0
  238. vllm/jsontree.py +79 -0
  239. vllm/logger.py +211 -0
  240. vllm/logging_utils/__init__.py +7 -0
  241. vllm/logging_utils/dump_input.py +84 -0
  242. vllm/logging_utils/formatter.py +17 -0
  243. vllm/logits_process.py +118 -0
  244. vllm/lora/__init__.py +0 -0
  245. vllm/lora/fully_sharded_layers.py +354 -0
  246. vllm/lora/layers.py +1284 -0
  247. vllm/lora/lora.py +198 -0
  248. vllm/lora/models.py +817 -0
  249. vllm/lora/ops/__init__.py +0 -0
  250. vllm/lora/ops/torch_ops/__init__.py +15 -0
  251. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  252. vllm/lora/ops/triton_ops/__init__.py +11 -0
  253. vllm/lora/ops/triton_ops/kernel_utils.py +242 -0
  254. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  255. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  256. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  257. vllm/lora/ops/triton_ops/utils.py +119 -0
  258. vllm/lora/ops/xla_ops/__init__.py +6 -0
  259. vllm/lora/ops/xla_ops/lora_ops.py +106 -0
  260. vllm/lora/ops/xla_ops/pallas.py +133 -0
  261. vllm/lora/peft_helper.py +135 -0
  262. vllm/lora/punica_wrapper/__init__.py +9 -0
  263. vllm/lora/punica_wrapper/punica_base.py +484 -0
  264. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  265. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  266. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  267. vllm/lora/punica_wrapper/punica_selector.py +19 -0
  268. vllm/lora/punica_wrapper/punica_tpu.py +325 -0
  269. vllm/lora/punica_wrapper/utils.py +163 -0
  270. vllm/lora/request.py +98 -0
  271. vllm/lora/resolver.py +84 -0
  272. vllm/lora/utils.py +239 -0
  273. vllm/lora/worker_manager.py +253 -0
  274. vllm/model_executor/__init__.py +15 -0
  275. vllm/model_executor/custom_op.py +151 -0
  276. vllm/model_executor/guided_decoding/__init__.py +180 -0
  277. vllm/model_executor/guided_decoding/guidance_decoding.py +62 -0
  278. vllm/model_executor/guided_decoding/guidance_logits_processors.py +103 -0
  279. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  280. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  281. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  282. vllm/model_executor/guided_decoding/outlines_logits_processors.py +283 -0
  283. vllm/model_executor/guided_decoding/utils.py +241 -0
  284. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  285. vllm/model_executor/layers/__init__.py +0 -0
  286. vllm/model_executor/layers/activation.py +368 -0
  287. vllm/model_executor/layers/fused_moe/__init__.py +53 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  449. vllm/model_executor/layers/fused_moe/cutlass_moe.py +382 -0
  450. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +227 -0
  451. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +755 -0
  452. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +231 -0
  453. vllm/model_executor/layers/fused_moe/fused_moe.py +1722 -0
  454. vllm/model_executor/layers/fused_moe/layer.py +1366 -0
  455. vllm/model_executor/layers/fused_moe/modular_kernel.py +364 -0
  456. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +242 -0
  457. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  458. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +188 -0
  459. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  460. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +146 -0
  461. vllm/model_executor/layers/fused_moe/prepare_finalize.py +60 -0
  462. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +372 -0
  463. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +112 -0
  464. vllm/model_executor/layers/fused_moe/utils.py +97 -0
  465. vllm/model_executor/layers/layernorm.py +287 -0
  466. vllm/model_executor/layers/lightning_attn.py +651 -0
  467. vllm/model_executor/layers/linear.py +1523 -0
  468. vllm/model_executor/layers/logits_processor.py +196 -0
  469. vllm/model_executor/layers/mamba/__init__.py +0 -0
  470. vllm/model_executor/layers/mamba/mamba2_metadata.py +124 -0
  471. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  472. vllm/model_executor/layers/mamba/mamba_mixer2.py +615 -0
  473. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  474. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  475. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
  476. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  477. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  478. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  479. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  480. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  481. vllm/model_executor/layers/pooler.py +343 -0
  482. vllm/model_executor/layers/quantization/__init__.py +156 -0
  483. vllm/model_executor/layers/quantization/aqlm.py +375 -0
  484. vllm/model_executor/layers/quantization/auto_round.py +308 -0
  485. vllm/model_executor/layers/quantization/awq.py +185 -0
  486. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  487. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  488. vllm/model_executor/layers/quantization/base_config.py +150 -0
  489. vllm/model_executor/layers/quantization/bitblas.py +460 -0
  490. vllm/model_executor/layers/quantization/bitsandbytes.py +397 -0
  491. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  492. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +644 -0
  493. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1252 -0
  494. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +21 -0
  495. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  496. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  497. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  498. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +92 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +120 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +214 -0
  505. vllm/model_executor/layers/quantization/deepspeedfp.py +194 -0
  506. vllm/model_executor/layers/quantization/experts_int8.py +195 -0
  507. vllm/model_executor/layers/quantization/fbgemm_fp8.py +171 -0
  508. vllm/model_executor/layers/quantization/fp8.py +876 -0
  509. vllm/model_executor/layers/quantization/gguf.py +564 -0
  510. vllm/model_executor/layers/quantization/gptq.py +277 -0
  511. vllm/model_executor/layers/quantization/gptq_bitblas.py +444 -0
  512. vllm/model_executor/layers/quantization/gptq_marlin.py +647 -0
  513. vllm/model_executor/layers/quantization/gptq_marlin_24.py +296 -0
  514. vllm/model_executor/layers/quantization/hqq_marlin.py +331 -0
  515. vllm/model_executor/layers/quantization/ipex_quant.py +249 -0
  516. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  517. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  518. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  519. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  520. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  521. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  522. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  523. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +130 -0
  524. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  525. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  526. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  527. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  528. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  529. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  530. vllm/model_executor/layers/quantization/kv_cache.py +138 -0
  531. vllm/model_executor/layers/quantization/marlin.py +260 -0
  532. vllm/model_executor/layers/quantization/modelopt.py +734 -0
  533. vllm/model_executor/layers/quantization/moe_wna16.py +448 -0
  534. vllm/model_executor/layers/quantization/neuron_quant.py +68 -0
  535. vllm/model_executor/layers/quantization/ptpc_fp8.py +126 -0
  536. vllm/model_executor/layers/quantization/qqq.py +274 -0
  537. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  538. vllm/model_executor/layers/quantization/quark/quark.py +440 -0
  539. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  540. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +8 -0
  541. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  542. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +125 -0
  543. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +145 -0
  544. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  545. vllm/model_executor/layers/quantization/quark/utils.py +104 -0
  546. vllm/model_executor/layers/quantization/schema.py +85 -0
  547. vllm/model_executor/layers/quantization/torchao.py +143 -0
  548. vllm/model_executor/layers/quantization/tpu_int8.py +120 -0
  549. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  550. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  551. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +207 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  754. vllm/model_executor/layers/quantization/utils/fp8_utils.py +611 -0
  755. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  756. vllm/model_executor/layers/quantization/utils/int8_utils.py +484 -0
  757. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  758. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  759. vllm/model_executor/layers/quantization/utils/marlin_utils.py +475 -0
  760. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +277 -0
  761. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +324 -0
  762. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  763. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +463 -0
  764. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +125 -0
  765. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +44 -0
  766. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +61 -0
  767. vllm/model_executor/layers/quantization/utils/quant_utils.py +572 -0
  768. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  769. vllm/model_executor/layers/rejection_sampler.py +405 -0
  770. vllm/model_executor/layers/resampler.py +269 -0
  771. vllm/model_executor/layers/rotary_embedding.py +1861 -0
  772. vllm/model_executor/layers/sampler.py +1203 -0
  773. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  774. vllm/model_executor/layers/typical_acceptance_sampler.py +165 -0
  775. vllm/model_executor/layers/utils.py +99 -0
  776. vllm/model_executor/layers/vocab_parallel_embedding.py +486 -0
  777. vllm/model_executor/model_loader/__init__.py +75 -0
  778. vllm/model_executor/model_loader/base_loader.py +24 -0
  779. vllm/model_executor/model_loader/bitsandbytes_loader.py +582 -0
  780. vllm/model_executor/model_loader/default_loader.py +295 -0
  781. vllm/model_executor/model_loader/dummy_loader.py +37 -0
  782. vllm/model_executor/model_loader/gguf_loader.py +113 -0
  783. vllm/model_executor/model_loader/neuron.py +475 -0
  784. vllm/model_executor/model_loader/neuronx_distributed.py +622 -0
  785. vllm/model_executor/model_loader/runai_streamer_loader.py +120 -0
  786. vllm/model_executor/model_loader/sharded_state_loader.py +211 -0
  787. vllm/model_executor/model_loader/tensorizer.py +632 -0
  788. vllm/model_executor/model_loader/tensorizer_loader.py +122 -0
  789. vllm/model_executor/model_loader/utils.py +301 -0
  790. vllm/model_executor/model_loader/weight_utils.py +781 -0
  791. vllm/model_executor/models/__init__.py +27 -0
  792. vllm/model_executor/models/adapters.py +247 -0
  793. vllm/model_executor/models/aimv2.py +199 -0
  794. vllm/model_executor/models/arctic.py +558 -0
  795. vllm/model_executor/models/aria.py +656 -0
  796. vllm/model_executor/models/aya_vision.py +461 -0
  797. vllm/model_executor/models/baichuan.py +473 -0
  798. vllm/model_executor/models/bamba.py +542 -0
  799. vllm/model_executor/models/bart.py +937 -0
  800. vllm/model_executor/models/bert.py +517 -0
  801. vllm/model_executor/models/bert_with_rope.py +714 -0
  802. vllm/model_executor/models/blip.py +338 -0
  803. vllm/model_executor/models/blip2.py +717 -0
  804. vllm/model_executor/models/bloom.py +372 -0
  805. vllm/model_executor/models/chameleon.py +1135 -0
  806. vllm/model_executor/models/chatglm.py +477 -0
  807. vllm/model_executor/models/clip.py +411 -0
  808. vllm/model_executor/models/commandr.py +471 -0
  809. vllm/model_executor/models/constant_size_cache.py +136 -0
  810. vllm/model_executor/models/dbrx.py +471 -0
  811. vllm/model_executor/models/deepseek.py +485 -0
  812. vllm/model_executor/models/deepseek_mtp.py +268 -0
  813. vllm/model_executor/models/deepseek_v2.py +842 -0
  814. vllm/model_executor/models/deepseek_vl2.py +647 -0
  815. vllm/model_executor/models/eagle.py +259 -0
  816. vllm/model_executor/models/exaone.py +550 -0
  817. vllm/model_executor/models/fairseq2_llama.py +153 -0
  818. vllm/model_executor/models/falcon.py +509 -0
  819. vllm/model_executor/models/falcon_h1.py +684 -0
  820. vllm/model_executor/models/florence2.py +1102 -0
  821. vllm/model_executor/models/fuyu.py +388 -0
  822. vllm/model_executor/models/gemma.py +424 -0
  823. vllm/model_executor/models/gemma2.py +424 -0
  824. vllm/model_executor/models/gemma3.py +532 -0
  825. vllm/model_executor/models/gemma3_mm.py +708 -0
  826. vllm/model_executor/models/glm.py +22 -0
  827. vllm/model_executor/models/glm4.py +304 -0
  828. vllm/model_executor/models/glm4v.py +647 -0
  829. vllm/model_executor/models/gpt2.py +327 -0
  830. vllm/model_executor/models/gpt_bigcode.py +334 -0
  831. vllm/model_executor/models/gpt_j.py +338 -0
  832. vllm/model_executor/models/gpt_neox.py +331 -0
  833. vllm/model_executor/models/granite.py +492 -0
  834. vllm/model_executor/models/granite_speech.py +778 -0
  835. vllm/model_executor/models/granitemoe.py +436 -0
  836. vllm/model_executor/models/granitemoehybrid.py +585 -0
  837. vllm/model_executor/models/granitemoeshared.py +340 -0
  838. vllm/model_executor/models/gritlm.py +223 -0
  839. vllm/model_executor/models/grok1.py +545 -0
  840. vllm/model_executor/models/h2ovl.py +545 -0
  841. vllm/model_executor/models/idefics2_vision_model.py +388 -0
  842. vllm/model_executor/models/idefics3.py +767 -0
  843. vllm/model_executor/models/interfaces.py +571 -0
  844. vllm/model_executor/models/interfaces_base.py +163 -0
  845. vllm/model_executor/models/intern_vit.py +475 -0
  846. vllm/model_executor/models/internlm2.py +454 -0
  847. vllm/model_executor/models/internlm2_ve.py +146 -0
  848. vllm/model_executor/models/internvl.py +1405 -0
  849. vllm/model_executor/models/jais.py +372 -0
  850. vllm/model_executor/models/jamba.py +591 -0
  851. vllm/model_executor/models/kimi_vl.py +576 -0
  852. vllm/model_executor/models/llama.py +643 -0
  853. vllm/model_executor/models/llama4.py +531 -0
  854. vllm/model_executor/models/llama_eagle.py +166 -0
  855. vllm/model_executor/models/llama_eagle3.py +257 -0
  856. vllm/model_executor/models/llava.py +865 -0
  857. vllm/model_executor/models/llava_next.py +585 -0
  858. vllm/model_executor/models/llava_next_video.py +470 -0
  859. vllm/model_executor/models/llava_onevision.py +955 -0
  860. vllm/model_executor/models/mamba.py +272 -0
  861. vllm/model_executor/models/mamba2.py +302 -0
  862. vllm/model_executor/models/mamba_cache.py +75 -0
  863. vllm/model_executor/models/medusa.py +218 -0
  864. vllm/model_executor/models/mimo.py +191 -0
  865. vllm/model_executor/models/mimo_mtp.py +284 -0
  866. vllm/model_executor/models/minicpm.py +590 -0
  867. vllm/model_executor/models/minicpm3.py +229 -0
  868. vllm/model_executor/models/minicpmo.py +758 -0
  869. vllm/model_executor/models/minicpmv.py +1286 -0
  870. vllm/model_executor/models/minimax_cache.py +35 -0
  871. vllm/model_executor/models/minimax_text_01.py +1303 -0
  872. vllm/model_executor/models/minimax_vl_01.py +363 -0
  873. vllm/model_executor/models/mistral3.py +603 -0
  874. vllm/model_executor/models/mixtral.py +487 -0
  875. vllm/model_executor/models/mixtral_quant.py +452 -0
  876. vllm/model_executor/models/mllama.py +1623 -0
  877. vllm/model_executor/models/mllama4.py +838 -0
  878. vllm/model_executor/models/mlp_speculator.py +205 -0
  879. vllm/model_executor/models/modernbert.py +329 -0
  880. vllm/model_executor/models/module_mapping.py +71 -0
  881. vllm/model_executor/models/molmo.py +1567 -0
  882. vllm/model_executor/models/moonvit.py +629 -0
  883. vllm/model_executor/models/mpt.py +330 -0
  884. vllm/model_executor/models/nemotron.py +507 -0
  885. vllm/model_executor/models/nemotron_nas.py +483 -0
  886. vllm/model_executor/models/nvlm_d.py +215 -0
  887. vllm/model_executor/models/olmo.py +388 -0
  888. vllm/model_executor/models/olmo2.py +413 -0
  889. vllm/model_executor/models/olmoe.py +446 -0
  890. vllm/model_executor/models/opt.py +411 -0
  891. vllm/model_executor/models/orion.py +348 -0
  892. vllm/model_executor/models/ovis.py +554 -0
  893. vllm/model_executor/models/paligemma.py +397 -0
  894. vllm/model_executor/models/persimmon.py +343 -0
  895. vllm/model_executor/models/phi.py +355 -0
  896. vllm/model_executor/models/phi3.py +18 -0
  897. vllm/model_executor/models/phi3_small.py +464 -0
  898. vllm/model_executor/models/phi3v.py +722 -0
  899. vllm/model_executor/models/phi4mm.py +1245 -0
  900. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  901. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  902. vllm/model_executor/models/phimoe.py +664 -0
  903. vllm/model_executor/models/pixtral.py +1315 -0
  904. vllm/model_executor/models/plamo2.py +737 -0
  905. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  906. vllm/model_executor/models/qwen.py +361 -0
  907. vllm/model_executor/models/qwen2.py +567 -0
  908. vllm/model_executor/models/qwen2_5_omni_thinker.py +903 -0
  909. vllm/model_executor/models/qwen2_5_vl.py +1171 -0
  910. vllm/model_executor/models/qwen2_audio.py +409 -0
  911. vllm/model_executor/models/qwen2_moe.py +539 -0
  912. vllm/model_executor/models/qwen2_rm.py +131 -0
  913. vllm/model_executor/models/qwen2_vl.py +1410 -0
  914. vllm/model_executor/models/qwen3.py +320 -0
  915. vllm/model_executor/models/qwen3_moe.py +534 -0
  916. vllm/model_executor/models/qwen_vl.py +784 -0
  917. vllm/model_executor/models/registry.py +618 -0
  918. vllm/model_executor/models/roberta.py +273 -0
  919. vllm/model_executor/models/siglip.py +523 -0
  920. vllm/model_executor/models/skyworkr1v.py +950 -0
  921. vllm/model_executor/models/smolvlm.py +51 -0
  922. vllm/model_executor/models/solar.py +505 -0
  923. vllm/model_executor/models/stablelm.py +342 -0
  924. vllm/model_executor/models/starcoder2.py +355 -0
  925. vllm/model_executor/models/telechat2.py +139 -0
  926. vllm/model_executor/models/teleflm.py +78 -0
  927. vllm/model_executor/models/transformers.py +507 -0
  928. vllm/model_executor/models/ultravox.py +655 -0
  929. vllm/model_executor/models/utils.py +730 -0
  930. vllm/model_executor/models/vision.py +146 -0
  931. vllm/model_executor/models/whisper.py +746 -0
  932. vllm/model_executor/models/zamba2.py +1008 -0
  933. vllm/model_executor/parameter.py +458 -0
  934. vllm/model_executor/pooling_metadata.py +71 -0
  935. vllm/model_executor/sampling_metadata.py +596 -0
  936. vllm/model_executor/utils.py +53 -0
  937. vllm/multimodal/__init__.py +32 -0
  938. vllm/multimodal/audio.py +105 -0
  939. vllm/multimodal/base.py +218 -0
  940. vllm/multimodal/hasher.py +117 -0
  941. vllm/multimodal/image.py +96 -0
  942. vllm/multimodal/inputs.py +872 -0
  943. vllm/multimodal/parse.py +460 -0
  944. vllm/multimodal/processing.py +1894 -0
  945. vllm/multimodal/profiling.py +273 -0
  946. vllm/multimodal/registry.py +330 -0
  947. vllm/multimodal/utils.py +392 -0
  948. vllm/multimodal/video.py +197 -0
  949. vllm/outputs.py +525 -0
  950. vllm/platforms/__init__.py +290 -0
  951. vllm/platforms/cpu.py +205 -0
  952. vllm/platforms/cuda.py +461 -0
  953. vllm/platforms/hpu.py +105 -0
  954. vllm/platforms/interface.py +492 -0
  955. vllm/platforms/neuron.py +152 -0
  956. vllm/platforms/rocm.py +388 -0
  957. vllm/platforms/tpu.py +215 -0
  958. vllm/platforms/xpu.py +155 -0
  959. vllm/plugins/__init__.py +86 -0
  960. vllm/plugins/lora_resolvers/README.md +15 -0
  961. vllm/plugins/lora_resolvers/__init__.py +0 -0
  962. vllm/plugins/lora_resolvers/filesystem_resolver.py +49 -0
  963. vllm/pooling_params.py +53 -0
  964. vllm/profiler/__init__.py +0 -0
  965. vllm/profiler/layerwise_profile.py +374 -0
  966. vllm/profiler/utils.py +147 -0
  967. vllm/prompt_adapter/__init__.py +0 -0
  968. vllm/prompt_adapter/layers.py +82 -0
  969. vllm/prompt_adapter/models.py +357 -0
  970. vllm/prompt_adapter/request.py +36 -0
  971. vllm/prompt_adapter/utils.py +97 -0
  972. vllm/prompt_adapter/worker_manager.py +178 -0
  973. vllm/py.typed +2 -0
  974. vllm/reasoning/__init__.py +14 -0
  975. vllm/reasoning/abs_reasoning_parsers.py +191 -0
  976. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  977. vllm/reasoning/granite_reasoning_parser.py +362 -0
  978. vllm/reasoning/qwen3_reasoning_parser.py +150 -0
  979. vllm/sampling_params.py +590 -0
  980. vllm/scalar_type.py +346 -0
  981. vllm/scripts.py +14 -0
  982. vllm/sequence.py +1567 -0
  983. vllm/spec_decode/__init__.py +0 -0
  984. vllm/spec_decode/batch_expansion.py +505 -0
  985. vllm/spec_decode/draft_model_runner.py +349 -0
  986. vllm/spec_decode/interfaces.py +98 -0
  987. vllm/spec_decode/medusa_worker.py +137 -0
  988. vllm/spec_decode/metrics.py +212 -0
  989. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  990. vllm/spec_decode/mqa_scorer.py +159 -0
  991. vllm/spec_decode/multi_step_worker.py +422 -0
  992. vllm/spec_decode/ngram_worker.py +195 -0
  993. vllm/spec_decode/proposer_worker_base.py +58 -0
  994. vllm/spec_decode/smaller_tp_proposer_worker.py +195 -0
  995. vllm/spec_decode/spec_decode_worker.py +1325 -0
  996. vllm/spec_decode/target_model_runner.py +44 -0
  997. vllm/spec_decode/top1_proposer.py +274 -0
  998. vllm/spec_decode/util.py +276 -0
  999. vllm/test_utils.py +129 -0
  1000. vllm/third_party/__init__.py +0 -0
  1001. vllm/third_party/pynvml.py +6139 -0
  1002. vllm/tracing.py +130 -0
  1003. vllm/transformers_utils/__init__.py +23 -0
  1004. vllm/transformers_utils/chat_templates/__init__.py +4 -0
  1005. vllm/transformers_utils/chat_templates/registry.py +59 -0
  1006. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1007. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1008. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1009. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1010. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1011. vllm/transformers_utils/config.py +835 -0
  1012. vllm/transformers_utils/configs/__init__.py +58 -0
  1013. vllm/transformers_utils/configs/arctic.py +206 -0
  1014. vllm/transformers_utils/configs/chatglm.py +71 -0
  1015. vllm/transformers_utils/configs/cohere2.py +194 -0
  1016. vllm/transformers_utils/configs/dbrx.py +279 -0
  1017. vllm/transformers_utils/configs/deepseek_vl2.py +215 -0
  1018. vllm/transformers_utils/configs/eagle.py +84 -0
  1019. vllm/transformers_utils/configs/exaone.py +189 -0
  1020. vllm/transformers_utils/configs/falcon.py +89 -0
  1021. vllm/transformers_utils/configs/h2ovl.py +15 -0
  1022. vllm/transformers_utils/configs/internvl.py +53 -0
  1023. vllm/transformers_utils/configs/jais.py +237 -0
  1024. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  1025. vllm/transformers_utils/configs/medusa.py +62 -0
  1026. vllm/transformers_utils/configs/minimax_text_01.py +69 -0
  1027. vllm/transformers_utils/configs/minimax_vl_01.py +70 -0
  1028. vllm/transformers_utils/configs/mllama.py +30 -0
  1029. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  1030. vllm/transformers_utils/configs/moonvit.py +32 -0
  1031. vllm/transformers_utils/configs/mpt.py +179 -0
  1032. vllm/transformers_utils/configs/nemotron.py +204 -0
  1033. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  1034. vllm/transformers_utils/configs/ovis.py +183 -0
  1035. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  1036. vllm/transformers_utils/configs/solar.py +246 -0
  1037. vllm/transformers_utils/configs/telechat2.py +63 -0
  1038. vllm/transformers_utils/configs/ultravox.py +107 -0
  1039. vllm/transformers_utils/detokenizer.py +167 -0
  1040. vllm/transformers_utils/detokenizer_utils.py +188 -0
  1041. vllm/transformers_utils/processor.py +220 -0
  1042. vllm/transformers_utils/processors/__init__.py +7 -0
  1043. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1044. vllm/transformers_utils/processors/ovis.py +419 -0
  1045. vllm/transformers_utils/s3_utils.py +161 -0
  1046. vllm/transformers_utils/tokenizer.py +301 -0
  1047. vllm/transformers_utils/tokenizer_base.py +148 -0
  1048. vllm/transformers_utils/tokenizer_group.py +119 -0
  1049. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  1050. vllm/transformers_utils/tokenizers/mistral.py +490 -0
  1051. vllm/transformers_utils/utils.py +98 -0
  1052. vllm/triton_utils/__init__.py +13 -0
  1053. vllm/triton_utils/importing.py +49 -0
  1054. vllm/usage/__init__.py +0 -0
  1055. vllm/usage/usage_lib.py +255 -0
  1056. vllm/utils.py +2844 -0
  1057. vllm/v1/__init__.py +0 -0
  1058. vllm/v1/attention/__init__.py +0 -0
  1059. vllm/v1/attention/backends/__init__.py +0 -0
  1060. vllm/v1/attention/backends/flash_attn.py +833 -0
  1061. vllm/v1/attention/backends/flashinfer.py +639 -0
  1062. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1063. vllm/v1/attention/backends/mla/common.py +926 -0
  1064. vllm/v1/attention/backends/mla/flashmla.py +150 -0
  1065. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +221 -0
  1066. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1067. vllm/v1/attention/backends/pallas.py +235 -0
  1068. vllm/v1/attention/backends/triton_attn.py +279 -0
  1069. vllm/v1/attention/backends/utils.py +18 -0
  1070. vllm/v1/core/__init__.py +0 -0
  1071. vllm/v1/core/block_pool.py +328 -0
  1072. vllm/v1/core/encoder_cache_manager.py +149 -0
  1073. vllm/v1/core/kv_cache_manager.py +372 -0
  1074. vllm/v1/core/kv_cache_utils.py +748 -0
  1075. vllm/v1/core/sched/__init__.py +0 -0
  1076. vllm/v1/core/sched/interface.py +143 -0
  1077. vllm/v1/core/sched/output.py +153 -0
  1078. vllm/v1/core/sched/scheduler.py +1015 -0
  1079. vllm/v1/core/sched/utils.py +22 -0
  1080. vllm/v1/core/single_type_kv_cache_manager.py +358 -0
  1081. vllm/v1/engine/__init__.py +171 -0
  1082. vllm/v1/engine/async_llm.py +546 -0
  1083. vllm/v1/engine/core.py +801 -0
  1084. vllm/v1/engine/core_client.py +1020 -0
  1085. vllm/v1/engine/detokenizer.py +260 -0
  1086. vllm/v1/engine/exceptions.py +16 -0
  1087. vllm/v1/engine/llm_engine.py +316 -0
  1088. vllm/v1/engine/logprobs.py +198 -0
  1089. vllm/v1/engine/mm_input_cache.py +90 -0
  1090. vllm/v1/engine/output_processor.py +427 -0
  1091. vllm/v1/engine/parallel_sampling.py +132 -0
  1092. vllm/v1/engine/processor.py +398 -0
  1093. vllm/v1/executor/__init__.py +0 -0
  1094. vllm/v1/executor/abstract.py +112 -0
  1095. vllm/v1/executor/multiproc_executor.py +532 -0
  1096. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1097. vllm/v1/kv_cache_interface.py +208 -0
  1098. vllm/v1/metrics/__init__.py +0 -0
  1099. vllm/v1/metrics/loggers.py +511 -0
  1100. vllm/v1/metrics/ray_wrappers.py +120 -0
  1101. vllm/v1/metrics/reader.py +245 -0
  1102. vllm/v1/metrics/stats.py +238 -0
  1103. vllm/v1/outputs.py +115 -0
  1104. vllm/v1/request.py +191 -0
  1105. vllm/v1/sample/__init__.py +0 -0
  1106. vllm/v1/sample/metadata.py +43 -0
  1107. vllm/v1/sample/ops/__init__.py +0 -0
  1108. vllm/v1/sample/ops/bad_words.py +38 -0
  1109. vllm/v1/sample/ops/penalties.py +58 -0
  1110. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1111. vllm/v1/sample/rejection_sampler.py +630 -0
  1112. vllm/v1/sample/sampler.py +270 -0
  1113. vllm/v1/sample/tpu/__init__.py +0 -0
  1114. vllm/v1/sample/tpu/metadata.py +123 -0
  1115. vllm/v1/sample/tpu/sampler.py +144 -0
  1116. vllm/v1/serial_utils.py +313 -0
  1117. vllm/v1/spec_decode/__init__.py +0 -0
  1118. vllm/v1/spec_decode/eagle.py +424 -0
  1119. vllm/v1/spec_decode/medusa.py +61 -0
  1120. vllm/v1/spec_decode/metadata.py +61 -0
  1121. vllm/v1/spec_decode/metrics.py +177 -0
  1122. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1123. vllm/v1/spec_decode/utils.py +45 -0
  1124. vllm/v1/structured_output/__init__.py +215 -0
  1125. vllm/v1/structured_output/backend_guidance.py +244 -0
  1126. vllm/v1/structured_output/backend_types.py +133 -0
  1127. vllm/v1/structured_output/backend_xgrammar.py +317 -0
  1128. vllm/v1/structured_output/request.py +85 -0
  1129. vllm/v1/structured_output/utils.py +174 -0
  1130. vllm/v1/utils.py +294 -0
  1131. vllm/v1/worker/__init__.py +0 -0
  1132. vllm/v1/worker/block_table.py +139 -0
  1133. vllm/v1/worker/gpu_input_batch.py +680 -0
  1134. vllm/v1/worker/gpu_model_runner.py +2084 -0
  1135. vllm/v1/worker/gpu_worker.py +373 -0
  1136. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1137. vllm/v1/worker/tpu_model_runner.py +1510 -0
  1138. vllm/v1/worker/tpu_worker.py +276 -0
  1139. vllm/v1/worker/utils.py +74 -0
  1140. vllm/v1/worker/worker_base.py +64 -0
  1141. vllm/version.py +40 -0
  1142. vllm/vllm_flash_attn/.gitkeep +0 -0
  1143. vllm/worker/__init__.py +0 -0
  1144. vllm/worker/cache_engine.py +144 -0
  1145. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1146. vllm/worker/cpu_model_runner.py +671 -0
  1147. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1148. vllm/worker/cpu_worker.py +400 -0
  1149. vllm/worker/enc_dec_model_runner.py +555 -0
  1150. vllm/worker/hpu_model_runner.py +2319 -0
  1151. vllm/worker/hpu_worker.py +483 -0
  1152. vllm/worker/model_runner.py +2178 -0
  1153. vllm/worker/model_runner_base.py +281 -0
  1154. vllm/worker/multi_step_hpu_worker.py +122 -0
  1155. vllm/worker/multi_step_model_runner.py +910 -0
  1156. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1157. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1158. vllm/worker/multi_step_tpu_worker.py +107 -0
  1159. vllm/worker/multi_step_worker.py +196 -0
  1160. vllm/worker/neuron_model_runner.py +418 -0
  1161. vllm/worker/neuron_worker.py +158 -0
  1162. vllm/worker/neuronx_distributed_model_runner.py +136 -0
  1163. vllm/worker/pooling_model_runner.py +211 -0
  1164. vllm/worker/tpu_model_runner.py +908 -0
  1165. vllm/worker/tpu_worker.py +336 -0
  1166. vllm/worker/utils.py +52 -0
  1167. vllm/worker/worker.py +574 -0
  1168. vllm/worker/worker_base.py +644 -0
  1169. vllm/worker/xpu_model_runner.py +606 -0
  1170. vllm/worker/xpu_worker.py +185 -0
  1171. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/METADATA +335 -0
  1172. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/RECORD +1175 -0
  1173. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/WHEEL +5 -0
  1174. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/entry_points.txt +5 -0
  1175. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1649 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # yapf: disable
4
+ import argparse
5
+ import dataclasses
6
+ import json
7
+ import sys
8
+ import threading
9
+ import warnings
10
+ from dataclasses import MISSING, dataclass, fields, is_dataclass
11
+ from itertools import permutations
12
+ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
13
+ Type, TypeVar, Union, cast, get_args, get_origin)
14
+
15
+ import regex as re
16
+ import torch
17
+ from typing_extensions import TypeIs, deprecated
18
+
19
+ import vllm.envs as envs
20
+ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
21
+ ConfigFormat, ConfigType, DecodingConfig,
22
+ DetailedTraceModules, Device, DeviceConfig,
23
+ DistributedExecutorBackend, GuidedDecodingBackend,
24
+ GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
25
+ KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
26
+ ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
27
+ ObservabilityConfig, ParallelConfig, PoolerConfig,
28
+ PrefixCachingHashAlgo, PromptAdapterConfig,
29
+ SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
30
+ TaskOption, TokenizerMode, TokenizerPoolConfig,
31
+ VllmConfig, get_attr_docs, get_field)
32
+ from vllm.executor.executor_base import ExecutorBase
33
+ from vllm.logger import init_logger
34
+ from vllm.model_executor.layers.quantization import QuantizationMethods
35
+ from vllm.plugins import load_general_plugins
36
+ from vllm.reasoning import ReasoningParserManager
37
+ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
38
+ from vllm.transformers_utils.utils import check_gguf_file
39
+ from vllm.usage.usage_lib import UsageContext
40
+ from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
41
+ GiB_bytes, is_in_doc_build, is_in_ray_actor)
42
+
43
+ # yapf: enable
44
+
45
+ logger = init_logger(__name__)
46
+
47
+ # object is used to allow for special typing forms
48
+ T = TypeVar("T")
49
+ TypeHint = Union[type[Any], object]
50
+ TypeHintT = Union[type[T], object]
51
+
52
+
53
+ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
54
+
55
+ def _parse_type(val: str) -> T:
56
+ try:
57
+ if return_type is json.loads and not re.match("^{.*}$", val):
58
+ return cast(T, nullable_kvs(val))
59
+ return return_type(val)
60
+ except ValueError as e:
61
+ raise argparse.ArgumentTypeError(
62
+ f"Value {val} cannot be converted to {return_type}.") from e
63
+
64
+ return _parse_type
65
+
66
+
67
+ def optional_type(
68
+ return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
69
+
70
+ def _optional_type(val: str) -> Optional[T]:
71
+ if val == "" or val == "None":
72
+ return None
73
+ return parse_type(return_type)(val)
74
+
75
+ return _optional_type
76
+
77
+
78
+ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
79
+ if not re.match("^{.*}$", val):
80
+ return str(val)
81
+ return optional_type(json.loads)(val)
82
+
83
+
84
+ @deprecated(
85
+ "Passing a JSON argument as a string containing comma separated key=value "
86
+ "pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
87
+ "string instead.")
88
+ def nullable_kvs(val: str) -> dict[str, int]:
89
+ """Parses a string containing comma separate key [str] to value [int]
90
+ pairs into a dictionary.
91
+
92
+ Args:
93
+ val: String value to be parsed.
94
+
95
+ Returns:
96
+ Dictionary with parsed values.
97
+ """
98
+ out_dict: dict[str, int] = {}
99
+ for item in val.split(","):
100
+ kv_parts = [part.lower().strip() for part in item.split("=")]
101
+ if len(kv_parts) != 2:
102
+ raise argparse.ArgumentTypeError(
103
+ "Each item should be in the form KEY=VALUE")
104
+ key, value = kv_parts
105
+
106
+ try:
107
+ parsed_value = int(value)
108
+ except ValueError as exc:
109
+ msg = f"Failed to parse value of item {key}={value}"
110
+ raise argparse.ArgumentTypeError(msg) from exc
111
+
112
+ if key in out_dict and out_dict[key] != parsed_value:
113
+ raise argparse.ArgumentTypeError(
114
+ f"Conflicting values specified for key: {key}")
115
+ out_dict[key] = parsed_value
116
+
117
+ return out_dict
118
+
119
+
120
+ def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
121
+ """Check if the type hint is a specific type."""
122
+ return type_hint is type or get_origin(type_hint) is type
123
+
124
+
125
+ def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
126
+ """Check if the type hints contain a specific type."""
127
+ return any(is_type(type_hint, type) for type_hint in type_hints)
128
+
129
+
130
+ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
131
+ """Get the specific type from the type hints."""
132
+ return next((th for th in type_hints if is_type(th, type)), None)
133
+
134
+
135
+ def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
136
+ """Convert Literal type hints to argparse kwargs."""
137
+ type_hint = get_type(type_hints, Literal)
138
+ choices = get_args(type_hint)
139
+ choice_type = type(choices[0])
140
+ if not all(isinstance(choice, choice_type) for choice in choices):
141
+ raise ValueError(
142
+ "All choices must be of the same type. "
143
+ f"Got {choices} with types {[type(c) for c in choices]}")
144
+ return {"type": choice_type, "choices": sorted(choices)}
145
+
146
+
147
+ def is_not_builtin(type_hint: TypeHint) -> bool:
148
+ """Check if the class is not a built-in type."""
149
+ return type_hint.__module__ != "builtins"
150
+
151
+
152
+ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
153
+ cls_docs = get_attr_docs(cls)
154
+ kwargs = {}
155
+ for field in fields(cls):
156
+ # Get the set of possible types for the field
157
+ type_hints: set[TypeHint] = set()
158
+ if get_origin(field.type) in {Union, Annotated}:
159
+ type_hints.update(get_args(field.type))
160
+ else:
161
+ type_hints.add(field.type)
162
+
163
+ # If the field is a dataclass, we can use the model_validate_json
164
+ generator = (th for th in type_hints if is_dataclass(th))
165
+ dataclass_cls = next(generator, None)
166
+
167
+ # Get the default value of the field
168
+ if field.default is not MISSING:
169
+ default = field.default
170
+ elif field.default_factory is not MISSING:
171
+ if is_dataclass(field.default_factory) and is_in_doc_build():
172
+ default = {}
173
+ else:
174
+ default = field.default_factory()
175
+
176
+ # Get the help text for the field
177
+ name = field.name
178
+ help = cls_docs[name].strip()
179
+ # Escape % for argparse
180
+ help = help.replace("%", "%%")
181
+
182
+ # Initialise the kwargs dictionary for the field
183
+ kwargs[name] = {"default": default, "help": help}
184
+
185
+ # Set other kwargs based on the type hints
186
+ json_tip = """\n\nShould either be a valid JSON string or JSON keys
187
+ passed individually. For example, the following sets of arguments are
188
+ equivalent:\n\n
189
+ - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
190
+ - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
191
+ if dataclass_cls is not None:
192
+ dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x))
193
+ # Special case for configs with a from_cli method
194
+ if hasattr(dataclass_cls, "from_cli"):
195
+ from_cli = dataclass_cls.from_cli
196
+ dataclass_init = lambda x, f=from_cli: f(x)
197
+ kwargs[name]["type"] = dataclass_init
198
+ kwargs[name]["help"] += json_tip
199
+ elif contains_type(type_hints, bool):
200
+ # Creates --no-<name> and --<name> flags
201
+ kwargs[name]["action"] = argparse.BooleanOptionalAction
202
+ elif contains_type(type_hints, Literal):
203
+ kwargs[name].update(literal_to_kwargs(type_hints))
204
+ elif contains_type(type_hints, tuple):
205
+ type_hint = get_type(type_hints, tuple)
206
+ types = get_args(type_hint)
207
+ tuple_type = types[0]
208
+ assert all(t is tuple_type for t in types if t is not Ellipsis), (
209
+ "All non-Ellipsis tuple elements must be of the same "
210
+ f"type. Got {types}.")
211
+ kwargs[name]["type"] = tuple_type
212
+ kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
213
+ elif contains_type(type_hints, list):
214
+ type_hint = get_type(type_hints, list)
215
+ types = get_args(type_hint)
216
+ assert len(types) == 1, (
217
+ "List type must have exactly one type. Got "
218
+ f"{type_hint} with types {types}")
219
+ kwargs[name]["type"] = types[0]
220
+ kwargs[name]["nargs"] = "+"
221
+ elif contains_type(type_hints, int):
222
+ kwargs[name]["type"] = int
223
+ # Special case for large integers
224
+ if name in {"max_model_len"}:
225
+ kwargs[name]["type"] = human_readable_int
226
+ elif contains_type(type_hints, float):
227
+ kwargs[name]["type"] = float
228
+ elif contains_type(type_hints,
229
+ dict) and (contains_type(type_hints, str) or any(
230
+ is_not_builtin(th) for th in type_hints)):
231
+ kwargs[name]["type"] = union_dict_and_str
232
+ elif contains_type(type_hints, dict):
233
+ # Dict arguments will always be optional
234
+ kwargs[name]["type"] = parse_type(json.loads)
235
+ kwargs[name]["help"] += json_tip
236
+ elif (contains_type(type_hints, str)
237
+ or any(is_not_builtin(th) for th in type_hints)):
238
+ kwargs[name]["type"] = str
239
+ else:
240
+ raise ValueError(
241
+ f"Unsupported type {type_hints} for argument {name}.")
242
+
243
+ # If the type hint was a sequence of literals, use the helper function
244
+ # to update the type and choices
245
+ if get_origin(kwargs[name].get("type")) is Literal:
246
+ kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
247
+
248
+ # If None is in type_hints, make the argument optional.
249
+ # But not if it's a bool, argparse will handle this better.
250
+ if type(None) in type_hints and not contains_type(type_hints, bool):
251
+ kwargs[name]["type"] = optional_type(kwargs[name]["type"])
252
+ if kwargs[name].get("choices"):
253
+ kwargs[name]["choices"].append("None")
254
+ return kwargs
255
+
256
+
257
+ @dataclass
258
+ class EngineArgs:
259
+ """Arguments for vLLM engine."""
260
+ model: str = ModelConfig.model
261
+ served_model_name: Optional[Union[
262
+ str, List[str]]] = ModelConfig.served_model_name
263
+ tokenizer: Optional[str] = ModelConfig.tokenizer
264
+ hf_config_path: Optional[str] = ModelConfig.hf_config_path
265
+ task: TaskOption = ModelConfig.task
266
+ skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
267
+ enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
268
+ tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
269
+ trust_remote_code: bool = ModelConfig.trust_remote_code
270
+ allowed_local_media_path: str = ModelConfig.allowed_local_media_path
271
+ download_dir: Optional[str] = LoadConfig.download_dir
272
+ load_format: str = LoadConfig.load_format
273
+ config_format: str = ModelConfig.config_format
274
+ dtype: ModelDType = ModelConfig.dtype
275
+ kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
276
+ seed: Optional[int] = ModelConfig.seed
277
+ max_model_len: Optional[int] = ModelConfig.max_model_len
278
+ cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
279
+ "cuda_graph_sizes")
280
+ # Note: Specifying a custom executor backend by passing a class
281
+ # is intended for expert use only. The API may change without
282
+ # notice.
283
+ distributed_executor_backend: Optional[Union[
284
+ DistributedExecutorBackend,
285
+ Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
286
+ # number of P/D disaggregation (or other disaggregation) workers
287
+ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
288
+ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
289
+ data_parallel_size: int = ParallelConfig.data_parallel_size
290
+ data_parallel_size_local: Optional[int] = None
291
+ data_parallel_address: Optional[str] = None
292
+ data_parallel_rpc_port: Optional[int] = None
293
+ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
294
+ max_parallel_loading_workers: Optional[
295
+ int] = ParallelConfig.max_parallel_loading_workers
296
+ block_size: Optional[BlockSize] = CacheConfig.block_size
297
+ enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
298
+ prefix_caching_hash_algo: PrefixCachingHashAlgo = \
299
+ CacheConfig.prefix_caching_hash_algo
300
+ disable_sliding_window: bool = ModelConfig.disable_sliding_window
301
+ disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
302
+ use_v2_block_manager: bool = True
303
+ swap_space: float = CacheConfig.swap_space
304
+ cpu_offload_gb: float = CacheConfig.cpu_offload_gb
305
+ gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
306
+ max_num_batched_tokens: Optional[
307
+ int] = SchedulerConfig.max_num_batched_tokens
308
+ max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
309
+ max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
310
+ long_prefill_token_threshold: int = \
311
+ SchedulerConfig.long_prefill_token_threshold
312
+ max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
313
+ max_logprobs: int = ModelConfig.max_logprobs
314
+ disable_log_stats: bool = False
315
+ revision: Optional[str] = ModelConfig.revision
316
+ code_revision: Optional[str] = ModelConfig.code_revision
317
+ rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
318
+ rope_theta: Optional[float] = ModelConfig.rope_theta
319
+ hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
320
+ hf_overrides: Optional[HfOverrides] = \
321
+ get_field(ModelConfig, "hf_overrides")
322
+ tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
323
+ quantization: Optional[QuantizationMethods] = ModelConfig.quantization
324
+ enforce_eager: bool = ModelConfig.enforce_eager
325
+ max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
326
+ disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
327
+ # The following three fields are deprecated and will be removed in a future
328
+ # release. Setting them will have no effect. Please remove them from your
329
+ # configurations.
330
+ tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
331
+ tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
332
+ tokenizer_pool_extra_config: dict = \
333
+ get_field(TokenizerPoolConfig, "extra_config")
334
+ limit_mm_per_prompt: dict[str, int] = \
335
+ get_field(MultiModalConfig, "limit_per_prompt")
336
+ mm_processor_kwargs: Optional[Dict[str, Any]] = \
337
+ MultiModalConfig.mm_processor_kwargs
338
+ disable_mm_preprocessor_cache: bool = \
339
+ MultiModalConfig.disable_mm_preprocessor_cache
340
+ # LoRA fields
341
+ enable_lora: bool = False
342
+ enable_lora_bias: bool = LoRAConfig.bias_enabled
343
+ max_loras: int = LoRAConfig.max_loras
344
+ max_lora_rank: int = LoRAConfig.max_lora_rank
345
+ fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
346
+ max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
347
+ lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
348
+ lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
349
+ long_lora_scaling_factors: Optional[tuple[float, ...]] = \
350
+ LoRAConfig.long_lora_scaling_factors
351
+ # PromptAdapter fields
352
+ enable_prompt_adapter: bool = False
353
+ max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
354
+ max_prompt_adapter_token: int = \
355
+ PromptAdapterConfig.max_prompt_adapter_token
356
+
357
+ device: Device = DeviceConfig.device
358
+ num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
359
+ multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
360
+ ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
361
+ num_gpu_blocks_override: Optional[
362
+ int] = CacheConfig.num_gpu_blocks_override
363
+ num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
364
+ model_loader_extra_config: dict = \
365
+ get_field(LoadConfig, "model_loader_extra_config")
366
+ ignore_patterns: Optional[Union[str,
367
+ List[str]]] = LoadConfig.ignore_patterns
368
+ preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
369
+
370
+ scheduler_delay_factor: float = SchedulerConfig.delay_factor
371
+ enable_chunked_prefill: Optional[
372
+ bool] = SchedulerConfig.enable_chunked_prefill
373
+ disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
374
+
375
+ guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
376
+ guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
377
+ guided_decoding_disable_any_whitespace: bool = \
378
+ DecodingConfig.disable_any_whitespace
379
+ guided_decoding_disable_additional_properties: bool = \
380
+ DecodingConfig.disable_additional_properties
381
+ logits_processor_pattern: Optional[
382
+ str] = ModelConfig.logits_processor_pattern
383
+
384
+ speculative_config: Optional[Dict[str, Any]] = None
385
+
386
+ qlora_adapter_name_or_path: Optional[str] = None
387
+ show_hidden_metrics_for_version: Optional[str] = \
388
+ ObservabilityConfig.show_hidden_metrics_for_version
389
+ otlp_traces_endpoint: Optional[str] = \
390
+ ObservabilityConfig.otlp_traces_endpoint
391
+ collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
392
+ ObservabilityConfig.collect_detailed_traces
393
+ disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
394
+ scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
395
+ scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
396
+
397
+ override_neuron_config: dict[str, Any] = \
398
+ get_field(ModelConfig, "override_neuron_config")
399
+ override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
400
+ ModelConfig.override_pooler_config
401
+ compilation_config: Optional[CompilationConfig] = None
402
+ worker_cls: str = ParallelConfig.worker_cls
403
+ worker_extension_cls: str = ParallelConfig.worker_extension_cls
404
+
405
+ kv_transfer_config: Optional[KVTransferConfig] = None
406
+ kv_events_config: Optional[KVEventsConfig] = None
407
+
408
+ generation_config: str = ModelConfig.generation_config
409
+ enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
410
+ override_generation_config: dict[str, Any] = \
411
+ get_field(ModelConfig, "override_generation_config")
412
+ model_impl: str = ModelConfig.model_impl
413
+
414
+ calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
415
+
416
+ additional_config: Optional[Dict[str, Any]] = None
417
+ enable_reasoning: Optional[bool] = None # DEPRECATED
418
+ reasoning_parser: str = DecodingConfig.reasoning_backend
419
+
420
+ use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
421
+ pt_load_map_location: str = LoadConfig.pt_load_map_location
422
+
423
+ def __post_init__(self):
424
+ # support `EngineArgs(compilation_config={...})`
425
+ # without having to manually construct a
426
+ # CompilationConfig object
427
+ if isinstance(self.compilation_config, (int, dict)):
428
+ self.compilation_config = CompilationConfig.from_cli(
429
+ str(self.compilation_config))
430
+ if self.qlora_adapter_name_or_path is not None:
431
+ warnings.warn(
432
+ "The `qlora_adapter_name_or_path` is deprecated "
433
+ "and will be removed in v0.10.0. ",
434
+ DeprecationWarning,
435
+ stacklevel=2,
436
+ )
437
+ # Setup plugins
438
+ from vllm.plugins import load_general_plugins
439
+ load_general_plugins()
440
+
441
+ @staticmethod
442
+ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
443
+ """Shared CLI arguments for vLLM engine."""
444
+
445
+ # Model arguments
446
+ model_kwargs = get_kwargs(ModelConfig)
447
+ model_group = parser.add_argument_group(
448
+ title="ModelConfig",
449
+ description=ModelConfig.__doc__,
450
+ )
451
+ if 'serve' not in sys.argv[1:] and '--help' not in sys.argv[1:]:
452
+ model_group.add_argument("--model", **model_kwargs["model"])
453
+ model_group.add_argument("--task", **model_kwargs["task"])
454
+ model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
455
+ model_group.add_argument("--tokenizer-mode",
456
+ **model_kwargs["tokenizer_mode"])
457
+ model_group.add_argument("--trust-remote-code",
458
+ **model_kwargs["trust_remote_code"])
459
+ model_group.add_argument("--dtype", **model_kwargs["dtype"])
460
+ model_group.add_argument("--seed", **model_kwargs["seed"])
461
+ model_group.add_argument("--hf-config-path",
462
+ **model_kwargs["hf_config_path"])
463
+ model_group.add_argument("--allowed-local-media-path",
464
+ **model_kwargs["allowed_local_media_path"])
465
+ model_group.add_argument("--revision", **model_kwargs["revision"])
466
+ model_group.add_argument("--code-revision",
467
+ **model_kwargs["code_revision"])
468
+ model_group.add_argument("--rope-scaling",
469
+ **model_kwargs["rope_scaling"])
470
+ model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
471
+ model_group.add_argument("--tokenizer-revision",
472
+ **model_kwargs["tokenizer_revision"])
473
+ model_group.add_argument("--max-model-len",
474
+ **model_kwargs["max_model_len"])
475
+ model_group.add_argument("--quantization", "-q",
476
+ **model_kwargs["quantization"])
477
+ model_group.add_argument("--enforce-eager",
478
+ **model_kwargs["enforce_eager"])
479
+ model_group.add_argument("--max-seq-len-to-capture",
480
+ **model_kwargs["max_seq_len_to_capture"])
481
+ model_group.add_argument("--max-logprobs",
482
+ **model_kwargs["max_logprobs"])
483
+ model_group.add_argument("--disable-sliding-window",
484
+ **model_kwargs["disable_sliding_window"])
485
+ model_group.add_argument("--disable-cascade-attn",
486
+ **model_kwargs["disable_cascade_attn"])
487
+ model_group.add_argument("--skip-tokenizer-init",
488
+ **model_kwargs["skip_tokenizer_init"])
489
+ model_group.add_argument("--enable-prompt-embeds",
490
+ **model_kwargs["enable_prompt_embeds"])
491
+ model_group.add_argument("--served-model-name",
492
+ **model_kwargs["served_model_name"])
493
+ # This one is a special case because it is the
494
+ # opposite of ModelConfig.use_async_output_proc
495
+ model_group.add_argument(
496
+ "--disable-async-output-proc",
497
+ action="store_true",
498
+ default=EngineArgs.disable_async_output_proc,
499
+ help="Disable async output processing. This may result in "
500
+ "lower performance.")
501
+ model_group.add_argument("--config-format",
502
+ choices=[f.value for f in ConfigFormat],
503
+ **model_kwargs["config_format"])
504
+ # This one is a special case because it can bool
505
+ # or str. TODO: Handle this in get_kwargs
506
+ model_group.add_argument("--hf-token",
507
+ type=str,
508
+ nargs="?",
509
+ const=True,
510
+ default=model_kwargs["hf_token"]["default"],
511
+ help=model_kwargs["hf_token"]["help"])
512
+ model_group.add_argument("--hf-overrides",
513
+ **model_kwargs["hf_overrides"])
514
+ model_group.add_argument("--override-neuron-config",
515
+ **model_kwargs["override_neuron_config"])
516
+ model_group.add_argument("--override-pooler-config",
517
+ **model_kwargs["override_pooler_config"])
518
+ model_group.add_argument("--logits-processor-pattern",
519
+ **model_kwargs["logits_processor_pattern"])
520
+ model_group.add_argument("--generation-config",
521
+ **model_kwargs["generation_config"])
522
+ model_group.add_argument("--override-generation-config",
523
+ **model_kwargs["override_generation_config"])
524
+ model_group.add_argument("--enable-sleep-mode",
525
+ **model_kwargs["enable_sleep_mode"])
526
+ model_group.add_argument("--model-impl",
527
+ choices=[f.value for f in ModelImpl],
528
+ **model_kwargs["model_impl"])
529
+
530
+ # Model loading arguments
531
+ load_kwargs = get_kwargs(LoadConfig)
532
+ load_group = parser.add_argument_group(
533
+ title="LoadConfig",
534
+ description=LoadConfig.__doc__,
535
+ )
536
+ load_group.add_argument("--load-format",
537
+ choices=[f.value for f in LoadFormat],
538
+ **load_kwargs["load_format"])
539
+ load_group.add_argument("--download-dir",
540
+ **load_kwargs["download_dir"])
541
+ load_group.add_argument("--model-loader-extra-config",
542
+ **load_kwargs["model_loader_extra_config"])
543
+ load_group.add_argument("--ignore-patterns",
544
+ **load_kwargs["ignore_patterns"])
545
+ load_group.add_argument("--use-tqdm-on-load",
546
+ **load_kwargs["use_tqdm_on_load"])
547
+ load_group.add_argument(
548
+ "--qlora-adapter-name-or-path",
549
+ type=str,
550
+ default=None,
551
+ help="The `--qlora-adapter-name-or-path` has no effect, do not set"
552
+ " it, and it will be removed in v0.10.0.",
553
+ deprecated=True,
554
+ )
555
+ load_group.add_argument('--pt-load-map-location',
556
+ **load_kwargs["pt_load_map_location"])
557
+
558
+ # Guided decoding arguments
559
+ guided_decoding_kwargs = get_kwargs(DecodingConfig)
560
+ guided_decoding_group = parser.add_argument_group(
561
+ title="DecodingConfig",
562
+ description=DecodingConfig.__doc__,
563
+ )
564
+ guided_decoding_group.add_argument("--guided-decoding-backend",
565
+ **guided_decoding_kwargs["backend"])
566
+ guided_decoding_group.add_argument(
567
+ "--guided-decoding-disable-fallback",
568
+ **guided_decoding_kwargs["disable_fallback"])
569
+ guided_decoding_group.add_argument(
570
+ "--guided-decoding-disable-any-whitespace",
571
+ **guided_decoding_kwargs["disable_any_whitespace"])
572
+ guided_decoding_group.add_argument(
573
+ "--guided-decoding-disable-additional-properties",
574
+ **guided_decoding_kwargs["disable_additional_properties"])
575
+ guided_decoding_group.add_argument(
576
+ "--enable-reasoning",
577
+ action=argparse.BooleanOptionalAction,
578
+ deprecated=True,
579
+ help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
580
+ "of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
581
+ "parser backend instead. This flag (`--enable-reasoning`) will be "
582
+ "removed in v0.10.0. When `--reasoning-parser` is specified, "
583
+ "reasoning mode is automatically enabled.")
584
+ guided_decoding_group.add_argument(
585
+ "--reasoning-parser",
586
+ # This choices is a special case because it's not static
587
+ choices=list(ReasoningParserManager.reasoning_parsers),
588
+ **guided_decoding_kwargs["reasoning_backend"])
589
+
590
+ # Parallel arguments
591
+ parallel_kwargs = get_kwargs(ParallelConfig)
592
+ parallel_group = parser.add_argument_group(
593
+ title="ParallelConfig",
594
+ description=ParallelConfig.__doc__,
595
+ )
596
+ parallel_group.add_argument(
597
+ "--distributed-executor-backend",
598
+ **parallel_kwargs["distributed_executor_backend"])
599
+ parallel_group.add_argument(
600
+ "--pipeline-parallel-size", "-pp",
601
+ **parallel_kwargs["pipeline_parallel_size"])
602
+ parallel_group.add_argument("--tensor-parallel-size", "-tp",
603
+ **parallel_kwargs["tensor_parallel_size"])
604
+ parallel_group.add_argument("--data-parallel-size", "-dp",
605
+ **parallel_kwargs["data_parallel_size"])
606
+ parallel_group.add_argument('--data-parallel-size-local',
607
+ '-dpl',
608
+ type=int,
609
+ help='Number of data parallel replicas '
610
+ 'to run on this node.')
611
+ parallel_group.add_argument('--data-parallel-address',
612
+ '-dpa',
613
+ type=str,
614
+ help='Address of data parallel cluster '
615
+ 'head-node.')
616
+ parallel_group.add_argument('--data-parallel-rpc-port',
617
+ '-dpp',
618
+ type=int,
619
+ help='Port for data parallel RPC '
620
+ 'communication.')
621
+ parallel_group.add_argument(
622
+ "--enable-expert-parallel",
623
+ **parallel_kwargs["enable_expert_parallel"])
624
+ parallel_group.add_argument(
625
+ "--max-parallel-loading-workers",
626
+ **parallel_kwargs["max_parallel_loading_workers"])
627
+ parallel_group.add_argument(
628
+ "--ray-workers-use-nsight",
629
+ **parallel_kwargs["ray_workers_use_nsight"])
630
+ parallel_group.add_argument(
631
+ "--disable-custom-all-reduce",
632
+ **parallel_kwargs["disable_custom_all_reduce"])
633
+ parallel_group.add_argument("--worker-cls",
634
+ **parallel_kwargs["worker_cls"])
635
+ parallel_group.add_argument("--worker-extension-cls",
636
+ **parallel_kwargs["worker_extension_cls"])
637
+
638
+ # KV cache arguments
639
+ cache_kwargs = get_kwargs(CacheConfig)
640
+ cache_group = parser.add_argument_group(
641
+ title="CacheConfig",
642
+ description=CacheConfig.__doc__,
643
+ )
644
+ cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
645
+ cache_group.add_argument("--gpu-memory-utilization",
646
+ **cache_kwargs["gpu_memory_utilization"])
647
+ cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
648
+ cache_group.add_argument("--kv-cache-dtype",
649
+ **cache_kwargs["cache_dtype"])
650
+ cache_group.add_argument("--num-gpu-blocks-override",
651
+ **cache_kwargs["num_gpu_blocks_override"])
652
+ cache_group.add_argument("--enable-prefix-caching",
653
+ **cache_kwargs["enable_prefix_caching"])
654
+ cache_group.add_argument("--prefix-caching-hash-algo",
655
+ **cache_kwargs["prefix_caching_hash_algo"])
656
+ cache_group.add_argument("--cpu-offload-gb",
657
+ **cache_kwargs["cpu_offload_gb"])
658
+ cache_group.add_argument("--calculate-kv-scales",
659
+ **cache_kwargs["calculate_kv_scales"])
660
+
661
+ # Tokenizer arguments
662
+ tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
663
+ tokenizer_group = parser.add_argument_group(
664
+ title="TokenizerPoolConfig",
665
+ description=TokenizerPoolConfig.__doc__,
666
+ )
667
+ tokenizer_group.add_argument("--tokenizer-pool-size",
668
+ **tokenizer_kwargs["pool_size"])
669
+ tokenizer_group.add_argument("--tokenizer-pool-type",
670
+ **tokenizer_kwargs["pool_type"])
671
+ tokenizer_group.add_argument("--tokenizer-pool-extra-config",
672
+ **tokenizer_kwargs["extra_config"])
673
+
674
+ # Multimodal related configs
675
+ multimodal_kwargs = get_kwargs(MultiModalConfig)
676
+ multimodal_group = parser.add_argument_group(
677
+ title="MultiModalConfig",
678
+ description=MultiModalConfig.__doc__,
679
+ )
680
+ multimodal_group.add_argument("--limit-mm-per-prompt",
681
+ **multimodal_kwargs["limit_per_prompt"])
682
+ multimodal_group.add_argument(
683
+ "--mm-processor-kwargs",
684
+ **multimodal_kwargs["mm_processor_kwargs"])
685
+ multimodal_group.add_argument(
686
+ "--disable-mm-preprocessor-cache",
687
+ **multimodal_kwargs["disable_mm_preprocessor_cache"])
688
+
689
+ # LoRA related configs
690
+ lora_kwargs = get_kwargs(LoRAConfig)
691
+ lora_group = parser.add_argument_group(
692
+ title="LoRAConfig",
693
+ description=LoRAConfig.__doc__,
694
+ )
695
+ lora_group.add_argument(
696
+ "--enable-lora",
697
+ action=argparse.BooleanOptionalAction,
698
+ help="If True, enable handling of LoRA adapters.")
699
+ lora_group.add_argument("--enable-lora-bias",
700
+ **lora_kwargs["bias_enabled"])
701
+ lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
702
+ lora_group.add_argument("--max-lora-rank",
703
+ **lora_kwargs["max_lora_rank"])
704
+ lora_group.add_argument("--lora-extra-vocab-size",
705
+ **lora_kwargs["lora_extra_vocab_size"])
706
+ lora_group.add_argument(
707
+ "--lora-dtype",
708
+ **lora_kwargs["lora_dtype"],
709
+ )
710
+ lora_group.add_argument("--long-lora-scaling-factors",
711
+ **lora_kwargs["long_lora_scaling_factors"])
712
+ lora_group.add_argument("--max-cpu-loras",
713
+ **lora_kwargs["max_cpu_loras"])
714
+ lora_group.add_argument("--fully-sharded-loras",
715
+ **lora_kwargs["fully_sharded_loras"])
716
+
717
+ # PromptAdapter related configs
718
+ prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
719
+ prompt_adapter_group = parser.add_argument_group(
720
+ title="PromptAdapterConfig",
721
+ description=PromptAdapterConfig.__doc__,
722
+ )
723
+ prompt_adapter_group.add_argument(
724
+ "--enable-prompt-adapter",
725
+ action=argparse.BooleanOptionalAction,
726
+ help="If True, enable handling of PromptAdapters.")
727
+ prompt_adapter_group.add_argument(
728
+ "--max-prompt-adapters",
729
+ **prompt_adapter_kwargs["max_prompt_adapters"])
730
+ prompt_adapter_group.add_argument(
731
+ "--max-prompt-adapter-token",
732
+ **prompt_adapter_kwargs["max_prompt_adapter_token"])
733
+
734
+ # Device arguments
735
+ device_kwargs = get_kwargs(DeviceConfig)
736
+ device_group = parser.add_argument_group(
737
+ title="DeviceConfig",
738
+ description=DeviceConfig.__doc__,
739
+ )
740
+ device_group.add_argument("--device",
741
+ **device_kwargs["device"],
742
+ deprecated=True)
743
+
744
+ # Speculative arguments
745
+ speculative_group = parser.add_argument_group(
746
+ title="SpeculativeConfig",
747
+ description=SpeculativeConfig.__doc__,
748
+ )
749
+ speculative_group.add_argument(
750
+ "--speculative-config",
751
+ type=json.loads,
752
+ default=None,
753
+ help="The configurations for speculative decoding. Should be a "
754
+ "JSON string.")
755
+
756
+ # Observability arguments
757
+ observability_kwargs = get_kwargs(ObservabilityConfig)
758
+ observability_group = parser.add_argument_group(
759
+ title="ObservabilityConfig",
760
+ description=ObservabilityConfig.__doc__,
761
+ )
762
+ observability_group.add_argument(
763
+ "--show-hidden-metrics-for-version",
764
+ **observability_kwargs["show_hidden_metrics_for_version"])
765
+ observability_group.add_argument(
766
+ "--otlp-traces-endpoint",
767
+ **observability_kwargs["otlp_traces_endpoint"])
768
+ # TODO: generalise this special case
769
+ choices = observability_kwargs["collect_detailed_traces"]["choices"]
770
+ metavar = f"{{{','.join(choices)}}}"
771
+ observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
772
+ observability_kwargs["collect_detailed_traces"]["choices"] += [
773
+ ",".join(p)
774
+ for p in permutations(get_args(DetailedTraceModules), r=2)
775
+ ]
776
+ observability_group.add_argument(
777
+ "--collect-detailed-traces",
778
+ **observability_kwargs["collect_detailed_traces"])
779
+
780
+ # Scheduler arguments
781
+ scheduler_kwargs = get_kwargs(SchedulerConfig)
782
+ scheduler_group = parser.add_argument_group(
783
+ title="SchedulerConfig",
784
+ description=SchedulerConfig.__doc__,
785
+ )
786
+ scheduler_group.add_argument(
787
+ "--max-num-batched-tokens",
788
+ **scheduler_kwargs["max_num_batched_tokens"])
789
+ scheduler_group.add_argument("--max-num-seqs",
790
+ **scheduler_kwargs["max_num_seqs"])
791
+ scheduler_group.add_argument(
792
+ "--max-num-partial-prefills",
793
+ **scheduler_kwargs["max_num_partial_prefills"])
794
+ scheduler_group.add_argument(
795
+ "--max-long-partial-prefills",
796
+ **scheduler_kwargs["max_long_partial_prefills"])
797
+ scheduler_group.add_argument('--cuda-graph-sizes',
798
+ **scheduler_kwargs["cuda_graph_sizes"])
799
+ scheduler_group.add_argument(
800
+ "--long-prefill-token-threshold",
801
+ **scheduler_kwargs["long_prefill_token_threshold"])
802
+ scheduler_group.add_argument("--num-lookahead-slots",
803
+ **scheduler_kwargs["num_lookahead_slots"])
804
+ scheduler_group.add_argument("--scheduler-delay-factor",
805
+ **scheduler_kwargs["delay_factor"])
806
+ scheduler_group.add_argument("--preemption-mode",
807
+ **scheduler_kwargs["preemption_mode"])
808
+ scheduler_group.add_argument("--num-scheduler-steps",
809
+ **scheduler_kwargs["num_scheduler_steps"])
810
+ scheduler_group.add_argument(
811
+ "--multi-step-stream-outputs",
812
+ **scheduler_kwargs["multi_step_stream_outputs"])
813
+ scheduler_group.add_argument("--scheduling-policy",
814
+ **scheduler_kwargs["policy"])
815
+ scheduler_group.add_argument(
816
+ "--enable-chunked-prefill",
817
+ **scheduler_kwargs["enable_chunked_prefill"])
818
+ scheduler_group.add_argument(
819
+ "--disable-chunked-mm-input",
820
+ **scheduler_kwargs["disable_chunked_mm_input"])
821
+ scheduler_group.add_argument("--scheduler-cls",
822
+ **scheduler_kwargs["scheduler_cls"])
823
+
824
+ # vLLM arguments
825
+ vllm_kwargs = get_kwargs(VllmConfig)
826
+ vllm_group = parser.add_argument_group(
827
+ title="VllmConfig",
828
+ description=VllmConfig.__doc__,
829
+ )
830
+ vllm_group.add_argument("--kv-transfer-config",
831
+ **vllm_kwargs["kv_transfer_config"])
832
+ vllm_group.add_argument('--kv-events-config',
833
+ **vllm_kwargs["kv_events_config"])
834
+ vllm_group.add_argument("--compilation-config", "-O",
835
+ **vllm_kwargs["compilation_config"])
836
+ vllm_group.add_argument("--additional-config",
837
+ **vllm_kwargs["additional_config"])
838
+
839
+ # Other arguments
840
+ parser.add_argument('--use-v2-block-manager',
841
+ action='store_true',
842
+ default=True,
843
+ deprecated=True,
844
+ help='[DEPRECATED] block manager v1 has been '
845
+ 'removed and SelfAttnBlockSpaceManager (i.e. '
846
+ 'block manager v2) is now the default. '
847
+ 'Setting this flag to True or False'
848
+ ' has no effect on vLLM behavior.')
849
+ parser.add_argument('--disable-log-stats',
850
+ action='store_true',
851
+ help='Disable logging statistics.')
852
+
853
+ return parser
854
+
855
+ @classmethod
856
+ def from_cli_args(cls, args: argparse.Namespace):
857
+ # Get the list of attributes of this dataclass.
858
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
859
+ # Set the attributes from the parsed arguments.
860
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
861
+ return engine_args
862
+
863
+ def create_model_config(self) -> ModelConfig:
864
+ # gguf file needs a specific model loader and doesn't use hf_repo
865
+ if check_gguf_file(self.model):
866
+ self.quantization = self.load_format = "gguf"
867
+
868
+ # NOTE: This is to allow model loading from S3 in CI
869
+ if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
870
+ and self.model in MODELS_ON_S3
871
+ and self.load_format == LoadFormat.AUTO): # noqa: E501
872
+ self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
873
+ self.load_format = LoadFormat.RUNAI_STREAMER
874
+
875
+ return ModelConfig(
876
+ model=self.model,
877
+ hf_config_path=self.hf_config_path,
878
+ task=self.task,
879
+ tokenizer=self.tokenizer,
880
+ tokenizer_mode=self.tokenizer_mode,
881
+ trust_remote_code=self.trust_remote_code,
882
+ allowed_local_media_path=self.allowed_local_media_path,
883
+ dtype=self.dtype,
884
+ seed=self.seed,
885
+ revision=self.revision,
886
+ code_revision=self.code_revision,
887
+ rope_scaling=self.rope_scaling,
888
+ rope_theta=self.rope_theta,
889
+ hf_token=self.hf_token,
890
+ hf_overrides=self.hf_overrides,
891
+ tokenizer_revision=self.tokenizer_revision,
892
+ max_model_len=self.max_model_len,
893
+ quantization=self.quantization,
894
+ enforce_eager=self.enforce_eager,
895
+ max_seq_len_to_capture=self.max_seq_len_to_capture,
896
+ max_logprobs=self.max_logprobs,
897
+ disable_sliding_window=self.disable_sliding_window,
898
+ disable_cascade_attn=self.disable_cascade_attn,
899
+ skip_tokenizer_init=self.skip_tokenizer_init,
900
+ enable_prompt_embeds=self.enable_prompt_embeds,
901
+ served_model_name=self.served_model_name,
902
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
903
+ use_async_output_proc=not self.disable_async_output_proc,
904
+ config_format=self.config_format,
905
+ mm_processor_kwargs=self.mm_processor_kwargs,
906
+ disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
907
+ override_neuron_config=self.override_neuron_config,
908
+ override_pooler_config=self.override_pooler_config,
909
+ logits_processor_pattern=self.logits_processor_pattern,
910
+ generation_config=self.generation_config,
911
+ override_generation_config=self.override_generation_config,
912
+ enable_sleep_mode=self.enable_sleep_mode,
913
+ model_impl=self.model_impl,
914
+ )
915
+
916
+ def create_load_config(self) -> LoadConfig:
917
+
918
+ if self.quantization == "bitsandbytes":
919
+ self.load_format = "bitsandbytes"
920
+
921
+ return LoadConfig(
922
+ load_format=self.load_format,
923
+ download_dir=self.download_dir,
924
+ model_loader_extra_config=self.model_loader_extra_config,
925
+ ignore_patterns=self.ignore_patterns,
926
+ use_tqdm_on_load=self.use_tqdm_on_load,
927
+ pt_load_map_location=self.pt_load_map_location,
928
+ )
929
+
930
+ def create_speculative_config(
931
+ self,
932
+ target_model_config: ModelConfig,
933
+ target_parallel_config: ParallelConfig,
934
+ enable_chunked_prefill: bool,
935
+ disable_log_stats: bool,
936
+ ) -> Optional["SpeculativeConfig"]:
937
+ """Initializes and returns a SpeculativeConfig object based on
938
+ `speculative_config`.
939
+
940
+ This function utilizes `speculative_config` to create a
941
+ SpeculativeConfig object. The `speculative_config` can either be
942
+ provided as a JSON string input via CLI arguments or directly as a
943
+ dictionary from the engine.
944
+ """
945
+ if self.speculative_config is None:
946
+ return None
947
+
948
+ # Note(Shangming): These parameters are not obtained from the cli arg
949
+ # '--speculative-config' and must be passed in when creating the engine
950
+ # config.
951
+ self.speculative_config.update({
952
+ "target_model_config": target_model_config,
953
+ "target_parallel_config": target_parallel_config,
954
+ "enable_chunked_prefill": enable_chunked_prefill,
955
+ "disable_log_stats": disable_log_stats,
956
+ })
957
+ speculative_config = SpeculativeConfig.from_dict(
958
+ self.speculative_config)
959
+
960
+ return speculative_config
961
+
962
+ def create_engine_config(
963
+ self,
964
+ usage_context: Optional[UsageContext] = None,
965
+ ) -> VllmConfig:
966
+ """
967
+ Create the VllmConfig.
968
+
969
+ NOTE: for autoselection of V0 vs V1 engine, we need to
970
+ create the ModelConfig first, since ModelConfig's attrs
971
+ (e.g. the model arch) are needed to make the decision.
972
+
973
+ This function set VLLM_USE_V1=X if VLLM_USE_V1 is
974
+ unspecified by the user.
975
+
976
+ If VLLM_USE_V1 is specified by the user but the VllmConfig
977
+ is incompatible, we raise an error.
978
+ """
979
+ from vllm.platforms import current_platform
980
+ current_platform.pre_register_and_update()
981
+
982
+ device_config = DeviceConfig(device=current_platform.device_type)
983
+ model_config = self.create_model_config()
984
+
985
+ # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
986
+ # and fall back to V0 for experimental or unsupported features.
987
+ # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
988
+ # features and raise error for unsupported features.
989
+ # * If VLLM_USE_V1=0, we disable V1.
990
+ use_v1 = False
991
+ try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
992
+ if try_v1 and self._is_v1_supported_oracle(model_config):
993
+ use_v1 = True
994
+
995
+ # If user explicitly set VLLM_USE_V1, sanity check we respect it.
996
+ if envs.is_set("VLLM_USE_V1"):
997
+ assert use_v1 == envs.VLLM_USE_V1
998
+ # Otherwise, set the VLLM_USE_V1 variable globally.
999
+ else:
1000
+ envs.set_vllm_use_v1(use_v1)
1001
+
1002
+ # Set default arguments for V0 or V1 Engine.
1003
+ if use_v1:
1004
+ self._set_default_args_v1(usage_context)
1005
+ else:
1006
+ self._set_default_args_v0(model_config)
1007
+
1008
+ assert self.enable_chunked_prefill is not None
1009
+
1010
+ if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
1011
+ assert self.enforce_eager, (
1012
+ "Cuda graph is not supported with DualChunkFlashAttention. "
1013
+ "To run the model in eager mode, set 'enforce_eager=True' "
1014
+ "or use '--enforce-eager' in the CLI.")
1015
+ assert current_platform.is_cuda(), (
1016
+ "DualChunkFlashAttention is only supported on CUDA platform.")
1017
+ assert not use_v1, (
1018
+ "DualChunkFlashAttention is not supported on V1 engine. "
1019
+ "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
1020
+
1021
+ cache_config = CacheConfig(
1022
+ block_size=self.block_size,
1023
+ gpu_memory_utilization=self.gpu_memory_utilization,
1024
+ swap_space=self.swap_space,
1025
+ cache_dtype=self.kv_cache_dtype,
1026
+ is_attention_free=model_config.is_attention_free,
1027
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
1028
+ sliding_window=model_config.get_sliding_window(),
1029
+ enable_prefix_caching=self.enable_prefix_caching,
1030
+ prefix_caching_hash_algo=self.prefix_caching_hash_algo,
1031
+ cpu_offload_gb=self.cpu_offload_gb,
1032
+ calculate_kv_scales=self.calculate_kv_scales,
1033
+ )
1034
+
1035
+ # Get the current placement group if Ray is initialized and
1036
+ # we are in a Ray actor. If so, then the placement group will be
1037
+ # passed to spawned processes.
1038
+ placement_group = None
1039
+ if is_in_ray_actor():
1040
+ import ray
1041
+
1042
+ # This call initializes Ray automatically if it is not initialized,
1043
+ # but we should not do this here.
1044
+ placement_group = ray.util.get_current_placement_group()
1045
+
1046
+ # Local DP size defaults to global DP size if not set.
1047
+ data_parallel_size_local = self.data_parallel_size if (
1048
+ self.data_parallel_size_local
1049
+ is None) else self.data_parallel_size_local
1050
+
1051
+ # DP address, used in multi-node case for torch distributed group
1052
+ # and ZMQ sockets.
1053
+ data_parallel_address = self.data_parallel_address if (
1054
+ self.data_parallel_address
1055
+ is not None) else ParallelConfig.data_parallel_master_ip
1056
+
1057
+ # This port is only used when there are remote data parallel engines,
1058
+ # otherwise the local IPC transport is used.
1059
+ data_parallel_rpc_port = self.data_parallel_rpc_port if (
1060
+ self.data_parallel_rpc_port
1061
+ is not None) else ParallelConfig.data_parallel_rpc_port
1062
+
1063
+ parallel_config = ParallelConfig(
1064
+ pipeline_parallel_size=self.pipeline_parallel_size,
1065
+ tensor_parallel_size=self.tensor_parallel_size,
1066
+ data_parallel_size=self.data_parallel_size,
1067
+ data_parallel_size_local=data_parallel_size_local,
1068
+ data_parallel_master_ip=data_parallel_address,
1069
+ data_parallel_rpc_port=data_parallel_rpc_port,
1070
+ enable_expert_parallel=self.enable_expert_parallel,
1071
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
1072
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
1073
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
1074
+ placement_group=placement_group,
1075
+ distributed_executor_backend=self.distributed_executor_backend,
1076
+ worker_cls=self.worker_cls,
1077
+ worker_extension_cls=self.worker_extension_cls,
1078
+ )
1079
+
1080
+ speculative_config = self.create_speculative_config(
1081
+ target_model_config=model_config,
1082
+ target_parallel_config=parallel_config,
1083
+ enable_chunked_prefill=self.enable_chunked_prefill,
1084
+ disable_log_stats=self.disable_log_stats,
1085
+ )
1086
+
1087
+ # Reminder: Please update docs/features/compatibility_matrix.md
1088
+ # If the feature combo become valid
1089
+ if self.num_scheduler_steps > 1:
1090
+ if speculative_config is not None:
1091
+ raise ValueError("Speculative decoding is not supported with "
1092
+ "multi-step (--num-scheduler-steps > 1)")
1093
+ if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
1094
+ raise ValueError("Multi-Step Chunked-Prefill is not supported "
1095
+ "for pipeline-parallel-size > 1")
1096
+ from vllm.platforms import current_platform
1097
+ if current_platform.is_cpu():
1098
+ logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
1099
+ "currently not supported for CPUs and has been "
1100
+ "disabled.")
1101
+ self.num_scheduler_steps = 1
1102
+
1103
+ # make sure num_lookahead_slots is set the higher value depending on
1104
+ # if we are using speculative decoding or multi-step
1105
+ num_lookahead_slots = max(self.num_lookahead_slots,
1106
+ self.num_scheduler_steps - 1)
1107
+ num_lookahead_slots = num_lookahead_slots \
1108
+ if speculative_config is None \
1109
+ else speculative_config.num_lookahead_slots
1110
+
1111
+ scheduler_config = SchedulerConfig(
1112
+ runner_type=model_config.runner_type,
1113
+ max_num_batched_tokens=self.max_num_batched_tokens,
1114
+ max_num_seqs=self.max_num_seqs,
1115
+ max_model_len=model_config.max_model_len,
1116
+ cuda_graph_sizes=self.cuda_graph_sizes,
1117
+ num_lookahead_slots=num_lookahead_slots,
1118
+ delay_factor=self.scheduler_delay_factor,
1119
+ enable_chunked_prefill=self.enable_chunked_prefill,
1120
+ disable_chunked_mm_input=self.disable_chunked_mm_input,
1121
+ is_multimodal_model=model_config.is_multimodal_model,
1122
+ preemption_mode=self.preemption_mode,
1123
+ num_scheduler_steps=self.num_scheduler_steps,
1124
+ multi_step_stream_outputs=self.multi_step_stream_outputs,
1125
+ send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
1126
+ and parallel_config.use_ray),
1127
+ policy=self.scheduling_policy,
1128
+ scheduler_cls=self.scheduler_cls,
1129
+ max_num_partial_prefills=self.max_num_partial_prefills,
1130
+ max_long_partial_prefills=self.max_long_partial_prefills,
1131
+ long_prefill_token_threshold=self.long_prefill_token_threshold,
1132
+ )
1133
+
1134
+ lora_config = LoRAConfig(
1135
+ bias_enabled=self.enable_lora_bias,
1136
+ max_lora_rank=self.max_lora_rank,
1137
+ max_loras=self.max_loras,
1138
+ fully_sharded_loras=self.fully_sharded_loras,
1139
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
1140
+ long_lora_scaling_factors=self.long_lora_scaling_factors,
1141
+ lora_dtype=self.lora_dtype,
1142
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
1143
+ and self.max_cpu_loras > 0 else None) if self.enable_lora else None
1144
+
1145
+ # bitsandbytes pre-quantized model need a specific model loader
1146
+ if model_config.quantization == "bitsandbytes":
1147
+ self.quantization = self.load_format = "bitsandbytes"
1148
+
1149
+ load_config = self.create_load_config()
1150
+
1151
+ prompt_adapter_config = PromptAdapterConfig(
1152
+ max_prompt_adapters=self.max_prompt_adapters,
1153
+ max_prompt_adapter_token=self.max_prompt_adapter_token) \
1154
+ if self.enable_prompt_adapter else None
1155
+
1156
+ decoding_config = DecodingConfig(
1157
+ backend=self.guided_decoding_backend,
1158
+ disable_fallback=self.guided_decoding_disable_fallback,
1159
+ disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
1160
+ disable_additional_properties=\
1161
+ self.guided_decoding_disable_additional_properties,
1162
+ reasoning_backend=self.reasoning_parser
1163
+ )
1164
+
1165
+ observability_config = ObservabilityConfig(
1166
+ show_hidden_metrics_for_version=self.
1167
+ show_hidden_metrics_for_version,
1168
+ otlp_traces_endpoint=self.otlp_traces_endpoint,
1169
+ collect_detailed_traces=self.collect_detailed_traces,
1170
+ )
1171
+
1172
+ config = VllmConfig(
1173
+ model_config=model_config,
1174
+ cache_config=cache_config,
1175
+ parallel_config=parallel_config,
1176
+ scheduler_config=scheduler_config,
1177
+ device_config=device_config,
1178
+ lora_config=lora_config,
1179
+ speculative_config=speculative_config,
1180
+ load_config=load_config,
1181
+ decoding_config=decoding_config,
1182
+ observability_config=observability_config,
1183
+ prompt_adapter_config=prompt_adapter_config,
1184
+ compilation_config=self.compilation_config,
1185
+ kv_transfer_config=self.kv_transfer_config,
1186
+ kv_events_config=self.kv_events_config,
1187
+ additional_config=self.additional_config,
1188
+ )
1189
+
1190
+ return config
1191
+
1192
+ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
1193
+ """Oracle for whether to use V0 or V1 Engine by default."""
1194
+
1195
+ #############################################################
1196
+ # Unsupported Feature Flags on V1.
1197
+
1198
+ if self.load_format == LoadFormat.SHARDED_STATE.value:
1199
+ _raise_or_fallback(
1200
+ feature_name=f"--load_format {self.load_format}",
1201
+ recommend_to_remove=False)
1202
+ return False
1203
+
1204
+ if (self.logits_processor_pattern
1205
+ != EngineArgs.logits_processor_pattern):
1206
+ _raise_or_fallback(feature_name="--logits-processor-pattern",
1207
+ recommend_to_remove=False)
1208
+ return False
1209
+
1210
+ if self.preemption_mode != SchedulerConfig.preemption_mode:
1211
+ _raise_or_fallback(feature_name="--preemption-mode",
1212
+ recommend_to_remove=True)
1213
+ return False
1214
+
1215
+ if (self.disable_async_output_proc
1216
+ != EngineArgs.disable_async_output_proc):
1217
+ _raise_or_fallback(feature_name="--disable-async-output-proc",
1218
+ recommend_to_remove=True)
1219
+ return False
1220
+
1221
+ if self.scheduling_policy != SchedulerConfig.policy:
1222
+ _raise_or_fallback(feature_name="--scheduling-policy",
1223
+ recommend_to_remove=False)
1224
+ return False
1225
+
1226
+ if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
1227
+ _raise_or_fallback(feature_name="--num-scheduler-steps",
1228
+ recommend_to_remove=True)
1229
+ return False
1230
+
1231
+ if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
1232
+ _raise_or_fallback(feature_name="--scheduler-delay-factor",
1233
+ recommend_to_remove=True)
1234
+ return False
1235
+
1236
+ if self.guided_decoding_backend not in get_args(
1237
+ GuidedDecodingBackendV1):
1238
+ _raise_or_fallback(
1239
+ feature_name=
1240
+ f"--guided-decoding-backend={self.guided_decoding_backend}",
1241
+ recommend_to_remove=False)
1242
+ return False
1243
+
1244
+ # Need at least Ampere for now (FA support required).
1245
+ # Skip this check if we are running on a non-GPU platform,
1246
+ # or if the device capability is not available
1247
+ # (e.g. in a Ray actor without GPUs).
1248
+ from vllm.platforms import current_platform
1249
+ if (current_platform.is_cuda()
1250
+ and current_platform.get_device_capability()
1251
+ and current_platform.get_device_capability().major < 8):
1252
+ _raise_or_fallback(feature_name="Compute Capability < 8.0",
1253
+ recommend_to_remove=False)
1254
+ return False
1255
+
1256
+ # No Fp8 KV cache so far.
1257
+ if self.kv_cache_dtype != "auto":
1258
+ fp8_attention = self.kv_cache_dtype.startswith("fp8")
1259
+ will_use_fa = (
1260
+ current_platform.is_cuda()
1261
+ and not envs.is_set("VLLM_ATTENTION_BACKEND")
1262
+ ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
1263
+ supported = False
1264
+ if current_platform.is_rocm():
1265
+ supported = True
1266
+ elif fp8_attention and will_use_fa:
1267
+ from vllm.attention.utils.fa_utils import (
1268
+ flash_attn_supports_fp8)
1269
+ supported = flash_attn_supports_fp8()
1270
+ if not supported:
1271
+ _raise_or_fallback(feature_name="--kv-cache-dtype",
1272
+ recommend_to_remove=False)
1273
+ return False
1274
+
1275
+ # No Prompt Adapter so far.
1276
+ if self.enable_prompt_adapter:
1277
+ _raise_or_fallback(feature_name="--enable-prompt-adapter",
1278
+ recommend_to_remove=False)
1279
+ return False
1280
+
1281
+ # No text embedding inputs so far.
1282
+ if self.enable_prompt_embeds:
1283
+ _raise_or_fallback(feature_name="--enable-prompt-embeds",
1284
+ recommend_to_remove=False)
1285
+ return False
1286
+
1287
+ # Only Fp16 and Bf16 dtypes since we only support FA.
1288
+ V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
1289
+ if model_config.dtype not in V1_SUPPORTED_DTYPES:
1290
+ _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
1291
+ recommend_to_remove=False)
1292
+ return False
1293
+
1294
+ # No Embedding Models so far.
1295
+ if model_config.task not in ["generate"]:
1296
+ _raise_or_fallback(feature_name=f"--task {model_config.task}",
1297
+ recommend_to_remove=False)
1298
+ return False
1299
+
1300
+ # No Mamba or Encoder-Decoder so far.
1301
+ if not model_config.is_v1_compatible:
1302
+ _raise_or_fallback(feature_name=model_config.architectures,
1303
+ recommend_to_remove=False)
1304
+ return False
1305
+
1306
+ # No Concurrent Partial Prefills so far.
1307
+ if (self.max_num_partial_prefills
1308
+ != SchedulerConfig.max_num_partial_prefills
1309
+ or self.max_long_partial_prefills
1310
+ != SchedulerConfig.max_long_partial_prefills):
1311
+ _raise_or_fallback(feature_name="Concurrent Partial Prefill",
1312
+ recommend_to_remove=False)
1313
+ return False
1314
+
1315
+ # No OTLP observability so far.
1316
+ if (self.otlp_traces_endpoint or self.collect_detailed_traces):
1317
+ _raise_or_fallback(feature_name="--otlp-traces-endpoint",
1318
+ recommend_to_remove=False)
1319
+ return False
1320
+
1321
+ # V1 supports N-gram, Medusa, and Eagle speculative decoding.
1322
+ is_ngram_enabled = False
1323
+ is_eagle_enabled = False
1324
+ is_medusa_enabled = False
1325
+ if self.speculative_config is not None:
1326
+ # This is supported but experimental (handled below).
1327
+ speculative_method = self.speculative_config.get("method")
1328
+ if speculative_method:
1329
+ if speculative_method in ("ngram", "[ngram]"):
1330
+ is_ngram_enabled = True
1331
+ elif speculative_method == "medusa":
1332
+ is_medusa_enabled = True
1333
+ elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"):
1334
+ is_eagle_enabled = True
1335
+ else:
1336
+ speculative_model = self.speculative_config.get("model")
1337
+ if speculative_model in ("ngram", "[ngram]"):
1338
+ is_ngram_enabled = True
1339
+ if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled):
1340
+ # Other speculative decoding methods are not supported yet.
1341
+ _raise_or_fallback(feature_name="Speculative Decoding",
1342
+ recommend_to_remove=False)
1343
+ return False
1344
+
1345
+ # No XFormers so far.
1346
+ V1_BACKENDS = [
1347
+ "FLASH_ATTN_VLLM_V1",
1348
+ "FLASH_ATTN",
1349
+ "PALLAS",
1350
+ "PALLAS_VLLM_V1",
1351
+ "TRITON_ATTN_VLLM_V1",
1352
+ "TRITON_MLA",
1353
+ "FLASHMLA",
1354
+ "FLASHINFER",
1355
+ "FLASHINFER_VLLM_V1",
1356
+ "ROCM_AITER_MLA",
1357
+ ]
1358
+ if (envs.is_set("VLLM_ATTENTION_BACKEND")
1359
+ and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
1360
+ name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
1361
+ _raise_or_fallback(feature_name=name, recommend_to_remove=True)
1362
+ return False
1363
+
1364
+ # Platforms must decide if they can support v1 for this model
1365
+ if not current_platform.supports_v1(model_config=model_config):
1366
+ _raise_or_fallback(
1367
+ feature_name=f"device type={current_platform.device_type}",
1368
+ recommend_to_remove=False)
1369
+ return False
1370
+ #############################################################
1371
+ # Experimental Features - allow users to opt in.
1372
+
1373
+ # Signal Handlers requires running in main thread.
1374
+ if (threading.current_thread() != threading.main_thread()
1375
+ and _warn_or_fallback("Engine in background thread")):
1376
+ return False
1377
+
1378
+ if (self.pipeline_parallel_size > 1
1379
+ and self.distributed_executor_backend
1380
+ not in ("ray", "mp", "external_launcher")):
1381
+ name = "Pipeline Parallelism without Ray distributed executor " \
1382
+ "or multiprocessing executor or external launcher"
1383
+ _raise_or_fallback(feature_name=name, recommend_to_remove=False)
1384
+ return False
1385
+
1386
+ # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
1387
+ v0_hardware = not any(
1388
+ (current_platform.is_cuda(), current_platform.is_tpu()))
1389
+ if v0_hardware and _warn_or_fallback( # noqa: SIM103
1390
+ current_platform.device_name):
1391
+ return False
1392
+ #############################################################
1393
+
1394
+ return True
1395
+
1396
+ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
1397
+ """Set Default Arguments for V0 Engine."""
1398
+
1399
+ max_model_len = model_config.max_model_len
1400
+ use_long_context = max_model_len > 32768
1401
+ if self.enable_chunked_prefill is None:
1402
+ # Chunked prefill not supported for Multimodal or MLA in V0.
1403
+ if model_config.is_multimodal_model or model_config.use_mla:
1404
+ self.enable_chunked_prefill = False
1405
+
1406
+ # Enable chunked prefill by default for long context (> 32K)
1407
+ # models to avoid OOM errors in initial memory profiling phase.
1408
+ elif use_long_context:
1409
+ from vllm.platforms import current_platform
1410
+ is_gpu = current_platform.is_cuda()
1411
+ use_sliding_window = (model_config.get_sliding_window()
1412
+ is not None)
1413
+ use_spec_decode = self.speculative_config is not None
1414
+
1415
+ if (is_gpu and not use_sliding_window and not use_spec_decode
1416
+ and not self.enable_lora
1417
+ and not self.enable_prompt_adapter
1418
+ and model_config.runner_type != "pooling"):
1419
+ self.enable_chunked_prefill = True
1420
+ logger.warning(
1421
+ "Chunked prefill is enabled by default for models "
1422
+ "with max_model_len > 32K. Chunked prefill might "
1423
+ "not work with some features or models. If you "
1424
+ "encounter any issues, please disable by launching "
1425
+ "with --enable-chunked-prefill=False.")
1426
+
1427
+ if self.enable_chunked_prefill is None:
1428
+ self.enable_chunked_prefill = False
1429
+
1430
+ if not self.enable_chunked_prefill and use_long_context:
1431
+ logger.warning(
1432
+ "The model has a long context length (%s). This may cause"
1433
+ "OOM during the initial memory profiling phase, or result "
1434
+ "in low performance due to small KV cache size. Consider "
1435
+ "setting --max-model-len to a smaller value.", max_model_len)
1436
+ elif (self.enable_chunked_prefill
1437
+ and model_config.runner_type == "pooling"):
1438
+ msg = "Chunked prefill is not supported for pooling models"
1439
+ raise ValueError(msg)
1440
+
1441
+ # if using prefix caching, we must set a hash algo
1442
+ if self.enable_prefix_caching:
1443
+ # Disable prefix caching for multimodal models for VLLM_V0.
1444
+ if model_config.is_multimodal_model:
1445
+ logger.warning(
1446
+ "--enable-prefix-caching is not supported for multimodal "
1447
+ "models in V0 and has been disabled.")
1448
+ self.enable_prefix_caching = False
1449
+
1450
+ # VLLM_V0 only supports builtin hash algo for prefix caching.
1451
+ if self.prefix_caching_hash_algo == "sha256":
1452
+ raise ValueError(
1453
+ "sha256 is not supported for prefix caching in V0 engine. "
1454
+ "Please use 'builtin'.")
1455
+
1456
+ # Set max_num_seqs to 256 for VLLM_V0.
1457
+ if self.max_num_seqs is None:
1458
+ self.max_num_seqs = 256
1459
+
1460
+ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
1461
+ """Set Default Arguments for V1 Engine."""
1462
+
1463
+ # V1 always uses chunked prefills.
1464
+ self.enable_chunked_prefill = True
1465
+
1466
+ # V1 enables prefix caching by default.
1467
+ if self.enable_prefix_caching is None:
1468
+ self.enable_prefix_caching = True
1469
+
1470
+ # V1 should use the new scheduler by default.
1471
+ # Swap it only if this arg is set to the original V0 default
1472
+ if self.scheduler_cls == EngineArgs.scheduler_cls:
1473
+ self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
1474
+
1475
+ # When no user override, set the default values based on the usage
1476
+ # context.
1477
+ # Use different default values for different hardware.
1478
+
1479
+ # Try to query the device name on the current platform. If it fails,
1480
+ # it may be because the platform that imports vLLM is not the same
1481
+ # as the platform that vLLM is running on (e.g. the case of scaling
1482
+ # vLLM with Ray) and has no GPUs. In this case we use the default
1483
+ # values for non-H100/H200 GPUs.
1484
+ from vllm.platforms import current_platform
1485
+ try:
1486
+ device_memory = current_platform.get_device_total_memory()
1487
+ device_name = current_platform.get_device_name().lower()
1488
+ except Exception:
1489
+ # This is only used to set default_max_num_batched_tokens
1490
+ device_memory = 0
1491
+
1492
+ # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
1493
+ # throughput, see PR #17885 for more details.
1494
+ # So here we do an extra device name check to prevent such regression.
1495
+ if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
1496
+ # For GPUs like H100 and MI300x, use larger default values.
1497
+ default_max_num_batched_tokens = {
1498
+ UsageContext.LLM_CLASS: 16384,
1499
+ UsageContext.OPENAI_API_SERVER: 8192,
1500
+ }
1501
+ default_max_num_seqs = 1024
1502
+ else:
1503
+ # TODO(woosuk): Tune the default values for other hardware.
1504
+ default_max_num_batched_tokens = {
1505
+ UsageContext.LLM_CLASS: 8192,
1506
+ UsageContext.OPENAI_API_SERVER: 2048,
1507
+ }
1508
+ default_max_num_seqs = 256
1509
+
1510
+ # tpu specific default values.
1511
+ if current_platform.is_tpu():
1512
+ default_max_num_batched_tokens_tpu = {
1513
+ UsageContext.LLM_CLASS: {
1514
+ 'V6E': 2048,
1515
+ 'V5E': 1024,
1516
+ 'V5P': 512,
1517
+ },
1518
+ UsageContext.OPENAI_API_SERVER: {
1519
+ 'V6E': 1024,
1520
+ 'V5E': 512,
1521
+ 'V5P': 256,
1522
+ }
1523
+ }
1524
+
1525
+ use_context_value = usage_context.value if usage_context else None
1526
+ if (self.max_num_batched_tokens is None
1527
+ and usage_context in default_max_num_batched_tokens):
1528
+ if current_platform.is_tpu():
1529
+ chip_name = current_platform.get_device_name()
1530
+ if chip_name in default_max_num_batched_tokens_tpu[
1531
+ usage_context]:
1532
+ self.max_num_batched_tokens = \
1533
+ default_max_num_batched_tokens_tpu[
1534
+ usage_context][chip_name]
1535
+ else:
1536
+ self.max_num_batched_tokens = \
1537
+ default_max_num_batched_tokens[usage_context]
1538
+ else:
1539
+ self.max_num_batched_tokens = default_max_num_batched_tokens[
1540
+ usage_context]
1541
+ logger.debug(
1542
+ "Setting max_num_batched_tokens to %d for %s usage context.",
1543
+ self.max_num_batched_tokens, use_context_value)
1544
+
1545
+ if self.max_num_seqs is None:
1546
+ self.max_num_seqs = default_max_num_seqs
1547
+
1548
+ logger.debug("Setting max_num_seqs to %d for %s usage context.",
1549
+ self.max_num_seqs, use_context_value)
1550
+
1551
+
1552
+ @dataclass
1553
+ class AsyncEngineArgs(EngineArgs):
1554
+ """Arguments for asynchronous vLLM engine."""
1555
+ disable_log_requests: bool = False
1556
+
1557
+ @staticmethod
1558
+ def add_cli_args(parser: FlexibleArgumentParser,
1559
+ async_args_only: bool = False) -> FlexibleArgumentParser:
1560
+ # Initialize plugin to update the parser, for example, The plugin may
1561
+ # adding a new kind of quantization method to --quantization argument or
1562
+ # a new device to --device argument.
1563
+ load_general_plugins()
1564
+ if not async_args_only:
1565
+ parser = EngineArgs.add_cli_args(parser)
1566
+ parser.add_argument('--disable-log-requests',
1567
+ action='store_true',
1568
+ help='Disable logging requests.')
1569
+ from vllm.platforms import current_platform
1570
+ current_platform.pre_register_and_update(parser)
1571
+ return parser
1572
+
1573
+
1574
+ def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
1575
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1576
+ raise NotImplementedError(
1577
+ f"VLLM_USE_V1=1 is not supported with {feature_name}.")
1578
+ msg = f"{feature_name} is not supported by the V1 Engine. "
1579
+ msg += "Falling back to V0. "
1580
+ if recommend_to_remove:
1581
+ msg += f"We recommend to remove {feature_name} from your config "
1582
+ msg += "in favor of the V1 Engine."
1583
+ logger.warning(msg)
1584
+
1585
+
1586
+ def _warn_or_fallback(feature_name: str) -> bool:
1587
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1588
+ logger.warning(
1589
+ "Detected VLLM_USE_V1=1 with %s. Usage should "
1590
+ "be considered experimental. Please report any "
1591
+ "issues on Github.", feature_name)
1592
+ should_exit = False
1593
+ else:
1594
+ logger.info(
1595
+ "%s is experimental on VLLM_USE_V1=1. "
1596
+ "Falling back to V0 Engine.", feature_name)
1597
+ should_exit = True
1598
+ return should_exit
1599
+
1600
+
1601
+ def human_readable_int(value):
1602
+ """Parse human-readable integers like '1k', '2M', etc.
1603
+ Including decimal values with decimal multipliers.
1604
+
1605
+ Examples:
1606
+ - '1k' -> 1,000
1607
+ - '1K' -> 1,024
1608
+ - '25.6k' -> 25,600
1609
+ """
1610
+ value = value.strip()
1611
+ match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
1612
+ if match:
1613
+ decimal_multiplier = {
1614
+ 'k': 10**3,
1615
+ 'm': 10**6,
1616
+ 'g': 10**9,
1617
+ }
1618
+ binary_multiplier = {
1619
+ 'K': 2**10,
1620
+ 'M': 2**20,
1621
+ 'G': 2**30,
1622
+ }
1623
+
1624
+ number, suffix = match.groups()
1625
+ if suffix in decimal_multiplier:
1626
+ mult = decimal_multiplier[suffix]
1627
+ return int(float(number) * mult)
1628
+ elif suffix in binary_multiplier:
1629
+ mult = binary_multiplier[suffix]
1630
+ # Do not allow decimals with binary multipliers
1631
+ try:
1632
+ return int(number) * mult
1633
+ except ValueError as e:
1634
+ raise argparse.ArgumentTypeError("Decimals are not allowed " \
1635
+ f"with binary suffixes like {suffix}. Did you mean to use " \
1636
+ f"{number}{suffix.lower()} instead?") from e
1637
+
1638
+ # Regular plain number.
1639
+ return int(value)
1640
+
1641
+
1642
+ # These functions are used by sphinx to build the documentation
1643
+ def _engine_args_parser():
1644
+ return EngineArgs.add_cli_args(FlexibleArgumentParser())
1645
+
1646
+
1647
+ def _async_engine_args_parser():
1648
+ return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
1649
+ async_args_only=True)