vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1708 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # yapf: disable
5
+ import argparse
6
+ import dataclasses
7
+ import json
8
+ import sys
9
+ import threading
10
+ import warnings
11
+ from dataclasses import MISSING, dataclass, fields, is_dataclass
12
+ from itertools import permutations
13
+ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
14
+ Type, TypeVar, Union, cast, get_args, get_origin)
15
+
16
+ import regex as re
17
+ import torch
18
+ from pydantic import TypeAdapter, ValidationError
19
+ from typing_extensions import TypeIs, deprecated
20
+
21
+ import vllm.envs as envs
22
+ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
23
+ ConfigFormat, ConfigType, DecodingConfig,
24
+ DetailedTraceModules, Device, DeviceConfig,
25
+ DistributedExecutorBackend, GuidedDecodingBackend,
26
+ GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
27
+ KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
28
+ ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
29
+ ObservabilityConfig, ParallelConfig, PoolerConfig,
30
+ PrefixCachingHashAlgo, PromptAdapterConfig,
31
+ SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
32
+ TaskOption, TokenizerMode, TokenizerPoolConfig,
33
+ VllmConfig, get_attr_docs, get_field)
34
+ from vllm.executor.executor_base import ExecutorBase
35
+ from vllm.logger import init_logger
36
+ from vllm.model_executor.layers.quantization import QuantizationMethods
37
+ from vllm.plugins import load_general_plugins
38
+ from vllm.reasoning import ReasoningParserManager
39
+ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
40
+ from vllm.transformers_utils.utils import check_gguf_file
41
+ from vllm.usage.usage_lib import UsageContext
42
+ from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
43
+ GiB_bytes, get_ip, is_in_ray_actor)
44
+
45
+ # yapf: enable
46
+
47
+ logger = init_logger(__name__)
48
+
49
+ # object is used to allow for special typing forms
50
+ T = TypeVar("T")
51
+ TypeHint = Union[type[Any], object]
52
+ TypeHintT = Union[type[T], object]
53
+
54
+
55
+ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
56
+
57
+ def _parse_type(val: str) -> T:
58
+ try:
59
+ if return_type is json.loads and not re.match("^{.*}$", val):
60
+ return cast(T, nullable_kvs(val))
61
+ return return_type(val)
62
+ except ValueError as e:
63
+ raise argparse.ArgumentTypeError(
64
+ f"Value {val} cannot be converted to {return_type}.") from e
65
+
66
+ return _parse_type
67
+
68
+
69
+ def optional_type(
70
+ return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
71
+
72
+ def _optional_type(val: str) -> Optional[T]:
73
+ if val == "" or val == "None":
74
+ return None
75
+ return parse_type(return_type)(val)
76
+
77
+ return _optional_type
78
+
79
+
80
+ def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
81
+ if not re.match("^{.*}$", val):
82
+ return str(val)
83
+ return optional_type(json.loads)(val)
84
+
85
+
86
+ @deprecated(
87
+ "Passing a JSON argument as a string containing comma separated key=value "
88
+ "pairs is deprecated. This will be removed in v0.10.0. Please use a JSON "
89
+ "string instead.")
90
+ def nullable_kvs(val: str) -> dict[str, int]:
91
+ """Parses a string containing comma separate key [str] to value [int]
92
+ pairs into a dictionary.
93
+
94
+ Args:
95
+ val: String value to be parsed.
96
+
97
+ Returns:
98
+ Dictionary with parsed values.
99
+ """
100
+ out_dict: dict[str, int] = {}
101
+ for item in val.split(","):
102
+ kv_parts = [part.lower().strip() for part in item.split("=")]
103
+ if len(kv_parts) != 2:
104
+ raise argparse.ArgumentTypeError(
105
+ "Each item should be in the form KEY=VALUE")
106
+ key, value = kv_parts
107
+
108
+ try:
109
+ parsed_value = int(value)
110
+ except ValueError as exc:
111
+ msg = f"Failed to parse value of item {key}={value}"
112
+ raise argparse.ArgumentTypeError(msg) from exc
113
+
114
+ if key in out_dict and out_dict[key] != parsed_value:
115
+ raise argparse.ArgumentTypeError(
116
+ f"Conflicting values specified for key: {key}")
117
+ out_dict[key] = parsed_value
118
+
119
+ return out_dict
120
+
121
+
122
+ def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
123
+ """Check if the type hint is a specific type."""
124
+ return type_hint is type or get_origin(type_hint) is type
125
+
126
+
127
+ def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
128
+ """Check if the type hints contain a specific type."""
129
+ return any(is_type(type_hint, type) for type_hint in type_hints)
130
+
131
+
132
+ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
133
+ """Get the specific type from the type hints."""
134
+ return next((th for th in type_hints if is_type(th, type)), None)
135
+
136
+
137
+ def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
138
+ """Convert Literal type hints to argparse kwargs."""
139
+ type_hint = get_type(type_hints, Literal)
140
+ choices = get_args(type_hint)
141
+ choice_type = type(choices[0])
142
+ if not all(isinstance(choice, choice_type) for choice in choices):
143
+ raise ValueError(
144
+ "All choices must be of the same type. "
145
+ f"Got {choices} with types {[type(c) for c in choices]}")
146
+ return {"type": choice_type, "choices": sorted(choices)}
147
+
148
+
149
+ def is_not_builtin(type_hint: TypeHint) -> bool:
150
+ """Check if the class is not a built-in type."""
151
+ return type_hint.__module__ != "builtins"
152
+
153
+
154
+ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
155
+ """Extract type hints from Annotated or Union type hints."""
156
+ type_hints: set[TypeHint] = set()
157
+ origin = get_origin(type_hint)
158
+ args = get_args(type_hint)
159
+
160
+ if origin is Annotated:
161
+ type_hints.update(get_type_hints(args[0]))
162
+ elif origin is Union:
163
+ for arg in args:
164
+ type_hints.update(get_type_hints(arg))
165
+ else:
166
+ type_hints.add(type_hint)
167
+
168
+ return type_hints
169
+
170
+
171
+ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
172
+ cls_docs = get_attr_docs(cls)
173
+ kwargs = {}
174
+ for field in fields(cls):
175
+ # Get the set of possible types for the field
176
+ type_hints: set[TypeHint] = get_type_hints(field.type)
177
+
178
+ # If the field is a dataclass, we can use the model_validate_json
179
+ generator = (th for th in type_hints if is_dataclass(th))
180
+ dataclass_cls = next(generator, None)
181
+
182
+ # Get the default value of the field
183
+ if field.default is not MISSING:
184
+ default = field.default
185
+ elif field.default_factory is not MISSING:
186
+ default = field.default_factory()
187
+
188
+ # Get the help text for the field
189
+ name = field.name
190
+ help = cls_docs[name].strip()
191
+ # Escape % for argparse
192
+ help = help.replace("%", "%%")
193
+
194
+ # Initialise the kwargs dictionary for the field
195
+ kwargs[name] = {"default": default, "help": help}
196
+
197
+ # Set other kwargs based on the type hints
198
+ json_tip = """\n\nShould either be a valid JSON string or JSON keys
199
+ passed individually. For example, the following sets of arguments are
200
+ equivalent:\n\n
201
+ - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
202
+ - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n"""
203
+ if dataclass_cls is not None:
204
+
205
+ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
206
+ try:
207
+ if hasattr(cls, "from_cli"):
208
+ return cls.from_cli(val)
209
+ return TypeAdapter(cls).validate_json(val)
210
+ except ValidationError as e:
211
+ raise argparse.ArgumentTypeError(repr(e)) from e
212
+
213
+ kwargs[name]["type"] = parse_dataclass
214
+ kwargs[name]["help"] += json_tip
215
+ elif contains_type(type_hints, bool):
216
+ # Creates --no-<name> and --<name> flags
217
+ kwargs[name]["action"] = argparse.BooleanOptionalAction
218
+ elif contains_type(type_hints, Literal):
219
+ kwargs[name].update(literal_to_kwargs(type_hints))
220
+ elif contains_type(type_hints, tuple):
221
+ type_hint = get_type(type_hints, tuple)
222
+ types = get_args(type_hint)
223
+ tuple_type = types[0]
224
+ assert all(t is tuple_type for t in types if t is not Ellipsis), (
225
+ "All non-Ellipsis tuple elements must be of the same "
226
+ f"type. Got {types}.")
227
+ kwargs[name]["type"] = tuple_type
228
+ kwargs[name]["nargs"] = "+" if Ellipsis in types else len(types)
229
+ elif contains_type(type_hints, list):
230
+ type_hint = get_type(type_hints, list)
231
+ types = get_args(type_hint)
232
+ assert len(types) == 1, (
233
+ "List type must have exactly one type. Got "
234
+ f"{type_hint} with types {types}")
235
+ kwargs[name]["type"] = types[0]
236
+ kwargs[name]["nargs"] = "+"
237
+ elif contains_type(type_hints, int):
238
+ kwargs[name]["type"] = int
239
+ # Special case for large integers
240
+ if name in {"max_model_len", "max_num_batched_tokens"}:
241
+ kwargs[name]["type"] = human_readable_int
242
+ elif contains_type(type_hints, float):
243
+ kwargs[name]["type"] = float
244
+ elif (contains_type(type_hints, dict)
245
+ and (contains_type(type_hints, str)
246
+ or any(is_not_builtin(th) for th in type_hints))):
247
+ kwargs[name]["type"] = union_dict_and_str
248
+ elif contains_type(type_hints, dict):
249
+ kwargs[name]["type"] = parse_type(json.loads)
250
+ kwargs[name]["help"] += json_tip
251
+ elif (contains_type(type_hints, str)
252
+ or any(is_not_builtin(th) for th in type_hints)):
253
+ kwargs[name]["type"] = str
254
+ else:
255
+ raise ValueError(
256
+ f"Unsupported type {type_hints} for argument {name}.")
257
+
258
+ # If the type hint was a sequence of literals, use the helper function
259
+ # to update the type and choices
260
+ if get_origin(kwargs[name].get("type")) is Literal:
261
+ kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
262
+
263
+ # If None is in type_hints, make the argument optional.
264
+ # But not if it's a bool, argparse will handle this better.
265
+ if type(None) in type_hints and not contains_type(type_hints, bool):
266
+ kwargs[name]["type"] = optional_type(kwargs[name]["type"])
267
+ if kwargs[name].get("choices"):
268
+ kwargs[name]["choices"].append("None")
269
+ return kwargs
270
+
271
+
272
+ @dataclass
273
+ class EngineArgs:
274
+ """Arguments for vLLM engine."""
275
+ model: str = ModelConfig.model
276
+ served_model_name: Optional[Union[
277
+ str, List[str]]] = ModelConfig.served_model_name
278
+ tokenizer: Optional[str] = ModelConfig.tokenizer
279
+ hf_config_path: Optional[str] = ModelConfig.hf_config_path
280
+ task: TaskOption = ModelConfig.task
281
+ skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
282
+ enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
283
+ tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
284
+ trust_remote_code: bool = ModelConfig.trust_remote_code
285
+ allowed_local_media_path: str = ModelConfig.allowed_local_media_path
286
+ download_dir: Optional[str] = LoadConfig.download_dir
287
+ load_format: str = LoadConfig.load_format
288
+ config_format: str = ModelConfig.config_format
289
+ dtype: ModelDType = ModelConfig.dtype
290
+ kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
291
+ seed: Optional[int] = ModelConfig.seed
292
+ max_model_len: Optional[int] = ModelConfig.max_model_len
293
+ cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
294
+ "cuda_graph_sizes")
295
+ # Note: Specifying a custom executor backend by passing a class
296
+ # is intended for expert use only. The API may change without
297
+ # notice.
298
+ distributed_executor_backend: Optional[Union[
299
+ DistributedExecutorBackend,
300
+ Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
301
+ # number of P/D disaggregation (or other disaggregation) workers
302
+ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
303
+ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
304
+ data_parallel_size: int = ParallelConfig.data_parallel_size
305
+ data_parallel_size_local: Optional[int] = None
306
+ data_parallel_address: Optional[str] = None
307
+ data_parallel_rpc_port: Optional[int] = None
308
+ data_parallel_backend: str = ParallelConfig.data_parallel_backend
309
+ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
310
+ max_parallel_loading_workers: Optional[
311
+ int] = ParallelConfig.max_parallel_loading_workers
312
+ block_size: Optional[BlockSize] = CacheConfig.block_size
313
+ enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
314
+ prefix_caching_hash_algo: PrefixCachingHashAlgo = \
315
+ CacheConfig.prefix_caching_hash_algo
316
+ disable_sliding_window: bool = ModelConfig.disable_sliding_window
317
+ disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
318
+ use_v2_block_manager: bool = True
319
+ swap_space: float = CacheConfig.swap_space
320
+ cpu_offload_gb: float = CacheConfig.cpu_offload_gb
321
+ gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
322
+ max_num_batched_tokens: Optional[
323
+ int] = SchedulerConfig.max_num_batched_tokens
324
+ max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
325
+ max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
326
+ long_prefill_token_threshold: int = \
327
+ SchedulerConfig.long_prefill_token_threshold
328
+ max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
329
+ max_logprobs: int = ModelConfig.max_logprobs
330
+ disable_log_stats: bool = False
331
+ revision: Optional[str] = ModelConfig.revision
332
+ code_revision: Optional[str] = ModelConfig.code_revision
333
+ rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
334
+ rope_theta: Optional[float] = ModelConfig.rope_theta
335
+ hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
336
+ hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
337
+ tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
338
+ quantization: Optional[QuantizationMethods] = ModelConfig.quantization
339
+ enforce_eager: bool = ModelConfig.enforce_eager
340
+ max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
341
+ disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
342
+ # The following three fields are deprecated and will be removed in a future
343
+ # release. Setting them will have no effect. Please remove them from your
344
+ # configurations.
345
+ tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
346
+ tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
347
+ tokenizer_pool_extra_config: dict = \
348
+ get_field(TokenizerPoolConfig, "extra_config")
349
+ limit_mm_per_prompt: dict[str, int] = \
350
+ get_field(MultiModalConfig, "limit_per_prompt")
351
+ mm_processor_kwargs: Optional[Dict[str, Any]] = \
352
+ MultiModalConfig.mm_processor_kwargs
353
+ disable_mm_preprocessor_cache: bool = \
354
+ MultiModalConfig.disable_mm_preprocessor_cache
355
+ # LoRA fields
356
+ enable_lora: bool = False
357
+ enable_lora_bias: bool = LoRAConfig.bias_enabled
358
+ max_loras: int = LoRAConfig.max_loras
359
+ max_lora_rank: int = LoRAConfig.max_lora_rank
360
+ fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
361
+ max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
362
+ lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
363
+ lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
364
+ long_lora_scaling_factors: Optional[tuple[float, ...]] = \
365
+ LoRAConfig.long_lora_scaling_factors
366
+ # PromptAdapter fields
367
+ enable_prompt_adapter: bool = False
368
+ max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
369
+ max_prompt_adapter_token: int = \
370
+ PromptAdapterConfig.max_prompt_adapter_token
371
+
372
+ device: Device = DeviceConfig.device
373
+ num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
374
+ multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
375
+ ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
376
+ num_gpu_blocks_override: Optional[
377
+ int] = CacheConfig.num_gpu_blocks_override
378
+ num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
379
+ model_loader_extra_config: dict = \
380
+ get_field(LoadConfig, "model_loader_extra_config")
381
+ ignore_patterns: Optional[Union[str,
382
+ List[str]]] = LoadConfig.ignore_patterns
383
+ preemption_mode: Optional[str] = SchedulerConfig.preemption_mode
384
+
385
+ scheduler_delay_factor: float = SchedulerConfig.delay_factor
386
+ enable_chunked_prefill: Optional[
387
+ bool] = SchedulerConfig.enable_chunked_prefill
388
+ disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
389
+
390
+ disable_hybrid_kv_cache_manager: bool = (
391
+ SchedulerConfig.disable_hybrid_kv_cache_manager)
392
+
393
+ guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
394
+ guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
395
+ guided_decoding_disable_any_whitespace: bool = \
396
+ DecodingConfig.disable_any_whitespace
397
+ guided_decoding_disable_additional_properties: bool = \
398
+ DecodingConfig.disable_additional_properties
399
+ logits_processor_pattern: Optional[
400
+ str] = ModelConfig.logits_processor_pattern
401
+
402
+ speculative_config: Optional[Dict[str, Any]] = None
403
+
404
+ qlora_adapter_name_or_path: Optional[str] = None
405
+ show_hidden_metrics_for_version: Optional[str] = \
406
+ ObservabilityConfig.show_hidden_metrics_for_version
407
+ otlp_traces_endpoint: Optional[str] = \
408
+ ObservabilityConfig.otlp_traces_endpoint
409
+ collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
410
+ ObservabilityConfig.collect_detailed_traces
411
+ disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
412
+ scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
413
+ scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
414
+
415
+ override_neuron_config: dict[str, Any] = \
416
+ get_field(ModelConfig, "override_neuron_config")
417
+ override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
418
+ ModelConfig.override_pooler_config
419
+ compilation_config: CompilationConfig = \
420
+ get_field(VllmConfig, "compilation_config")
421
+ worker_cls: str = ParallelConfig.worker_cls
422
+ worker_extension_cls: str = ParallelConfig.worker_extension_cls
423
+
424
+ kv_transfer_config: Optional[KVTransferConfig] = None
425
+ kv_events_config: Optional[KVEventsConfig] = None
426
+
427
+ generation_config: str = ModelConfig.generation_config
428
+ enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
429
+ override_generation_config: dict[str, Any] = \
430
+ get_field(ModelConfig, "override_generation_config")
431
+ model_impl: str = ModelConfig.model_impl
432
+
433
+ calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
434
+
435
+ additional_config: dict[str, Any] = \
436
+ get_field(VllmConfig, "additional_config")
437
+ enable_reasoning: Optional[bool] = None # DEPRECATED
438
+ reasoning_parser: str = DecodingConfig.reasoning_backend
439
+
440
+ use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
441
+ pt_load_map_location: str = LoadConfig.pt_load_map_location
442
+
443
+ enable_multimodal_encoder_data_parallel: bool = \
444
+ ParallelConfig.enable_multimodal_encoder_data_parallel
445
+
446
+ def __post_init__(self):
447
+ # support `EngineArgs(compilation_config={...})`
448
+ # without having to manually construct a
449
+ # CompilationConfig object
450
+ if isinstance(self.compilation_config, (int, dict)):
451
+ self.compilation_config = CompilationConfig.from_cli(
452
+ str(self.compilation_config))
453
+ if self.qlora_adapter_name_or_path is not None:
454
+ warnings.warn(
455
+ "The `qlora_adapter_name_or_path` is deprecated "
456
+ "and will be removed in v0.10.0. ",
457
+ DeprecationWarning,
458
+ stacklevel=2,
459
+ )
460
+ # Setup plugins
461
+ from vllm.plugins import load_general_plugins
462
+ load_general_plugins()
463
+
464
+ @staticmethod
465
+ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
466
+ """Shared CLI arguments for vLLM engine."""
467
+
468
+ # Model arguments
469
+ model_kwargs = get_kwargs(ModelConfig)
470
+ model_group = parser.add_argument_group(
471
+ title="ModelConfig",
472
+ description=ModelConfig.__doc__,
473
+ )
474
+ if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
475
+ model_group.add_argument("--model", **model_kwargs["model"])
476
+ model_group.add_argument("--task", **model_kwargs["task"])
477
+ model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
478
+ model_group.add_argument("--tokenizer-mode",
479
+ **model_kwargs["tokenizer_mode"])
480
+ model_group.add_argument("--trust-remote-code",
481
+ **model_kwargs["trust_remote_code"])
482
+ model_group.add_argument("--dtype", **model_kwargs["dtype"])
483
+ model_group.add_argument("--seed", **model_kwargs["seed"])
484
+ model_group.add_argument("--hf-config-path",
485
+ **model_kwargs["hf_config_path"])
486
+ model_group.add_argument("--allowed-local-media-path",
487
+ **model_kwargs["allowed_local_media_path"])
488
+ model_group.add_argument("--revision", **model_kwargs["revision"])
489
+ model_group.add_argument("--code-revision",
490
+ **model_kwargs["code_revision"])
491
+ model_group.add_argument("--rope-scaling",
492
+ **model_kwargs["rope_scaling"])
493
+ model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
494
+ model_group.add_argument("--tokenizer-revision",
495
+ **model_kwargs["tokenizer_revision"])
496
+ model_group.add_argument("--max-model-len",
497
+ **model_kwargs["max_model_len"])
498
+ model_group.add_argument("--quantization", "-q",
499
+ **model_kwargs["quantization"])
500
+ model_group.add_argument("--enforce-eager",
501
+ **model_kwargs["enforce_eager"])
502
+ model_group.add_argument("--max-seq-len-to-capture",
503
+ **model_kwargs["max_seq_len_to_capture"])
504
+ model_group.add_argument("--max-logprobs",
505
+ **model_kwargs["max_logprobs"])
506
+ model_group.add_argument("--disable-sliding-window",
507
+ **model_kwargs["disable_sliding_window"])
508
+ model_group.add_argument("--disable-cascade-attn",
509
+ **model_kwargs["disable_cascade_attn"])
510
+ model_group.add_argument("--skip-tokenizer-init",
511
+ **model_kwargs["skip_tokenizer_init"])
512
+ model_group.add_argument("--enable-prompt-embeds",
513
+ **model_kwargs["enable_prompt_embeds"])
514
+ model_group.add_argument("--served-model-name",
515
+ **model_kwargs["served_model_name"])
516
+ # This one is a special case because it is the
517
+ # opposite of ModelConfig.use_async_output_proc
518
+ model_group.add_argument(
519
+ "--disable-async-output-proc",
520
+ action="store_true",
521
+ default=EngineArgs.disable_async_output_proc,
522
+ help="Disable async output processing. This may result in "
523
+ "lower performance.")
524
+ model_group.add_argument("--config-format",
525
+ choices=[f.value for f in ConfigFormat],
526
+ **model_kwargs["config_format"])
527
+ # This one is a special case because it can bool
528
+ # or str. TODO: Handle this in get_kwargs
529
+ model_group.add_argument("--hf-token",
530
+ type=str,
531
+ nargs="?",
532
+ const=True,
533
+ default=model_kwargs["hf_token"]["default"],
534
+ help=model_kwargs["hf_token"]["help"])
535
+ model_group.add_argument("--hf-overrides",
536
+ **model_kwargs["hf_overrides"])
537
+ model_group.add_argument("--override-neuron-config",
538
+ **model_kwargs["override_neuron_config"])
539
+ model_group.add_argument("--override-pooler-config",
540
+ **model_kwargs["override_pooler_config"])
541
+ model_group.add_argument("--logits-processor-pattern",
542
+ **model_kwargs["logits_processor_pattern"])
543
+ model_group.add_argument("--generation-config",
544
+ **model_kwargs["generation_config"])
545
+ model_group.add_argument("--override-generation-config",
546
+ **model_kwargs["override_generation_config"])
547
+ model_group.add_argument("--enable-sleep-mode",
548
+ **model_kwargs["enable_sleep_mode"])
549
+ model_group.add_argument("--model-impl",
550
+ choices=[f.value for f in ModelImpl],
551
+ **model_kwargs["model_impl"])
552
+
553
+ # Model loading arguments
554
+ load_kwargs = get_kwargs(LoadConfig)
555
+ load_group = parser.add_argument_group(
556
+ title="LoadConfig",
557
+ description=LoadConfig.__doc__,
558
+ )
559
+ load_group.add_argument("--load-format",
560
+ choices=[f.value for f in LoadFormat],
561
+ **load_kwargs["load_format"])
562
+ load_group.add_argument("--download-dir",
563
+ **load_kwargs["download_dir"])
564
+ load_group.add_argument("--model-loader-extra-config",
565
+ **load_kwargs["model_loader_extra_config"])
566
+ load_group.add_argument("--ignore-patterns",
567
+ **load_kwargs["ignore_patterns"])
568
+ load_group.add_argument("--use-tqdm-on-load",
569
+ **load_kwargs["use_tqdm_on_load"])
570
+ load_group.add_argument(
571
+ "--qlora-adapter-name-or-path",
572
+ type=str,
573
+ default=None,
574
+ help="The `--qlora-adapter-name-or-path` has no effect, do not set"
575
+ " it, and it will be removed in v0.10.0.",
576
+ deprecated=True,
577
+ )
578
+ load_group.add_argument('--pt-load-map-location',
579
+ **load_kwargs["pt_load_map_location"])
580
+
581
+ # Guided decoding arguments
582
+ guided_decoding_kwargs = get_kwargs(DecodingConfig)
583
+ guided_decoding_group = parser.add_argument_group(
584
+ title="DecodingConfig",
585
+ description=DecodingConfig.__doc__,
586
+ )
587
+ guided_decoding_group.add_argument("--guided-decoding-backend",
588
+ **guided_decoding_kwargs["backend"])
589
+ guided_decoding_group.add_argument(
590
+ "--guided-decoding-disable-fallback",
591
+ **guided_decoding_kwargs["disable_fallback"])
592
+ guided_decoding_group.add_argument(
593
+ "--guided-decoding-disable-any-whitespace",
594
+ **guided_decoding_kwargs["disable_any_whitespace"])
595
+ guided_decoding_group.add_argument(
596
+ "--guided-decoding-disable-additional-properties",
597
+ **guided_decoding_kwargs["disable_additional_properties"])
598
+ guided_decoding_group.add_argument(
599
+ "--enable-reasoning",
600
+ action=argparse.BooleanOptionalAction,
601
+ deprecated=True,
602
+ help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
603
+ "of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
604
+ "parser backend instead. This flag (`--enable-reasoning`) will be "
605
+ "removed in v0.10.0. When `--reasoning-parser` is specified, "
606
+ "reasoning mode is automatically enabled.")
607
+ guided_decoding_group.add_argument(
608
+ "--reasoning-parser",
609
+ # This choices is a special case because it's not static
610
+ choices=list(ReasoningParserManager.reasoning_parsers),
611
+ **guided_decoding_kwargs["reasoning_backend"])
612
+
613
+ # Parallel arguments
614
+ parallel_kwargs = get_kwargs(ParallelConfig)
615
+ parallel_group = parser.add_argument_group(
616
+ title="ParallelConfig",
617
+ description=ParallelConfig.__doc__,
618
+ )
619
+ parallel_group.add_argument(
620
+ "--distributed-executor-backend",
621
+ **parallel_kwargs["distributed_executor_backend"])
622
+ parallel_group.add_argument(
623
+ "--pipeline-parallel-size", "-pp",
624
+ **parallel_kwargs["pipeline_parallel_size"])
625
+ parallel_group.add_argument("--tensor-parallel-size", "-tp",
626
+ **parallel_kwargs["tensor_parallel_size"])
627
+ parallel_group.add_argument("--data-parallel-size", "-dp",
628
+ **parallel_kwargs["data_parallel_size"])
629
+ parallel_group.add_argument('--data-parallel-size-local',
630
+ '-dpl',
631
+ type=int,
632
+ help='Number of data parallel replicas '
633
+ 'to run on this node.')
634
+ parallel_group.add_argument('--data-parallel-address',
635
+ '-dpa',
636
+ type=str,
637
+ help='Address of data parallel cluster '
638
+ 'head-node.')
639
+ parallel_group.add_argument('--data-parallel-rpc-port',
640
+ '-dpp',
641
+ type=int,
642
+ help='Port for data parallel RPC '
643
+ 'communication.')
644
+ parallel_group.add_argument('--data-parallel-backend',
645
+ '-dpb',
646
+ type=str,
647
+ default='mp',
648
+ help='Backend for data parallel, either '
649
+ '"mp" or "ray".')
650
+ parallel_group.add_argument(
651
+ "--enable-expert-parallel",
652
+ **parallel_kwargs["enable_expert_parallel"])
653
+ parallel_group.add_argument(
654
+ "--max-parallel-loading-workers",
655
+ **parallel_kwargs["max_parallel_loading_workers"])
656
+ parallel_group.add_argument(
657
+ "--ray-workers-use-nsight",
658
+ **parallel_kwargs["ray_workers_use_nsight"])
659
+ parallel_group.add_argument(
660
+ "--disable-custom-all-reduce",
661
+ **parallel_kwargs["disable_custom_all_reduce"])
662
+ parallel_group.add_argument("--worker-cls",
663
+ **parallel_kwargs["worker_cls"])
664
+ parallel_group.add_argument("--worker-extension-cls",
665
+ **parallel_kwargs["worker_extension_cls"])
666
+ parallel_group.add_argument(
667
+ "--enable-multimodal-encoder-data-parallel",
668
+ **parallel_kwargs["enable_multimodal_encoder_data_parallel"])
669
+
670
+ # KV cache arguments
671
+ cache_kwargs = get_kwargs(CacheConfig)
672
+ cache_group = parser.add_argument_group(
673
+ title="CacheConfig",
674
+ description=CacheConfig.__doc__,
675
+ )
676
+ cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
677
+ cache_group.add_argument("--gpu-memory-utilization",
678
+ **cache_kwargs["gpu_memory_utilization"])
679
+ cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
680
+ cache_group.add_argument("--kv-cache-dtype",
681
+ **cache_kwargs["cache_dtype"])
682
+ cache_group.add_argument("--num-gpu-blocks-override",
683
+ **cache_kwargs["num_gpu_blocks_override"])
684
+ cache_group.add_argument("--enable-prefix-caching",
685
+ **cache_kwargs["enable_prefix_caching"])
686
+ cache_group.add_argument("--prefix-caching-hash-algo",
687
+ **cache_kwargs["prefix_caching_hash_algo"])
688
+ cache_group.add_argument("--cpu-offload-gb",
689
+ **cache_kwargs["cpu_offload_gb"])
690
+ cache_group.add_argument("--calculate-kv-scales",
691
+ **cache_kwargs["calculate_kv_scales"])
692
+
693
+ # Tokenizer arguments
694
+ tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
695
+ tokenizer_group = parser.add_argument_group(
696
+ title="TokenizerPoolConfig",
697
+ description=TokenizerPoolConfig.__doc__,
698
+ )
699
+ tokenizer_group.add_argument("--tokenizer-pool-size",
700
+ **tokenizer_kwargs["pool_size"])
701
+ tokenizer_group.add_argument("--tokenizer-pool-type",
702
+ **tokenizer_kwargs["pool_type"])
703
+ tokenizer_group.add_argument("--tokenizer-pool-extra-config",
704
+ **tokenizer_kwargs["extra_config"])
705
+
706
+ # Multimodal related configs
707
+ multimodal_kwargs = get_kwargs(MultiModalConfig)
708
+ multimodal_group = parser.add_argument_group(
709
+ title="MultiModalConfig",
710
+ description=MultiModalConfig.__doc__,
711
+ )
712
+ multimodal_group.add_argument("--limit-mm-per-prompt",
713
+ **multimodal_kwargs["limit_per_prompt"])
714
+ multimodal_group.add_argument(
715
+ "--mm-processor-kwargs",
716
+ **multimodal_kwargs["mm_processor_kwargs"])
717
+ multimodal_group.add_argument(
718
+ "--disable-mm-preprocessor-cache",
719
+ **multimodal_kwargs["disable_mm_preprocessor_cache"])
720
+
721
+ # LoRA related configs
722
+ lora_kwargs = get_kwargs(LoRAConfig)
723
+ lora_group = parser.add_argument_group(
724
+ title="LoRAConfig",
725
+ description=LoRAConfig.__doc__,
726
+ )
727
+ lora_group.add_argument(
728
+ "--enable-lora",
729
+ action=argparse.BooleanOptionalAction,
730
+ help="If True, enable handling of LoRA adapters.")
731
+ lora_group.add_argument("--enable-lora-bias",
732
+ **lora_kwargs["bias_enabled"])
733
+ lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
734
+ lora_group.add_argument("--max-lora-rank",
735
+ **lora_kwargs["max_lora_rank"])
736
+ lora_group.add_argument("--lora-extra-vocab-size",
737
+ **lora_kwargs["lora_extra_vocab_size"])
738
+ lora_group.add_argument(
739
+ "--lora-dtype",
740
+ **lora_kwargs["lora_dtype"],
741
+ )
742
+ lora_group.add_argument("--long-lora-scaling-factors",
743
+ **lora_kwargs["long_lora_scaling_factors"])
744
+ lora_group.add_argument("--max-cpu-loras",
745
+ **lora_kwargs["max_cpu_loras"])
746
+ lora_group.add_argument("--fully-sharded-loras",
747
+ **lora_kwargs["fully_sharded_loras"])
748
+
749
+ # PromptAdapter related configs
750
+ prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
751
+ prompt_adapter_group = parser.add_argument_group(
752
+ title="PromptAdapterConfig",
753
+ description=PromptAdapterConfig.__doc__,
754
+ )
755
+ prompt_adapter_group.add_argument(
756
+ "--enable-prompt-adapter",
757
+ action=argparse.BooleanOptionalAction,
758
+ help="If True, enable handling of PromptAdapters.")
759
+ prompt_adapter_group.add_argument(
760
+ "--max-prompt-adapters",
761
+ **prompt_adapter_kwargs["max_prompt_adapters"])
762
+ prompt_adapter_group.add_argument(
763
+ "--max-prompt-adapter-token",
764
+ **prompt_adapter_kwargs["max_prompt_adapter_token"])
765
+
766
+ # Device arguments
767
+ device_kwargs = get_kwargs(DeviceConfig)
768
+ device_group = parser.add_argument_group(
769
+ title="DeviceConfig",
770
+ description=DeviceConfig.__doc__,
771
+ )
772
+ device_group.add_argument("--device",
773
+ **device_kwargs["device"],
774
+ deprecated=True)
775
+
776
+ # Speculative arguments
777
+ speculative_group = parser.add_argument_group(
778
+ title="SpeculativeConfig",
779
+ description=SpeculativeConfig.__doc__,
780
+ )
781
+ speculative_group.add_argument(
782
+ "--speculative-config",
783
+ type=json.loads,
784
+ default=None,
785
+ help="The configurations for speculative decoding. Should be a "
786
+ "JSON string.")
787
+
788
+ # Observability arguments
789
+ observability_kwargs = get_kwargs(ObservabilityConfig)
790
+ observability_group = parser.add_argument_group(
791
+ title="ObservabilityConfig",
792
+ description=ObservabilityConfig.__doc__,
793
+ )
794
+ observability_group.add_argument(
795
+ "--show-hidden-metrics-for-version",
796
+ **observability_kwargs["show_hidden_metrics_for_version"])
797
+ observability_group.add_argument(
798
+ "--otlp-traces-endpoint",
799
+ **observability_kwargs["otlp_traces_endpoint"])
800
+ # TODO: generalise this special case
801
+ choices = observability_kwargs["collect_detailed_traces"]["choices"]
802
+ metavar = f"{{{','.join(choices)}}}"
803
+ observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
804
+ observability_kwargs["collect_detailed_traces"]["choices"] += [
805
+ ",".join(p)
806
+ for p in permutations(get_args(DetailedTraceModules), r=2)
807
+ ]
808
+ observability_group.add_argument(
809
+ "--collect-detailed-traces",
810
+ **observability_kwargs["collect_detailed_traces"])
811
+
812
+ # Scheduler arguments
813
+ scheduler_kwargs = get_kwargs(SchedulerConfig)
814
+ scheduler_group = parser.add_argument_group(
815
+ title="SchedulerConfig",
816
+ description=SchedulerConfig.__doc__,
817
+ )
818
+ scheduler_group.add_argument(
819
+ "--max-num-batched-tokens",
820
+ **scheduler_kwargs["max_num_batched_tokens"])
821
+ scheduler_group.add_argument("--max-num-seqs",
822
+ **scheduler_kwargs["max_num_seqs"])
823
+ scheduler_group.add_argument(
824
+ "--max-num-partial-prefills",
825
+ **scheduler_kwargs["max_num_partial_prefills"])
826
+ scheduler_group.add_argument(
827
+ "--max-long-partial-prefills",
828
+ **scheduler_kwargs["max_long_partial_prefills"])
829
+ scheduler_group.add_argument('--cuda-graph-sizes',
830
+ **scheduler_kwargs["cuda_graph_sizes"])
831
+ scheduler_group.add_argument(
832
+ "--long-prefill-token-threshold",
833
+ **scheduler_kwargs["long_prefill_token_threshold"])
834
+ scheduler_group.add_argument("--num-lookahead-slots",
835
+ **scheduler_kwargs["num_lookahead_slots"])
836
+ scheduler_group.add_argument("--scheduler-delay-factor",
837
+ **scheduler_kwargs["delay_factor"])
838
+ scheduler_group.add_argument("--preemption-mode",
839
+ **scheduler_kwargs["preemption_mode"])
840
+ scheduler_group.add_argument("--num-scheduler-steps",
841
+ **scheduler_kwargs["num_scheduler_steps"])
842
+ scheduler_group.add_argument(
843
+ "--multi-step-stream-outputs",
844
+ **scheduler_kwargs["multi_step_stream_outputs"])
845
+ scheduler_group.add_argument("--scheduling-policy",
846
+ **scheduler_kwargs["policy"])
847
+ scheduler_group.add_argument(
848
+ "--enable-chunked-prefill",
849
+ **scheduler_kwargs["enable_chunked_prefill"])
850
+ scheduler_group.add_argument(
851
+ "--disable-chunked-mm-input",
852
+ **scheduler_kwargs["disable_chunked_mm_input"])
853
+ scheduler_group.add_argument("--scheduler-cls",
854
+ **scheduler_kwargs["scheduler_cls"])
855
+ scheduler_group.add_argument(
856
+ "--disable-hybrid-kv-cache-manager",
857
+ **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
858
+
859
+ # vLLM arguments
860
+ vllm_kwargs = get_kwargs(VllmConfig)
861
+ vllm_group = parser.add_argument_group(
862
+ title="VllmConfig",
863
+ description=VllmConfig.__doc__,
864
+ )
865
+ vllm_group.add_argument("--kv-transfer-config",
866
+ **vllm_kwargs["kv_transfer_config"])
867
+ vllm_group.add_argument('--kv-events-config',
868
+ **vllm_kwargs["kv_events_config"])
869
+ vllm_group.add_argument("--compilation-config", "-O",
870
+ **vllm_kwargs["compilation_config"])
871
+ vllm_group.add_argument("--additional-config",
872
+ **vllm_kwargs["additional_config"])
873
+
874
+ # Other arguments
875
+ parser.add_argument('--use-v2-block-manager',
876
+ action='store_true',
877
+ default=True,
878
+ deprecated=True,
879
+ help='[DEPRECATED] block manager v1 has been '
880
+ 'removed and SelfAttnBlockSpaceManager (i.e. '
881
+ 'block manager v2) is now the default. '
882
+ 'Setting this flag to True or False'
883
+ ' has no effect on vLLM behavior.')
884
+ parser.add_argument('--disable-log-stats',
885
+ action='store_true',
886
+ help='Disable logging statistics.')
887
+
888
+ return parser
889
+
890
+ @classmethod
891
+ def from_cli_args(cls, args: argparse.Namespace):
892
+ # Get the list of attributes of this dataclass.
893
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
894
+ # Set the attributes from the parsed arguments.
895
+ engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
896
+ return engine_args
897
+
898
+ def create_model_config(self) -> ModelConfig:
899
+ # gguf file needs a specific model loader and doesn't use hf_repo
900
+ if check_gguf_file(self.model):
901
+ self.quantization = self.load_format = "gguf"
902
+
903
+ # NOTE: This is to allow model loading from S3 in CI
904
+ if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
905
+ and self.model in MODELS_ON_S3
906
+ and self.load_format == LoadFormat.AUTO): # noqa: E501
907
+ self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
908
+ self.load_format = LoadFormat.RUNAI_STREAMER
909
+
910
+ return ModelConfig(
911
+ model=self.model,
912
+ hf_config_path=self.hf_config_path,
913
+ task=self.task,
914
+ tokenizer=self.tokenizer,
915
+ tokenizer_mode=self.tokenizer_mode,
916
+ trust_remote_code=self.trust_remote_code,
917
+ allowed_local_media_path=self.allowed_local_media_path,
918
+ dtype=self.dtype,
919
+ seed=self.seed,
920
+ revision=self.revision,
921
+ code_revision=self.code_revision,
922
+ rope_scaling=self.rope_scaling,
923
+ rope_theta=self.rope_theta,
924
+ hf_token=self.hf_token,
925
+ hf_overrides=self.hf_overrides,
926
+ tokenizer_revision=self.tokenizer_revision,
927
+ max_model_len=self.max_model_len,
928
+ quantization=self.quantization,
929
+ enforce_eager=self.enforce_eager,
930
+ max_seq_len_to_capture=self.max_seq_len_to_capture,
931
+ max_logprobs=self.max_logprobs,
932
+ disable_sliding_window=self.disable_sliding_window,
933
+ disable_cascade_attn=self.disable_cascade_attn,
934
+ skip_tokenizer_init=self.skip_tokenizer_init,
935
+ enable_prompt_embeds=self.enable_prompt_embeds,
936
+ served_model_name=self.served_model_name,
937
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
938
+ use_async_output_proc=not self.disable_async_output_proc,
939
+ config_format=self.config_format,
940
+ mm_processor_kwargs=self.mm_processor_kwargs,
941
+ disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
942
+ override_neuron_config=self.override_neuron_config,
943
+ override_pooler_config=self.override_pooler_config,
944
+ logits_processor_pattern=self.logits_processor_pattern,
945
+ generation_config=self.generation_config,
946
+ override_generation_config=self.override_generation_config,
947
+ enable_sleep_mode=self.enable_sleep_mode,
948
+ model_impl=self.model_impl,
949
+ )
950
+
951
+ def create_load_config(self) -> LoadConfig:
952
+
953
+ if self.quantization == "bitsandbytes":
954
+ self.load_format = "bitsandbytes"
955
+
956
+ return LoadConfig(
957
+ load_format=self.load_format,
958
+ download_dir=self.download_dir,
959
+ model_loader_extra_config=self.model_loader_extra_config,
960
+ ignore_patterns=self.ignore_patterns,
961
+ use_tqdm_on_load=self.use_tqdm_on_load,
962
+ pt_load_map_location=self.pt_load_map_location,
963
+ )
964
+
965
+ def create_speculative_config(
966
+ self,
967
+ target_model_config: ModelConfig,
968
+ target_parallel_config: ParallelConfig,
969
+ enable_chunked_prefill: bool,
970
+ disable_log_stats: bool,
971
+ ) -> Optional["SpeculativeConfig"]:
972
+ """Initializes and returns a SpeculativeConfig object based on
973
+ `speculative_config`.
974
+
975
+ This function utilizes `speculative_config` to create a
976
+ SpeculativeConfig object. The `speculative_config` can either be
977
+ provided as a JSON string input via CLI arguments or directly as a
978
+ dictionary from the engine.
979
+ """
980
+ if self.speculative_config is None:
981
+ return None
982
+
983
+ # Note(Shangming): These parameters are not obtained from the cli arg
984
+ # '--speculative-config' and must be passed in when creating the engine
985
+ # config.
986
+ self.speculative_config.update({
987
+ "target_model_config": target_model_config,
988
+ "target_parallel_config": target_parallel_config,
989
+ "enable_chunked_prefill": enable_chunked_prefill,
990
+ "disable_log_stats": disable_log_stats,
991
+ })
992
+ speculative_config = SpeculativeConfig.from_dict(
993
+ self.speculative_config)
994
+
995
+ return speculative_config
996
+
997
+ def create_engine_config(
998
+ self,
999
+ usage_context: Optional[UsageContext] = None,
1000
+ ) -> VllmConfig:
1001
+ """
1002
+ Create the VllmConfig.
1003
+
1004
+ NOTE: for autoselection of V0 vs V1 engine, we need to
1005
+ create the ModelConfig first, since ModelConfig's attrs
1006
+ (e.g. the model arch) are needed to make the decision.
1007
+
1008
+ This function set VLLM_USE_V1=X if VLLM_USE_V1 is
1009
+ unspecified by the user.
1010
+
1011
+ If VLLM_USE_V1 is specified by the user but the VllmConfig
1012
+ is incompatible, we raise an error.
1013
+ """
1014
+ from vllm.platforms import current_platform
1015
+ current_platform.pre_register_and_update()
1016
+
1017
+ device_config = DeviceConfig(device=current_platform.device_type)
1018
+ model_config = self.create_model_config()
1019
+
1020
+ # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
1021
+ # and fall back to V0 for experimental or unsupported features.
1022
+ # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
1023
+ # features and raise error for unsupported features.
1024
+ # * If VLLM_USE_V1=0, we disable V1.
1025
+ use_v1 = False
1026
+ try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
1027
+ if try_v1 and self._is_v1_supported_oracle(model_config):
1028
+ use_v1 = True
1029
+
1030
+ # If user explicitly set VLLM_USE_V1, sanity check we respect it.
1031
+ if envs.is_set("VLLM_USE_V1"):
1032
+ assert use_v1 == envs.VLLM_USE_V1
1033
+ # Otherwise, set the VLLM_USE_V1 variable globally.
1034
+ else:
1035
+ envs.set_vllm_use_v1(use_v1)
1036
+
1037
+ # Set default arguments for V0 or V1 Engine.
1038
+ if use_v1:
1039
+ self._set_default_args_v1(usage_context)
1040
+ else:
1041
+ self._set_default_args_v0(model_config)
1042
+
1043
+ assert self.enable_chunked_prefill is not None
1044
+
1045
+ if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
1046
+ assert self.enforce_eager, (
1047
+ "Cuda graph is not supported with DualChunkFlashAttention. "
1048
+ "To run the model in eager mode, set 'enforce_eager=True' "
1049
+ "or use '--enforce-eager' in the CLI.")
1050
+ assert current_platform.is_cuda(), (
1051
+ "DualChunkFlashAttention is only supported on CUDA platform.")
1052
+ assert not use_v1, (
1053
+ "DualChunkFlashAttention is not supported on V1 engine. "
1054
+ "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
1055
+
1056
+ cache_config = CacheConfig(
1057
+ block_size=self.block_size,
1058
+ gpu_memory_utilization=self.gpu_memory_utilization,
1059
+ swap_space=self.swap_space,
1060
+ cache_dtype=self.kv_cache_dtype,
1061
+ is_attention_free=model_config.is_attention_free,
1062
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
1063
+ sliding_window=model_config.get_sliding_window(),
1064
+ enable_prefix_caching=self.enable_prefix_caching,
1065
+ prefix_caching_hash_algo=self.prefix_caching_hash_algo,
1066
+ cpu_offload_gb=self.cpu_offload_gb,
1067
+ calculate_kv_scales=self.calculate_kv_scales,
1068
+ )
1069
+
1070
+ # Get the current placement group if Ray is initialized and
1071
+ # we are in a Ray actor. If so, then the placement group will be
1072
+ # passed to spawned processes.
1073
+ placement_group = None
1074
+ if is_in_ray_actor():
1075
+ import ray
1076
+
1077
+ # This call initializes Ray automatically if it is not initialized,
1078
+ # but we should not do this here.
1079
+ placement_group = ray.util.get_current_placement_group()
1080
+
1081
+ # Local DP size defaults to global DP size if not set.
1082
+ data_parallel_size_local = self.data_parallel_size if (
1083
+ self.data_parallel_size_local
1084
+ is None) else self.data_parallel_size_local
1085
+
1086
+ # DP address, used in multi-node case for torch distributed group
1087
+ # and ZMQ sockets.
1088
+ if self.data_parallel_address is None:
1089
+ if self.data_parallel_backend == "ray":
1090
+ host_ip = get_ip()
1091
+ logger.info(
1092
+ "Using host IP %s as ray-based data parallel address",
1093
+ host_ip)
1094
+ data_parallel_address = host_ip
1095
+ else:
1096
+ assert self.data_parallel_backend == "mp", (
1097
+ "data_parallel_backend can only be ray or mp, got %s",
1098
+ self.data_parallel_backend)
1099
+ data_parallel_address = ParallelConfig.data_parallel_master_ip
1100
+ else:
1101
+ data_parallel_address = self.data_parallel_address
1102
+
1103
+ # This port is only used when there are remote data parallel engines,
1104
+ # otherwise the local IPC transport is used.
1105
+ data_parallel_rpc_port = self.data_parallel_rpc_port if (
1106
+ self.data_parallel_rpc_port
1107
+ is not None) else ParallelConfig.data_parallel_rpc_port
1108
+
1109
+ data_parallel_backend = self.data_parallel_backend
1110
+
1111
+ parallel_config = ParallelConfig(
1112
+ pipeline_parallel_size=self.pipeline_parallel_size,
1113
+ tensor_parallel_size=self.tensor_parallel_size,
1114
+ data_parallel_size=self.data_parallel_size,
1115
+ data_parallel_size_local=data_parallel_size_local,
1116
+ data_parallel_master_ip=data_parallel_address,
1117
+ data_parallel_rpc_port=data_parallel_rpc_port,
1118
+ data_parallel_backend=data_parallel_backend,
1119
+ enable_expert_parallel=self.enable_expert_parallel,
1120
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
1121
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
1122
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
1123
+ placement_group=placement_group,
1124
+ distributed_executor_backend=self.distributed_executor_backend,
1125
+ worker_cls=self.worker_cls,
1126
+ worker_extension_cls=self.worker_extension_cls,
1127
+ enable_multimodal_encoder_data_parallel=self.
1128
+ enable_multimodal_encoder_data_parallel,
1129
+ )
1130
+
1131
+ speculative_config = self.create_speculative_config(
1132
+ target_model_config=model_config,
1133
+ target_parallel_config=parallel_config,
1134
+ enable_chunked_prefill=self.enable_chunked_prefill,
1135
+ disable_log_stats=self.disable_log_stats,
1136
+ )
1137
+
1138
+ # Reminder: Please update docs/features/compatibility_matrix.md
1139
+ # If the feature combo become valid
1140
+ if self.num_scheduler_steps > 1:
1141
+ if speculative_config is not None:
1142
+ raise ValueError("Speculative decoding is not supported with "
1143
+ "multi-step (--num-scheduler-steps > 1)")
1144
+ if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
1145
+ raise ValueError("Multi-Step Chunked-Prefill is not supported "
1146
+ "for pipeline-parallel-size > 1")
1147
+ from vllm.platforms import current_platform
1148
+ if current_platform.is_cpu():
1149
+ logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
1150
+ "currently not supported for CPUs and has been "
1151
+ "disabled.")
1152
+ self.num_scheduler_steps = 1
1153
+
1154
+ # make sure num_lookahead_slots is set the higher value depending on
1155
+ # if we are using speculative decoding or multi-step
1156
+ num_lookahead_slots = max(self.num_lookahead_slots,
1157
+ self.num_scheduler_steps - 1)
1158
+ num_lookahead_slots = num_lookahead_slots \
1159
+ if speculative_config is None \
1160
+ else speculative_config.num_lookahead_slots
1161
+
1162
+ scheduler_config = SchedulerConfig(
1163
+ runner_type=model_config.runner_type,
1164
+ max_num_batched_tokens=self.max_num_batched_tokens,
1165
+ max_num_seqs=self.max_num_seqs,
1166
+ max_model_len=model_config.max_model_len,
1167
+ cuda_graph_sizes=self.cuda_graph_sizes,
1168
+ num_lookahead_slots=num_lookahead_slots,
1169
+ delay_factor=self.scheduler_delay_factor,
1170
+ enable_chunked_prefill=self.enable_chunked_prefill,
1171
+ disable_chunked_mm_input=self.disable_chunked_mm_input,
1172
+ is_multimodal_model=model_config.is_multimodal_model,
1173
+ preemption_mode=self.preemption_mode,
1174
+ num_scheduler_steps=self.num_scheduler_steps,
1175
+ multi_step_stream_outputs=self.multi_step_stream_outputs,
1176
+ send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
1177
+ and parallel_config.use_ray),
1178
+ policy=self.scheduling_policy,
1179
+ scheduler_cls=self.scheduler_cls,
1180
+ max_num_partial_prefills=self.max_num_partial_prefills,
1181
+ max_long_partial_prefills=self.max_long_partial_prefills,
1182
+ long_prefill_token_threshold=self.long_prefill_token_threshold,
1183
+ disable_hybrid_kv_cache_manager=self.
1184
+ disable_hybrid_kv_cache_manager,
1185
+ )
1186
+
1187
+ lora_config = LoRAConfig(
1188
+ bias_enabled=self.enable_lora_bias,
1189
+ max_lora_rank=self.max_lora_rank,
1190
+ max_loras=self.max_loras,
1191
+ fully_sharded_loras=self.fully_sharded_loras,
1192
+ lora_extra_vocab_size=self.lora_extra_vocab_size,
1193
+ long_lora_scaling_factors=self.long_lora_scaling_factors,
1194
+ lora_dtype=self.lora_dtype,
1195
+ max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
1196
+ and self.max_cpu_loras > 0 else None) if self.enable_lora else None
1197
+
1198
+ # bitsandbytes pre-quantized model need a specific model loader
1199
+ if model_config.quantization == "bitsandbytes":
1200
+ self.quantization = self.load_format = "bitsandbytes"
1201
+
1202
+ load_config = self.create_load_config()
1203
+
1204
+ prompt_adapter_config = PromptAdapterConfig(
1205
+ max_prompt_adapters=self.max_prompt_adapters,
1206
+ max_prompt_adapter_token=self.max_prompt_adapter_token) \
1207
+ if self.enable_prompt_adapter else None
1208
+
1209
+ decoding_config = DecodingConfig(
1210
+ backend=self.guided_decoding_backend,
1211
+ disable_fallback=self.guided_decoding_disable_fallback,
1212
+ disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
1213
+ disable_additional_properties=\
1214
+ self.guided_decoding_disable_additional_properties,
1215
+ reasoning_backend=self.reasoning_parser
1216
+ )
1217
+
1218
+ observability_config = ObservabilityConfig(
1219
+ show_hidden_metrics_for_version=self.
1220
+ show_hidden_metrics_for_version,
1221
+ otlp_traces_endpoint=self.otlp_traces_endpoint,
1222
+ collect_detailed_traces=self.collect_detailed_traces,
1223
+ )
1224
+
1225
+ config = VllmConfig(
1226
+ model_config=model_config,
1227
+ cache_config=cache_config,
1228
+ parallel_config=parallel_config,
1229
+ scheduler_config=scheduler_config,
1230
+ device_config=device_config,
1231
+ lora_config=lora_config,
1232
+ speculative_config=speculative_config,
1233
+ load_config=load_config,
1234
+ decoding_config=decoding_config,
1235
+ observability_config=observability_config,
1236
+ prompt_adapter_config=prompt_adapter_config,
1237
+ compilation_config=self.compilation_config,
1238
+ kv_transfer_config=self.kv_transfer_config,
1239
+ kv_events_config=self.kv_events_config,
1240
+ additional_config=self.additional_config,
1241
+ )
1242
+
1243
+ return config
1244
+
1245
+ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
1246
+ """Oracle for whether to use V0 or V1 Engine by default."""
1247
+
1248
+ #############################################################
1249
+ # Unsupported Feature Flags on V1.
1250
+
1251
+ if self.load_format == LoadFormat.SHARDED_STATE.value:
1252
+ _raise_or_fallback(
1253
+ feature_name=f"--load_format {self.load_format}",
1254
+ recommend_to_remove=False)
1255
+ return False
1256
+
1257
+ if (self.logits_processor_pattern
1258
+ != EngineArgs.logits_processor_pattern):
1259
+ _raise_or_fallback(feature_name="--logits-processor-pattern",
1260
+ recommend_to_remove=False)
1261
+ return False
1262
+
1263
+ if self.preemption_mode != SchedulerConfig.preemption_mode:
1264
+ _raise_or_fallback(feature_name="--preemption-mode",
1265
+ recommend_to_remove=True)
1266
+ return False
1267
+
1268
+ if (self.disable_async_output_proc
1269
+ != EngineArgs.disable_async_output_proc):
1270
+ _raise_or_fallback(feature_name="--disable-async-output-proc",
1271
+ recommend_to_remove=True)
1272
+ return False
1273
+
1274
+ if self.scheduling_policy != SchedulerConfig.policy:
1275
+ _raise_or_fallback(feature_name="--scheduling-policy",
1276
+ recommend_to_remove=False)
1277
+ return False
1278
+
1279
+ if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
1280
+ _raise_or_fallback(feature_name="--num-scheduler-steps",
1281
+ recommend_to_remove=True)
1282
+ return False
1283
+
1284
+ if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
1285
+ _raise_or_fallback(feature_name="--scheduler-delay-factor",
1286
+ recommend_to_remove=True)
1287
+ return False
1288
+
1289
+ if self.guided_decoding_backend not in get_args(
1290
+ GuidedDecodingBackendV1):
1291
+ _raise_or_fallback(
1292
+ feature_name=
1293
+ f"--guided-decoding-backend={self.guided_decoding_backend}",
1294
+ recommend_to_remove=False)
1295
+ return False
1296
+
1297
+ # Need at least Ampere for now (FA support required).
1298
+ # Skip this check if we are running on a non-GPU platform,
1299
+ # or if the device capability is not available
1300
+ # (e.g. in a Ray actor without GPUs).
1301
+ from vllm.platforms import CpuArchEnum, current_platform
1302
+ if (current_platform.is_cuda()
1303
+ and current_platform.get_device_capability()
1304
+ and current_platform.get_device_capability().major < 8):
1305
+ _raise_or_fallback(feature_name="Compute Capability < 8.0",
1306
+ recommend_to_remove=False)
1307
+ return False
1308
+
1309
+ # No Fp8 KV cache so far.
1310
+ if self.kv_cache_dtype != "auto":
1311
+ fp8_attention = self.kv_cache_dtype.startswith("fp8")
1312
+ will_use_fa = (
1313
+ current_platform.is_cuda()
1314
+ and not envs.is_set("VLLM_ATTENTION_BACKEND")
1315
+ ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
1316
+ supported = False
1317
+ if current_platform.is_rocm():
1318
+ supported = True
1319
+ elif fp8_attention and will_use_fa:
1320
+ from vllm.attention.utils.fa_utils import (
1321
+ flash_attn_supports_fp8)
1322
+ supported = flash_attn_supports_fp8()
1323
+ if not supported:
1324
+ _raise_or_fallback(feature_name="--kv-cache-dtype",
1325
+ recommend_to_remove=False)
1326
+ return False
1327
+
1328
+ # No Prompt Adapter so far.
1329
+ if self.enable_prompt_adapter:
1330
+ _raise_or_fallback(feature_name="--enable-prompt-adapter",
1331
+ recommend_to_remove=False)
1332
+ return False
1333
+
1334
+ # No text embedding inputs so far.
1335
+ if self.enable_prompt_embeds:
1336
+ _raise_or_fallback(feature_name="--enable-prompt-embeds",
1337
+ recommend_to_remove=False)
1338
+ return False
1339
+
1340
+ # Only Fp16 and Bf16 dtypes since we only support FA.
1341
+ V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
1342
+ if model_config.dtype not in V1_SUPPORTED_DTYPES:
1343
+ _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
1344
+ recommend_to_remove=False)
1345
+ return False
1346
+
1347
+ # No Embedding Models so far.
1348
+ if model_config.task not in ["generate"]:
1349
+ _raise_or_fallback(feature_name=f"--task {model_config.task}",
1350
+ recommend_to_remove=False)
1351
+ return False
1352
+
1353
+ # No Mamba or Encoder-Decoder so far.
1354
+ if not model_config.is_v1_compatible:
1355
+ _raise_or_fallback(feature_name=model_config.architectures,
1356
+ recommend_to_remove=False)
1357
+ return False
1358
+
1359
+ # No Concurrent Partial Prefills so far.
1360
+ if (self.max_num_partial_prefills
1361
+ != SchedulerConfig.max_num_partial_prefills
1362
+ or self.max_long_partial_prefills
1363
+ != SchedulerConfig.max_long_partial_prefills):
1364
+ _raise_or_fallback(feature_name="Concurrent Partial Prefill",
1365
+ recommend_to_remove=False)
1366
+ return False
1367
+
1368
+ # No OTLP observability so far.
1369
+ if (self.otlp_traces_endpoint or self.collect_detailed_traces):
1370
+ _raise_or_fallback(feature_name="--otlp-traces-endpoint",
1371
+ recommend_to_remove=False)
1372
+ return False
1373
+
1374
+ # V1 supports N-gram, Medusa, and Eagle speculative decoding.
1375
+ is_ngram_enabled = False
1376
+ is_eagle_enabled = False
1377
+ is_medusa_enabled = False
1378
+ if self.speculative_config is not None:
1379
+ # This is supported but experimental (handled below).
1380
+ speculative_method = self.speculative_config.get("method")
1381
+ if speculative_method:
1382
+ if speculative_method in ("ngram", "[ngram]"):
1383
+ is_ngram_enabled = True
1384
+ elif speculative_method == "medusa":
1385
+ is_medusa_enabled = True
1386
+ elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"):
1387
+ is_eagle_enabled = True
1388
+ else:
1389
+ speculative_model = self.speculative_config.get("model")
1390
+ if speculative_model in ("ngram", "[ngram]"):
1391
+ is_ngram_enabled = True
1392
+ if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled):
1393
+ # Other speculative decoding methods are not supported yet.
1394
+ _raise_or_fallback(feature_name="Speculative Decoding",
1395
+ recommend_to_remove=False)
1396
+ return False
1397
+
1398
+ # No XFormers so far.
1399
+ V1_BACKENDS = [
1400
+ "FLASH_ATTN_VLLM_V1",
1401
+ "FLASH_ATTN",
1402
+ "PALLAS",
1403
+ "PALLAS_VLLM_V1",
1404
+ "TRITON_ATTN_VLLM_V1",
1405
+ "TRITON_MLA",
1406
+ "CUTLASS_MLA_VLLM_V1",
1407
+ "FLASHMLA",
1408
+ "FLASHINFER",
1409
+ "FLASHINFER_VLLM_V1",
1410
+ "ROCM_AITER_MLA",
1411
+ "TORCH_SDPA_VLLM_V1",
1412
+ "FLEX_ATTENTION",
1413
+ ]
1414
+ if (envs.is_set("VLLM_ATTENTION_BACKEND")
1415
+ and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
1416
+ name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
1417
+ _raise_or_fallback(feature_name=name, recommend_to_remove=True)
1418
+ return False
1419
+
1420
+ # Platforms must decide if they can support v1 for this model
1421
+ if not current_platform.supports_v1(model_config=model_config):
1422
+ _raise_or_fallback(
1423
+ feature_name=f"device type={current_platform.device_type}",
1424
+ recommend_to_remove=False)
1425
+ return False
1426
+ #############################################################
1427
+ # Experimental Features - allow users to opt in.
1428
+
1429
+ # Signal Handlers requires running in main thread.
1430
+ if (threading.current_thread() != threading.main_thread()
1431
+ and _warn_or_fallback("Engine in background thread")):
1432
+ return False
1433
+
1434
+ if (self.pipeline_parallel_size > 1
1435
+ and self.distributed_executor_backend
1436
+ not in (ParallelConfig.distributed_executor_backend, "ray",
1437
+ "mp", "external_launcher")):
1438
+ name = "Pipeline Parallelism without Ray distributed executor " \
1439
+ "or multiprocessing executor or external launcher"
1440
+ _raise_or_fallback(feature_name=name, recommend_to_remove=False)
1441
+ return False
1442
+
1443
+ # Non-[CUDA, TPU] may be supported on V1, but off by default for now.
1444
+ v0_hardware = not any(
1445
+ (current_platform.is_cuda(), current_platform.is_tpu(),
1446
+ (current_platform.is_cpu()
1447
+ and current_platform.get_cpu_architecture() == CpuArchEnum.X86)))
1448
+ if v0_hardware and _warn_or_fallback( # noqa: SIM103
1449
+ current_platform.device_name):
1450
+ return False
1451
+ #############################################################
1452
+
1453
+ return True
1454
+
1455
+ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
1456
+ """Set Default Arguments for V0 Engine."""
1457
+
1458
+ max_model_len = model_config.max_model_len
1459
+ use_long_context = max_model_len > 32768
1460
+ if self.enable_chunked_prefill is None:
1461
+ # Chunked prefill not supported for Multimodal or MLA in V0.
1462
+ if model_config.is_multimodal_model or model_config.use_mla:
1463
+ self.enable_chunked_prefill = False
1464
+
1465
+ # Enable chunked prefill by default for long context (> 32K)
1466
+ # models to avoid OOM errors in initial memory profiling phase.
1467
+ elif use_long_context:
1468
+ from vllm.platforms import current_platform
1469
+ is_gpu = current_platform.is_cuda()
1470
+ use_sliding_window = (model_config.get_sliding_window()
1471
+ is not None)
1472
+ use_spec_decode = self.speculative_config is not None
1473
+
1474
+ if (is_gpu and not use_sliding_window and not use_spec_decode
1475
+ and not self.enable_lora
1476
+ and not self.enable_prompt_adapter
1477
+ and model_config.runner_type != "pooling"):
1478
+ self.enable_chunked_prefill = True
1479
+ logger.warning(
1480
+ "Chunked prefill is enabled by default for models "
1481
+ "with max_model_len > 32K. Chunked prefill might "
1482
+ "not work with some features or models. If you "
1483
+ "encounter any issues, please disable by launching "
1484
+ "with --enable-chunked-prefill=False.")
1485
+
1486
+ if self.enable_chunked_prefill is None:
1487
+ self.enable_chunked_prefill = False
1488
+
1489
+ if not self.enable_chunked_prefill and use_long_context:
1490
+ logger.warning(
1491
+ "The model has a long context length (%s). This may cause"
1492
+ "OOM during the initial memory profiling phase, or result "
1493
+ "in low performance due to small KV cache size. Consider "
1494
+ "setting --max-model-len to a smaller value.", max_model_len)
1495
+ elif (self.enable_chunked_prefill
1496
+ and model_config.runner_type == "pooling"):
1497
+ msg = "Chunked prefill is not supported for pooling models"
1498
+ raise ValueError(msg)
1499
+
1500
+ # if using prefix caching, we must set a hash algo
1501
+ if self.enable_prefix_caching:
1502
+ # Disable prefix caching for multimodal models for VLLM_V0.
1503
+ if model_config.is_multimodal_model:
1504
+ logger.warning(
1505
+ "--enable-prefix-caching is not supported for multimodal "
1506
+ "models in V0 and has been disabled.")
1507
+ self.enable_prefix_caching = False
1508
+
1509
+ # VLLM_V0 only supports builtin hash algo for prefix caching.
1510
+ if self.prefix_caching_hash_algo == "sha256":
1511
+ raise ValueError(
1512
+ "sha256 is not supported for prefix caching in V0 engine. "
1513
+ "Please use 'builtin'.")
1514
+
1515
+ # Set max_num_seqs to 256 for VLLM_V0.
1516
+ if self.max_num_seqs is None:
1517
+ self.max_num_seqs = 256
1518
+
1519
+ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
1520
+ """Set Default Arguments for V1 Engine."""
1521
+
1522
+ # V1 always uses chunked prefills.
1523
+ self.enable_chunked_prefill = True
1524
+
1525
+ # V1 enables prefix caching by default.
1526
+ if self.enable_prefix_caching is None:
1527
+ self.enable_prefix_caching = True
1528
+
1529
+ # V1 should use the new scheduler by default.
1530
+ # Swap it only if this arg is set to the original V0 default
1531
+ if self.scheduler_cls == EngineArgs.scheduler_cls:
1532
+ self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
1533
+
1534
+ # When no user override, set the default values based on the usage
1535
+ # context.
1536
+ # Use different default values for different hardware.
1537
+
1538
+ # Try to query the device name on the current platform. If it fails,
1539
+ # it may be because the platform that imports vLLM is not the same
1540
+ # as the platform that vLLM is running on (e.g. the case of scaling
1541
+ # vLLM with Ray) and has no GPUs. In this case we use the default
1542
+ # values for non-H100/H200 GPUs.
1543
+ from vllm.platforms import current_platform
1544
+ try:
1545
+ device_memory = current_platform.get_device_total_memory()
1546
+ device_name = current_platform.get_device_name().lower()
1547
+ except Exception:
1548
+ # This is only used to set default_max_num_batched_tokens
1549
+ device_memory = 0
1550
+
1551
+ # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
1552
+ # throughput, see PR #17885 for more details.
1553
+ # So here we do an extra device name check to prevent such regression.
1554
+ if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
1555
+ # For GPUs like H100 and MI300x, use larger default values.
1556
+ default_max_num_batched_tokens = {
1557
+ UsageContext.LLM_CLASS: 16384,
1558
+ UsageContext.OPENAI_API_SERVER: 8192,
1559
+ }
1560
+ default_max_num_seqs = 1024
1561
+ else:
1562
+ # TODO(woosuk): Tune the default values for other hardware.
1563
+ default_max_num_batched_tokens = {
1564
+ UsageContext.LLM_CLASS: 8192,
1565
+ UsageContext.OPENAI_API_SERVER: 2048,
1566
+ }
1567
+ default_max_num_seqs = 256
1568
+
1569
+ # tpu specific default values.
1570
+ if current_platform.is_tpu():
1571
+ default_max_num_batched_tokens_tpu = {
1572
+ UsageContext.LLM_CLASS: {
1573
+ 'V6E': 2048,
1574
+ 'V5E': 1024,
1575
+ 'V5P': 512,
1576
+ },
1577
+ UsageContext.OPENAI_API_SERVER: {
1578
+ 'V6E': 1024,
1579
+ 'V5E': 512,
1580
+ 'V5P': 256,
1581
+ }
1582
+ }
1583
+
1584
+ use_context_value = usage_context.value if usage_context else None
1585
+ if (self.max_num_batched_tokens is None
1586
+ and usage_context in default_max_num_batched_tokens):
1587
+ if current_platform.is_tpu():
1588
+ chip_name = current_platform.get_device_name()
1589
+ if chip_name in default_max_num_batched_tokens_tpu[
1590
+ usage_context]:
1591
+ self.max_num_batched_tokens = \
1592
+ default_max_num_batched_tokens_tpu[
1593
+ usage_context][chip_name]
1594
+ else:
1595
+ self.max_num_batched_tokens = \
1596
+ default_max_num_batched_tokens[usage_context]
1597
+ else:
1598
+ self.max_num_batched_tokens = default_max_num_batched_tokens[
1599
+ usage_context]
1600
+ logger.debug(
1601
+ "Setting max_num_batched_tokens to %d for %s usage context.",
1602
+ self.max_num_batched_tokens, use_context_value)
1603
+
1604
+ if self.max_num_seqs is None:
1605
+ self.max_num_seqs = default_max_num_seqs
1606
+
1607
+ logger.debug("Setting max_num_seqs to %d for %s usage context.",
1608
+ self.max_num_seqs, use_context_value)
1609
+
1610
+
1611
+ @dataclass
1612
+ class AsyncEngineArgs(EngineArgs):
1613
+ """Arguments for asynchronous vLLM engine."""
1614
+ disable_log_requests: bool = False
1615
+
1616
+ @staticmethod
1617
+ def add_cli_args(parser: FlexibleArgumentParser,
1618
+ async_args_only: bool = False) -> FlexibleArgumentParser:
1619
+ # Initialize plugin to update the parser, for example, The plugin may
1620
+ # adding a new kind of quantization method to --quantization argument or
1621
+ # a new device to --device argument.
1622
+ load_general_plugins()
1623
+ if not async_args_only:
1624
+ parser = EngineArgs.add_cli_args(parser)
1625
+ parser.add_argument('--disable-log-requests',
1626
+ action='store_true',
1627
+ help='Disable logging requests.')
1628
+ from vllm.platforms import current_platform
1629
+ current_platform.pre_register_and_update(parser)
1630
+ return parser
1631
+
1632
+
1633
+ def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
1634
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1635
+ raise NotImplementedError(
1636
+ f"VLLM_USE_V1=1 is not supported with {feature_name}.")
1637
+ msg = f"{feature_name} is not supported by the V1 Engine. "
1638
+ msg += "Falling back to V0. "
1639
+ if recommend_to_remove:
1640
+ msg += f"We recommend to remove {feature_name} from your config "
1641
+ msg += "in favor of the V1 Engine."
1642
+ logger.warning(msg)
1643
+
1644
+
1645
+ def _warn_or_fallback(feature_name: str) -> bool:
1646
+ if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
1647
+ logger.warning(
1648
+ "Detected VLLM_USE_V1=1 with %s. Usage should "
1649
+ "be considered experimental. Please report any "
1650
+ "issues on Github.", feature_name)
1651
+ should_exit = False
1652
+ else:
1653
+ logger.info(
1654
+ "%s is experimental on VLLM_USE_V1=1. "
1655
+ "Falling back to V0 Engine.", feature_name)
1656
+ should_exit = True
1657
+ return should_exit
1658
+
1659
+
1660
+ def human_readable_int(value):
1661
+ """Parse human-readable integers like '1k', '2M', etc.
1662
+ Including decimal values with decimal multipliers.
1663
+
1664
+ Examples:
1665
+ - '1k' -> 1,000
1666
+ - '1K' -> 1,024
1667
+ - '25.6k' -> 25,600
1668
+ """
1669
+ value = value.strip()
1670
+ match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
1671
+ if match:
1672
+ decimal_multiplier = {
1673
+ 'k': 10**3,
1674
+ 'm': 10**6,
1675
+ 'g': 10**9,
1676
+ }
1677
+ binary_multiplier = {
1678
+ 'K': 2**10,
1679
+ 'M': 2**20,
1680
+ 'G': 2**30,
1681
+ }
1682
+
1683
+ number, suffix = match.groups()
1684
+ if suffix in decimal_multiplier:
1685
+ mult = decimal_multiplier[suffix]
1686
+ return int(float(number) * mult)
1687
+ elif suffix in binary_multiplier:
1688
+ mult = binary_multiplier[suffix]
1689
+ # Do not allow decimals with binary multipliers
1690
+ try:
1691
+ return int(number) * mult
1692
+ except ValueError as e:
1693
+ raise argparse.ArgumentTypeError("Decimals are not allowed " \
1694
+ f"with binary suffixes like {suffix}. Did you mean to use " \
1695
+ f"{number}{suffix.lower()} instead?") from e
1696
+
1697
+ # Regular plain number.
1698
+ return int(value)
1699
+
1700
+
1701
+ # These functions are used by sphinx to build the documentation
1702
+ def _engine_args_parser():
1703
+ return EngineArgs.add_cli_args(FlexibleArgumentParser())
1704
+
1705
+
1706
+ def _async_engine_args_parser():
1707
+ return AsyncEngineArgs.add_cli_args(FlexibleArgumentParser(),
1708
+ async_args_only=True)