vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1600) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +107 -0
  3. vllm/_aiter_ops.py +1018 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2925 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +434 -0
  16. vllm/attention/backends/registry.py +286 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +975 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +469 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +51 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +136 -0
  37. vllm/attention/selector.py +268 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +117 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +41 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  56. vllm/benchmarks/sweep/serve.py +448 -0
  57. vllm/benchmarks/sweep/serve_sla.py +492 -0
  58. vllm/benchmarks/sweep/server.py +114 -0
  59. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  60. vllm/benchmarks/sweep/utils.py +4 -0
  61. vllm/benchmarks/throughput.py +799 -0
  62. vllm/collect_env.py +857 -0
  63. vllm/compilation/__init__.py +0 -0
  64. vllm/compilation/activation_quant_fusion.py +209 -0
  65. vllm/compilation/backends.py +827 -0
  66. vllm/compilation/base_static_graph.py +57 -0
  67. vllm/compilation/caching.py +180 -0
  68. vllm/compilation/collective_fusion.py +1234 -0
  69. vllm/compilation/compiler_interface.py +639 -0
  70. vllm/compilation/counter.py +48 -0
  71. vllm/compilation/cuda_graph.py +208 -0
  72. vllm/compilation/decorators.py +614 -0
  73. vllm/compilation/fix_functionalization.py +253 -0
  74. vllm/compilation/fusion.py +374 -0
  75. vllm/compilation/fusion_attn.py +359 -0
  76. vllm/compilation/fx_utils.py +91 -0
  77. vllm/compilation/inductor_pass.py +133 -0
  78. vllm/compilation/matcher_utils.py +315 -0
  79. vllm/compilation/monitor.py +62 -0
  80. vllm/compilation/noop_elimination.py +134 -0
  81. vllm/compilation/partition_rules.py +72 -0
  82. vllm/compilation/pass_manager.py +136 -0
  83. vllm/compilation/piecewise_backend.py +121 -0
  84. vllm/compilation/post_cleanup.py +21 -0
  85. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  86. vllm/compilation/sequence_parallelism.py +363 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  88. vllm/compilation/vllm_inductor_pass.py +173 -0
  89. vllm/compilation/wrapper.py +260 -0
  90. vllm/config/__init__.py +102 -0
  91. vllm/config/cache.py +220 -0
  92. vllm/config/compilation.py +1154 -0
  93. vllm/config/device.py +75 -0
  94. vllm/config/ec_transfer.py +110 -0
  95. vllm/config/kv_events.py +56 -0
  96. vllm/config/kv_transfer.py +114 -0
  97. vllm/config/load.py +124 -0
  98. vllm/config/lora.py +96 -0
  99. vllm/config/model.py +2274 -0
  100. vllm/config/multimodal.py +247 -0
  101. vllm/config/observability.py +131 -0
  102. vllm/config/parallel.py +653 -0
  103. vllm/config/pooler.py +124 -0
  104. vllm/config/scheduler.py +297 -0
  105. vllm/config/speculative.py +643 -0
  106. vllm/config/speech_to_text.py +38 -0
  107. vllm/config/structured_outputs.py +94 -0
  108. vllm/config/utils.py +324 -0
  109. vllm/config/vllm.py +1353 -0
  110. vllm/connections.py +189 -0
  111. vllm/device_allocator/__init__.py +0 -0
  112. vllm/device_allocator/cumem.py +327 -0
  113. vllm/distributed/__init__.py +6 -0
  114. vllm/distributed/communication_op.py +43 -0
  115. vllm/distributed/device_communicators/__init__.py +0 -0
  116. vllm/distributed/device_communicators/all2all.py +490 -0
  117. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  118. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  119. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  120. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  121. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  122. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  123. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  124. vllm/distributed/device_communicators/pynccl.py +386 -0
  125. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  126. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  127. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  128. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  129. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  130. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  131. vllm/distributed/device_communicators/symm_mem.py +156 -0
  132. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  133. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  134. vllm/distributed/ec_transfer/__init__.py +14 -0
  135. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  136. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  137. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  138. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  139. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  140. vllm/distributed/eplb/__init__.py +8 -0
  141. vllm/distributed/eplb/async_worker.py +115 -0
  142. vllm/distributed/eplb/eplb_state.py +1154 -0
  143. vllm/distributed/eplb/rebalance_algo.py +260 -0
  144. vllm/distributed/eplb/rebalance_execute.py +532 -0
  145. vllm/distributed/kv_events.py +371 -0
  146. vllm/distributed/kv_transfer/README.md +29 -0
  147. vllm/distributed/kv_transfer/__init__.py +20 -0
  148. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  150. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  151. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  152. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/base.py +575 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2480 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  173. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  174. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  175. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  177. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  178. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1790 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2106 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +188 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  192. vllm/entrypoints/api_server.py +184 -0
  193. vllm/entrypoints/chat_utils.py +1837 -0
  194. vllm/entrypoints/cli/__init__.py +13 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  201. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  202. vllm/entrypoints/cli/collect_env.py +38 -0
  203. vllm/entrypoints/cli/main.py +79 -0
  204. vllm/entrypoints/cli/openai.py +256 -0
  205. vllm/entrypoints/cli/run_batch.py +68 -0
  206. vllm/entrypoints/cli/serve.py +249 -0
  207. vllm/entrypoints/cli/types.py +29 -0
  208. vllm/entrypoints/constants.py +10 -0
  209. vllm/entrypoints/context.py +572 -0
  210. vllm/entrypoints/dynamic_lora.py +57 -0
  211. vllm/entrypoints/harmony_utils.py +535 -0
  212. vllm/entrypoints/launcher.py +175 -0
  213. vllm/entrypoints/llm.py +1762 -0
  214. vllm/entrypoints/logger.py +84 -0
  215. vllm/entrypoints/openai/__init__.py +0 -0
  216. vllm/entrypoints/openai/api_server.py +1891 -0
  217. vllm/entrypoints/openai/cli_args.py +302 -0
  218. vllm/entrypoints/openai/orca_metrics.py +120 -0
  219. vllm/entrypoints/openai/protocol.py +2465 -0
  220. vllm/entrypoints/openai/run_batch.py +631 -0
  221. vllm/entrypoints/openai/serving_chat.py +1782 -0
  222. vllm/entrypoints/openai/serving_completion.py +716 -0
  223. vllm/entrypoints/openai/serving_engine.py +1478 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_responses.py +2032 -0
  226. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  227. vllm/entrypoints/openai/serving_tokens.py +281 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  231. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  232. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  233. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  234. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  235. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  236. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  237. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  238. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  239. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  240. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  241. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +322 -0
  242. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  243. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  244. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +324 -0
  245. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  246. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  247. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  248. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  249. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  250. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  251. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  252. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  253. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  254. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  255. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  256. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  257. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  258. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  259. vllm/entrypoints/openai/utils.py +49 -0
  260. vllm/entrypoints/pooling/__init__.py +16 -0
  261. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  262. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  263. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  264. vllm/entrypoints/pooling/classify/serving.py +237 -0
  265. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  266. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  267. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  268. vllm/entrypoints/pooling/embed/serving.py +697 -0
  269. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  270. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  271. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  272. vllm/entrypoints/pooling/pooling/serving.py +348 -0
  273. vllm/entrypoints/pooling/score/__init__.py +0 -0
  274. vllm/entrypoints/pooling/score/api_router.py +149 -0
  275. vllm/entrypoints/pooling/score/protocol.py +145 -0
  276. vllm/entrypoints/pooling/score/serving.py +505 -0
  277. vllm/entrypoints/renderer.py +409 -0
  278. vllm/entrypoints/responses_utils.py +148 -0
  279. vllm/entrypoints/sagemaker/__init__.py +4 -0
  280. vllm/entrypoints/sagemaker/routes.py +118 -0
  281. vllm/entrypoints/score_utils.py +240 -0
  282. vllm/entrypoints/ssl.py +78 -0
  283. vllm/entrypoints/tool.py +143 -0
  284. vllm/entrypoints/tool_server.py +234 -0
  285. vllm/entrypoints/utils.py +319 -0
  286. vllm/env_override.py +378 -0
  287. vllm/envs.py +1710 -0
  288. vllm/forward_context.py +358 -0
  289. vllm/inputs/__init__.py +44 -0
  290. vllm/inputs/data.py +359 -0
  291. vllm/inputs/parse.py +137 -0
  292. vllm/inputs/preprocess.py +716 -0
  293. vllm/logger.py +298 -0
  294. vllm/logging_utils/__init__.py +13 -0
  295. vllm/logging_utils/dump_input.py +83 -0
  296. vllm/logging_utils/formatter.py +127 -0
  297. vllm/logging_utils/lazy.py +20 -0
  298. vllm/logging_utils/log_time.py +34 -0
  299. vllm/logits_process.py +121 -0
  300. vllm/logprobs.py +206 -0
  301. vllm/lora/__init__.py +0 -0
  302. vllm/lora/layers/__init__.py +42 -0
  303. vllm/lora/layers/base.py +66 -0
  304. vllm/lora/layers/base_linear.py +165 -0
  305. vllm/lora/layers/column_parallel_linear.py +577 -0
  306. vllm/lora/layers/fused_moe.py +747 -0
  307. vllm/lora/layers/logits_processor.py +203 -0
  308. vllm/lora/layers/replicated_linear.py +70 -0
  309. vllm/lora/layers/row_parallel_linear.py +176 -0
  310. vllm/lora/layers/utils.py +74 -0
  311. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  312. vllm/lora/lora_weights.py +227 -0
  313. vllm/lora/models.py +903 -0
  314. vllm/lora/ops/__init__.py +0 -0
  315. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  316. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  317. vllm/lora/ops/torch_ops/__init__.py +20 -0
  318. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  319. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  320. vllm/lora/ops/triton_ops/__init__.py +21 -0
  321. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +661 -0
  322. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  323. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  324. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  325. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  326. vllm/lora/ops/triton_ops/utils.py +295 -0
  327. vllm/lora/ops/xla_ops/__init__.py +6 -0
  328. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  329. vllm/lora/peft_helper.py +128 -0
  330. vllm/lora/punica_wrapper/__init__.py +10 -0
  331. vllm/lora/punica_wrapper/punica_base.py +493 -0
  332. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  333. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  334. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  335. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  336. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  337. vllm/lora/punica_wrapper/utils.py +150 -0
  338. vllm/lora/request.py +100 -0
  339. vllm/lora/resolver.py +88 -0
  340. vllm/lora/utils.py +306 -0
  341. vllm/lora/worker_manager.py +268 -0
  342. vllm/model_executor/__init__.py +11 -0
  343. vllm/model_executor/custom_op.py +194 -0
  344. vllm/model_executor/layers/__init__.py +0 -0
  345. vllm/model_executor/layers/activation.py +595 -0
  346. vllm/model_executor/layers/attention_layer_base.py +32 -0
  347. vllm/model_executor/layers/batch_invariant.py +1058 -0
  348. vllm/model_executor/layers/conv.py +256 -0
  349. vllm/model_executor/layers/fla/__init__.py +8 -0
  350. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  351. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  352. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  353. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  354. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  355. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  356. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  357. vllm/model_executor/layers/fla/ops/index.py +41 -0
  358. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  359. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  360. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  361. vllm/model_executor/layers/fla/ops/op.py +60 -0
  362. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  363. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  364. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  365. vllm/model_executor/layers/fused_moe/__init__.py +110 -0
  366. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  367. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  368. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  369. vllm/model_executor/layers/fused_moe/config.py +938 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  645. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  646. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  647. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  648. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  649. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  650. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  651. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  652. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  653. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  654. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  655. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  656. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +821 -0
  657. vllm/model_executor/layers/fused_moe/fused_moe.py +2172 -0
  658. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +121 -0
  659. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +136 -0
  660. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  661. vllm/model_executor/layers/fused_moe/layer.py +2152 -0
  662. vllm/model_executor/layers/fused_moe/modular_kernel.py +1332 -0
  663. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  664. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  665. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  666. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  667. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  668. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  669. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  670. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  671. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  672. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  673. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  674. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  675. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +559 -0
  676. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  677. vllm/model_executor/layers/kda.py +442 -0
  678. vllm/model_executor/layers/layernorm.py +442 -0
  679. vllm/model_executor/layers/lightning_attn.py +735 -0
  680. vllm/model_executor/layers/linear.py +1424 -0
  681. vllm/model_executor/layers/logits_processor.py +106 -0
  682. vllm/model_executor/layers/mamba/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/abstract.py +68 -0
  684. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  685. vllm/model_executor/layers/mamba/mamba_mixer.py +527 -0
  686. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  687. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  688. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  689. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  690. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  691. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  692. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  693. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  694. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  695. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  696. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  697. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  698. vllm/model_executor/layers/mla.py +176 -0
  699. vllm/model_executor/layers/pooler.py +817 -0
  700. vllm/model_executor/layers/quantization/__init__.py +179 -0
  701. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  702. vllm/model_executor/layers/quantization/awq.py +277 -0
  703. vllm/model_executor/layers/quantization/awq_marlin.py +718 -0
  704. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  705. vllm/model_executor/layers/quantization/base_config.py +170 -0
  706. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  707. vllm/model_executor/layers/quantization/bitsandbytes.py +644 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +963 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2387 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  725. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  726. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  727. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  728. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  729. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  730. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  731. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  732. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  733. vllm/model_executor/layers/quantization/experts_int8.py +225 -0
  734. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  735. vllm/model_executor/layers/quantization/fp8.py +1348 -0
  736. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  737. vllm/model_executor/layers/quantization/gguf.py +687 -0
  738. vllm/model_executor/layers/quantization/gptq.py +393 -0
  739. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  740. vllm/model_executor/layers/quantization/gptq_marlin.py +842 -0
  741. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  742. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  743. vllm/model_executor/layers/quantization/inc.py +65 -0
  744. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  745. vllm/model_executor/layers/quantization/ipex_quant.py +470 -0
  746. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  752. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  753. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  754. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  755. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  756. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  759. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  760. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  761. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  762. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  763. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  764. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  765. vllm/model_executor/layers/quantization/modelopt.py +1637 -0
  766. vllm/model_executor/layers/quantization/moe_wna16.py +528 -0
  767. vllm/model_executor/layers/quantization/mxfp4.py +1175 -0
  768. vllm/model_executor/layers/quantization/petit.py +319 -0
  769. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  770. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  771. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  772. vllm/model_executor/layers/quantization/quark/quark_moe.py +653 -0
  773. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  774. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  775. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  776. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  777. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  778. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  779. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  780. vllm/model_executor/layers/quantization/rtn.py +639 -0
  781. vllm/model_executor/layers/quantization/schema.py +90 -0
  782. vllm/model_executor/layers/quantization/torchao.py +380 -0
  783. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  784. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  785. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  786. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1002. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +333 -0
  1003. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +311 -0
  1004. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  1005. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1006. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  1007. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1008. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1009. vllm/model_executor/layers/quantization/utils/marlin_utils.py +674 -0
  1010. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1011. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1012. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1013. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1014. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +183 -0
  1015. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1016. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1017. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1018. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1019. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1020. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1021. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  1022. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  1023. vllm/model_executor/layers/resampler.py +283 -0
  1024. vllm/model_executor/layers/rotary_embedding/__init__.py +292 -0
  1025. vllm/model_executor/layers/rotary_embedding/base.py +240 -0
  1026. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1027. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1028. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1029. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1030. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1031. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1032. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1033. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1034. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1035. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1036. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1037. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1038. vllm/model_executor/layers/rotary_embedding/xdrope.py +102 -0
  1039. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1040. vllm/model_executor/layers/utils.py +251 -0
  1041. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1042. vllm/model_executor/model_loader/__init__.py +150 -0
  1043. vllm/model_executor/model_loader/base_loader.py +57 -0
  1044. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1045. vllm/model_executor/model_loader/default_loader.py +321 -0
  1046. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1047. vllm/model_executor/model_loader/gguf_loader.py +349 -0
  1048. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1049. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1050. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1051. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1052. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1053. vllm/model_executor/model_loader/tpu.py +118 -0
  1054. vllm/model_executor/model_loader/utils.py +296 -0
  1055. vllm/model_executor/model_loader/weight_utils.py +1147 -0
  1056. vllm/model_executor/models/__init__.py +44 -0
  1057. vllm/model_executor/models/adapters.py +543 -0
  1058. vllm/model_executor/models/afmoe.py +697 -0
  1059. vllm/model_executor/models/aimv2.py +248 -0
  1060. vllm/model_executor/models/apertus.py +569 -0
  1061. vllm/model_executor/models/arcee.py +428 -0
  1062. vllm/model_executor/models/arctic.py +634 -0
  1063. vllm/model_executor/models/aria.py +655 -0
  1064. vllm/model_executor/models/aya_vision.py +450 -0
  1065. vllm/model_executor/models/baichuan.py +494 -0
  1066. vllm/model_executor/models/bailing_moe.py +645 -0
  1067. vllm/model_executor/models/bamba.py +516 -0
  1068. vllm/model_executor/models/bee.py +157 -0
  1069. vllm/model_executor/models/bert.py +925 -0
  1070. vllm/model_executor/models/bert_with_rope.py +732 -0
  1071. vllm/model_executor/models/blip.py +350 -0
  1072. vllm/model_executor/models/blip2.py +695 -0
  1073. vllm/model_executor/models/bloom.py +390 -0
  1074. vllm/model_executor/models/chameleon.py +1098 -0
  1075. vllm/model_executor/models/chatglm.py +499 -0
  1076. vllm/model_executor/models/clip.py +1005 -0
  1077. vllm/model_executor/models/cohere2_vision.py +472 -0
  1078. vllm/model_executor/models/commandr.py +470 -0
  1079. vllm/model_executor/models/config.py +510 -0
  1080. vllm/model_executor/models/dbrx.py +485 -0
  1081. vllm/model_executor/models/deepencoder.py +676 -0
  1082. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1083. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1084. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1085. vllm/model_executor/models/deepseek_v2.py +1715 -0
  1086. vllm/model_executor/models/deepseek_vl2.py +644 -0
  1087. vllm/model_executor/models/dots1.py +566 -0
  1088. vllm/model_executor/models/dots_ocr.py +874 -0
  1089. vllm/model_executor/models/ernie45.py +53 -0
  1090. vllm/model_executor/models/ernie45_moe.py +755 -0
  1091. vllm/model_executor/models/ernie45_vl.py +1710 -0
  1092. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1093. vllm/model_executor/models/ernie_mtp.py +279 -0
  1094. vllm/model_executor/models/exaone.py +525 -0
  1095. vllm/model_executor/models/exaone4.py +517 -0
  1096. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1097. vllm/model_executor/models/falcon.py +544 -0
  1098. vllm/model_executor/models/falcon_h1.py +680 -0
  1099. vllm/model_executor/models/flex_olmo.py +155 -0
  1100. vllm/model_executor/models/fuyu.py +373 -0
  1101. vllm/model_executor/models/gemma.py +426 -0
  1102. vllm/model_executor/models/gemma2.py +436 -0
  1103. vllm/model_executor/models/gemma3.py +577 -0
  1104. vllm/model_executor/models/gemma3_mm.py +665 -0
  1105. vllm/model_executor/models/gemma3n.py +1167 -0
  1106. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1107. vllm/model_executor/models/glm.py +23 -0
  1108. vllm/model_executor/models/glm4.py +298 -0
  1109. vllm/model_executor/models/glm4_1v.py +1854 -0
  1110. vllm/model_executor/models/glm4_moe.py +738 -0
  1111. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1112. vllm/model_executor/models/glm4v.py +785 -0
  1113. vllm/model_executor/models/gpt2.py +397 -0
  1114. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1115. vllm/model_executor/models/gpt_j.py +345 -0
  1116. vllm/model_executor/models/gpt_neox.py +343 -0
  1117. vllm/model_executor/models/gpt_oss.py +745 -0
  1118. vllm/model_executor/models/granite.py +476 -0
  1119. vllm/model_executor/models/granite_speech.py +913 -0
  1120. vllm/model_executor/models/granitemoe.py +561 -0
  1121. vllm/model_executor/models/granitemoehybrid.py +704 -0
  1122. vllm/model_executor/models/granitemoeshared.py +328 -0
  1123. vllm/model_executor/models/gritlm.py +245 -0
  1124. vllm/model_executor/models/grok1.py +555 -0
  1125. vllm/model_executor/models/h2ovl.py +554 -0
  1126. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1127. vllm/model_executor/models/hunyuan_vision.py +1028 -0
  1128. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1129. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1130. vllm/model_executor/models/idefics3.py +718 -0
  1131. vllm/model_executor/models/interfaces.py +1148 -0
  1132. vllm/model_executor/models/interfaces_base.py +243 -0
  1133. vllm/model_executor/models/intern_vit.py +454 -0
  1134. vllm/model_executor/models/internlm2.py +454 -0
  1135. vllm/model_executor/models/internlm2_ve.py +139 -0
  1136. vllm/model_executor/models/interns1.py +830 -0
  1137. vllm/model_executor/models/interns1_vit.py +433 -0
  1138. vllm/model_executor/models/internvl.py +1452 -0
  1139. vllm/model_executor/models/jais.py +397 -0
  1140. vllm/model_executor/models/jamba.py +609 -0
  1141. vllm/model_executor/models/jina_vl.py +147 -0
  1142. vllm/model_executor/models/keye.py +1765 -0
  1143. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1144. vllm/model_executor/models/kimi_linear.py +658 -0
  1145. vllm/model_executor/models/kimi_vl.py +578 -0
  1146. vllm/model_executor/models/lfm2.py +516 -0
  1147. vllm/model_executor/models/lfm2_moe.py +746 -0
  1148. vllm/model_executor/models/lightonocr.py +195 -0
  1149. vllm/model_executor/models/llama.py +704 -0
  1150. vllm/model_executor/models/llama4.py +857 -0
  1151. vllm/model_executor/models/llama4_eagle.py +216 -0
  1152. vllm/model_executor/models/llama_eagle.py +213 -0
  1153. vllm/model_executor/models/llama_eagle3.py +375 -0
  1154. vllm/model_executor/models/llava.py +842 -0
  1155. vllm/model_executor/models/llava_next.py +583 -0
  1156. vllm/model_executor/models/llava_next_video.py +467 -0
  1157. vllm/model_executor/models/llava_onevision.py +923 -0
  1158. vllm/model_executor/models/longcat_flash.py +743 -0
  1159. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1160. vllm/model_executor/models/mamba.py +276 -0
  1161. vllm/model_executor/models/mamba2.py +288 -0
  1162. vllm/model_executor/models/medusa.py +179 -0
  1163. vllm/model_executor/models/midashenglm.py +828 -0
  1164. vllm/model_executor/models/mimo.py +188 -0
  1165. vllm/model_executor/models/mimo_mtp.py +294 -0
  1166. vllm/model_executor/models/minicpm.py +657 -0
  1167. vllm/model_executor/models/minicpm3.py +234 -0
  1168. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1169. vllm/model_executor/models/minicpmo.py +768 -0
  1170. vllm/model_executor/models/minicpmv.py +1744 -0
  1171. vllm/model_executor/models/minimax_m2.py +546 -0
  1172. vllm/model_executor/models/minimax_text_01.py +1010 -0
  1173. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1174. vllm/model_executor/models/mistral3.py +637 -0
  1175. vllm/model_executor/models/mistral_large_3.py +63 -0
  1176. vllm/model_executor/models/mistral_large_3_eagle.py +165 -0
  1177. vllm/model_executor/models/mixtral.py +599 -0
  1178. vllm/model_executor/models/mllama4.py +1151 -0
  1179. vllm/model_executor/models/mlp_speculator.py +235 -0
  1180. vllm/model_executor/models/modernbert.py +452 -0
  1181. vllm/model_executor/models/module_mapping.py +74 -0
  1182. vllm/model_executor/models/molmo.py +1553 -0
  1183. vllm/model_executor/models/moonvit.py +686 -0
  1184. vllm/model_executor/models/mpt.py +335 -0
  1185. vllm/model_executor/models/nano_nemotron_vl.py +1732 -0
  1186. vllm/model_executor/models/nemotron.py +502 -0
  1187. vllm/model_executor/models/nemotron_h.py +850 -0
  1188. vllm/model_executor/models/nemotron_nas.py +473 -0
  1189. vllm/model_executor/models/nemotron_vl.py +653 -0
  1190. vllm/model_executor/models/nvlm_d.py +216 -0
  1191. vllm/model_executor/models/olmo.py +413 -0
  1192. vllm/model_executor/models/olmo2.py +455 -0
  1193. vllm/model_executor/models/olmoe.py +494 -0
  1194. vllm/model_executor/models/opencua.py +271 -0
  1195. vllm/model_executor/models/openpangu.py +1051 -0
  1196. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1197. vllm/model_executor/models/opt.py +426 -0
  1198. vllm/model_executor/models/orion.py +366 -0
  1199. vllm/model_executor/models/ouro.py +508 -0
  1200. vllm/model_executor/models/ovis.py +559 -0
  1201. vllm/model_executor/models/ovis2_5.py +673 -0
  1202. vllm/model_executor/models/paddleocr_vl.py +1380 -0
  1203. vllm/model_executor/models/paligemma.py +412 -0
  1204. vllm/model_executor/models/persimmon.py +376 -0
  1205. vllm/model_executor/models/phi.py +370 -0
  1206. vllm/model_executor/models/phi3.py +18 -0
  1207. vllm/model_executor/models/phi3v.py +737 -0
  1208. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1209. vllm/model_executor/models/phi4mm.py +1253 -0
  1210. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1211. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1212. vllm/model_executor/models/phimoe.py +670 -0
  1213. vllm/model_executor/models/pixtral.py +1380 -0
  1214. vllm/model_executor/models/plamo2.py +966 -0
  1215. vllm/model_executor/models/plamo3.py +441 -0
  1216. vllm/model_executor/models/qwen.py +363 -0
  1217. vllm/model_executor/models/qwen2.py +569 -0
  1218. vllm/model_executor/models/qwen2_5_omni_thinker.py +1220 -0
  1219. vllm/model_executor/models/qwen2_5_vl.py +1594 -0
  1220. vllm/model_executor/models/qwen2_audio.py +473 -0
  1221. vllm/model_executor/models/qwen2_moe.py +590 -0
  1222. vllm/model_executor/models/qwen2_rm.py +123 -0
  1223. vllm/model_executor/models/qwen2_vl.py +1593 -0
  1224. vllm/model_executor/models/qwen3.py +332 -0
  1225. vllm/model_executor/models/qwen3_moe.py +738 -0
  1226. vllm/model_executor/models/qwen3_next.py +1390 -0
  1227. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1228. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1765 -0
  1229. vllm/model_executor/models/qwen3_vl.py +1686 -0
  1230. vllm/model_executor/models/qwen3_vl_moe.py +470 -0
  1231. vllm/model_executor/models/qwen_vl.py +803 -0
  1232. vllm/model_executor/models/radio.py +555 -0
  1233. vllm/model_executor/models/registry.py +1183 -0
  1234. vllm/model_executor/models/roberta.py +259 -0
  1235. vllm/model_executor/models/rvl.py +107 -0
  1236. vllm/model_executor/models/seed_oss.py +493 -0
  1237. vllm/model_executor/models/siglip.py +1245 -0
  1238. vllm/model_executor/models/siglip2navit.py +723 -0
  1239. vllm/model_executor/models/skyworkr1v.py +953 -0
  1240. vllm/model_executor/models/smolvlm.py +38 -0
  1241. vllm/model_executor/models/solar.py +485 -0
  1242. vllm/model_executor/models/stablelm.py +359 -0
  1243. vllm/model_executor/models/starcoder2.py +366 -0
  1244. vllm/model_executor/models/step3_text.py +555 -0
  1245. vllm/model_executor/models/step3_vl.py +1149 -0
  1246. vllm/model_executor/models/swin.py +514 -0
  1247. vllm/model_executor/models/tarsier.py +619 -0
  1248. vllm/model_executor/models/telechat2.py +153 -0
  1249. vllm/model_executor/models/teleflm.py +78 -0
  1250. vllm/model_executor/models/terratorch.py +319 -0
  1251. vllm/model_executor/models/transformers/__init__.py +127 -0
  1252. vllm/model_executor/models/transformers/base.py +464 -0
  1253. vllm/model_executor/models/transformers/causal.py +65 -0
  1254. vllm/model_executor/models/transformers/legacy.py +90 -0
  1255. vllm/model_executor/models/transformers/moe.py +325 -0
  1256. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1257. vllm/model_executor/models/transformers/pooling.py +119 -0
  1258. vllm/model_executor/models/transformers/utils.py +213 -0
  1259. vllm/model_executor/models/ultravox.py +686 -0
  1260. vllm/model_executor/models/utils.py +832 -0
  1261. vllm/model_executor/models/vision.py +552 -0
  1262. vllm/model_executor/models/voxtral.py +842 -0
  1263. vllm/model_executor/models/whisper.py +963 -0
  1264. vllm/model_executor/models/zamba2.py +980 -0
  1265. vllm/model_executor/parameter.py +642 -0
  1266. vllm/model_executor/utils.py +94 -0
  1267. vllm/model_executor/warmup/__init__.py +0 -0
  1268. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1269. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1270. vllm/multimodal/__init__.py +40 -0
  1271. vllm/multimodal/audio.py +142 -0
  1272. vllm/multimodal/base.py +26 -0
  1273. vllm/multimodal/cache.py +830 -0
  1274. vllm/multimodal/evs.py +294 -0
  1275. vllm/multimodal/hasher.py +106 -0
  1276. vllm/multimodal/image.py +130 -0
  1277. vllm/multimodal/inputs.py +1036 -0
  1278. vllm/multimodal/parse.py +544 -0
  1279. vllm/multimodal/processing.py +2240 -0
  1280. vllm/multimodal/profiling.py +369 -0
  1281. vllm/multimodal/registry.py +357 -0
  1282. vllm/multimodal/utils.py +523 -0
  1283. vllm/multimodal/video.py +333 -0
  1284. vllm/outputs.py +345 -0
  1285. vllm/platforms/__init__.py +277 -0
  1286. vllm/platforms/cpu.py +410 -0
  1287. vllm/platforms/cuda.py +642 -0
  1288. vllm/platforms/interface.py +656 -0
  1289. vllm/platforms/rocm.py +513 -0
  1290. vllm/platforms/tpu.py +275 -0
  1291. vllm/platforms/xpu.py +261 -0
  1292. vllm/plugins/__init__.py +81 -0
  1293. vllm/plugins/io_processors/__init__.py +68 -0
  1294. vllm/plugins/io_processors/interface.py +77 -0
  1295. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1296. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1297. vllm/pooling_params.py +230 -0
  1298. vllm/profiler/__init__.py +0 -0
  1299. vllm/profiler/gpu_profiler.py +216 -0
  1300. vllm/profiler/layerwise_profile.py +392 -0
  1301. vllm/profiler/utils.py +151 -0
  1302. vllm/py.typed +2 -0
  1303. vllm/ray/__init__.py +0 -0
  1304. vllm/ray/lazy_utils.py +30 -0
  1305. vllm/ray/ray_env.py +79 -0
  1306. vllm/reasoning/__init__.py +92 -0
  1307. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1308. vllm/reasoning/basic_parsers.py +162 -0
  1309. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1310. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1311. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1312. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1313. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1314. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1315. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1316. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1317. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1318. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1319. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1320. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1321. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1322. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1323. vllm/sampling_params.py +597 -0
  1324. vllm/scalar_type.py +355 -0
  1325. vllm/scripts.py +17 -0
  1326. vllm/sequence.py +98 -0
  1327. vllm/tasks.py +13 -0
  1328. vllm/third_party/__init__.py +0 -0
  1329. vllm/third_party/pynvml.py +6140 -0
  1330. vllm/tokenizers/__init__.py +24 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +124 -0
  1333. vllm/tokenizers/mistral.py +554 -0
  1334. vllm/tokenizers/protocol.py +111 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tracing.py +135 -0
  1337. vllm/transformers_utils/__init__.py +26 -0
  1338. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1339. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1340. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1341. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1342. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1343. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1344. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1345. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1346. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1347. vllm/transformers_utils/config.py +1081 -0
  1348. vllm/transformers_utils/config_parser_base.py +20 -0
  1349. vllm/transformers_utils/configs/__init__.py +84 -0
  1350. vllm/transformers_utils/configs/afmoe.py +87 -0
  1351. vllm/transformers_utils/configs/arctic.py +216 -0
  1352. vllm/transformers_utils/configs/chatglm.py +75 -0
  1353. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1354. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1355. vllm/transformers_utils/configs/eagle.py +90 -0
  1356. vllm/transformers_utils/configs/falcon.py +89 -0
  1357. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1358. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1359. vllm/transformers_utils/configs/jais.py +243 -0
  1360. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1361. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1362. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1363. vllm/transformers_utils/configs/medusa.py +65 -0
  1364. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1365. vllm/transformers_utils/configs/mistral.py +235 -0
  1366. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1367. vllm/transformers_utils/configs/moonvit.py +33 -0
  1368. vllm/transformers_utils/configs/nemotron.py +214 -0
  1369. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1370. vllm/transformers_utils/configs/olmo3.py +83 -0
  1371. vllm/transformers_utils/configs/ovis.py +182 -0
  1372. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1373. vllm/transformers_utils/configs/radio.py +89 -0
  1374. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1375. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1376. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1377. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1378. vllm/transformers_utils/configs/ultravox.py +118 -0
  1379. vllm/transformers_utils/dynamic_module.py +59 -0
  1380. vllm/transformers_utils/gguf_utils.py +209 -0
  1381. vllm/transformers_utils/processor.py +423 -0
  1382. vllm/transformers_utils/processors/__init__.py +23 -0
  1383. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1384. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1385. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1386. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1387. vllm/transformers_utils/processors/ovis.py +453 -0
  1388. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1389. vllm/transformers_utils/repo_utils.py +287 -0
  1390. vllm/transformers_utils/runai_utils.py +104 -0
  1391. vllm/transformers_utils/s3_utils.py +95 -0
  1392. vllm/transformers_utils/tokenizer.py +127 -0
  1393. vllm/transformers_utils/tokenizer_base.py +33 -0
  1394. vllm/transformers_utils/utils.py +184 -0
  1395. vllm/triton_utils/__init__.py +20 -0
  1396. vllm/triton_utils/importing.py +103 -0
  1397. vllm/usage/__init__.py +0 -0
  1398. vllm/usage/usage_lib.py +294 -0
  1399. vllm/utils/__init__.py +66 -0
  1400. vllm/utils/argparse_utils.py +504 -0
  1401. vllm/utils/async_utils.py +310 -0
  1402. vllm/utils/cache.py +214 -0
  1403. vllm/utils/collection_utils.py +112 -0
  1404. vllm/utils/counter.py +45 -0
  1405. vllm/utils/deep_gemm.py +399 -0
  1406. vllm/utils/flashinfer.py +532 -0
  1407. vllm/utils/func_utils.py +236 -0
  1408. vllm/utils/gc_utils.py +151 -0
  1409. vllm/utils/hashing.py +81 -0
  1410. vllm/utils/import_utils.py +449 -0
  1411. vllm/utils/jsontree.py +158 -0
  1412. vllm/utils/math_utils.py +32 -0
  1413. vllm/utils/mem_constants.py +13 -0
  1414. vllm/utils/mem_utils.py +232 -0
  1415. vllm/utils/nccl.py +64 -0
  1416. vllm/utils/network_utils.py +331 -0
  1417. vllm/utils/platform_utils.py +59 -0
  1418. vllm/utils/profiling.py +56 -0
  1419. vllm/utils/registry.py +51 -0
  1420. vllm/utils/serial_utils.py +169 -0
  1421. vllm/utils/system_utils.py +265 -0
  1422. vllm/utils/tensor_schema.py +255 -0
  1423. vllm/utils/torch_utils.py +647 -0
  1424. vllm/v1/__init__.py +0 -0
  1425. vllm/v1/attention/__init__.py +0 -0
  1426. vllm/v1/attention/backends/__init__.py +0 -0
  1427. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1428. vllm/v1/attention/backends/flash_attn.py +1050 -0
  1429. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1430. vllm/v1/attention/backends/flex_attention.py +945 -0
  1431. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1432. vllm/v1/attention/backends/linear_attn.py +77 -0
  1433. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1434. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1435. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1436. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1437. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1438. vllm/v1/attention/backends/mla/common.py +2069 -0
  1439. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1440. vllm/v1/attention/backends/mla/flashattn_mla.py +340 -0
  1441. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1442. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1443. vllm/v1/attention/backends/mla/flashmla_sparse.py +551 -0
  1444. vllm/v1/attention/backends/mla/indexer.py +369 -0
  1445. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1446. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1447. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1448. vllm/v1/attention/backends/pallas.py +436 -0
  1449. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1450. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1451. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1452. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1453. vllm/v1/attention/backends/tree_attn.py +428 -0
  1454. vllm/v1/attention/backends/triton_attn.py +377 -0
  1455. vllm/v1/attention/backends/utils.py +1149 -0
  1456. vllm/v1/core/__init__.py +0 -0
  1457. vllm/v1/core/block_pool.py +466 -0
  1458. vllm/v1/core/encoder_cache_manager.py +343 -0
  1459. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1460. vllm/v1/core/kv_cache_manager.py +408 -0
  1461. vllm/v1/core/kv_cache_metrics.py +96 -0
  1462. vllm/v1/core/kv_cache_utils.py +1471 -0
  1463. vllm/v1/core/sched/__init__.py +0 -0
  1464. vllm/v1/core/sched/async_scheduler.py +68 -0
  1465. vllm/v1/core/sched/interface.py +187 -0
  1466. vllm/v1/core/sched/output.py +230 -0
  1467. vllm/v1/core/sched/request_queue.py +217 -0
  1468. vllm/v1/core/sched/scheduler.py +1726 -0
  1469. vllm/v1/core/sched/utils.py +72 -0
  1470. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1471. vllm/v1/cudagraph_dispatcher.py +183 -0
  1472. vllm/v1/engine/__init__.py +214 -0
  1473. vllm/v1/engine/async_llm.py +874 -0
  1474. vllm/v1/engine/coordinator.py +377 -0
  1475. vllm/v1/engine/core.py +1421 -0
  1476. vllm/v1/engine/core_client.py +1406 -0
  1477. vllm/v1/engine/detokenizer.py +351 -0
  1478. vllm/v1/engine/exceptions.py +18 -0
  1479. vllm/v1/engine/input_processor.py +636 -0
  1480. vllm/v1/engine/llm_engine.py +416 -0
  1481. vllm/v1/engine/logprobs.py +189 -0
  1482. vllm/v1/engine/output_processor.py +658 -0
  1483. vllm/v1/engine/parallel_sampling.py +145 -0
  1484. vllm/v1/engine/processor.py +20 -0
  1485. vllm/v1/engine/utils.py +1068 -0
  1486. vllm/v1/executor/__init__.py +6 -0
  1487. vllm/v1/executor/abstract.py +352 -0
  1488. vllm/v1/executor/multiproc_executor.py +888 -0
  1489. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1490. vllm/v1/executor/ray_executor.py +626 -0
  1491. vllm/v1/executor/ray_utils.py +465 -0
  1492. vllm/v1/executor/uniproc_executor.py +183 -0
  1493. vllm/v1/kv_cache_interface.py +404 -0
  1494. vllm/v1/kv_offload/__init__.py +0 -0
  1495. vllm/v1/kv_offload/abstract.py +161 -0
  1496. vllm/v1/kv_offload/arc_manager.py +237 -0
  1497. vllm/v1/kv_offload/backend.py +97 -0
  1498. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1499. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1500. vllm/v1/kv_offload/cpu.py +86 -0
  1501. vllm/v1/kv_offload/factory.py +56 -0
  1502. vllm/v1/kv_offload/lru_manager.py +139 -0
  1503. vllm/v1/kv_offload/mediums.py +39 -0
  1504. vllm/v1/kv_offload/spec.py +66 -0
  1505. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1506. vllm/v1/kv_offload/worker/cpu_gpu.py +191 -0
  1507. vllm/v1/kv_offload/worker/worker.py +144 -0
  1508. vllm/v1/metrics/__init__.py +0 -0
  1509. vllm/v1/metrics/loggers.py +1268 -0
  1510. vllm/v1/metrics/prometheus.py +82 -0
  1511. vllm/v1/metrics/ray_wrappers.py +194 -0
  1512. vllm/v1/metrics/reader.py +257 -0
  1513. vllm/v1/metrics/stats.py +431 -0
  1514. vllm/v1/outputs.py +237 -0
  1515. vllm/v1/pool/__init__.py +0 -0
  1516. vllm/v1/pool/metadata.py +82 -0
  1517. vllm/v1/request.py +280 -0
  1518. vllm/v1/sample/__init__.py +0 -0
  1519. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1520. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1521. vllm/v1/sample/logits_processor/interface.py +106 -0
  1522. vllm/v1/sample/logits_processor/state.py +165 -0
  1523. vllm/v1/sample/metadata.py +44 -0
  1524. vllm/v1/sample/ops/__init__.py +0 -0
  1525. vllm/v1/sample/ops/bad_words.py +52 -0
  1526. vllm/v1/sample/ops/logprobs.py +25 -0
  1527. vllm/v1/sample/ops/penalties.py +57 -0
  1528. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1529. vllm/v1/sample/rejection_sampler.py +805 -0
  1530. vllm/v1/sample/sampler.py +319 -0
  1531. vllm/v1/sample/tpu/__init__.py +0 -0
  1532. vllm/v1/sample/tpu/metadata.py +120 -0
  1533. vllm/v1/sample/tpu/sampler.py +215 -0
  1534. vllm/v1/serial_utils.py +532 -0
  1535. vllm/v1/spec_decode/__init__.py +0 -0
  1536. vllm/v1/spec_decode/eagle.py +1325 -0
  1537. vllm/v1/spec_decode/medusa.py +73 -0
  1538. vllm/v1/spec_decode/metadata.py +66 -0
  1539. vllm/v1/spec_decode/metrics.py +225 -0
  1540. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1541. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1542. vllm/v1/spec_decode/utils.py +121 -0
  1543. vllm/v1/structured_output/__init__.py +338 -0
  1544. vllm/v1/structured_output/backend_guidance.py +265 -0
  1545. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1546. vllm/v1/structured_output/backend_outlines.py +324 -0
  1547. vllm/v1/structured_output/backend_types.py +136 -0
  1548. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1549. vllm/v1/structured_output/request.py +94 -0
  1550. vllm/v1/structured_output/utils.py +469 -0
  1551. vllm/v1/utils.py +414 -0
  1552. vllm/v1/worker/__init__.py +0 -0
  1553. vllm/v1/worker/block_table.py +343 -0
  1554. vllm/v1/worker/cpu_model_runner.py +122 -0
  1555. vllm/v1/worker/cpu_worker.py +210 -0
  1556. vllm/v1/worker/dp_utils.py +250 -0
  1557. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1558. vllm/v1/worker/gpu/README.md +4 -0
  1559. vllm/v1/worker/gpu/__init__.py +0 -0
  1560. vllm/v1/worker/gpu/async_utils.py +97 -0
  1561. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1562. vllm/v1/worker/gpu/block_table.py +314 -0
  1563. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1564. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1565. vllm/v1/worker/gpu/input_batch.py +430 -0
  1566. vllm/v1/worker/gpu/model_runner.py +1007 -0
  1567. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1568. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1569. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1570. vllm/v1/worker/gpu/sample/metadata.py +179 -0
  1571. vllm/v1/worker/gpu/sample/penalties.py +154 -0
  1572. vllm/v1/worker/gpu/sample/sampler.py +75 -0
  1573. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1574. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1575. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1576. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +83 -0
  1577. vllm/v1/worker/gpu/states.py +309 -0
  1578. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1579. vllm/v1/worker/gpu_input_batch.py +971 -0
  1580. vllm/v1/worker/gpu_model_runner.py +5360 -0
  1581. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1582. vllm/v1/worker/gpu_worker.py +922 -0
  1583. vllm/v1/worker/kv_connector_model_runner_mixin.py +309 -0
  1584. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1585. vllm/v1/worker/tpu_input_batch.py +583 -0
  1586. vllm/v1/worker/tpu_model_runner.py +2196 -0
  1587. vllm/v1/worker/tpu_worker.py +351 -0
  1588. vllm/v1/worker/ubatch_utils.py +73 -0
  1589. vllm/v1/worker/ubatching.py +231 -0
  1590. vllm/v1/worker/utils.py +365 -0
  1591. vllm/v1/worker/worker_base.py +377 -0
  1592. vllm/v1/worker/xpu_model_runner.py +48 -0
  1593. vllm/v1/worker/xpu_worker.py +198 -0
  1594. vllm/version.py +39 -0
  1595. vllm/vllm_flash_attn/.gitkeep +0 -0
  1596. vllm_cpu-0.12.0.dist-info/METADATA +300 -0
  1597. vllm_cpu-0.12.0.dist-info/RECORD +1600 -0
  1598. vllm_cpu-0.12.0.dist-info/WHEEL +5 -0
  1599. vllm_cpu-0.12.0.dist-info/entry_points.txt +5 -0
  1600. vllm_cpu-0.12.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2106 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import argparse
5
+ import copy
6
+ import dataclasses
7
+ import functools
8
+ import json
9
+ import sys
10
+ from collections.abc import Callable
11
+ from dataclasses import MISSING, dataclass, fields, is_dataclass
12
+ from itertools import permutations
13
+ from types import UnionType
14
+ from typing import (
15
+ TYPE_CHECKING,
16
+ Annotated,
17
+ Any,
18
+ Literal,
19
+ TypeAlias,
20
+ TypeVar,
21
+ Union,
22
+ cast,
23
+ get_args,
24
+ get_origin,
25
+ )
26
+
27
+ import huggingface_hub
28
+ import regex as re
29
+ import torch
30
+ from pydantic import TypeAdapter, ValidationError
31
+ from pydantic.fields import FieldInfo
32
+ from typing_extensions import TypeIs
33
+
34
+ import vllm.envs as envs
35
+ from vllm.attention.backends.registry import AttentionBackendEnum
36
+ from vllm.config import (
37
+ CacheConfig,
38
+ CompilationConfig,
39
+ ConfigType,
40
+ DeviceConfig,
41
+ ECTransferConfig,
42
+ EPLBConfig,
43
+ KVEventsConfig,
44
+ KVTransferConfig,
45
+ LoadConfig,
46
+ LoRAConfig,
47
+ ModelConfig,
48
+ MultiModalConfig,
49
+ ObservabilityConfig,
50
+ ParallelConfig,
51
+ PoolerConfig,
52
+ SchedulerConfig,
53
+ SpeculativeConfig,
54
+ StructuredOutputsConfig,
55
+ VllmConfig,
56
+ get_attr_docs,
57
+ )
58
+ from vllm.config.cache import (
59
+ BlockSize,
60
+ CacheDType,
61
+ KVOffloadingBackend,
62
+ MambaDType,
63
+ PrefixCachingHashAlgo,
64
+ )
65
+ from vllm.config.device import Device
66
+ from vllm.config.model import (
67
+ ConvertOption,
68
+ HfOverrides,
69
+ LogprobsMode,
70
+ ModelDType,
71
+ RunnerOption,
72
+ TaskOption,
73
+ TokenizerMode,
74
+ )
75
+ from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
76
+ from vllm.config.observability import DetailedTraceModules
77
+ from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
78
+ from vllm.config.scheduler import SchedulerPolicy
79
+ from vllm.config.utils import get_field
80
+ from vllm.config.vllm import OptimizationLevel
81
+ from vllm.logger import init_logger, suppress_logging
82
+ from vllm.platforms import CpuArchEnum, current_platform
83
+ from vllm.plugins import load_general_plugins
84
+ from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
85
+ from vllm.transformers_utils.config import (
86
+ is_interleaved,
87
+ maybe_override_with_speculators,
88
+ )
89
+ from vllm.transformers_utils.repo_utils import get_model_path
90
+ from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
91
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
92
+ from vllm.utils.mem_constants import GiB_bytes
93
+ from vllm.utils.network_utils import get_ip
94
+ from vllm.v1.sample.logits_processor import LogitsProcessor
95
+
96
+ if TYPE_CHECKING:
97
+ from vllm.model_executor.layers.quantization import QuantizationMethods
98
+ from vllm.model_executor.model_loader import LoadFormats
99
+ from vllm.usage.usage_lib import UsageContext
100
+ from vllm.v1.executor import Executor
101
+ else:
102
+ Executor = Any
103
+ QuantizationMethods = Any
104
+ LoadFormats = Any
105
+ UsageContext = Any
106
+
107
+ logger = init_logger(__name__)
108
+
109
+ # object is used to allow for special typing forms
110
+ T = TypeVar("T")
111
+ TypeHint: TypeAlias = type[Any] | object
112
+ TypeHintT: TypeAlias = type[T] | object
113
+
114
+
115
+ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
116
+ def _parse_type(val: str) -> T:
117
+ try:
118
+ return return_type(val)
119
+ except ValueError as e:
120
+ raise argparse.ArgumentTypeError(
121
+ f"Value {val} cannot be converted to {return_type}."
122
+ ) from e
123
+
124
+ return _parse_type
125
+
126
+
127
+ def optional_type(return_type: Callable[[str], T]) -> Callable[[str], T | None]:
128
+ def _optional_type(val: str) -> T | None:
129
+ if val == "" or val == "None":
130
+ return None
131
+ return parse_type(return_type)(val)
132
+
133
+ return _optional_type
134
+
135
+
136
+ def union_dict_and_str(val: str) -> str | dict[str, str] | None:
137
+ if not re.match(r"(?s)^\s*{.*}\s*$", val):
138
+ return str(val)
139
+ return optional_type(json.loads)(val)
140
+
141
+
142
+ def is_type(type_hint: TypeHint, type: TypeHintT) -> TypeIs[TypeHintT]:
143
+ """Check if the type hint is a specific type."""
144
+ return type_hint is type or get_origin(type_hint) is type
145
+
146
+
147
+ def contains_type(type_hints: set[TypeHint], type: TypeHintT) -> bool:
148
+ """Check if the type hints contain a specific type."""
149
+ return any(is_type(type_hint, type) for type_hint in type_hints)
150
+
151
+
152
+ def get_type(type_hints: set[TypeHint], type: TypeHintT) -> TypeHintT:
153
+ """Get the specific type from the type hints."""
154
+ return next((th for th in type_hints if is_type(th, type)), None)
155
+
156
+
157
+ def literal_to_kwargs(type_hints: set[TypeHint]) -> dict[str, Any]:
158
+ """Get the `type` and `choices` from a `Literal` type hint in `type_hints`.
159
+
160
+ If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
161
+ """
162
+ type_hint = get_type(type_hints, Literal)
163
+ options = get_args(type_hint)
164
+ option_type = type(options[0])
165
+ if not all(isinstance(option, option_type) for option in options):
166
+ raise ValueError(
167
+ "All options must be of the same type. "
168
+ f"Got {options} with types {[type(c) for c in options]}"
169
+ )
170
+ kwarg = "metavar" if contains_type(type_hints, str) else "choices"
171
+ return {"type": option_type, kwarg: sorted(options)}
172
+
173
+
174
+ def collection_to_kwargs(type_hints: set[TypeHint], type: TypeHint) -> dict[str, Any]:
175
+ type_hint = get_type(type_hints, type)
176
+ types = get_args(type_hint)
177
+ elem_type = types[0]
178
+
179
+ # Handle Ellipsis
180
+ assert all(t is elem_type for t in types if t is not Ellipsis), (
181
+ f"All non-Ellipsis elements must be of the same type. Got {types}."
182
+ )
183
+
184
+ # Handle Union types
185
+ if get_origin(elem_type) in {Union, UnionType}:
186
+ # Union for Union[X, Y] and UnionType for X | Y
187
+ assert str in get_args(elem_type), (
188
+ "If element can have multiple types, one must be 'str' "
189
+ f"(i.e. 'list[int | str]'). Got {elem_type}."
190
+ )
191
+ elem_type = str
192
+
193
+ return {
194
+ "type": elem_type,
195
+ "nargs": "+" if type is not tuple or Ellipsis in types else len(types),
196
+ }
197
+
198
+
199
+ def is_not_builtin(type_hint: TypeHint) -> bool:
200
+ """Check if the class is not a built-in type."""
201
+ return type_hint.__module__ != "builtins"
202
+
203
+
204
+ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
205
+ """Extract type hints from Annotated or Union type hints."""
206
+ type_hints: set[TypeHint] = set()
207
+ origin = get_origin(type_hint)
208
+ args = get_args(type_hint)
209
+
210
+ if origin is Annotated:
211
+ type_hints.update(get_type_hints(args[0]))
212
+ elif origin in {Union, UnionType}:
213
+ # Union for Union[X, Y] and UnionType for X | Y
214
+ for arg in args:
215
+ type_hints.update(get_type_hints(arg))
216
+ else:
217
+ type_hints.add(type_hint)
218
+
219
+ return type_hints
220
+
221
+
222
+ def is_online_quantization(quantization: Any) -> bool:
223
+ return quantization in ["inc"]
224
+
225
+
226
+ NEEDS_HELP = (
227
+ any("--help" in arg for arg in sys.argv) # vllm SUBCOMMAND --help
228
+ or (argv0 := sys.argv[0]).endswith("mkdocs") # mkdocs SUBCOMMAND
229
+ or argv0.endswith("mkdocs/__main__.py") # python -m mkdocs SUBCOMMAND
230
+ )
231
+
232
+
233
+ @functools.lru_cache(maxsize=30)
234
+ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
235
+ # Save time only getting attr docs if we're generating help text
236
+ cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
237
+ kwargs = {}
238
+ for field in fields(cls):
239
+ # Get the set of possible types for the field
240
+ type_hints: set[TypeHint] = get_type_hints(field.type)
241
+
242
+ # If the field is a dataclass, we can use the model_validate_json
243
+ generator = (th for th in type_hints if is_dataclass(th))
244
+ dataclass_cls = next(generator, None)
245
+
246
+ # Get the default value of the field
247
+ if field.default is not MISSING:
248
+ default = field.default
249
+ # Handle pydantic.Field defaults
250
+ if isinstance(default, FieldInfo):
251
+ if default.default_factory is None:
252
+ default = default.default
253
+ else:
254
+ # VllmConfig's Fields have default_factory set to config classes.
255
+ # These could emit logs on init, which would be confusing.
256
+ with suppress_logging():
257
+ default = default.default_factory()
258
+ elif field.default_factory is not MISSING:
259
+ default = field.default_factory()
260
+
261
+ # Get the help text for the field
262
+ name = field.name
263
+ help = cls_docs.get(name, "").strip()
264
+ # Escape % for argparse
265
+ help = help.replace("%", "%%")
266
+
267
+ # Initialise the kwargs dictionary for the field
268
+ kwargs[name] = {"default": default, "help": help}
269
+
270
+ # Set other kwargs based on the type hints
271
+ json_tip = (
272
+ "Should either be a valid JSON string or JSON keys passed individually."
273
+ )
274
+ if dataclass_cls is not None:
275
+
276
+ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
277
+ try:
278
+ return TypeAdapter(cls).validate_json(val)
279
+ except ValidationError as e:
280
+ raise argparse.ArgumentTypeError(repr(e)) from e
281
+
282
+ kwargs[name]["type"] = parse_dataclass
283
+ kwargs[name]["help"] += f"\n\n{json_tip}"
284
+ elif contains_type(type_hints, bool):
285
+ # Creates --no-<name> and --<name> flags
286
+ kwargs[name]["action"] = argparse.BooleanOptionalAction
287
+ elif contains_type(type_hints, Literal):
288
+ kwargs[name].update(literal_to_kwargs(type_hints))
289
+ elif contains_type(type_hints, tuple):
290
+ kwargs[name].update(collection_to_kwargs(type_hints, tuple))
291
+ elif contains_type(type_hints, list):
292
+ kwargs[name].update(collection_to_kwargs(type_hints, list))
293
+ elif contains_type(type_hints, set):
294
+ kwargs[name].update(collection_to_kwargs(type_hints, set))
295
+ elif contains_type(type_hints, int):
296
+ kwargs[name]["type"] = int
297
+ # Special case for large integers
298
+ human_readable_ints = {
299
+ "max_model_len",
300
+ "max_num_batched_tokens",
301
+ "kv_cache_memory_bytes",
302
+ }
303
+ if name in human_readable_ints:
304
+ kwargs[name]["type"] = human_readable_int
305
+ kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
306
+ elif contains_type(type_hints, float):
307
+ kwargs[name]["type"] = float
308
+ elif contains_type(type_hints, dict) and (
309
+ contains_type(type_hints, str)
310
+ or any(is_not_builtin(th) for th in type_hints)
311
+ ):
312
+ kwargs[name]["type"] = union_dict_and_str
313
+ elif contains_type(type_hints, dict):
314
+ kwargs[name]["type"] = parse_type(json.loads)
315
+ kwargs[name]["help"] += f"\n\n{json_tip}"
316
+ elif contains_type(type_hints, str) or any(
317
+ is_not_builtin(th) for th in type_hints
318
+ ):
319
+ kwargs[name]["type"] = str
320
+ else:
321
+ raise ValueError(f"Unsupported type {type_hints} for argument {name}.")
322
+
323
+ # If the type hint was a sequence of literals, use the helper function
324
+ # to update the type and choices
325
+ if get_origin(kwargs[name].get("type")) is Literal:
326
+ kwargs[name].update(literal_to_kwargs({kwargs[name]["type"]}))
327
+
328
+ # If None is in type_hints, make the argument optional.
329
+ # But not if it's a bool, argparse will handle this better.
330
+ if type(None) in type_hints and not contains_type(type_hints, bool):
331
+ kwargs[name]["type"] = optional_type(kwargs[name]["type"])
332
+ if kwargs[name].get("choices"):
333
+ kwargs[name]["choices"].append("None")
334
+ return kwargs
335
+
336
+
337
+ def get_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
338
+ """Return argparse kwargs for the given Config dataclass.
339
+
340
+ If `--help` or `mkdocs` are not present in the command line command, the
341
+ attribute documentation will not be included in the help output.
342
+
343
+ The heavy computation is cached via functools.lru_cache, and a deep copy
344
+ is returned so callers can mutate the dictionary without affecting the
345
+ cached version.
346
+ """
347
+ return copy.deepcopy(_compute_kwargs(cls))
348
+
349
+
350
+ @dataclass
351
+ class EngineArgs:
352
+ """Arguments for vLLM engine."""
353
+
354
+ model: str = ModelConfig.model
355
+ served_model_name: str | list[str] | None = ModelConfig.served_model_name
356
+ tokenizer: str | None = ModelConfig.tokenizer
357
+ hf_config_path: str | None = ModelConfig.hf_config_path
358
+ runner: RunnerOption = ModelConfig.runner
359
+ convert: ConvertOption = ModelConfig.convert
360
+ task: TaskOption | None = ModelConfig.task
361
+ skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
362
+ enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
363
+ tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
364
+ trust_remote_code: bool = ModelConfig.trust_remote_code
365
+ allowed_local_media_path: str = ModelConfig.allowed_local_media_path
366
+ allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
367
+ download_dir: str | None = LoadConfig.download_dir
368
+ safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
369
+ load_format: str | LoadFormats = LoadConfig.load_format
370
+ config_format: str = ModelConfig.config_format
371
+ dtype: ModelDType = ModelConfig.dtype
372
+ kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
373
+ seed: int | None = 0
374
+ max_model_len: int | None = ModelConfig.max_model_len
375
+ cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
376
+ cudagraph_capture_sizes: list[int] | None = (
377
+ CompilationConfig.cudagraph_capture_sizes
378
+ )
379
+ max_cudagraph_capture_size: int | None = get_field(
380
+ CompilationConfig, "max_cudagraph_capture_size"
381
+ )
382
+ # Note: Specifying a custom executor backend by passing a class
383
+ # is intended for expert use only. The API may change without
384
+ # notice.
385
+ distributed_executor_backend: (
386
+ str | DistributedExecutorBackend | type[Executor] | None
387
+ ) = ParallelConfig.distributed_executor_backend
388
+ # number of P/D disaggregation (or other disaggregation) workers
389
+ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
390
+ master_addr: str = ParallelConfig.master_addr
391
+ master_port: int = ParallelConfig.master_port
392
+ nnodes: int = ParallelConfig.nnodes
393
+ node_rank: int = ParallelConfig.node_rank
394
+ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
395
+ prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
396
+ decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
397
+ dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
398
+ cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
399
+ data_parallel_size: int = ParallelConfig.data_parallel_size
400
+ data_parallel_rank: int | None = None
401
+ data_parallel_start_rank: int | None = None
402
+ data_parallel_size_local: int | None = None
403
+ data_parallel_address: str | None = None
404
+ data_parallel_rpc_port: int | None = None
405
+ data_parallel_hybrid_lb: bool = False
406
+ data_parallel_external_lb: bool = False
407
+ data_parallel_backend: str = ParallelConfig.data_parallel_backend
408
+ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
409
+ all2all_backend: str | None = ParallelConfig.all2all_backend
410
+ enable_dbo: bool = ParallelConfig.enable_dbo
411
+ dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
412
+ dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
413
+ disable_nccl_for_dp_synchronization: bool = (
414
+ ParallelConfig.disable_nccl_for_dp_synchronization
415
+ )
416
+ eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
417
+ enable_eplb: bool = ParallelConfig.enable_eplb
418
+ expert_placement_strategy: ExpertPlacementStrategy = (
419
+ ParallelConfig.expert_placement_strategy
420
+ )
421
+ _api_process_count: int = ParallelConfig._api_process_count
422
+ _api_process_rank: int = ParallelConfig._api_process_rank
423
+ max_parallel_loading_workers: int | None = (
424
+ ParallelConfig.max_parallel_loading_workers
425
+ )
426
+ block_size: BlockSize | None = CacheConfig.block_size
427
+ enable_prefix_caching: bool | None = None
428
+ prefix_caching_hash_algo: PrefixCachingHashAlgo = (
429
+ CacheConfig.prefix_caching_hash_algo
430
+ )
431
+ disable_sliding_window: bool = ModelConfig.disable_sliding_window
432
+ disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
433
+ swap_space: float = CacheConfig.swap_space
434
+ cpu_offload_gb: float = CacheConfig.cpu_offload_gb
435
+ gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
436
+ kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
437
+ max_num_batched_tokens: int | None = None
438
+ max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
439
+ max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
440
+ long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
441
+ max_num_seqs: int | None = None
442
+ max_logprobs: int = ModelConfig.max_logprobs
443
+ logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
444
+ disable_log_stats: bool = False
445
+ aggregate_engine_logging: bool = False
446
+ revision: str | None = ModelConfig.revision
447
+ code_revision: str | None = ModelConfig.code_revision
448
+ hf_token: bool | str | None = ModelConfig.hf_token
449
+ hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
450
+ tokenizer_revision: str | None = ModelConfig.tokenizer_revision
451
+ quantization: QuantizationMethods | None = ModelConfig.quantization
452
+ enforce_eager: bool = ModelConfig.enforce_eager
453
+ disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
454
+ limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
455
+ MultiModalConfig, "limit_per_prompt"
456
+ )
457
+ enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
458
+ interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
459
+ media_io_kwargs: dict[str, dict[str, Any]] = get_field(
460
+ MultiModalConfig, "media_io_kwargs"
461
+ )
462
+ mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
463
+ disable_mm_preprocessor_cache: bool = False # DEPRECATED
464
+ mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
465
+ mm_processor_cache_type: MMCacheType | None = (
466
+ MultiModalConfig.mm_processor_cache_type
467
+ )
468
+ mm_shm_cache_max_object_size_mb: int = (
469
+ MultiModalConfig.mm_shm_cache_max_object_size_mb
470
+ )
471
+ mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
472
+ mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
473
+ MultiModalConfig.mm_encoder_attn_backend
474
+ )
475
+ io_processor_plugin: str | None = None
476
+ skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
477
+ video_pruning_rate: float = MultiModalConfig.video_pruning_rate
478
+ # LoRA fields
479
+ enable_lora: bool = False
480
+ max_loras: int = LoRAConfig.max_loras
481
+ max_lora_rank: int = LoRAConfig.max_lora_rank
482
+ default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
483
+ fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
484
+ max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
485
+ lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
486
+
487
+ ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
488
+ num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
489
+ model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
490
+ ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
491
+
492
+ enable_chunked_prefill: bool | None = None
493
+ disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
494
+
495
+ disable_hybrid_kv_cache_manager: bool = (
496
+ SchedulerConfig.disable_hybrid_kv_cache_manager
497
+ )
498
+
499
+ structured_outputs_config: StructuredOutputsConfig = get_field(
500
+ VllmConfig, "structured_outputs_config"
501
+ )
502
+ reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
503
+ reasoning_parser_plugin: str | None = None
504
+
505
+ logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
506
+
507
+ speculative_config: dict[str, Any] | None = None
508
+
509
+ show_hidden_metrics_for_version: str | None = (
510
+ ObservabilityConfig.show_hidden_metrics_for_version
511
+ )
512
+ otlp_traces_endpoint: str | None = ObservabilityConfig.otlp_traces_endpoint
513
+ collect_detailed_traces: list[DetailedTraceModules] | None = (
514
+ ObservabilityConfig.collect_detailed_traces
515
+ )
516
+ kv_cache_metrics: bool = ObservabilityConfig.kv_cache_metrics
517
+ kv_cache_metrics_sample: float = get_field(
518
+ ObservabilityConfig, "kv_cache_metrics_sample"
519
+ )
520
+ scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
521
+ scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
522
+
523
+ pooler_config: PoolerConfig | None = ModelConfig.pooler_config
524
+ compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
525
+ worker_cls: str = ParallelConfig.worker_cls
526
+ worker_extension_cls: str = ParallelConfig.worker_extension_cls
527
+
528
+ kv_transfer_config: KVTransferConfig | None = None
529
+ kv_events_config: KVEventsConfig | None = None
530
+
531
+ ec_transfer_config: ECTransferConfig | None = None
532
+
533
+ generation_config: str = ModelConfig.generation_config
534
+ enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
535
+ override_generation_config: dict[str, Any] = get_field(
536
+ ModelConfig, "override_generation_config"
537
+ )
538
+ model_impl: str = ModelConfig.model_impl
539
+ override_attention_dtype: str = ModelConfig.override_attention_dtype
540
+
541
+ calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
542
+ mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
543
+ mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype
544
+ mamba_block_size: int | None = get_field(CacheConfig, "mamba_block_size")
545
+
546
+ additional_config: dict[str, Any] = get_field(VllmConfig, "additional_config")
547
+
548
+ use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
549
+ pt_load_map_location: str = LoadConfig.pt_load_map_location
550
+
551
+ # DEPRECATED
552
+ enable_multimodal_encoder_data_parallel: bool = False
553
+
554
+ logits_processors: list[str | type[LogitsProcessor]] | None = (
555
+ ModelConfig.logits_processors
556
+ )
557
+ """Custom logitproc types"""
558
+
559
+ async_scheduling: bool | None = SchedulerConfig.async_scheduling
560
+
561
+ stream_interval: int = SchedulerConfig.stream_interval
562
+
563
+ kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
564
+ optimization_level: OptimizationLevel = VllmConfig.optimization_level
565
+
566
+ kv_offloading_size: float | None = CacheConfig.kv_offloading_size
567
+ kv_offloading_backend: KVOffloadingBackend | None = (
568
+ CacheConfig.kv_offloading_backend
569
+ )
570
+ tokens_only: bool = False
571
+
572
+ def __post_init__(self):
573
+ # support `EngineArgs(compilation_config={...})`
574
+ # without having to manually construct a
575
+ # CompilationConfig object
576
+ if isinstance(self.compilation_config, dict):
577
+ self.compilation_config = CompilationConfig(**self.compilation_config)
578
+ if isinstance(self.eplb_config, dict):
579
+ self.eplb_config = EPLBConfig(**self.eplb_config)
580
+ # Setup plugins
581
+ from vllm.plugins import load_general_plugins
582
+
583
+ load_general_plugins()
584
+ # when use hf offline,replace model and tokenizer id to local model path
585
+ if huggingface_hub.constants.HF_HUB_OFFLINE:
586
+ model_id = self.model
587
+ self.model = get_model_path(self.model, self.revision)
588
+ if model_id is not self.model:
589
+ logger.info(
590
+ "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
591
+ model_id,
592
+ self.model,
593
+ )
594
+ if self.tokenizer is not None:
595
+ tokenizer_id = self.tokenizer
596
+ self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision)
597
+ if tokenizer_id is not self.tokenizer:
598
+ logger.info(
599
+ "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] "
600
+ "to tokenizer_path [%s]",
601
+ tokenizer_id,
602
+ self.tokenizer,
603
+ )
604
+
605
+ @staticmethod
606
+ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
607
+ """Shared CLI arguments for vLLM engine."""
608
+
609
+ # Model arguments
610
+ model_kwargs = get_kwargs(ModelConfig)
611
+ model_group = parser.add_argument_group(
612
+ title="ModelConfig",
613
+ description=ModelConfig.__doc__,
614
+ )
615
+ if not ("serve" in sys.argv[1:] and "--help" in sys.argv[1:]):
616
+ model_group.add_argument("--model", **model_kwargs["model"])
617
+ model_group.add_argument("--runner", **model_kwargs["runner"])
618
+ model_group.add_argument("--convert", **model_kwargs["convert"])
619
+ model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
620
+ model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
621
+ model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
622
+ model_group.add_argument(
623
+ "--trust-remote-code", **model_kwargs["trust_remote_code"]
624
+ )
625
+ model_group.add_argument("--dtype", **model_kwargs["dtype"])
626
+ model_group.add_argument("--seed", **model_kwargs["seed"])
627
+ model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
628
+ model_group.add_argument(
629
+ "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
630
+ )
631
+ model_group.add_argument(
632
+ "--allowed-media-domains", **model_kwargs["allowed_media_domains"]
633
+ )
634
+ model_group.add_argument("--revision", **model_kwargs["revision"])
635
+ model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
636
+ model_group.add_argument(
637
+ "--tokenizer-revision", **model_kwargs["tokenizer_revision"]
638
+ )
639
+ model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
640
+ model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
641
+ model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
642
+ model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
643
+ model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
644
+ model_group.add_argument(
645
+ "--disable-sliding-window", **model_kwargs["disable_sliding_window"]
646
+ )
647
+ model_group.add_argument(
648
+ "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
649
+ )
650
+ model_group.add_argument(
651
+ "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
652
+ )
653
+ model_group.add_argument(
654
+ "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
655
+ )
656
+ model_group.add_argument(
657
+ "--served-model-name", **model_kwargs["served_model_name"]
658
+ )
659
+ model_group.add_argument("--config-format", **model_kwargs["config_format"])
660
+ # This one is a special case because it can bool
661
+ # or str. TODO: Handle this in get_kwargs
662
+ model_group.add_argument(
663
+ "--hf-token",
664
+ type=str,
665
+ nargs="?",
666
+ const=True,
667
+ default=model_kwargs["hf_token"]["default"],
668
+ help=model_kwargs["hf_token"]["help"],
669
+ )
670
+ model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
671
+ model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
672
+ model_group.add_argument(
673
+ "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
674
+ )
675
+ model_group.add_argument(
676
+ "--generation-config", **model_kwargs["generation_config"]
677
+ )
678
+ model_group.add_argument(
679
+ "--override-generation-config", **model_kwargs["override_generation_config"]
680
+ )
681
+ model_group.add_argument(
682
+ "--enable-sleep-mode", **model_kwargs["enable_sleep_mode"]
683
+ )
684
+ model_group.add_argument("--model-impl", **model_kwargs["model_impl"])
685
+ model_group.add_argument(
686
+ "--override-attention-dtype", **model_kwargs["override_attention_dtype"]
687
+ )
688
+ model_group.add_argument(
689
+ "--logits-processors", **model_kwargs["logits_processors"]
690
+ )
691
+ model_group.add_argument(
692
+ "--io-processor-plugin", **model_kwargs["io_processor_plugin"]
693
+ )
694
+
695
+ # Model loading arguments
696
+ load_kwargs = get_kwargs(LoadConfig)
697
+ load_group = parser.add_argument_group(
698
+ title="LoadConfig",
699
+ description=LoadConfig.__doc__,
700
+ )
701
+ load_group.add_argument("--load-format", **load_kwargs["load_format"])
702
+ load_group.add_argument("--download-dir", **load_kwargs["download_dir"])
703
+ load_group.add_argument(
704
+ "--safetensors-load-strategy", **load_kwargs["safetensors_load_strategy"]
705
+ )
706
+ load_group.add_argument(
707
+ "--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]
708
+ )
709
+ load_group.add_argument("--ignore-patterns", **load_kwargs["ignore_patterns"])
710
+ load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"])
711
+ load_group.add_argument(
712
+ "--pt-load-map-location", **load_kwargs["pt_load_map_location"]
713
+ )
714
+
715
+ # Structured outputs arguments
716
+ structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
717
+ structured_outputs_group = parser.add_argument_group(
718
+ title="StructuredOutputsConfig",
719
+ description=StructuredOutputsConfig.__doc__,
720
+ )
721
+ structured_outputs_group.add_argument(
722
+ "--reasoning-parser",
723
+ # Choices need to be validated after parsing to include plugins
724
+ **structured_outputs_kwargs["reasoning_parser"],
725
+ )
726
+ structured_outputs_group.add_argument(
727
+ "--reasoning-parser-plugin",
728
+ **structured_outputs_kwargs["reasoning_parser_plugin"],
729
+ )
730
+
731
+ # Parallel arguments
732
+ parallel_kwargs = get_kwargs(ParallelConfig)
733
+ parallel_group = parser.add_argument_group(
734
+ title="ParallelConfig",
735
+ description=ParallelConfig.__doc__,
736
+ )
737
+ parallel_group.add_argument(
738
+ "--distributed-executor-backend",
739
+ **parallel_kwargs["distributed_executor_backend"],
740
+ )
741
+ parallel_group.add_argument(
742
+ "--pipeline-parallel-size",
743
+ "-pp",
744
+ **parallel_kwargs["pipeline_parallel_size"],
745
+ )
746
+ parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
747
+ parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
748
+ parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
749
+ parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
750
+ parallel_group.add_argument(
751
+ "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
752
+ )
753
+ parallel_group.add_argument(
754
+ "--decode-context-parallel-size",
755
+ "-dcp",
756
+ **parallel_kwargs["decode_context_parallel_size"],
757
+ )
758
+ parallel_group.add_argument(
759
+ "--dcp-kv-cache-interleave-size",
760
+ **parallel_kwargs["dcp_kv_cache_interleave_size"],
761
+ )
762
+ parallel_group.add_argument(
763
+ "--cp-kv-cache-interleave-size",
764
+ **parallel_kwargs["cp_kv_cache_interleave_size"],
765
+ )
766
+ parallel_group.add_argument(
767
+ "--prefill-context-parallel-size",
768
+ "-pcp",
769
+ **parallel_kwargs["prefill_context_parallel_size"],
770
+ )
771
+ parallel_group.add_argument(
772
+ "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
773
+ )
774
+ parallel_group.add_argument(
775
+ "--data-parallel-rank",
776
+ "-dpn",
777
+ type=int,
778
+ help="Data parallel rank of this instance. "
779
+ "When set, enables external load balancer mode.",
780
+ )
781
+ parallel_group.add_argument(
782
+ "--data-parallel-start-rank",
783
+ "-dpr",
784
+ type=int,
785
+ help="Starting data parallel rank for secondary nodes.",
786
+ )
787
+ parallel_group.add_argument(
788
+ "--data-parallel-size-local",
789
+ "-dpl",
790
+ type=int,
791
+ help="Number of data parallel replicas to run on this node.",
792
+ )
793
+ parallel_group.add_argument(
794
+ "--data-parallel-address",
795
+ "-dpa",
796
+ type=str,
797
+ help="Address of data parallel cluster head-node.",
798
+ )
799
+ parallel_group.add_argument(
800
+ "--data-parallel-rpc-port",
801
+ "-dpp",
802
+ type=int,
803
+ help="Port for data parallel RPC communication.",
804
+ )
805
+ parallel_group.add_argument(
806
+ "--data-parallel-backend",
807
+ "-dpb",
808
+ type=str,
809
+ default="mp",
810
+ help='Backend for data parallel, either "mp" or "ray".',
811
+ )
812
+ parallel_group.add_argument(
813
+ "--data-parallel-hybrid-lb",
814
+ "-dph",
815
+ **parallel_kwargs["data_parallel_hybrid_lb"],
816
+ )
817
+ parallel_group.add_argument(
818
+ "--data-parallel-external-lb",
819
+ "-dpe",
820
+ **parallel_kwargs["data_parallel_external_lb"],
821
+ )
822
+ parallel_group.add_argument(
823
+ "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
824
+ )
825
+ parallel_group.add_argument(
826
+ "--all2all-backend", **parallel_kwargs["all2all_backend"]
827
+ )
828
+ parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
829
+ parallel_group.add_argument(
830
+ "--dbo-decode-token-threshold",
831
+ **parallel_kwargs["dbo_decode_token_threshold"],
832
+ )
833
+ parallel_group.add_argument(
834
+ "--dbo-prefill-token-threshold",
835
+ **parallel_kwargs["dbo_prefill_token_threshold"],
836
+ )
837
+ parallel_group.add_argument(
838
+ "--disable-nccl-for-dp-synchronization",
839
+ **parallel_kwargs["disable_nccl_for_dp_synchronization"],
840
+ )
841
+ parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
842
+ parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
843
+ parallel_group.add_argument(
844
+ "--expert-placement-strategy",
845
+ **parallel_kwargs["expert_placement_strategy"],
846
+ )
847
+
848
+ parallel_group.add_argument(
849
+ "--max-parallel-loading-workers",
850
+ **parallel_kwargs["max_parallel_loading_workers"],
851
+ )
852
+ parallel_group.add_argument(
853
+ "--ray-workers-use-nsight", **parallel_kwargs["ray_workers_use_nsight"]
854
+ )
855
+ parallel_group.add_argument(
856
+ "--disable-custom-all-reduce",
857
+ **parallel_kwargs["disable_custom_all_reduce"],
858
+ )
859
+ parallel_group.add_argument("--worker-cls", **parallel_kwargs["worker_cls"])
860
+ parallel_group.add_argument(
861
+ "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]
862
+ )
863
+ parallel_group.add_argument(
864
+ "--enable-multimodal-encoder-data-parallel",
865
+ action="store_true",
866
+ deprecated=True,
867
+ )
868
+
869
+ # KV cache arguments
870
+ cache_kwargs = get_kwargs(CacheConfig)
871
+ cache_group = parser.add_argument_group(
872
+ title="CacheConfig",
873
+ description=CacheConfig.__doc__,
874
+ )
875
+ cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
876
+ cache_group.add_argument(
877
+ "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
878
+ )
879
+ cache_group.add_argument(
880
+ "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
881
+ )
882
+ cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
883
+ cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
884
+ cache_group.add_argument(
885
+ "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
886
+ )
887
+ cache_group.add_argument(
888
+ "--enable-prefix-caching",
889
+ **{
890
+ **cache_kwargs["enable_prefix_caching"],
891
+ "default": None,
892
+ },
893
+ )
894
+ cache_group.add_argument(
895
+ "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
896
+ )
897
+ cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"])
898
+ cache_group.add_argument(
899
+ "--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]
900
+ )
901
+ cache_group.add_argument(
902
+ "--kv-sharing-fast-prefill", **cache_kwargs["kv_sharing_fast_prefill"]
903
+ )
904
+ cache_group.add_argument(
905
+ "--mamba-cache-dtype", **cache_kwargs["mamba_cache_dtype"]
906
+ )
907
+ cache_group.add_argument(
908
+ "--mamba-ssm-cache-dtype", **cache_kwargs["mamba_ssm_cache_dtype"]
909
+ )
910
+ cache_group.add_argument(
911
+ "--mamba-block-size", **cache_kwargs["mamba_block_size"]
912
+ )
913
+ cache_group.add_argument(
914
+ "--kv-offloading-size", **cache_kwargs["kv_offloading_size"]
915
+ )
916
+ cache_group.add_argument(
917
+ "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"]
918
+ )
919
+
920
+ # Multimodal related configs
921
+ multimodal_kwargs = get_kwargs(MultiModalConfig)
922
+ multimodal_group = parser.add_argument_group(
923
+ title="MultiModalConfig",
924
+ description=MultiModalConfig.__doc__,
925
+ )
926
+ multimodal_group.add_argument(
927
+ "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"]
928
+ )
929
+ multimodal_group.add_argument(
930
+ "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
931
+ )
932
+ multimodal_group.add_argument(
933
+ "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
934
+ )
935
+ multimodal_group.add_argument(
936
+ "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
937
+ )
938
+ multimodal_group.add_argument(
939
+ "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]
940
+ )
941
+ multimodal_group.add_argument(
942
+ "--disable-mm-preprocessor-cache", action="store_true", deprecated=True
943
+ )
944
+ multimodal_group.add_argument(
945
+ "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"]
946
+ )
947
+ multimodal_group.add_argument(
948
+ "--mm-shm-cache-max-object-size-mb",
949
+ **multimodal_kwargs["mm_shm_cache_max_object_size_mb"],
950
+ )
951
+ multimodal_group.add_argument(
952
+ "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"]
953
+ )
954
+ multimodal_group.add_argument(
955
+ "--mm-encoder-attn-backend",
956
+ **multimodal_kwargs["mm_encoder_attn_backend"],
957
+ )
958
+ multimodal_group.add_argument(
959
+ "--interleave-mm-strings", **multimodal_kwargs["interleave_mm_strings"]
960
+ )
961
+ multimodal_group.add_argument(
962
+ "--skip-mm-profiling", **multimodal_kwargs["skip_mm_profiling"]
963
+ )
964
+
965
+ multimodal_group.add_argument(
966
+ "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"]
967
+ )
968
+
969
+ # LoRA related configs
970
+ lora_kwargs = get_kwargs(LoRAConfig)
971
+ lora_group = parser.add_argument_group(
972
+ title="LoRAConfig",
973
+ description=LoRAConfig.__doc__,
974
+ )
975
+ lora_group.add_argument(
976
+ "--enable-lora",
977
+ action=argparse.BooleanOptionalAction,
978
+ help="If True, enable handling of LoRA adapters.",
979
+ )
980
+ lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
981
+ lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
982
+ lora_group.add_argument(
983
+ "--lora-dtype",
984
+ **lora_kwargs["lora_dtype"],
985
+ )
986
+ lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
987
+ lora_group.add_argument(
988
+ "--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
989
+ )
990
+ lora_group.add_argument("--default-mm-loras", **lora_kwargs["default_mm_loras"])
991
+
992
+ # Observability arguments
993
+ observability_kwargs = get_kwargs(ObservabilityConfig)
994
+ observability_group = parser.add_argument_group(
995
+ title="ObservabilityConfig",
996
+ description=ObservabilityConfig.__doc__,
997
+ )
998
+ observability_group.add_argument(
999
+ "--show-hidden-metrics-for-version",
1000
+ **observability_kwargs["show_hidden_metrics_for_version"],
1001
+ )
1002
+ observability_group.add_argument(
1003
+ "--otlp-traces-endpoint", **observability_kwargs["otlp_traces_endpoint"]
1004
+ )
1005
+ # TODO: generalise this special case
1006
+ choices = observability_kwargs["collect_detailed_traces"]["choices"]
1007
+ metavar = f"{{{','.join(choices)}}}"
1008
+ observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
1009
+ observability_kwargs["collect_detailed_traces"]["choices"] += [
1010
+ ",".join(p) for p in permutations(get_args(DetailedTraceModules), r=2)
1011
+ ]
1012
+ observability_group.add_argument(
1013
+ "--collect-detailed-traces",
1014
+ **observability_kwargs["collect_detailed_traces"],
1015
+ )
1016
+ observability_group.add_argument(
1017
+ "--kv-cache-metrics", **observability_kwargs["kv_cache_metrics"]
1018
+ )
1019
+ observability_group.add_argument(
1020
+ "--kv-cache-metrics-sample",
1021
+ **observability_kwargs["kv_cache_metrics_sample"],
1022
+ )
1023
+
1024
+ # Scheduler arguments
1025
+ scheduler_kwargs = get_kwargs(SchedulerConfig)
1026
+ scheduler_group = parser.add_argument_group(
1027
+ title="SchedulerConfig",
1028
+ description=SchedulerConfig.__doc__,
1029
+ )
1030
+ scheduler_group.add_argument(
1031
+ "--max-num-batched-tokens",
1032
+ **{
1033
+ **scheduler_kwargs["max_num_batched_tokens"],
1034
+ "default": None,
1035
+ },
1036
+ )
1037
+ scheduler_group.add_argument(
1038
+ "--max-num-seqs",
1039
+ **{
1040
+ **scheduler_kwargs["max_num_seqs"],
1041
+ "default": None,
1042
+ },
1043
+ )
1044
+ scheduler_group.add_argument(
1045
+ "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
1046
+ )
1047
+ scheduler_group.add_argument(
1048
+ "--max-long-partial-prefills",
1049
+ **scheduler_kwargs["max_long_partial_prefills"],
1050
+ )
1051
+ scheduler_group.add_argument(
1052
+ "--long-prefill-token-threshold",
1053
+ **scheduler_kwargs["long_prefill_token_threshold"],
1054
+ )
1055
+ # multi-step scheduling has been removed; corresponding arguments
1056
+ # are no longer supported.
1057
+ scheduler_group.add_argument(
1058
+ "--scheduling-policy", **scheduler_kwargs["policy"]
1059
+ )
1060
+ scheduler_group.add_argument(
1061
+ "--enable-chunked-prefill",
1062
+ **{
1063
+ **scheduler_kwargs["enable_chunked_prefill"],
1064
+ "default": None,
1065
+ },
1066
+ )
1067
+ scheduler_group.add_argument(
1068
+ "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
1069
+ )
1070
+ scheduler_group.add_argument(
1071
+ "--scheduler-cls", **scheduler_kwargs["scheduler_cls"]
1072
+ )
1073
+ scheduler_group.add_argument(
1074
+ "--disable-hybrid-kv-cache-manager",
1075
+ **scheduler_kwargs["disable_hybrid_kv_cache_manager"],
1076
+ )
1077
+ scheduler_group.add_argument(
1078
+ "--async-scheduling", **scheduler_kwargs["async_scheduling"]
1079
+ )
1080
+ scheduler_group.add_argument(
1081
+ "--stream-interval", **scheduler_kwargs["stream_interval"]
1082
+ )
1083
+
1084
+ # Compilation arguments
1085
+ compilation_kwargs = get_kwargs(CompilationConfig)
1086
+ compilation_group = parser.add_argument_group(
1087
+ title="CompilationConfig",
1088
+ description=CompilationConfig.__doc__,
1089
+ )
1090
+ compilation_group.add_argument(
1091
+ "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
1092
+ )
1093
+ compilation_kwargs["cudagraph_capture_sizes"]["help"] = (
1094
+ "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0,"
1095
+ " whichever is soonest. Please use --cudagraph-capture-sizes instead."
1096
+ )
1097
+ compilation_group.add_argument(
1098
+ "--cuda-graph-sizes",
1099
+ **compilation_kwargs["cudagraph_capture_sizes"],
1100
+ deprecated=True,
1101
+ )
1102
+ compilation_group.add_argument(
1103
+ "--max-cudagraph-capture-size",
1104
+ **compilation_kwargs["max_cudagraph_capture_size"],
1105
+ )
1106
+
1107
+ # vLLM arguments
1108
+ vllm_kwargs = get_kwargs(VllmConfig)
1109
+ vllm_group = parser.add_argument_group(
1110
+ title="VllmConfig",
1111
+ description=VllmConfig.__doc__,
1112
+ )
1113
+ # We construct SpeculativeConfig using fields from other configs in
1114
+ # create_engine_config. So we set the type to a JSON string here to
1115
+ # delay the Pydantic validation that comes with SpeculativeConfig.
1116
+ vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
1117
+ vllm_group.add_argument(
1118
+ "--speculative-config", **vllm_kwargs["speculative_config"]
1119
+ )
1120
+ vllm_group.add_argument(
1121
+ "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"]
1122
+ )
1123
+ vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"])
1124
+ vllm_group.add_argument(
1125
+ "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
1126
+ )
1127
+ vllm_group.add_argument(
1128
+ "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
1129
+ )
1130
+ vllm_group.add_argument(
1131
+ "--additional-config", **vllm_kwargs["additional_config"]
1132
+ )
1133
+ vllm_group.add_argument(
1134
+ "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
1135
+ )
1136
+
1137
+ vllm_group.add_argument(
1138
+ "--optimization-level", **vllm_kwargs["optimization_level"]
1139
+ )
1140
+
1141
+ # Other arguments
1142
+ parser.add_argument(
1143
+ "--disable-log-stats",
1144
+ action="store_true",
1145
+ help="Disable logging statistics.",
1146
+ )
1147
+
1148
+ parser.add_argument(
1149
+ "--aggregate-engine-logging",
1150
+ action="store_true",
1151
+ help="Log aggregate rather than per-engine statistics "
1152
+ "when using data parallelism.",
1153
+ )
1154
+ return parser
1155
+
1156
+ @classmethod
1157
+ def from_cli_args(cls, args: argparse.Namespace):
1158
+ # Get the list of attributes of this dataclass.
1159
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
1160
+ # Set the attributes from the parsed arguments.
1161
+ engine_args = cls(
1162
+ **{attr: getattr(args, attr) for attr in attrs if hasattr(args, attr)}
1163
+ )
1164
+ return engine_args
1165
+
1166
+ def create_model_config(self) -> ModelConfig:
1167
+ # gguf file needs a specific model loader
1168
+ if is_gguf(self.model):
1169
+ self.quantization = self.load_format = "gguf"
1170
+
1171
+ # NOTE(woosuk): In V1, we use separate processes for workers (unless
1172
+ # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
1173
+ # doesn't affect the user process.
1174
+ if self.seed is None:
1175
+ logger.warning_once(
1176
+ "`seed=None` is equivalent to `seed=0` in V1 Engine. "
1177
+ "You will no longer be allowed to pass `None` in v0.13.",
1178
+ scope="local",
1179
+ )
1180
+
1181
+ self.seed = 0
1182
+ if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
1183
+ logger.warning(
1184
+ "The global random seed is set to %d. Since "
1185
+ "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
1186
+ "affect the random state of the Python process that "
1187
+ "launched vLLM.",
1188
+ self.seed,
1189
+ )
1190
+
1191
+ if self.disable_mm_preprocessor_cache:
1192
+ logger.warning_once(
1193
+ "`--disable-mm-preprocessor-cache` is deprecated "
1194
+ "and will be removed in v0.13. "
1195
+ "Please use `--mm-processor-cache-gb 0` instead.",
1196
+ scope="local",
1197
+ )
1198
+
1199
+ self.mm_processor_cache_gb = 0
1200
+ elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
1201
+ logger.warning_once(
1202
+ "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
1203
+ "and will be removed in v0.13. "
1204
+ "Please use `--mm-processor-cache-gb %d` instead.",
1205
+ envs.VLLM_MM_INPUT_CACHE_GIB,
1206
+ scope="local",
1207
+ )
1208
+
1209
+ self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
1210
+
1211
+ if self.enable_multimodal_encoder_data_parallel:
1212
+ logger.warning_once(
1213
+ "--enable-multimodal-encoder-data-parallel` is deprecated "
1214
+ "and will be removed in v0.13. "
1215
+ "Please use `--mm-encoder-tp-mode data` instead.",
1216
+ scope="local",
1217
+ )
1218
+
1219
+ self.mm_encoder_tp_mode = "data"
1220
+
1221
+ return ModelConfig(
1222
+ model=self.model,
1223
+ hf_config_path=self.hf_config_path,
1224
+ runner=self.runner,
1225
+ convert=self.convert,
1226
+ task=self.task,
1227
+ tokenizer=self.tokenizer,
1228
+ tokenizer_mode=self.tokenizer_mode,
1229
+ trust_remote_code=self.trust_remote_code,
1230
+ allowed_local_media_path=self.allowed_local_media_path,
1231
+ allowed_media_domains=self.allowed_media_domains,
1232
+ dtype=self.dtype,
1233
+ seed=self.seed,
1234
+ revision=self.revision,
1235
+ code_revision=self.code_revision,
1236
+ hf_token=self.hf_token,
1237
+ hf_overrides=self.hf_overrides,
1238
+ tokenizer_revision=self.tokenizer_revision,
1239
+ max_model_len=self.max_model_len,
1240
+ quantization=self.quantization,
1241
+ enforce_eager=self.enforce_eager,
1242
+ max_logprobs=self.max_logprobs,
1243
+ logprobs_mode=self.logprobs_mode,
1244
+ disable_sliding_window=self.disable_sliding_window,
1245
+ disable_cascade_attn=self.disable_cascade_attn,
1246
+ skip_tokenizer_init=self.skip_tokenizer_init,
1247
+ enable_prompt_embeds=self.enable_prompt_embeds,
1248
+ served_model_name=self.served_model_name,
1249
+ limit_mm_per_prompt=self.limit_mm_per_prompt,
1250
+ enable_mm_embeds=self.enable_mm_embeds,
1251
+ interleave_mm_strings=self.interleave_mm_strings,
1252
+ media_io_kwargs=self.media_io_kwargs,
1253
+ skip_mm_profiling=self.skip_mm_profiling,
1254
+ config_format=self.config_format,
1255
+ mm_processor_kwargs=self.mm_processor_kwargs,
1256
+ mm_processor_cache_gb=self.mm_processor_cache_gb,
1257
+ mm_processor_cache_type=self.mm_processor_cache_type,
1258
+ mm_shm_cache_max_object_size_mb=self.mm_shm_cache_max_object_size_mb,
1259
+ mm_encoder_tp_mode=self.mm_encoder_tp_mode,
1260
+ mm_encoder_attn_backend=self.mm_encoder_attn_backend,
1261
+ pooler_config=self.pooler_config,
1262
+ logits_processor_pattern=self.logits_processor_pattern,
1263
+ generation_config=self.generation_config,
1264
+ override_generation_config=self.override_generation_config,
1265
+ enable_sleep_mode=self.enable_sleep_mode,
1266
+ model_impl=self.model_impl,
1267
+ override_attention_dtype=self.override_attention_dtype,
1268
+ logits_processors=self.logits_processors,
1269
+ video_pruning_rate=self.video_pruning_rate,
1270
+ io_processor_plugin=self.io_processor_plugin,
1271
+ )
1272
+
1273
+ def validate_tensorizer_args(self):
1274
+ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
1275
+
1276
+ for key in self.model_loader_extra_config:
1277
+ if key in TensorizerConfig._fields:
1278
+ self.model_loader_extra_config["tensorizer_config"][key] = (
1279
+ self.model_loader_extra_config[key]
1280
+ )
1281
+
1282
+ def create_load_config(self) -> LoadConfig:
1283
+ if self.quantization == "bitsandbytes":
1284
+ self.load_format = "bitsandbytes"
1285
+
1286
+ if self.load_format == "tensorizer":
1287
+ if hasattr(self.model_loader_extra_config, "to_serializable"):
1288
+ self.model_loader_extra_config = (
1289
+ self.model_loader_extra_config.to_serializable()
1290
+ )
1291
+ self.model_loader_extra_config["tensorizer_config"] = {}
1292
+ self.model_loader_extra_config["tensorizer_config"]["tensorizer_dir"] = (
1293
+ self.model
1294
+ )
1295
+ self.validate_tensorizer_args()
1296
+
1297
+ return LoadConfig(
1298
+ load_format=self.load_format,
1299
+ download_dir=self.download_dir,
1300
+ safetensors_load_strategy=self.safetensors_load_strategy,
1301
+ device="cpu" if is_online_quantization(self.quantization) else None,
1302
+ model_loader_extra_config=self.model_loader_extra_config,
1303
+ ignore_patterns=self.ignore_patterns,
1304
+ use_tqdm_on_load=self.use_tqdm_on_load,
1305
+ pt_load_map_location=self.pt_load_map_location,
1306
+ )
1307
+
1308
+ def create_speculative_config(
1309
+ self,
1310
+ target_model_config: ModelConfig,
1311
+ target_parallel_config: ParallelConfig,
1312
+ ) -> SpeculativeConfig | None:
1313
+ """Initializes and returns a SpeculativeConfig object based on
1314
+ `speculative_config`.
1315
+
1316
+ This function utilizes `speculative_config` to create a
1317
+ SpeculativeConfig object. The `speculative_config` can either be
1318
+ provided as a JSON string input via CLI arguments or directly as a
1319
+ dictionary from the engine.
1320
+ """
1321
+ if self.speculative_config is None:
1322
+ return None
1323
+
1324
+ # Note(Shangming): These parameters are not obtained from the cli arg
1325
+ # '--speculative-config' and must be passed in when creating the engine
1326
+ # config.
1327
+ self.speculative_config.update(
1328
+ {
1329
+ "target_model_config": target_model_config,
1330
+ "target_parallel_config": target_parallel_config,
1331
+ }
1332
+ )
1333
+ return SpeculativeConfig(**self.speculative_config)
1334
+
1335
+ def create_engine_config(
1336
+ self,
1337
+ usage_context: UsageContext | None = None,
1338
+ headless: bool = False,
1339
+ ) -> VllmConfig:
1340
+ """
1341
+ Create the VllmConfig.
1342
+
1343
+ NOTE: If VllmConfig is incompatible, we raise an error.
1344
+ """
1345
+ current_platform.pre_register_and_update()
1346
+
1347
+ device_config = DeviceConfig(device=cast(Device, current_platform.device_type))
1348
+
1349
+ # Check if the model is a speculator and override model/tokenizer/config
1350
+ # BEFORE creating ModelConfig, so the config is created with the target model
1351
+ # Skip speculator detection for cloud storage models (eg: S3, GCS) since
1352
+ # HuggingFace cannot load configs directly from S3 URLs. S3 models can still
1353
+ # use speculators with explicit --speculative-config.
1354
+ if not is_cloud_storage(self.model):
1355
+ (self.model, self.tokenizer, self.speculative_config) = (
1356
+ maybe_override_with_speculators(
1357
+ model=self.model,
1358
+ tokenizer=self.tokenizer,
1359
+ revision=self.revision,
1360
+ trust_remote_code=self.trust_remote_code,
1361
+ vllm_speculative_config=self.speculative_config,
1362
+ )
1363
+ )
1364
+
1365
+ model_config = self.create_model_config()
1366
+ self.model = model_config.model
1367
+ self.tokenizer = model_config.tokenizer
1368
+
1369
+ self._check_feature_supported(model_config)
1370
+ self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
1371
+ self._set_default_max_num_seqs_and_batched_tokens_args(
1372
+ usage_context, model_config
1373
+ )
1374
+
1375
+ sliding_window: int | None = None
1376
+ if not is_interleaved(model_config.hf_text_config):
1377
+ # Only set CacheConfig.sliding_window if the model is all sliding
1378
+ # window. Otherwise CacheConfig.sliding_window will override the
1379
+ # global layers in interleaved sliding window models.
1380
+ sliding_window = model_config.get_sliding_window()
1381
+
1382
+ # Note(hc): In the current implementation of decode context
1383
+ # parallel(DCP), tp_size needs to be divisible by dcp_size,
1384
+ # because the world size does not change by dcp, it simply
1385
+ # reuses the GPUs of TP group, and split one TP group into
1386
+ # tp_size//dcp_size DCP groups.
1387
+ assert self.tensor_parallel_size % self.decode_context_parallel_size == 0, (
1388
+ f"tp_size={self.tensor_parallel_size} must be divisible by"
1389
+ f"dcp_size={self.decode_context_parallel_size}."
1390
+ )
1391
+
1392
+ cache_config = CacheConfig(
1393
+ block_size=self.block_size,
1394
+ gpu_memory_utilization=self.gpu_memory_utilization,
1395
+ kv_cache_memory_bytes=self.kv_cache_memory_bytes,
1396
+ swap_space=self.swap_space,
1397
+ cache_dtype=self.kv_cache_dtype,
1398
+ is_attention_free=model_config.is_attention_free,
1399
+ num_gpu_blocks_override=self.num_gpu_blocks_override,
1400
+ sliding_window=sliding_window,
1401
+ enable_prefix_caching=self.enable_prefix_caching,
1402
+ prefix_caching_hash_algo=self.prefix_caching_hash_algo,
1403
+ cpu_offload_gb=self.cpu_offload_gb,
1404
+ calculate_kv_scales=self.calculate_kv_scales,
1405
+ kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
1406
+ mamba_cache_dtype=self.mamba_cache_dtype,
1407
+ mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
1408
+ mamba_block_size=self.mamba_block_size,
1409
+ kv_offloading_size=self.kv_offloading_size,
1410
+ kv_offloading_backend=self.kv_offloading_backend,
1411
+ )
1412
+
1413
+ ray_runtime_env = None
1414
+ if is_ray_initialized():
1415
+ # Ray Serve LLM calls `create_engine_config` in the context
1416
+ # of a Ray task, therefore we check is_ray_initialized()
1417
+ # as opposed to is_in_ray_actor().
1418
+ import ray
1419
+
1420
+ ray_runtime_env = ray.get_runtime_context().runtime_env
1421
+ # Avoid logging sensitive environment variables
1422
+ sanitized_env = ray_runtime_env.to_dict() if ray_runtime_env else {}
1423
+ if "env_vars" in sanitized_env:
1424
+ sanitized_env["env_vars"] = {
1425
+ k: "***" for k in sanitized_env["env_vars"]
1426
+ }
1427
+ logger.info("Using ray runtime env (env vars redacted): %s", sanitized_env)
1428
+
1429
+ # Get the current placement group if Ray is initialized and
1430
+ # we are in a Ray actor. If so, then the placement group will be
1431
+ # passed to spawned processes.
1432
+ placement_group = None
1433
+ if is_in_ray_actor():
1434
+ import ray
1435
+
1436
+ # This call initializes Ray automatically if it is not initialized,
1437
+ # but we should not do this here.
1438
+ placement_group = ray.util.get_current_placement_group()
1439
+
1440
+ assert not headless or not self.data_parallel_hybrid_lb, (
1441
+ "data_parallel_hybrid_lb is not applicable in headless mode"
1442
+ )
1443
+ assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
1444
+ "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
1445
+ )
1446
+ assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
1447
+ "nnodes > 1 is only supported with data_parallel_backend=mp"
1448
+ )
1449
+ inferred_data_parallel_rank = 0
1450
+ if self.nnodes > 1:
1451
+ world_size = (
1452
+ self.data_parallel_size
1453
+ * self.pipeline_parallel_size
1454
+ * self.tensor_parallel_size
1455
+ )
1456
+ world_size_within_dp = (
1457
+ self.pipeline_parallel_size * self.tensor_parallel_size
1458
+ )
1459
+ local_world_size = world_size // self.nnodes
1460
+ assert world_size % self.nnodes == 0, (
1461
+ f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
1462
+ )
1463
+ assert self.node_rank < self.nnodes, (
1464
+ f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
1465
+ )
1466
+ inferred_data_parallel_rank = (
1467
+ self.node_rank * local_world_size
1468
+ ) // world_size_within_dp
1469
+ if self.data_parallel_size > 1 and self.data_parallel_external_lb:
1470
+ self.data_parallel_rank = inferred_data_parallel_rank
1471
+ logger.info(
1472
+ "Inferred data_parallel_rank %d from node_rank %d for external lb",
1473
+ self.data_parallel_rank,
1474
+ self.node_rank,
1475
+ )
1476
+ elif self.data_parallel_size_local is None:
1477
+ # Infer data parallel size local for internal dplb:
1478
+ self.data_parallel_size_local = max(
1479
+ local_world_size // world_size_within_dp, 1
1480
+ )
1481
+ data_parallel_external_lb = (
1482
+ self.data_parallel_external_lb or self.data_parallel_rank is not None
1483
+ )
1484
+ # Local DP rank = 1, use pure-external LB.
1485
+ if data_parallel_external_lb:
1486
+ assert self.data_parallel_rank is not None, (
1487
+ "data_parallel_rank or node_rank must be specified if "
1488
+ "data_parallel_external_lb is enable."
1489
+ )
1490
+ assert self.data_parallel_size_local in (1, None), (
1491
+ "data_parallel_size_local must be 1 or None when data_parallel_rank "
1492
+ "is set"
1493
+ )
1494
+ data_parallel_size_local = 1
1495
+ # Use full external lb if we have local_size of 1.
1496
+ self.data_parallel_hybrid_lb = False
1497
+ elif self.data_parallel_size_local is not None:
1498
+ data_parallel_size_local = self.data_parallel_size_local
1499
+
1500
+ if self.data_parallel_start_rank and not headless:
1501
+ # Infer hybrid LB mode.
1502
+ self.data_parallel_hybrid_lb = True
1503
+
1504
+ if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
1505
+ # Use full external lb if we have local_size of 1.
1506
+ logger.warning(
1507
+ "data_parallel_hybrid_lb is not eligible when "
1508
+ "data_parallel_size_local = 1, autoswitch to "
1509
+ "data_parallel_external_lb."
1510
+ )
1511
+ data_parallel_external_lb = True
1512
+ self.data_parallel_hybrid_lb = False
1513
+
1514
+ if data_parallel_size_local == self.data_parallel_size:
1515
+ # Disable hybrid LB mode if set for a single node
1516
+ self.data_parallel_hybrid_lb = False
1517
+
1518
+ self.data_parallel_rank = (
1519
+ self.data_parallel_start_rank or inferred_data_parallel_rank
1520
+ )
1521
+ if self.nnodes > 1:
1522
+ logger.info(
1523
+ "Inferred data_parallel_rank %d from node_rank %d",
1524
+ self.data_parallel_rank,
1525
+ self.node_rank,
1526
+ )
1527
+ else:
1528
+ assert not self.data_parallel_hybrid_lb, (
1529
+ "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
1530
+ )
1531
+
1532
+ if self.data_parallel_backend == "ray" and (
1533
+ envs.VLLM_RAY_DP_PACK_STRATEGY == "span"
1534
+ ):
1535
+ # Data parallel size defaults to 1 if DP ranks are spanning
1536
+ # multiple nodes
1537
+ data_parallel_size_local = 1
1538
+ else:
1539
+ # Otherwise local DP size defaults to global DP size if not set
1540
+ data_parallel_size_local = self.data_parallel_size
1541
+
1542
+ # DP address, used in multi-node case for torch distributed group
1543
+ # and ZMQ sockets.
1544
+ if self.data_parallel_address is None:
1545
+ if self.data_parallel_backend == "ray":
1546
+ host_ip = get_ip()
1547
+ logger.info(
1548
+ "Using host IP %s as ray-based data parallel address", host_ip
1549
+ )
1550
+ data_parallel_address = host_ip
1551
+ else:
1552
+ assert self.data_parallel_backend == "mp", (
1553
+ "data_parallel_backend can only be ray or mp, got %s",
1554
+ self.data_parallel_backend,
1555
+ )
1556
+ data_parallel_address = (
1557
+ self.master_addr or ParallelConfig.data_parallel_master_ip
1558
+ )
1559
+ else:
1560
+ data_parallel_address = self.data_parallel_address
1561
+
1562
+ # This port is only used when there are remote data parallel engines,
1563
+ # otherwise the local IPC transport is used.
1564
+ data_parallel_rpc_port = (
1565
+ self.data_parallel_rpc_port
1566
+ if (self.data_parallel_rpc_port is not None)
1567
+ else ParallelConfig.data_parallel_rpc_port
1568
+ )
1569
+
1570
+ if self.tokens_only and not model_config.skip_tokenizer_init:
1571
+ model_config.skip_tokenizer_init = True
1572
+ logger.info("Skipping tokenizer initialization for tokens-only mode.")
1573
+
1574
+ if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
1575
+ logger.info(
1576
+ "Disabling NCCL for DP synchronization when using async scheduling."
1577
+ )
1578
+ self.disable_nccl_for_dp_synchronization = True
1579
+
1580
+ parallel_config = ParallelConfig(
1581
+ pipeline_parallel_size=self.pipeline_parallel_size,
1582
+ tensor_parallel_size=self.tensor_parallel_size,
1583
+ prefill_context_parallel_size=self.prefill_context_parallel_size,
1584
+ data_parallel_size=self.data_parallel_size,
1585
+ data_parallel_rank=self.data_parallel_rank or 0,
1586
+ data_parallel_external_lb=data_parallel_external_lb,
1587
+ data_parallel_size_local=data_parallel_size_local,
1588
+ master_addr=self.master_addr,
1589
+ master_port=self.master_port,
1590
+ nnodes=self.nnodes,
1591
+ node_rank=self.node_rank,
1592
+ data_parallel_master_ip=data_parallel_address,
1593
+ data_parallel_rpc_port=data_parallel_rpc_port,
1594
+ data_parallel_backend=self.data_parallel_backend,
1595
+ data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
1596
+ enable_expert_parallel=self.enable_expert_parallel,
1597
+ all2all_backend=self.all2all_backend,
1598
+ enable_dbo=self.enable_dbo,
1599
+ dbo_decode_token_threshold=self.dbo_decode_token_threshold,
1600
+ dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
1601
+ disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
1602
+ enable_eplb=self.enable_eplb,
1603
+ eplb_config=self.eplb_config,
1604
+ expert_placement_strategy=self.expert_placement_strategy,
1605
+ max_parallel_loading_workers=self.max_parallel_loading_workers,
1606
+ disable_custom_all_reduce=self.disable_custom_all_reduce,
1607
+ ray_workers_use_nsight=self.ray_workers_use_nsight,
1608
+ ray_runtime_env=ray_runtime_env,
1609
+ placement_group=placement_group,
1610
+ distributed_executor_backend=self.distributed_executor_backend,
1611
+ worker_cls=self.worker_cls,
1612
+ worker_extension_cls=self.worker_extension_cls,
1613
+ decode_context_parallel_size=self.decode_context_parallel_size,
1614
+ dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
1615
+ cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
1616
+ _api_process_count=self._api_process_count,
1617
+ _api_process_rank=self._api_process_rank,
1618
+ )
1619
+
1620
+ speculative_config = self.create_speculative_config(
1621
+ target_model_config=model_config,
1622
+ target_parallel_config=parallel_config,
1623
+ )
1624
+
1625
+ scheduler_config = SchedulerConfig(
1626
+ runner_type=model_config.runner_type,
1627
+ max_num_batched_tokens=self.max_num_batched_tokens,
1628
+ max_num_seqs=self.max_num_seqs,
1629
+ max_model_len=model_config.max_model_len,
1630
+ enable_chunked_prefill=self.enable_chunked_prefill,
1631
+ disable_chunked_mm_input=self.disable_chunked_mm_input,
1632
+ is_multimodal_model=model_config.is_multimodal_model,
1633
+ is_encoder_decoder=model_config.is_encoder_decoder,
1634
+ policy=self.scheduling_policy,
1635
+ scheduler_cls=self.scheduler_cls,
1636
+ max_num_partial_prefills=self.max_num_partial_prefills,
1637
+ max_long_partial_prefills=self.max_long_partial_prefills,
1638
+ long_prefill_token_threshold=self.long_prefill_token_threshold,
1639
+ disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
1640
+ async_scheduling=self.async_scheduling,
1641
+ stream_interval=self.stream_interval,
1642
+ )
1643
+
1644
+ if not model_config.is_multimodal_model and self.default_mm_loras:
1645
+ raise ValueError(
1646
+ "Default modality-specific LoRA(s) were provided for a "
1647
+ "non multimodal model"
1648
+ )
1649
+
1650
+ lora_config = (
1651
+ LoRAConfig(
1652
+ max_lora_rank=self.max_lora_rank,
1653
+ max_loras=self.max_loras,
1654
+ default_mm_loras=self.default_mm_loras,
1655
+ fully_sharded_loras=self.fully_sharded_loras,
1656
+ lora_dtype=self.lora_dtype,
1657
+ max_cpu_loras=self.max_cpu_loras
1658
+ if self.max_cpu_loras and self.max_cpu_loras > 0
1659
+ else None,
1660
+ )
1661
+ if self.enable_lora
1662
+ else None
1663
+ )
1664
+
1665
+ if (
1666
+ lora_config is not None
1667
+ and speculative_config is not None
1668
+ and scheduler_config.max_num_batched_tokens
1669
+ < (
1670
+ scheduler_config.max_num_seqs
1671
+ * (speculative_config.num_speculative_tokens + 1)
1672
+ )
1673
+ ):
1674
+ raise ValueError(
1675
+ "Consider increasing max_num_batched_tokens or "
1676
+ "decreasing num_speculative_tokens"
1677
+ )
1678
+
1679
+ # bitsandbytes pre-quantized model need a specific model loader
1680
+ if model_config.quantization == "bitsandbytes":
1681
+ self.quantization = self.load_format = "bitsandbytes"
1682
+
1683
+ load_config = self.create_load_config()
1684
+
1685
+ # Pass reasoning_parser into StructuredOutputsConfig
1686
+ if self.reasoning_parser:
1687
+ self.structured_outputs_config.reasoning_parser = self.reasoning_parser
1688
+
1689
+ if self.reasoning_parser_plugin:
1690
+ self.structured_outputs_config.reasoning_parser_plugin = (
1691
+ self.reasoning_parser_plugin
1692
+ )
1693
+
1694
+ observability_config = ObservabilityConfig(
1695
+ show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
1696
+ otlp_traces_endpoint=self.otlp_traces_endpoint,
1697
+ collect_detailed_traces=self.collect_detailed_traces,
1698
+ kv_cache_metrics=self.kv_cache_metrics,
1699
+ kv_cache_metrics_sample=self.kv_cache_metrics_sample,
1700
+ )
1701
+
1702
+ # Compilation config overrides
1703
+ compilation_config = copy.deepcopy(self.compilation_config)
1704
+ if self.cuda_graph_sizes is not None:
1705
+ logger.warning(
1706
+ "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
1707
+ "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
1708
+ "instead."
1709
+ )
1710
+ if compilation_config.cudagraph_capture_sizes is not None:
1711
+ raise ValueError(
1712
+ "cuda_graph_sizes and compilation_config."
1713
+ "cudagraph_capture_sizes are mutually exclusive"
1714
+ )
1715
+ compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
1716
+ if self.cudagraph_capture_sizes is not None:
1717
+ if compilation_config.cudagraph_capture_sizes is not None:
1718
+ raise ValueError(
1719
+ "cudagraph_capture_sizes and compilation_config."
1720
+ "cudagraph_capture_sizes are mutually exclusive"
1721
+ )
1722
+ compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
1723
+ if self.max_cudagraph_capture_size is not None:
1724
+ if compilation_config.max_cudagraph_capture_size is not None:
1725
+ raise ValueError(
1726
+ "max_cudagraph_capture_size and compilation_config."
1727
+ "max_cudagraph_capture_size are mutually exclusive"
1728
+ )
1729
+ compilation_config.max_cudagraph_capture_size = (
1730
+ self.max_cudagraph_capture_size
1731
+ )
1732
+ config = VllmConfig(
1733
+ model_config=model_config,
1734
+ cache_config=cache_config,
1735
+ parallel_config=parallel_config,
1736
+ scheduler_config=scheduler_config,
1737
+ device_config=device_config,
1738
+ lora_config=lora_config,
1739
+ speculative_config=speculative_config,
1740
+ load_config=load_config,
1741
+ structured_outputs_config=self.structured_outputs_config,
1742
+ observability_config=observability_config,
1743
+ compilation_config=compilation_config,
1744
+ kv_transfer_config=self.kv_transfer_config,
1745
+ kv_events_config=self.kv_events_config,
1746
+ ec_transfer_config=self.ec_transfer_config,
1747
+ additional_config=self.additional_config,
1748
+ optimization_level=self.optimization_level,
1749
+ )
1750
+
1751
+ return config
1752
+
1753
+ def _check_feature_supported(self, model_config: ModelConfig):
1754
+ """Raise an error if the feature is not supported."""
1755
+ if self.logits_processor_pattern != EngineArgs.logits_processor_pattern:
1756
+ _raise_unsupported_error(feature_name="--logits-processor-pattern")
1757
+
1758
+ # No Concurrent Partial Prefills so far.
1759
+ if (
1760
+ self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills
1761
+ or self.max_long_partial_prefills
1762
+ != SchedulerConfig.max_long_partial_prefills
1763
+ ):
1764
+ _raise_unsupported_error(feature_name="Concurrent Partial Prefill")
1765
+
1766
+ # N-gram, Medusa, and Eagle are supported for speculative decoding.
1767
+ if self.speculative_config is not None:
1768
+ # speculative_config could still be a dict at this point
1769
+ if isinstance(self.speculative_config, dict):
1770
+ method = self.speculative_config.get("method", None)
1771
+ else:
1772
+ method = self.speculative_config.method
1773
+
1774
+ if method == "draft_model":
1775
+ raise NotImplementedError(
1776
+ "Draft model speculative decoding is not supported yet. "
1777
+ "Please consider using other speculative decoding methods "
1778
+ "such as ngram, medusa, eagle, or mtp."
1779
+ )
1780
+
1781
+ if self.pipeline_parallel_size > 1:
1782
+ supports_pp = getattr(
1783
+ self.distributed_executor_backend, "supports_pp", False
1784
+ )
1785
+ if not supports_pp and self.distributed_executor_backend not in (
1786
+ ParallelConfig.distributed_executor_backend,
1787
+ "ray",
1788
+ "mp",
1789
+ "external_launcher",
1790
+ ):
1791
+ name = (
1792
+ "Pipeline Parallelism without Ray distributed "
1793
+ "executor or multiprocessing executor or external "
1794
+ "launcher"
1795
+ )
1796
+ _raise_unsupported_error(feature_name=name)
1797
+
1798
+ @classmethod
1799
+ def get_batch_defaults(
1800
+ cls,
1801
+ world_size: int,
1802
+ ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
1803
+ from vllm.usage.usage_lib import UsageContext
1804
+
1805
+ default_max_num_batched_tokens: dict[UsageContext | None, int]
1806
+ default_max_num_seqs: dict[UsageContext | None, int]
1807
+
1808
+ # When no user override, set the default values based on the usage
1809
+ # context.
1810
+ # Use different default values for different hardware.
1811
+
1812
+ # Try to query the device name on the current platform. If it fails,
1813
+ # it may be because the platform that imports vLLM is not the same
1814
+ # as the platform that vLLM is running on (e.g. the case of scaling
1815
+ # vLLM with Ray) and has no GPUs. In this case we use the default
1816
+ # values for non-H100/H200 GPUs.
1817
+ try:
1818
+ device_memory = current_platform.get_device_total_memory()
1819
+ device_name = current_platform.get_device_name().lower()
1820
+ except Exception:
1821
+ # This is only used to set default_max_num_batched_tokens
1822
+ device_memory = 0
1823
+
1824
+ # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
1825
+ # throughput, see PR #17885 for more details.
1826
+ # So here we do an extra device name check to prevent such regression.
1827
+ if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
1828
+ # For GPUs like H100 and MI300x, use larger default values.
1829
+ default_max_num_batched_tokens = {
1830
+ UsageContext.LLM_CLASS: 16384,
1831
+ UsageContext.OPENAI_API_SERVER: 8192,
1832
+ }
1833
+ default_max_num_seqs = {
1834
+ UsageContext.LLM_CLASS: 1024,
1835
+ UsageContext.OPENAI_API_SERVER: 1024,
1836
+ }
1837
+ else:
1838
+ # TODO(woosuk): Tune the default values for other hardware.
1839
+ default_max_num_batched_tokens = {
1840
+ UsageContext.LLM_CLASS: 8192,
1841
+ UsageContext.OPENAI_API_SERVER: 2048,
1842
+ }
1843
+ default_max_num_seqs = {
1844
+ UsageContext.LLM_CLASS: 256,
1845
+ UsageContext.OPENAI_API_SERVER: 256,
1846
+ }
1847
+
1848
+ # tpu specific default values.
1849
+ if current_platform.is_tpu():
1850
+ chip_name = current_platform.get_device_name()
1851
+
1852
+ if chip_name == "V6E":
1853
+ default_max_num_batched_tokens = {
1854
+ UsageContext.LLM_CLASS: 2048,
1855
+ UsageContext.OPENAI_API_SERVER: 1024,
1856
+ }
1857
+ elif chip_name == "V5E":
1858
+ default_max_num_batched_tokens = {
1859
+ UsageContext.LLM_CLASS: 1024,
1860
+ UsageContext.OPENAI_API_SERVER: 512,
1861
+ }
1862
+ elif chip_name == "V5P":
1863
+ default_max_num_batched_tokens = {
1864
+ UsageContext.LLM_CLASS: 512,
1865
+ UsageContext.OPENAI_API_SERVER: 256,
1866
+ }
1867
+
1868
+ # cpu specific default values.
1869
+ if current_platform.is_cpu():
1870
+ default_max_num_batched_tokens = {
1871
+ UsageContext.LLM_CLASS: 4096 * world_size,
1872
+ UsageContext.OPENAI_API_SERVER: 2048 * world_size,
1873
+ }
1874
+ default_max_num_seqs = {
1875
+ UsageContext.LLM_CLASS: 256 * world_size,
1876
+ UsageContext.OPENAI_API_SERVER: 128 * world_size,
1877
+ }
1878
+
1879
+ return default_max_num_batched_tokens, default_max_num_seqs
1880
+
1881
+ def _set_default_chunked_prefill_and_prefix_caching_args(
1882
+ self, model_config: ModelConfig
1883
+ ) -> None:
1884
+ default_chunked_prefill = model_config.is_chunked_prefill_supported
1885
+ default_prefix_caching = model_config.is_prefix_caching_supported
1886
+
1887
+ if self.prefill_context_parallel_size > 1:
1888
+ default_chunked_prefill = False
1889
+ default_prefix_caching = False
1890
+ logger.warning_once(
1891
+ "--prefill-context-parallel-size > 1 is not compatible with "
1892
+ "chunked prefill and prefix caching now. Chunked prefill "
1893
+ "and prefix caching have been disabled by default.",
1894
+ scope="local",
1895
+ )
1896
+
1897
+ if self.enable_chunked_prefill is None:
1898
+ self.enable_chunked_prefill = default_chunked_prefill
1899
+
1900
+ logger.debug(
1901
+ "%s chunked prefill by default",
1902
+ "Enabling" if default_chunked_prefill else "Disabling",
1903
+ )
1904
+ elif (
1905
+ model_config.runner_type == "generate"
1906
+ and not self.enable_chunked_prefill
1907
+ and default_chunked_prefill
1908
+ ):
1909
+ logger.warning_once(
1910
+ "This model does not officially support disabling chunked prefill. "
1911
+ "Disabling this manually may cause the engine to crash "
1912
+ "or produce incorrect outputs.",
1913
+ scope="local",
1914
+ )
1915
+ elif (
1916
+ model_config.runner_type == "pooling"
1917
+ and self.enable_chunked_prefill
1918
+ and not default_chunked_prefill
1919
+ ):
1920
+ logger.warning_once(
1921
+ "This model does not officially support chunked prefill. "
1922
+ "Enabling this manually may cause the engine to crash "
1923
+ "or produce incorrect outputs.",
1924
+ scope="local",
1925
+ )
1926
+
1927
+ if self.enable_prefix_caching is None:
1928
+ self.enable_prefix_caching = default_prefix_caching
1929
+
1930
+ logger.debug(
1931
+ "%s prefix caching by default",
1932
+ "Enabling" if default_prefix_caching else "Disabling",
1933
+ )
1934
+ elif (
1935
+ model_config.runner_type == "pooling"
1936
+ and self.enable_prefix_caching
1937
+ and not default_prefix_caching
1938
+ ):
1939
+ logger.warning_once(
1940
+ "This model does not officially support prefix caching. "
1941
+ "Enabling this manually may cause the engine to crash "
1942
+ "or produce incorrect outputs.",
1943
+ scope="local",
1944
+ )
1945
+
1946
+ # Disable chunked prefill and prefix caching for:
1947
+ # POWER (ppc64le)/s390x/RISCV CPUs in V1
1948
+ if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
1949
+ CpuArchEnum.POWERPC,
1950
+ CpuArchEnum.S390X,
1951
+ CpuArchEnum.RISCV,
1952
+ ):
1953
+ logger.info(
1954
+ "Chunked prefill is not supported for ARM and POWER, "
1955
+ "S390X and RISC-V CPUs; "
1956
+ "disabling it for V1 backend."
1957
+ )
1958
+ self.enable_chunked_prefill = False
1959
+ logger.info(
1960
+ "Prefix caching is not supported for ARM and POWER, "
1961
+ "S390X and RISC-V CPUs; "
1962
+ "disabling it for V1 backend."
1963
+ )
1964
+ self.enable_prefix_caching = False
1965
+
1966
+ def _set_default_max_num_seqs_and_batched_tokens_args(
1967
+ self,
1968
+ usage_context: UsageContext | None,
1969
+ model_config: ModelConfig,
1970
+ ):
1971
+ world_size = self.pipeline_parallel_size * self.tensor_parallel_size
1972
+ (
1973
+ default_max_num_batched_tokens,
1974
+ default_max_num_seqs,
1975
+ ) = self.get_batch_defaults(world_size)
1976
+
1977
+ orig_max_num_batched_tokens = self.max_num_batched_tokens
1978
+ orig_max_num_seqs = self.max_num_seqs
1979
+
1980
+ if self.max_num_batched_tokens is None:
1981
+ self.max_num_batched_tokens = default_max_num_batched_tokens.get(
1982
+ usage_context,
1983
+ SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
1984
+ )
1985
+
1986
+ if self.max_num_seqs is None:
1987
+ self.max_num_seqs = default_max_num_seqs.get(
1988
+ usage_context,
1989
+ SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
1990
+ )
1991
+
1992
+ if orig_max_num_batched_tokens is None:
1993
+ if not self.enable_chunked_prefill:
1994
+ # If max_model_len is too short, use the default for higher throughput.
1995
+ self.max_num_batched_tokens = max(
1996
+ model_config.max_model_len,
1997
+ self.max_num_batched_tokens,
1998
+ )
1999
+
2000
+ # When using default settings,
2001
+ # Ensure max_num_batched_tokens does not exceed model limit.
2002
+ # Some models (e.g., Whisper) have embeddings tied to max length.
2003
+ self.max_num_batched_tokens = min(
2004
+ self.max_num_seqs * model_config.max_model_len,
2005
+ self.max_num_batched_tokens,
2006
+ )
2007
+
2008
+ logger.debug(
2009
+ "Defaulting max_num_batched_tokens to %d for %s usage context.",
2010
+ self.max_num_batched_tokens,
2011
+ usage_context.value if usage_context else None,
2012
+ )
2013
+
2014
+ if orig_max_num_seqs is None:
2015
+ assert self.max_num_batched_tokens is not None # For type checking
2016
+ self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
2017
+
2018
+ logger.debug(
2019
+ "Defaulting max_num_seqs to %d for %s usage context.",
2020
+ self.max_num_seqs,
2021
+ usage_context.value if usage_context else None,
2022
+ )
2023
+
2024
+
2025
+ @dataclass
2026
+ class AsyncEngineArgs(EngineArgs):
2027
+ """Arguments for asynchronous vLLM engine."""
2028
+
2029
+ enable_log_requests: bool = False
2030
+
2031
+ @staticmethod
2032
+ def add_cli_args(
2033
+ parser: FlexibleArgumentParser, async_args_only: bool = False
2034
+ ) -> FlexibleArgumentParser:
2035
+ # Initialize plugin to update the parser, for example, The plugin may
2036
+ # add a new kind of quantization method to --quantization argument or
2037
+ # a new device to --device argument.
2038
+ load_general_plugins()
2039
+ if not async_args_only:
2040
+ parser = EngineArgs.add_cli_args(parser)
2041
+ parser.add_argument(
2042
+ "--enable-log-requests",
2043
+ action=argparse.BooleanOptionalAction,
2044
+ default=AsyncEngineArgs.enable_log_requests,
2045
+ help="Enable logging requests.",
2046
+ )
2047
+ parser.add_argument(
2048
+ "--disable-log-requests",
2049
+ action=argparse.BooleanOptionalAction,
2050
+ default=not AsyncEngineArgs.enable_log_requests,
2051
+ help="[DEPRECATED] Disable logging requests.",
2052
+ deprecated=True,
2053
+ )
2054
+ current_platform.pre_register_and_update(parser)
2055
+ return parser
2056
+
2057
+
2058
+ def _raise_unsupported_error(feature_name: str):
2059
+ msg = (
2060
+ f"{feature_name} is not supported. We recommend to "
2061
+ f"remove {feature_name} from your config."
2062
+ )
2063
+ raise NotImplementedError(msg)
2064
+
2065
+
2066
+ def human_readable_int(value):
2067
+ """Parse human-readable integers like '1k', '2M', etc.
2068
+ Including decimal values with decimal multipliers.
2069
+
2070
+ Examples:
2071
+ - '1k' -> 1,000
2072
+ - '1K' -> 1,024
2073
+ - '25.6k' -> 25,600
2074
+ """
2075
+ value = value.strip()
2076
+ match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
2077
+ if match:
2078
+ decimal_multiplier = {
2079
+ "k": 10**3,
2080
+ "m": 10**6,
2081
+ "g": 10**9,
2082
+ }
2083
+ binary_multiplier = {
2084
+ "K": 2**10,
2085
+ "M": 2**20,
2086
+ "G": 2**30,
2087
+ }
2088
+
2089
+ number, suffix = match.groups()
2090
+ if suffix in decimal_multiplier:
2091
+ mult = decimal_multiplier[suffix]
2092
+ return int(float(number) * mult)
2093
+ elif suffix in binary_multiplier:
2094
+ mult = binary_multiplier[suffix]
2095
+ # Do not allow decimals with binary multipliers
2096
+ try:
2097
+ return int(number) * mult
2098
+ except ValueError as e:
2099
+ raise argparse.ArgumentTypeError(
2100
+ "Decimals are not allowed "
2101
+ f"with binary suffixes like {suffix}. Did you mean to use "
2102
+ f"{number}{suffix.lower()} instead?"
2103
+ ) from e
2104
+
2105
+ # Regular plain number.
2106
+ return int(value)